Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """ | |
| Generate langid_mapping.py from MADLAD-400 paper Table 9. | |
| Parses Table 9 (Section A.1, pages 16-22) from the MADLAD-400 paper PDF, | |
| assigns regions using script-based defaults and language-specific overrides, | |
| and writes the result to langmap/langid_mapping.py. | |
| Usage: | |
| python scripts/generate_langmap.py <path-to-paper.pdf> | |
| Dev dependencies (requirements-dev.txt): | |
| pdfplumber | |
| """ | |
| import re | |
| import sys | |
| from pathlib import Path | |
| import pdfplumber | |
| # Pages 16-22 of the paper contain Table 9 (0-indexed: 15-21) | |
| TABLE_PAGES = range(15, 22) | |
| # On the last page of the table (page 22, 0-indexed 21), the table ends around | |
| # y=430 and is followed by Section A.2. Stop processing words below this threshold. | |
| PAGE_22_MAX_TOP = 440 | |
| # Column x-coordinate boundaries for Table 9 | |
| COL_BCP47_MIN = 100 | |
| COL_BCP47_MAX = 140 | |
| COL_NAME_MIN = 140 | |
| COL_NAME_MAX = 220 | |
| COL_SCRIPT_MIN = 220 | |
| COL_SCRIPT_MAX = 260 | |
| COL_DATA_MIN = 260 | |
| # Tolerance in pixels for matching name/script cells to BCP-47 cells in the same row. | |
| # Some multi-line name cells have a slightly different y-position than the BCP-47 cell. | |
| ROW_TOLERANCE = 5 | |
| # BCP-47 code pattern: lowercase base subtag, optional uppercase subtags (e.g. tly-IR, nan-Latn-TW) | |
| BCP47_RE = re.compile(r"^[a-z]{2,}(-[A-Za-z0-9]+)*$") | |
| # Default region based on writing script | |
| SCRIPT_TO_REGION: dict[str, str] = { | |
| "Arab": "Middle East & North Africa", | |
| "Armn": "Europe", | |
| "Beng": "South Asia", | |
| "Cans": "Americas", | |
| "Cher": "Americas", | |
| "Cyrl": "Europe", | |
| "Deva": "South Asia", | |
| "Ethi": "Africa", | |
| "Geor": "Europe", | |
| "Grek": "Europe", | |
| "Gujr": "South Asia", | |
| "Guru": "South Asia", | |
| "Hang": "East Asia", | |
| "Hans": "East Asia", | |
| "Hant": "East Asia", | |
| "Hebr": "Middle East & North Africa", | |
| "Jpan": "East Asia", | |
| "Khmr": "Southeast Asia", | |
| "Knda": "South Asia", | |
| "Kore": "East Asia", | |
| "Laoo": "Southeast Asia", | |
| "Mlym": "South Asia", | |
| "Mong": "East Asia", | |
| "Mymr": "Southeast Asia", | |
| "Olck": "South Asia", | |
| "Orya": "South Asia", | |
| "Sinh": "South Asia", | |
| "Syrc": "Middle East & North Africa", | |
| "Taml": "South Asia", | |
| "Telu": "South Asia", | |
| "Tfng": "Africa", | |
| "Thaa": "South Asia", | |
| "Thai": "Southeast Asia", | |
| "Tibt": "East Asia", | |
| } | |
| # Language-specific overrides (primarily for Latin-script languages and corrections) | |
| LANGUAGE_OVERRIDES: dict[str, str] = { | |
| # Europe (Latin-script) | |
| "ang": "Europe", | |
| "br": "Europe", | |
| "bs": "Europe", | |
| "ca": "Europe", | |
| "co": "Europe", | |
| "cs": "Europe", | |
| "cy": "Europe", | |
| "da": "Europe", | |
| "de": "Europe", | |
| "en": "Europe", | |
| "eo": "Europe", | |
| "es": "Europe", | |
| "et": "Europe", | |
| "eu": "Europe", | |
| "fi": "Europe", | |
| "fo": "Europe", | |
| "fr": "Europe", | |
| "frp": "Europe", | |
| "fy": "Europe", | |
| "ga": "Europe", | |
| "gag": "Europe", | |
| "gd": "Europe", | |
| "gl": "Europe", | |
| "gsw": "Europe", | |
| "gv": "Europe", | |
| "hr": "Europe", | |
| "hu": "Europe", | |
| "is": "Europe", | |
| "it": "Europe", | |
| "kw": "Europe", | |
| "la": "Europe", | |
| "lb": "Europe", | |
| "lt": "Europe", | |
| "ltg": "Europe", | |
| "lv": "Europe", | |
| "mk": "Europe", | |
| "mt": "Europe", | |
| "nl": "Europe", | |
| "nn": "Europe", | |
| "no": "Europe", | |
| "oc": "Europe", | |
| "pl": "Europe", | |
| "pt": "Europe", | |
| "rm": "Europe", | |
| "rmc": "Europe", | |
| "ro": "Europe", | |
| "rom": "Europe", | |
| "se": "Europe", | |
| "sk": "Europe", | |
| "sl": "Europe", | |
| "sq": "Europe", | |
| "sr": "Europe", | |
| "stq": "Europe", | |
| "sv": "Europe", | |
| "vec": "Europe", | |
| "wa": "Europe", | |
| # Africa (Latin-script) | |
| "aa": "Africa", | |
| "ach": "Africa", | |
| "ada": "Africa", | |
| "adh": "Africa", | |
| "af": "Africa", | |
| "ak": "Africa", | |
| "alz": "Africa", | |
| "ann": "Africa", | |
| "bas": "Africa", | |
| "bci": "Africa", | |
| "bim": "Africa", | |
| "bm": "Africa", | |
| "bqc": "Africa", | |
| "bum": "Africa", | |
| "bus": "Africa", | |
| "cce": "Africa", | |
| "cjk": "Africa", | |
| "din": "Africa", | |
| "dje": "Africa", | |
| "dov": "Africa", | |
| "dwr": "Africa", | |
| "dyu": "Africa", | |
| "ee": "Africa", | |
| "ff": "Africa", | |
| "ffm": "Africa", | |
| "fip": "Africa", | |
| "fon": "Africa", | |
| "gub": "Americas", | |
| "gvl": "Africa", | |
| "ha": "Africa", | |
| "ig": "Africa", | |
| "ibb": "Africa", | |
| "iso": "Africa", | |
| "izz": "Africa", | |
| "kbp": "Africa", | |
| "kg": "Africa", | |
| "kj": "Africa", | |
| "kmb": "Africa", | |
| "kri": "Africa", | |
| "ktu": "Africa", | |
| "laj": "Africa", | |
| "lg": "Africa", | |
| "ln": "Africa", | |
| "lu": "Africa", | |
| "mas": "Africa", | |
| "mfe": "Africa", | |
| "mg": "Africa", | |
| "mgh": "Africa", | |
| "niq": "Africa", | |
| "nnb": "Africa", | |
| "nso": "Africa", | |
| "nr": "Africa", | |
| "ny": "Africa", | |
| "nyu": "Africa", | |
| "nzi": "Africa", | |
| "om": "Africa", | |
| "rn": "Africa", | |
| "rw": "Africa", | |
| "seh": "Africa", | |
| "sg": "Africa", | |
| "sn": "Africa", | |
| "so": "Africa", | |
| "spp": "Africa", | |
| "srr": "Africa", | |
| "ss": "Africa", | |
| "st": "Africa", | |
| "sus": "Africa", | |
| "sw": "Africa", | |
| "tbz": "Africa", | |
| "tdx": "Africa", | |
| "teo": "Africa", | |
| "tiv": "Africa", | |
| "tll": "Africa", | |
| "tn": "Africa", | |
| "ts": "Africa", | |
| "tsc": "Africa", | |
| "ve": "Africa", | |
| "wal": "Africa", | |
| "wo": "Africa", | |
| "xh": "Africa", | |
| "yo": "Africa", | |
| "zne": "Africa", | |
| "zu": "Africa", | |
| # Americas (Latin-script) | |
| "acf": "Americas", | |
| "agr": "Americas", | |
| "cab": "Americas", | |
| "amu": "Americas", | |
| "arn": "Americas", | |
| "ay": "Americas", | |
| "bzj": "Americas", | |
| "cac": "Americas", | |
| "cak": "Americas", | |
| "cni": "Americas", | |
| "ctu": "Americas", | |
| "cuk": "Americas", | |
| "djk": "Americas", | |
| "emp": "Americas", | |
| "gn": "Americas", | |
| "guc": "Americas", | |
| "guh": "Americas", | |
| "gui": "Americas", | |
| "gym": "Americas", | |
| "gyn": "Americas", | |
| "haw": "Americas", | |
| "ht": "Americas", | |
| "hus": "Americas", | |
| "inb": "Americas", | |
| "jac": "Americas", | |
| "jam": "Americas", | |
| "jiv": "Americas", | |
| "kek": "Americas", | |
| "kl": "Americas", | |
| "knj": "Americas", | |
| "kwi": "Americas", | |
| "mam": "Americas", | |
| "maz": "Americas", | |
| "miq": "Americas", | |
| "ngu": "Americas", | |
| "nhe": "Americas", | |
| "noa": "Americas", | |
| "nv": "Americas", | |
| "otq": "Americas", | |
| "pap": "Americas", | |
| "qu": "Americas", | |
| "qub": "Americas", | |
| "quc": "Americas", | |
| "quf": "Americas", | |
| "quh": "Americas", | |
| "qup": "Americas", | |
| "quy": "Americas", | |
| "qvc": "Americas", | |
| "qvi": "Americas", | |
| "qvz": "Americas", | |
| "qxr": "Americas", | |
| "rcf": "Americas", | |
| "shp": "Americas", | |
| "sja": "Americas", | |
| "srm": "Americas", | |
| "srn": "Americas", | |
| "tca": "Americas", | |
| "toj": "Americas", | |
| "tzh": "Americas", | |
| "tzj": "Americas", | |
| "tzo": "Americas", | |
| "yua": "Americas", | |
| "zap": "Americas", | |
| # Southeast Asia (Latin-script) | |
| "ace": "Southeast Asia", | |
| "ahk": "Southeast Asia", | |
| "akb": "Southeast Asia", | |
| "ban": "Southeast Asia", | |
| "bbc": "Southeast Asia", | |
| "bew": "Southeast Asia", | |
| "bgz": "Southeast Asia", | |
| "bik": "Southeast Asia", | |
| "bru": "Southeast Asia", | |
| "btx": "Southeast Asia", | |
| "bts": "Southeast Asia", | |
| "cbk": "Southeast Asia", | |
| "ceb": "Southeast Asia", | |
| "cfm": "Southeast Asia", | |
| "cjm": "Southeast Asia", | |
| "cnh": "Southeast Asia", | |
| "dtp": "Southeast Asia", | |
| "fil": "Southeast Asia", | |
| "gor": "Southeast Asia", | |
| "hil": "Southeast Asia", | |
| "hmn": "Southeast Asia", | |
| "iba": "Southeast Asia", | |
| "id": "Southeast Asia", | |
| "ify": "Southeast Asia", | |
| "ilo": "Southeast Asia", | |
| "ium": "Southeast Asia", | |
| "jax": "Southeast Asia", | |
| "jv": "Southeast Asia", | |
| "jvn": "Southeast Asia", | |
| "kac": "Southeast Asia", | |
| "krj": "Southeast Asia", | |
| "lhu": "Southeast Asia", | |
| "mad": "Southeast Asia", | |
| "mak": "Southeast Asia", | |
| "mbt": "Southeast Asia", | |
| "mel": "Southeast Asia", | |
| "meo": "Southeast Asia", | |
| "min": "Southeast Asia", | |
| "mkn": "Southeast Asia", | |
| "mqy": "Southeast Asia", | |
| "mrw": "Southeast Asia", | |
| "ms": "Southeast Asia", | |
| "msi": "Southeast Asia", | |
| "msb": "Southeast Asia", | |
| "msm": "Southeast Asia", | |
| "nia": "Southeast Asia", | |
| "nij": "Southeast Asia", | |
| "nut": "Southeast Asia", | |
| "pag": "Southeast Asia", | |
| "pam": "Southeast Asia", | |
| "pck": "Southeast Asia", | |
| "ppk": "Southeast Asia", | |
| "prk": "Southeast Asia", | |
| "sda": "Southeast Asia", | |
| "su": "Southeast Asia", | |
| "sxn": "Southeast Asia", | |
| "tet": "Southeast Asia", | |
| "tl": "Southeast Asia", | |
| "tsg": "Southeast Asia", | |
| "tyz": "Southeast Asia", | |
| "vi": "Southeast Asia", | |
| "war": "Southeast Asia", | |
| "xmm": "Southeast Asia", | |
| # Oceania (Latin-script) | |
| "abt": "Oceania", | |
| "ape": "Oceania", | |
| "bi": "Oceania", | |
| "ch": "Oceania", | |
| "chk": "Oceania", | |
| "enq": "Oceania", | |
| "fj": "Oceania", | |
| "gil": "Oceania", | |
| "hif": "Oceania", | |
| "ho": "Oceania", | |
| "hui": "Oceania", | |
| "hvn": "Oceania", | |
| "kos": "Oceania", | |
| "ksd": "Oceania", | |
| "meu": "Oceania", | |
| "mh": "Oceania", | |
| "mi": "Oceania", | |
| "mps": "Oceania", | |
| "pau": "Oceania", | |
| "pis": "Oceania", | |
| "pon": "Oceania", | |
| "rwo": "Oceania", | |
| "sm": "Oceania", | |
| "to": "Oceania", | |
| "tuc": "Oceania", | |
| "tvl": "Oceania", | |
| "twu": "Oceania", | |
| "ubu": "Oceania", | |
| "yap": "Oceania", | |
| # South Asia (Latin-script) | |
| "kha": "South Asia", | |
| "lus": "South Asia", | |
| "smt": "South Asia", | |
| "trp": "South Asia", | |
| # East Asia (Latin-script) | |
| "mn": "East Asia", | |
| "za": "East Asia", | |
| # Middle East & North Africa (Latin-script) | |
| "bgp": "Middle East & North Africa", | |
| "ku": "Middle East & North Africa", | |
| "tr": "Middle East & North Africa", | |
| "zza": "Middle East & North Africa", | |
| # Central Asia (Latin-script) | |
| "az": "Central Asia", | |
| "kaa-Latn": "Central Asia", | |
| "kk": "Central Asia", | |
| "ky": "Central Asia", | |
| "tg": "Central Asia", | |
| "tk": "Central Asia", | |
| "uz": "Central Asia", | |
| # Corrections for Cyrillic languages not in Europe | |
| "ba": "Central Asia", | |
| "ce": "Europe", | |
| "cv": "Europe", | |
| "kv": "Europe", | |
| "os": "Europe", | |
| "sah": "East Asia", | |
| "tt": "Europe", | |
| # Non-geographic / constructed languages: assign to Americas (creator's continent) | |
| "crs": "Americas", # Seselwa Creole French (Indian Ocean) — grouped with Atlantic/Indian Ocean creoles | |
| "tlh": "Americas", # Klingon — constructed language | |
| "zxx": "Americas", # Noise/non-linguistic content — placeholder region | |
| } | |
| def fix_name_spacing(name: str) -> str: | |
| """Insert spaces lost during PDF word extraction. | |
| PDF word extraction sometimes concatenates adjacent words when they appear | |
| close together on the page. This function restores spaces before uppercase | |
| letters that follow lowercase letters, before opening parentheses, and after | |
| periods followed by uppercase letters. | |
| """ | |
| # Insert space before uppercase letter following a lowercase/accented lowercase letter | |
| name = re.sub(r"([a-zà-öø-ÿ])([A-Z])", r"\1 \2", name) | |
| # Insert space before opening parenthesis following a word character | |
| name = re.sub(r"(\w)\(", r"\1 (", name) | |
| # Insert space after period followed by uppercase letter | |
| name = re.sub(r"\.([A-Z])", r". \1", name) | |
| return name | |
| def parse_table9(pdf_path: str) -> list[dict[str, str]]: | |
| """Parse Table 9 from the MADLAD-400 paper using word-based extraction. | |
| pdfplumber's extract_table() does not work on this PDF because the table | |
| lines are not detected as table borders. Instead we extract words and | |
| assign them to columns by x-coordinate, then group by row using a | |
| y-position tolerance to handle multi-line name cells. | |
| """ | |
| entries: list[dict[str, str]] = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page_num in TABLE_PAGES: | |
| page = pdf.pages[page_num] | |
| words = page.extract_words() | |
| # On the last table page, cut off below the table (avoids Section A.2 code block) | |
| max_top = PAGE_22_MAX_TOP if page_num == 21 else float("inf") | |
| # Partition words by column | |
| bcp47_col = [w for w in words if COL_BCP47_MIN <= w["x0"] < COL_BCP47_MAX and w["top"] <= max_top] | |
| name_col = [w for w in words if COL_NAME_MIN <= w["x0"] < COL_NAME_MAX and w["top"] <= max_top] | |
| script_col = [w for w in words if COL_SCRIPT_MIN <= w["x0"] < COL_SCRIPT_MAX and w["top"] <= max_top] | |
| data_col = [w for w in words if w["x0"] >= COL_DATA_MIN and w["top"] <= max_top] | |
| for bcp_w in bcp47_col: | |
| bcp47 = bcp_w["text"] | |
| # Skip non-BCP-47 text (headers, prose, code) | |
| if not BCP47_RE.match(bcp47) or bcp47 in ("total", "median"): | |
| continue | |
| bcp_top = bcp_w["top"] | |
| # Find cells in the same row (within ROW_TOLERANCE pixels) | |
| name_ws = [w for w in name_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE] | |
| script_ws = [w for w in script_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE] | |
| data_ws = [w for w in data_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE] | |
| name = " ".join(w["text"] for w in sorted(name_ws, key=lambda w: w["x0"])) | |
| name = fix_name_spacing(name) | |
| script = " ".join(w["text"] for w in sorted(script_ws, key=lambda w: w["x0"])) | |
| data_texts = [w["text"] for w in sorted(data_ws, key=lambda w: w["x0"])] | |
| # Require both name and script to be present | |
| if not name or not script: | |
| continue | |
| # Skip rows where all data columns are "-" (self-audit omissions) | |
| if data_texts and all(d.strip() == "-" for d in data_texts): | |
| continue | |
| entries.append({"bcp47": bcp47, "name": name, "script": script}) | |
| return entries | |
| def assign_region(bcp47: str, script: str) -> str: | |
| """Assign a geographic region based on language code and script.""" | |
| if bcp47 in LANGUAGE_OVERRIDES: | |
| return LANGUAGE_OVERRIDES[bcp47] | |
| if script in SCRIPT_TO_REGION: | |
| return SCRIPT_TO_REGION[script] | |
| return "Other" | |
| def write_mapping(entries: list[dict[str, str]], output_path: Path) -> None: | |
| """Write the language mapping to a Python file.""" | |
| mapping: list[tuple[str, str, str]] = [] | |
| for entry in entries: | |
| token = f"<2{entry['bcp47']}>" | |
| region = assign_region(entry["bcp47"], entry["script"]) | |
| mapping.append((token, entry["name"], region)) | |
| # Sort by region, then by name within each region | |
| mapping.sort(key=lambda x: (x[2], x[1])) | |
| lines = [ | |
| "# Auto-generated by scripts/generate_langmap.py from MADLAD-400 paper Table 9 (Section A.1)", | |
| "# Source: https://arxiv.org/pdf/2309.04662", | |
| f"# {len(mapping)} languages with training data (excludes 79 self-audit omissions)", | |
| "#", | |
| "# To regenerate:", | |
| "# python scripts/generate_langmap.py <path-to-paper.pdf>", | |
| "langid_to_language = {", | |
| ] | |
| for token, name, region in mapping: | |
| # Escape any quotes in names | |
| escaped_name = name.replace('"', '\\"') | |
| lines.append(f' "{token}": {{"name": "{escaped_name}", "region": "{region}"}},') | |
| lines.append("}") | |
| lines.append("") | |
| output_path.write_text("\n".join(lines)) | |
| def main() -> None: | |
| if len(sys.argv) != 2: | |
| print("Usage: python scripts/generate_langmap.py <path-to-paper.pdf>") | |
| sys.exit(1) | |
| pdf_path = sys.argv[1] | |
| entries = parse_table9(pdf_path) | |
| print(f"Parsed {len(entries)} languages from Table 9") | |
| # Report region assignments | |
| other: list[tuple[str, str]] = [] | |
| for entry in entries: | |
| region = assign_region(entry["bcp47"], entry["script"]) | |
| if region == "Other": | |
| other.append((entry["bcp47"], entry["name"])) | |
| if other: | |
| print(f"\n{len(other)} languages assigned to 'Other' region (need manual override):") | |
| for bcp47, name in other: | |
| print(f" {bcp47}: {name}") | |
| print("\nAdd overrides to LANGUAGE_OVERRIDES in the script and re-run.") | |
| output_path = Path(__file__).parent.parent / "langmap" / "langid_mapping.py" | |
| write_mapping(entries, output_path) | |
| print(f"\nWrote {len(entries)} entries to {output_path}") | |
| if other: | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |