madlad-400-translate / scripts /generate_langmap.py
Daryl Lim
chore: remove unused pycountry dev dependency
dbb3bbf
Raw
History Blame
16.4 kB
#!/usr/bin/env python3
"""
Generate langid_mapping.py from MADLAD-400 paper Table 9.
Parses Table 9 (Section A.1, pages 16-22) from the MADLAD-400 paper PDF,
assigns regions using script-based defaults and language-specific overrides,
and writes the result to langmap/langid_mapping.py.
Usage:
python scripts/generate_langmap.py <path-to-paper.pdf>
Dev dependencies (requirements-dev.txt):
pdfplumber
"""
import re
import sys
from pathlib import Path
import pdfplumber
# Pages 16-22 of the paper contain Table 9 (0-indexed: 15-21)
TABLE_PAGES = range(15, 22)
# On the last page of the table (page 22, 0-indexed 21), the table ends around
# y=430 and is followed by Section A.2. Stop processing words below this threshold.
PAGE_22_MAX_TOP = 440
# Column x-coordinate boundaries for Table 9
COL_BCP47_MIN = 100
COL_BCP47_MAX = 140
COL_NAME_MIN = 140
COL_NAME_MAX = 220
COL_SCRIPT_MIN = 220
COL_SCRIPT_MAX = 260
COL_DATA_MIN = 260
# Tolerance in pixels for matching name/script cells to BCP-47 cells in the same row.
# Some multi-line name cells have a slightly different y-position than the BCP-47 cell.
ROW_TOLERANCE = 5
# BCP-47 code pattern: lowercase base subtag, optional uppercase subtags (e.g. tly-IR, nan-Latn-TW)
BCP47_RE = re.compile(r"^[a-z]{2,}(-[A-Za-z0-9]+)*$")
# Default region based on writing script
SCRIPT_TO_REGION: dict[str, str] = {
"Arab": "Middle East & North Africa",
"Armn": "Europe",
"Beng": "South Asia",
"Cans": "Americas",
"Cher": "Americas",
"Cyrl": "Europe",
"Deva": "South Asia",
"Ethi": "Africa",
"Geor": "Europe",
"Grek": "Europe",
"Gujr": "South Asia",
"Guru": "South Asia",
"Hang": "East Asia",
"Hans": "East Asia",
"Hant": "East Asia",
"Hebr": "Middle East & North Africa",
"Jpan": "East Asia",
"Khmr": "Southeast Asia",
"Knda": "South Asia",
"Kore": "East Asia",
"Laoo": "Southeast Asia",
"Mlym": "South Asia",
"Mong": "East Asia",
"Mymr": "Southeast Asia",
"Olck": "South Asia",
"Orya": "South Asia",
"Sinh": "South Asia",
"Syrc": "Middle East & North Africa",
"Taml": "South Asia",
"Telu": "South Asia",
"Tfng": "Africa",
"Thaa": "South Asia",
"Thai": "Southeast Asia",
"Tibt": "East Asia",
}
# Language-specific overrides (primarily for Latin-script languages and corrections)
LANGUAGE_OVERRIDES: dict[str, str] = {
# Europe (Latin-script)
"ang": "Europe",
"br": "Europe",
"bs": "Europe",
"ca": "Europe",
"co": "Europe",
"cs": "Europe",
"cy": "Europe",
"da": "Europe",
"de": "Europe",
"en": "Europe",
"eo": "Europe",
"es": "Europe",
"et": "Europe",
"eu": "Europe",
"fi": "Europe",
"fo": "Europe",
"fr": "Europe",
"frp": "Europe",
"fy": "Europe",
"ga": "Europe",
"gag": "Europe",
"gd": "Europe",
"gl": "Europe",
"gsw": "Europe",
"gv": "Europe",
"hr": "Europe",
"hu": "Europe",
"is": "Europe",
"it": "Europe",
"kw": "Europe",
"la": "Europe",
"lb": "Europe",
"lt": "Europe",
"ltg": "Europe",
"lv": "Europe",
"mk": "Europe",
"mt": "Europe",
"nl": "Europe",
"nn": "Europe",
"no": "Europe",
"oc": "Europe",
"pl": "Europe",
"pt": "Europe",
"rm": "Europe",
"rmc": "Europe",
"ro": "Europe",
"rom": "Europe",
"se": "Europe",
"sk": "Europe",
"sl": "Europe",
"sq": "Europe",
"sr": "Europe",
"stq": "Europe",
"sv": "Europe",
"vec": "Europe",
"wa": "Europe",
# Africa (Latin-script)
"aa": "Africa",
"ach": "Africa",
"ada": "Africa",
"adh": "Africa",
"af": "Africa",
"ak": "Africa",
"alz": "Africa",
"ann": "Africa",
"bas": "Africa",
"bci": "Africa",
"bim": "Africa",
"bm": "Africa",
"bqc": "Africa",
"bum": "Africa",
"bus": "Africa",
"cce": "Africa",
"cjk": "Africa",
"din": "Africa",
"dje": "Africa",
"dov": "Africa",
"dwr": "Africa",
"dyu": "Africa",
"ee": "Africa",
"ff": "Africa",
"ffm": "Africa",
"fip": "Africa",
"fon": "Africa",
"gub": "Americas",
"gvl": "Africa",
"ha": "Africa",
"ig": "Africa",
"ibb": "Africa",
"iso": "Africa",
"izz": "Africa",
"kbp": "Africa",
"kg": "Africa",
"kj": "Africa",
"kmb": "Africa",
"kri": "Africa",
"ktu": "Africa",
"laj": "Africa",
"lg": "Africa",
"ln": "Africa",
"lu": "Africa",
"mas": "Africa",
"mfe": "Africa",
"mg": "Africa",
"mgh": "Africa",
"niq": "Africa",
"nnb": "Africa",
"nso": "Africa",
"nr": "Africa",
"ny": "Africa",
"nyu": "Africa",
"nzi": "Africa",
"om": "Africa",
"rn": "Africa",
"rw": "Africa",
"seh": "Africa",
"sg": "Africa",
"sn": "Africa",
"so": "Africa",
"spp": "Africa",
"srr": "Africa",
"ss": "Africa",
"st": "Africa",
"sus": "Africa",
"sw": "Africa",
"tbz": "Africa",
"tdx": "Africa",
"teo": "Africa",
"tiv": "Africa",
"tll": "Africa",
"tn": "Africa",
"ts": "Africa",
"tsc": "Africa",
"ve": "Africa",
"wal": "Africa",
"wo": "Africa",
"xh": "Africa",
"yo": "Africa",
"zne": "Africa",
"zu": "Africa",
# Americas (Latin-script)
"acf": "Americas",
"agr": "Americas",
"cab": "Americas",
"amu": "Americas",
"arn": "Americas",
"ay": "Americas",
"bzj": "Americas",
"cac": "Americas",
"cak": "Americas",
"cni": "Americas",
"ctu": "Americas",
"cuk": "Americas",
"djk": "Americas",
"emp": "Americas",
"gn": "Americas",
"guc": "Americas",
"guh": "Americas",
"gui": "Americas",
"gym": "Americas",
"gyn": "Americas",
"haw": "Americas",
"ht": "Americas",
"hus": "Americas",
"inb": "Americas",
"jac": "Americas",
"jam": "Americas",
"jiv": "Americas",
"kek": "Americas",
"kl": "Americas",
"knj": "Americas",
"kwi": "Americas",
"mam": "Americas",
"maz": "Americas",
"miq": "Americas",
"ngu": "Americas",
"nhe": "Americas",
"noa": "Americas",
"nv": "Americas",
"otq": "Americas",
"pap": "Americas",
"qu": "Americas",
"qub": "Americas",
"quc": "Americas",
"quf": "Americas",
"quh": "Americas",
"qup": "Americas",
"quy": "Americas",
"qvc": "Americas",
"qvi": "Americas",
"qvz": "Americas",
"qxr": "Americas",
"rcf": "Americas",
"shp": "Americas",
"sja": "Americas",
"srm": "Americas",
"srn": "Americas",
"tca": "Americas",
"toj": "Americas",
"tzh": "Americas",
"tzj": "Americas",
"tzo": "Americas",
"yua": "Americas",
"zap": "Americas",
# Southeast Asia (Latin-script)
"ace": "Southeast Asia",
"ahk": "Southeast Asia",
"akb": "Southeast Asia",
"ban": "Southeast Asia",
"bbc": "Southeast Asia",
"bew": "Southeast Asia",
"bgz": "Southeast Asia",
"bik": "Southeast Asia",
"bru": "Southeast Asia",
"btx": "Southeast Asia",
"bts": "Southeast Asia",
"cbk": "Southeast Asia",
"ceb": "Southeast Asia",
"cfm": "Southeast Asia",
"cjm": "Southeast Asia",
"cnh": "Southeast Asia",
"dtp": "Southeast Asia",
"fil": "Southeast Asia",
"gor": "Southeast Asia",
"hil": "Southeast Asia",
"hmn": "Southeast Asia",
"iba": "Southeast Asia",
"id": "Southeast Asia",
"ify": "Southeast Asia",
"ilo": "Southeast Asia",
"ium": "Southeast Asia",
"jax": "Southeast Asia",
"jv": "Southeast Asia",
"jvn": "Southeast Asia",
"kac": "Southeast Asia",
"krj": "Southeast Asia",
"lhu": "Southeast Asia",
"mad": "Southeast Asia",
"mak": "Southeast Asia",
"mbt": "Southeast Asia",
"mel": "Southeast Asia",
"meo": "Southeast Asia",
"min": "Southeast Asia",
"mkn": "Southeast Asia",
"mqy": "Southeast Asia",
"mrw": "Southeast Asia",
"ms": "Southeast Asia",
"msi": "Southeast Asia",
"msb": "Southeast Asia",
"msm": "Southeast Asia",
"nia": "Southeast Asia",
"nij": "Southeast Asia",
"nut": "Southeast Asia",
"pag": "Southeast Asia",
"pam": "Southeast Asia",
"pck": "Southeast Asia",
"ppk": "Southeast Asia",
"prk": "Southeast Asia",
"sda": "Southeast Asia",
"su": "Southeast Asia",
"sxn": "Southeast Asia",
"tet": "Southeast Asia",
"tl": "Southeast Asia",
"tsg": "Southeast Asia",
"tyz": "Southeast Asia",
"vi": "Southeast Asia",
"war": "Southeast Asia",
"xmm": "Southeast Asia",
# Oceania (Latin-script)
"abt": "Oceania",
"ape": "Oceania",
"bi": "Oceania",
"ch": "Oceania",
"chk": "Oceania",
"enq": "Oceania",
"fj": "Oceania",
"gil": "Oceania",
"hif": "Oceania",
"ho": "Oceania",
"hui": "Oceania",
"hvn": "Oceania",
"kos": "Oceania",
"ksd": "Oceania",
"meu": "Oceania",
"mh": "Oceania",
"mi": "Oceania",
"mps": "Oceania",
"pau": "Oceania",
"pis": "Oceania",
"pon": "Oceania",
"rwo": "Oceania",
"sm": "Oceania",
"to": "Oceania",
"tuc": "Oceania",
"tvl": "Oceania",
"twu": "Oceania",
"ubu": "Oceania",
"yap": "Oceania",
# South Asia (Latin-script)
"kha": "South Asia",
"lus": "South Asia",
"smt": "South Asia",
"trp": "South Asia",
# East Asia (Latin-script)
"mn": "East Asia",
"za": "East Asia",
# Middle East & North Africa (Latin-script)
"bgp": "Middle East & North Africa",
"ku": "Middle East & North Africa",
"tr": "Middle East & North Africa",
"zza": "Middle East & North Africa",
# Central Asia (Latin-script)
"az": "Central Asia",
"kaa-Latn": "Central Asia",
"kk": "Central Asia",
"ky": "Central Asia",
"tg": "Central Asia",
"tk": "Central Asia",
"uz": "Central Asia",
# Corrections for Cyrillic languages not in Europe
"ba": "Central Asia",
"ce": "Europe",
"cv": "Europe",
"kv": "Europe",
"os": "Europe",
"sah": "East Asia",
"tt": "Europe",
# Non-geographic / constructed languages: assign to Americas (creator's continent)
"crs": "Americas", # Seselwa Creole French (Indian Ocean) — grouped with Atlantic/Indian Ocean creoles
"tlh": "Americas", # Klingon — constructed language
"zxx": "Americas", # Noise/non-linguistic content — placeholder region
}
def fix_name_spacing(name: str) -> str:
"""Insert spaces lost during PDF word extraction.
PDF word extraction sometimes concatenates adjacent words when they appear
close together on the page. This function restores spaces before uppercase
letters that follow lowercase letters, before opening parentheses, and after
periods followed by uppercase letters.
"""
# Insert space before uppercase letter following a lowercase/accented lowercase letter
name = re.sub(r"([a-zà-öø-ÿ])([A-Z])", r"\1 \2", name)
# Insert space before opening parenthesis following a word character
name = re.sub(r"(\w)\(", r"\1 (", name)
# Insert space after period followed by uppercase letter
name = re.sub(r"\.([A-Z])", r". \1", name)
return name
def parse_table9(pdf_path: str) -> list[dict[str, str]]:
"""Parse Table 9 from the MADLAD-400 paper using word-based extraction.
pdfplumber's extract_table() does not work on this PDF because the table
lines are not detected as table borders. Instead we extract words and
assign them to columns by x-coordinate, then group by row using a
y-position tolerance to handle multi-line name cells.
"""
entries: list[dict[str, str]] = []
with pdfplumber.open(pdf_path) as pdf:
for page_num in TABLE_PAGES:
page = pdf.pages[page_num]
words = page.extract_words()
# On the last table page, cut off below the table (avoids Section A.2 code block)
max_top = PAGE_22_MAX_TOP if page_num == 21 else float("inf")
# Partition words by column
bcp47_col = [w for w in words if COL_BCP47_MIN <= w["x0"] < COL_BCP47_MAX and w["top"] <= max_top]
name_col = [w for w in words if COL_NAME_MIN <= w["x0"] < COL_NAME_MAX and w["top"] <= max_top]
script_col = [w for w in words if COL_SCRIPT_MIN <= w["x0"] < COL_SCRIPT_MAX and w["top"] <= max_top]
data_col = [w for w in words if w["x0"] >= COL_DATA_MIN and w["top"] <= max_top]
for bcp_w in bcp47_col:
bcp47 = bcp_w["text"]
# Skip non-BCP-47 text (headers, prose, code)
if not BCP47_RE.match(bcp47) or bcp47 in ("total", "median"):
continue
bcp_top = bcp_w["top"]
# Find cells in the same row (within ROW_TOLERANCE pixels)
name_ws = [w for w in name_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE]
script_ws = [w for w in script_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE]
data_ws = [w for w in data_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE]
name = " ".join(w["text"] for w in sorted(name_ws, key=lambda w: w["x0"]))
name = fix_name_spacing(name)
script = " ".join(w["text"] for w in sorted(script_ws, key=lambda w: w["x0"]))
data_texts = [w["text"] for w in sorted(data_ws, key=lambda w: w["x0"])]
# Require both name and script to be present
if not name or not script:
continue
# Skip rows where all data columns are "-" (self-audit omissions)
if data_texts and all(d.strip() == "-" for d in data_texts):
continue
entries.append({"bcp47": bcp47, "name": name, "script": script})
return entries
def assign_region(bcp47: str, script: str) -> str:
"""Assign a geographic region based on language code and script."""
if bcp47 in LANGUAGE_OVERRIDES:
return LANGUAGE_OVERRIDES[bcp47]
if script in SCRIPT_TO_REGION:
return SCRIPT_TO_REGION[script]
return "Other"
def write_mapping(entries: list[dict[str, str]], output_path: Path) -> None:
"""Write the language mapping to a Python file."""
mapping: list[tuple[str, str, str]] = []
for entry in entries:
token = f"<2{entry['bcp47']}>"
region = assign_region(entry["bcp47"], entry["script"])
mapping.append((token, entry["name"], region))
# Sort by region, then by name within each region
mapping.sort(key=lambda x: (x[2], x[1]))
lines = [
"# Auto-generated by scripts/generate_langmap.py from MADLAD-400 paper Table 9 (Section A.1)",
"# Source: https://arxiv.org/pdf/2309.04662",
f"# {len(mapping)} languages with training data (excludes 79 self-audit omissions)",
"#",
"# To regenerate:",
"# python scripts/generate_langmap.py <path-to-paper.pdf>",
"langid_to_language = {",
]
for token, name, region in mapping:
# Escape any quotes in names
escaped_name = name.replace('"', '\\"')
lines.append(f' "{token}": {{"name": "{escaped_name}", "region": "{region}"}},')
lines.append("}")
lines.append("")
output_path.write_text("\n".join(lines))
def main() -> None:
if len(sys.argv) != 2:
print("Usage: python scripts/generate_langmap.py <path-to-paper.pdf>")
sys.exit(1)
pdf_path = sys.argv[1]
entries = parse_table9(pdf_path)
print(f"Parsed {len(entries)} languages from Table 9")
# Report region assignments
other: list[tuple[str, str]] = []
for entry in entries:
region = assign_region(entry["bcp47"], entry["script"])
if region == "Other":
other.append((entry["bcp47"], entry["name"]))
if other:
print(f"\n{len(other)} languages assigned to 'Other' region (need manual override):")
for bcp47, name in other:
print(f" {bcp47}: {name}")
print("\nAdd overrides to LANGUAGE_OVERRIDES in the script and re-run.")
output_path = Path(__file__).parent.parent / "langmap" / "langid_mapping.py"
write_mapping(entries, output_path)
print(f"\nWrote {len(entries)} entries to {output_path}")
if other:
sys.exit(1)
if __name__ == "__main__":
main()