Spaces:

darylalim
/

madlad-400-translate

Running on Zero

madlad-400-translate / scripts /generate_langmap.py

Daryl Lim

chore: remove unused pycountry dev dependency

dbb3bbf 6 days ago

16.4 kB

	#!/usr/bin/env python3
	"""
	Generate langid_mapping.py from MADLAD-400 paper Table 9.

	Parses Table 9 (Section A.1, pages 16-22) from the MADLAD-400 paper PDF,
	assigns regions using script-based defaults and language-specific overrides,
	and writes the result to langmap/langid_mapping.py.

	Usage:
	python scripts/generate_langmap.py <path-to-paper.pdf>

	Dev dependencies (requirements-dev.txt):
	pdfplumber
	"""

	import re
	import sys
	from pathlib import Path

	import pdfplumber

	# Pages 16-22 of the paper contain Table 9 (0-indexed: 15-21)
	TABLE_PAGES = range(15, 22)

	# On the last page of the table (page 22, 0-indexed 21), the table ends around
	# y=430 and is followed by Section A.2. Stop processing words below this threshold.
	PAGE_22_MAX_TOP = 440

	# Column x-coordinate boundaries for Table 9
	COL_BCP47_MIN = 100
	COL_BCP47_MAX = 140
	COL_NAME_MIN = 140
	COL_NAME_MAX = 220
	COL_SCRIPT_MIN = 220
	COL_SCRIPT_MAX = 260
	COL_DATA_MIN = 260

	# Tolerance in pixels for matching name/script cells to BCP-47 cells in the same row.
	# Some multi-line name cells have a slightly different y-position than the BCP-47 cell.
	ROW_TOLERANCE = 5

	# BCP-47 code pattern: lowercase base subtag, optional uppercase subtags (e.g. tly-IR, nan-Latn-TW)
	BCP47_RE = re.compile(r"^[a-z]{2,}(-[A-Za-z0-9]+)*$")

	# Default region based on writing script
	SCRIPT_TO_REGION: dict[str, str] = {
	"Arab": "Middle East & North Africa",
	"Armn": "Europe",
	"Beng": "South Asia",
	"Cans": "Americas",
	"Cher": "Americas",
	"Cyrl": "Europe",
	"Deva": "South Asia",
	"Ethi": "Africa",
	"Geor": "Europe",
	"Grek": "Europe",
	"Gujr": "South Asia",
	"Guru": "South Asia",
	"Hang": "East Asia",
	"Hans": "East Asia",
	"Hant": "East Asia",
	"Hebr": "Middle East & North Africa",
	"Jpan": "East Asia",
	"Khmr": "Southeast Asia",
	"Knda": "South Asia",
	"Kore": "East Asia",
	"Laoo": "Southeast Asia",
	"Mlym": "South Asia",
	"Mong": "East Asia",
	"Mymr": "Southeast Asia",
	"Olck": "South Asia",
	"Orya": "South Asia",
	"Sinh": "South Asia",
	"Syrc": "Middle East & North Africa",
	"Taml": "South Asia",
	"Telu": "South Asia",
	"Tfng": "Africa",
	"Thaa": "South Asia",
	"Thai": "Southeast Asia",
	"Tibt": "East Asia",
	}

	# Language-specific overrides (primarily for Latin-script languages and corrections)
	LANGUAGE_OVERRIDES: dict[str, str] = {
	# Europe (Latin-script)
	"ang": "Europe",
	"br": "Europe",
	"bs": "Europe",
	"ca": "Europe",
	"co": "Europe",
	"cs": "Europe",
	"cy": "Europe",
	"da": "Europe",
	"de": "Europe",
	"en": "Europe",
	"eo": "Europe",
	"es": "Europe",
	"et": "Europe",
	"eu": "Europe",
	"fi": "Europe",
	"fo": "Europe",
	"fr": "Europe",
	"frp": "Europe",
	"fy": "Europe",
	"ga": "Europe",
	"gag": "Europe",
	"gd": "Europe",
	"gl": "Europe",
	"gsw": "Europe",
	"gv": "Europe",
	"hr": "Europe",
	"hu": "Europe",
	"is": "Europe",
	"it": "Europe",
	"kw": "Europe",
	"la": "Europe",
	"lb": "Europe",
	"lt": "Europe",
	"ltg": "Europe",
	"lv": "Europe",
	"mk": "Europe",
	"mt": "Europe",
	"nl": "Europe",
	"nn": "Europe",
	"no": "Europe",
	"oc": "Europe",
	"pl": "Europe",
	"pt": "Europe",
	"rm": "Europe",
	"rmc": "Europe",
	"ro": "Europe",
	"rom": "Europe",
	"se": "Europe",
	"sk": "Europe",
	"sl": "Europe",
	"sq": "Europe",
	"sr": "Europe",
	"stq": "Europe",
	"sv": "Europe",
	"vec": "Europe",
	"wa": "Europe",
	# Africa (Latin-script)
	"aa": "Africa",
	"ach": "Africa",
	"ada": "Africa",
	"adh": "Africa",
	"af": "Africa",
	"ak": "Africa",
	"alz": "Africa",
	"ann": "Africa",
	"bas": "Africa",
	"bci": "Africa",
	"bim": "Africa",
	"bm": "Africa",
	"bqc": "Africa",
	"bum": "Africa",
	"bus": "Africa",
	"cce": "Africa",
	"cjk": "Africa",
	"din": "Africa",
	"dje": "Africa",
	"dov": "Africa",
	"dwr": "Africa",
	"dyu": "Africa",
	"ee": "Africa",
	"ff": "Africa",
	"ffm": "Africa",
	"fip": "Africa",
	"fon": "Africa",
	"gub": "Americas",
	"gvl": "Africa",
	"ha": "Africa",
	"ig": "Africa",
	"ibb": "Africa",
	"iso": "Africa",
	"izz": "Africa",
	"kbp": "Africa",
	"kg": "Africa",
	"kj": "Africa",
	"kmb": "Africa",
	"kri": "Africa",
	"ktu": "Africa",
	"laj": "Africa",
	"lg": "Africa",
	"ln": "Africa",
	"lu": "Africa",
	"mas": "Africa",
	"mfe": "Africa",
	"mg": "Africa",
	"mgh": "Africa",
	"niq": "Africa",
	"nnb": "Africa",
	"nso": "Africa",
	"nr": "Africa",
	"ny": "Africa",
	"nyu": "Africa",
	"nzi": "Africa",
	"om": "Africa",
	"rn": "Africa",
	"rw": "Africa",
	"seh": "Africa",
	"sg": "Africa",
	"sn": "Africa",
	"so": "Africa",
	"spp": "Africa",
	"srr": "Africa",
	"ss": "Africa",
	"st": "Africa",
	"sus": "Africa",
	"sw": "Africa",
	"tbz": "Africa",
	"tdx": "Africa",
	"teo": "Africa",
	"tiv": "Africa",
	"tll": "Africa",
	"tn": "Africa",
	"ts": "Africa",
	"tsc": "Africa",
	"ve": "Africa",
	"wal": "Africa",
	"wo": "Africa",
	"xh": "Africa",
	"yo": "Africa",
	"zne": "Africa",
	"zu": "Africa",
	# Americas (Latin-script)
	"acf": "Americas",
	"agr": "Americas",
	"cab": "Americas",
	"amu": "Americas",
	"arn": "Americas",
	"ay": "Americas",
	"bzj": "Americas",
	"cac": "Americas",
	"cak": "Americas",
	"cni": "Americas",
	"ctu": "Americas",
	"cuk": "Americas",
	"djk": "Americas",
	"emp": "Americas",
	"gn": "Americas",
	"guc": "Americas",
	"guh": "Americas",
	"gui": "Americas",
	"gym": "Americas",
	"gyn": "Americas",
	"haw": "Americas",
	"ht": "Americas",
	"hus": "Americas",
	"inb": "Americas",
	"jac": "Americas",
	"jam": "Americas",
	"jiv": "Americas",
	"kek": "Americas",
	"kl": "Americas",
	"knj": "Americas",
	"kwi": "Americas",
	"mam": "Americas",
	"maz": "Americas",
	"miq": "Americas",
	"ngu": "Americas",
	"nhe": "Americas",
	"noa": "Americas",
	"nv": "Americas",
	"otq": "Americas",
	"pap": "Americas",
	"qu": "Americas",
	"qub": "Americas",
	"quc": "Americas",
	"quf": "Americas",
	"quh": "Americas",
	"qup": "Americas",
	"quy": "Americas",
	"qvc": "Americas",
	"qvi": "Americas",
	"qvz": "Americas",
	"qxr": "Americas",
	"rcf": "Americas",
	"shp": "Americas",
	"sja": "Americas",
	"srm": "Americas",
	"srn": "Americas",
	"tca": "Americas",
	"toj": "Americas",
	"tzh": "Americas",
	"tzj": "Americas",
	"tzo": "Americas",
	"yua": "Americas",
	"zap": "Americas",
	# Southeast Asia (Latin-script)
	"ace": "Southeast Asia",
	"ahk": "Southeast Asia",
	"akb": "Southeast Asia",
	"ban": "Southeast Asia",
	"bbc": "Southeast Asia",
	"bew": "Southeast Asia",
	"bgz": "Southeast Asia",
	"bik": "Southeast Asia",
	"bru": "Southeast Asia",
	"btx": "Southeast Asia",
	"bts": "Southeast Asia",
	"cbk": "Southeast Asia",
	"ceb": "Southeast Asia",
	"cfm": "Southeast Asia",
	"cjm": "Southeast Asia",
	"cnh": "Southeast Asia",
	"dtp": "Southeast Asia",
	"fil": "Southeast Asia",
	"gor": "Southeast Asia",
	"hil": "Southeast Asia",
	"hmn": "Southeast Asia",
	"iba": "Southeast Asia",
	"id": "Southeast Asia",
	"ify": "Southeast Asia",
	"ilo": "Southeast Asia",
	"ium": "Southeast Asia",
	"jax": "Southeast Asia",
	"jv": "Southeast Asia",
	"jvn": "Southeast Asia",
	"kac": "Southeast Asia",
	"krj": "Southeast Asia",
	"lhu": "Southeast Asia",
	"mad": "Southeast Asia",
	"mak": "Southeast Asia",
	"mbt": "Southeast Asia",
	"mel": "Southeast Asia",
	"meo": "Southeast Asia",
	"min": "Southeast Asia",
	"mkn": "Southeast Asia",
	"mqy": "Southeast Asia",
	"mrw": "Southeast Asia",
	"ms": "Southeast Asia",
	"msi": "Southeast Asia",
	"msb": "Southeast Asia",
	"msm": "Southeast Asia",
	"nia": "Southeast Asia",
	"nij": "Southeast Asia",
	"nut": "Southeast Asia",
	"pag": "Southeast Asia",
	"pam": "Southeast Asia",
	"pck": "Southeast Asia",
	"ppk": "Southeast Asia",
	"prk": "Southeast Asia",
	"sda": "Southeast Asia",
	"su": "Southeast Asia",
	"sxn": "Southeast Asia",
	"tet": "Southeast Asia",
	"tl": "Southeast Asia",
	"tsg": "Southeast Asia",
	"tyz": "Southeast Asia",
	"vi": "Southeast Asia",
	"war": "Southeast Asia",
	"xmm": "Southeast Asia",
	# Oceania (Latin-script)
	"abt": "Oceania",
	"ape": "Oceania",
	"bi": "Oceania",
	"ch": "Oceania",
	"chk": "Oceania",
	"enq": "Oceania",
	"fj": "Oceania",
	"gil": "Oceania",
	"hif": "Oceania",
	"ho": "Oceania",
	"hui": "Oceania",
	"hvn": "Oceania",
	"kos": "Oceania",
	"ksd": "Oceania",
	"meu": "Oceania",
	"mh": "Oceania",
	"mi": "Oceania",
	"mps": "Oceania",
	"pau": "Oceania",
	"pis": "Oceania",
	"pon": "Oceania",
	"rwo": "Oceania",
	"sm": "Oceania",
	"to": "Oceania",
	"tuc": "Oceania",
	"tvl": "Oceania",
	"twu": "Oceania",
	"ubu": "Oceania",
	"yap": "Oceania",
	# South Asia (Latin-script)
	"kha": "South Asia",
	"lus": "South Asia",
	"smt": "South Asia",
	"trp": "South Asia",
	# East Asia (Latin-script)
	"mn": "East Asia",
	"za": "East Asia",
	# Middle East & North Africa (Latin-script)
	"bgp": "Middle East & North Africa",
	"ku": "Middle East & North Africa",
	"tr": "Middle East & North Africa",
	"zza": "Middle East & North Africa",
	# Central Asia (Latin-script)
	"az": "Central Asia",
	"kaa-Latn": "Central Asia",
	"kk": "Central Asia",
	"ky": "Central Asia",
	"tg": "Central Asia",
	"tk": "Central Asia",
	"uz": "Central Asia",
	# Corrections for Cyrillic languages not in Europe
	"ba": "Central Asia",
	"ce": "Europe",
	"cv": "Europe",
	"kv": "Europe",
	"os": "Europe",
	"sah": "East Asia",
	"tt": "Europe",
	# Non-geographic / constructed languages: assign to Americas (creator's continent)
	"crs": "Americas", # Seselwa Creole French (Indian Ocean) — grouped with Atlantic/Indian Ocean creoles
	"tlh": "Americas", # Klingon — constructed language
	"zxx": "Americas", # Noise/non-linguistic content — placeholder region
	}


	def fix_name_spacing(name: str) -> str:
	"""Insert spaces lost during PDF word extraction.

	PDF word extraction sometimes concatenates adjacent words when they appear
	close together on the page. This function restores spaces before uppercase
	letters that follow lowercase letters, before opening parentheses, and after
	periods followed by uppercase letters.
	"""
	# Insert space before uppercase letter following a lowercase/accented lowercase letter
	name = re.sub(r"([a-zà-öø-ÿ])([A-Z])", r"\1 \2", name)
	# Insert space before opening parenthesis following a word character
	name = re.sub(r"(\w)\(", r"\1 (", name)
	# Insert space after period followed by uppercase letter
	name = re.sub(r"\.([A-Z])", r". \1", name)
	return name


	def parse_table9(pdf_path: str) -> list[dict[str, str]]:
	"""Parse Table 9 from the MADLAD-400 paper using word-based extraction.

	pdfplumber's extract_table() does not work on this PDF because the table
	lines are not detected as table borders. Instead we extract words and
	assign them to columns by x-coordinate, then group by row using a
	y-position tolerance to handle multi-line name cells.
	"""
	entries: list[dict[str, str]] = []
	with pdfplumber.open(pdf_path) as pdf:
	for page_num in TABLE_PAGES:
	page = pdf.pages[page_num]
	words = page.extract_words()

	# On the last table page, cut off below the table (avoids Section A.2 code block)
	max_top = PAGE_22_MAX_TOP if page_num == 21 else float("inf")

	# Partition words by column
	bcp47_col = [w for w in words if COL_BCP47_MIN <= w["x0"] < COL_BCP47_MAX and w["top"] <= max_top]
	name_col = [w for w in words if COL_NAME_MIN <= w["x0"] < COL_NAME_MAX and w["top"] <= max_top]
	script_col = [w for w in words if COL_SCRIPT_MIN <= w["x0"] < COL_SCRIPT_MAX and w["top"] <= max_top]
	data_col = [w for w in words if w["x0"] >= COL_DATA_MIN and w["top"] <= max_top]

	for bcp_w in bcp47_col:
	bcp47 = bcp_w["text"]

	# Skip non-BCP-47 text (headers, prose, code)
	if not BCP47_RE.match(bcp47) or bcp47 in ("total", "median"):
	continue

	bcp_top = bcp_w["top"]

	# Find cells in the same row (within ROW_TOLERANCE pixels)
	name_ws = [w for w in name_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE]
	script_ws = [w for w in script_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE]
	data_ws = [w for w in data_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE]

	name = " ".join(w["text"] for w in sorted(name_ws, key=lambda w: w["x0"]))
	name = fix_name_spacing(name)
	script = " ".join(w["text"] for w in sorted(script_ws, key=lambda w: w["x0"]))
	data_texts = [w["text"] for w in sorted(data_ws, key=lambda w: w["x0"])]

	# Require both name and script to be present
	if not name or not script:
	continue

	# Skip rows where all data columns are "-" (self-audit omissions)
	if data_texts and all(d.strip() == "-" for d in data_texts):
	continue

	entries.append({"bcp47": bcp47, "name": name, "script": script})

	return entries


	def assign_region(bcp47: str, script: str) -> str:
	"""Assign a geographic region based on language code and script."""
	if bcp47 in LANGUAGE_OVERRIDES:
	return LANGUAGE_OVERRIDES[bcp47]
	if script in SCRIPT_TO_REGION:
	return SCRIPT_TO_REGION[script]
	return "Other"


	def write_mapping(entries: list[dict[str, str]], output_path: Path) -> None:
	"""Write the language mapping to a Python file."""
	mapping: list[tuple[str, str, str]] = []
	for entry in entries:
	token = f"<2{entry['bcp47']}>"
	region = assign_region(entry["bcp47"], entry["script"])
	mapping.append((token, entry["name"], region))

	# Sort by region, then by name within each region
	mapping.sort(key=lambda x: (x[2], x[1]))

	lines = [
	"# Auto-generated by scripts/generate_langmap.py from MADLAD-400 paper Table 9 (Section A.1)",
	"# Source: https://arxiv.org/pdf/2309.04662",
	f"# {len(mapping)} languages with training data (excludes 79 self-audit omissions)",
	"#",
	"# To regenerate:",
	"# python scripts/generate_langmap.py <path-to-paper.pdf>",
	"langid_to_language = {",
	]
	for token, name, region in mapping:
	# Escape any quotes in names
	escaped_name = name.replace('"', '\\"')
	lines.append(f' "{token}": {{"name": "{escaped_name}", "region": "{region}"}},')
	lines.append("}")
	lines.append("")

	output_path.write_text("\n".join(lines))


	def main() -> None:
	if len(sys.argv) != 2:
	print("Usage: python scripts/generate_langmap.py <path-to-paper.pdf>")
	sys.exit(1)

	pdf_path = sys.argv[1]
	entries = parse_table9(pdf_path)
	print(f"Parsed {len(entries)} languages from Table 9")

	# Report region assignments
	other: list[tuple[str, str]] = []
	for entry in entries:
	region = assign_region(entry["bcp47"], entry["script"])
	if region == "Other":
	other.append((entry["bcp47"], entry["name"]))

	if other:
	print(f"\n{len(other)} languages assigned to 'Other' region (need manual override):")
	for bcp47, name in other:
	print(f" {bcp47}: {name}")
	print("\nAdd overrides to LANGUAGE_OVERRIDES in the script and re-run.")

	output_path = Path(__file__).parent.parent / "langmap" / "langid_mapping.py"
	write_mapping(entries, output_path)
	print(f"\nWrote {len(entries)} entries to {output_path}")

	if other:
	sys.exit(1)


	if __name__ == "__main__":
	main()