Spaces:

ljvmiranda921
/

multilinguality-at-the-edge

Running

App Files Files Community

multilinguality-at-the-edge / index.html

ljvmiranda921

Sync from docs/ (2026-06-01T12:13Z)

a74c6b2 23 days ago

Raw

History Blame Contribute Delete

15.5 kB

	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>Multilinguality at the Edge: Developing Language Models for the Global South</title>
	<meta name="description" content="An interactive companion to our survey of multilingual edge language models: challenges, analyses, and recommendations for the Global South.">
	<meta name="robots" content="index, follow, max-image-preview:large">
	<link rel="canonical" href="https://ljvmiranda921.github.io/multilinguality-at-the-edge/">

	<meta property="og:type" content="website">
	<meta property="og:title" content="Multilinguality at the Edge">
	<meta property="og:description" content="Interactive analyses and findings on multilingual edge LMs for the Global South.">
	<meta property="og:url" content="https://ljvmiranda921.github.io/multilinguality-at-the-edge/">
	<meta property="og:image" content="https://ljvmiranda921.github.io/multilinguality-at-the-edge/assets/social-preview.png">
	<meta property="og:image:width" content="1200">
	<meta property="og:image:height" content="630">

	<meta name="twitter:card" content="summary_large_image">
	<meta name="twitter:title" content="Multilinguality at the Edge">
	<meta name="twitter:description" content="Interactive analyses and findings on multilingual edge LMs for the Global South.">
	<meta name="twitter:image" content="https://ljvmiranda921.github.io/multilinguality-at-the-edge/assets/social-preview.png">

	<script type="application/ld+json">
	{
	"@context": "https://schema.org",
	"@type": "ScholarlyArticle",
	"headline": "Multilinguality at the Edge: Developing Language Models for the Global South",
	"author": [
	{ "@type": "Person", "name": "Lester James V. Miranda" },
	{ "@type": "Person", "name": "Songbo Hu" },
	{ "@type": "Person", "name": "Roi Reichart" },
	{ "@type": "Person", "name": "Anna Korhonen" }
	],
	"url": "https://ljvmiranda921.github.io/multilinguality-at-the-edge/",
	"sameAs": "https://arxiv.org/abs/2604.21637"
	}
	</script>

	<link rel="icon" href="favicon.ico" sizes="any">
	<link rel="icon" type="image/png" sizes="32x32" href="assets/favicon-32.png">
	<link rel="icon" type="image/png" sizes="16x16" href="assets/favicon-16.png">
	<link rel="apple-touch-icon" href="assets/favicon.png">
	<link rel="stylesheet" href="css/fonts.css">
	<link rel="stylesheet" href="css/style.css">
	</head>
	<body>
	<div class="page-frame">

	<div class="letterhead" aria-label="Affiliations">
	<img src="assets/ucam-logo-colour-preferred.png" alt="University of Cambridge">
	<img src="assets/ltl_logo2.svg" alt="Language Technology Lab">
	</div>

	<header class="site-header">
	<h1>Multilinguality at the <span class="title-edge">Edge</span></h1>
	<p class="subtitle">Developing Language Models for the Global South</p>
	<p class="authors"><a href="https://ljvmiranda921.github.io">Lester James V. Miranda</a><a class="email-link" href="mailto:ljvm2@cam.ac.uk" aria-label="Email Lester James V. Miranda"><svg width="13" height="13" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true"><rect x="3" y="5" width="18" height="14" rx="1"/><path d="m3 7 9 6 9-6"/></svg></a>, <a href="https://songbohu.github.io/">Songbo Hu</a>, <a href="https://roireichart.com/">Roi Reichart</a>, <a href="https://sites.google.com/site/annakorhonen/">Anna Korhonen</a></p>
	<nav class="top-links">
	<a href="https://github.com/ljvmiranda921/multilinguality-at-the-edge" target="_blank" rel="noopener noreferrer">code</a> · <a href="https://arxiv.org/abs/2604.21637" target="_blank" rel="noopener noreferrer">read the paper</a>
	</nav>
	</header>

	<section id="intro">
	<p class="callout">
	<strong>The Problem:</strong>
	Communities with the greatest linguistic diversity often face severe infrastructure constraints.
	</p>

	<figure class="figure-row">
	<div id="fig-1" class="figure-mount">
	<p class="placeholder">[ figure loads here ]</p>
	</div>
	<figcaption class="figure-caption">
	<strong>Figure 1. Countries with high linguistic diversity have the most limited network connectivity (upper left).</strong> Internet penetration is sourced from <a href="https://www.itu.int/en/ITU-D/Statistics/Pages/facts/default.aspx">ITU (2025)</a>, number of living languages (log-scale) from the <a href="https://ourworldindata.org/grapher/living-languages">Ethnologue (SIL International, 2025)</a>, and income groups from the <a href="https://ourworldindata.org/grapher/world-bank-income-groups">World Bank (2025)</a>. Hover a point for the country.
	</figcaption>
	</figure>

	<p>
	The field has several names for this: the <em>low-resource double bind</em> <a class="cite" href="https://aclanthology.org/2021.findings-emnlp.282/">(Ahia et al., 2021)</a>, the <em>square-one bias</em> <a class="cite" href="https://aclanthology.org/2022.findings-acl.184/">(Ruder et al., 2022)</a>, <em>Zeno's paradox of language technology</em> <a class="cite" href="https://aclanthology.org/2024.emnlp-main.983/">(Nigatu et al., 2024)</a>, among others.
	</p>

	<p class="callout-question">
	<strong>The Challenge</strong>
	How can we develop language models that are both multilingual and deployable on-device?
	</p>

	<p class="callout">
	<strong>Our Approach:</strong>
	To understand the state of the art and the challenges of combining the two areas, we survey 232 papers that tackle this problem across the language modelling pipeline.
	</p>

	<div class="figure-pair">
	<figure>
	<div id="fig-2a" class="figure-mount" data-svg="assets/figures/language_coverage.svg">
	<p class="placeholder">[ figure loads here ]</p>
	</div>
	<figcaption><strong>Figure 2a. Reported language coverage of edge LM papers.</strong> We show 78 papers (of 232) that report a concrete number of evaluated languages and bin them into four brackets: monolingual (1), few (2–10), many (11–50), and massive (50+), categorized by research focus.</figcaption>
	</figure>
	<figure>
	<div id="fig-2b" class="figure-mount">
	<p class="placeholder">[ figure loads here ]</p>
	</div>
	<figcaption><strong>Figure 2b. Model sizes (in billion parameters) of various language models.</strong> For each model family in our curated set of released models, we recorded publicly documented parameter counts and plotted the range of available sizes on a log scale.</figcaption>
	</figure>
	</div>
	</section>

	<section id="pipeline-hero">
	<p>
	The requirements for deploying on the edge and supporting multilinguality often have competing requirements that impose challenges across the language modelling pipeline.
	<em>Click</em> on each pipeline stage (or requirement) to read about the challenges and the state of the art.
	</p>
	<div id="pipeline-figure">
	<p class="placeholder">[ pipeline diagram loads here ]</p>
	</div>
	<aside id="pipeline-detail" class="pixel-frame">
	<p class="placeholder">[ click a stage above to read ]</p>
	</aside>
	</section>

	<section id="analysis">
	<h2>Analysis</h2>

	<p>
	We also looked into <strong>edge LM systems</strong>, which we define as completed efforts that have been integrated into real-world applications. To identify them, we manually classified each of the 232 papers on whether an actual model deployment took place, obtaining 36 systems in the process.
	</p>

	<div class="tabs" role="tablist" aria-label="Analysis subsections">
	<button type="button" class="tab is-active" role="tab" data-tab="how" aria-selected="true">How are edge LMs developed?</button>
	<button type="button" class="tab" role="tab" data-tab="who" aria-selected="false">Who develops edge LMs?</button>
	<button type="button" class="tab" role="tab" data-tab="where" aria-selected="false">Which domains are edge LMs deployed to?</button>
	</div>

	<div class="tab-panels">
	<div class="tab-panel is-active" role="tabpanel" data-panel="how">
	<p>
	To examine how edge LM systems are made, we situate the 36 deployment papers within the broader 232 surveyed papers. We embed each abstract with MiniLM, reduce to 2D with UMAP, and cluster with HDBSCAN; KeyBERT extracts the top keywords per cluster. Hover any cluster to see representative papers.
	</p>
	<figure class="analysis-figure">
	<div id="fig-how" class="figure-mount">
	<p class="placeholder">[ chart loads here ]</p>
	</div>
	<figcaption><strong>Figure 3. Clustering of the 232 surveyed papers by abstract similarity.</strong> Real-world deployments (&starf;) tend to concentrate near a few clusters such as <em>model compression</em> and <em>dialog datasets</em>, while clusters like <em>reasoning performance</em> or <em>prompt compression</em> have little to no representation, suggesting that edge LM deployments favor a relatively narrow set of methods.</figcaption>
	</figure>
	</div>

	<div class="tab-panel" role="tabpanel" data-panel="who" hidden>
	<p>
	We classified the affiliations of authors across the 36 deployment papers into four sectors: <strong>Academia</strong> (universities and affiliated research institutions), <strong>Industry</strong> (startups to enterprise), <strong>Research collective</strong> (non-profit research organizations), and <strong>Government</strong> (state-affiliated institutes, public sector). Authors with multiple affiliations are counted in each. Cross-sector collaborations are measured by how often each pair of sectors co-occurs within the same paper.
	</p>
	<figure class="analysis-figure">
	<div id="fig-who" class="figure-mount" data-svg="assets/figures/collaboration_sectors.svg">
	<p class="placeholder">[ chart loads here ]</p>
	</div>
	<figcaption><strong>Figure 4. Affiliation type of authors from papers that deployed edge LM systems.</strong> Numbers on each arc show the total count of papers contributing from that sector. Academia has the largest proportion of collaborations, while government participation remains limited and is mostly driven by cross-sector ties with academia.</figcaption>
	</figure>
	</div>

	<div class="tab-panel" role="tabpanel" data-panel="where" hidden>
	<p>
	In order to map the domains in which an edge LM is deployed, we perform a round of classification by tagging each paper according to their <strong>domain</strong>: Agriculture, Climate, Finance, Healthcare, Legal, Social, and Speech.
	Then, we extract mentions of different methods by keyword matching via KeyBERT, and visualize the domain-method connections as a network graph. Click on any outer domain node to see representative papers for that domain.
	</p>
	<figure class="analysis-figure">
	<div id="fig-where" class="figure-mount">
	<p class="placeholder">[ chart loads here ]</p>
	</div>
	<figcaption><strong>Figure 5. Edge LM real-world deployment domains network.</strong> Central nodes represent methods used to develop and deploy real-world edge LMs. Edge color indicates connectivity, while darker nodes indicate high sharing among domains.</figcaption>
	</figure>
	<aside id="fig-where-detail" class="pixel-frame where-detail">
	<p class="placeholder">[ click a domain to see representative deployment papers ]</p>
	</aside>
	</div>
	</div>
	</section>

	<section id="recommendations">
	<h2>Recommendations</h2>
	<div class="pixel-frame">
	<ol class="recs-list">
	<li><strong>For NLP researchers and model developers:</strong> Evaluate edge models beyond memory (e.g., compute and energy), and explore underrepresented methods since current deployments cluster around a relatively narrow toolkit.</li>
	<li><strong>For deployment practitioners and communities at the edge:</strong> Build cross-sector collaborations (academia, industry, research collectives, government), and involve local communities as active collaborators in development and deployment.</li>
	<li><strong>For policymakers and funders:</strong> Invest not only in model development but also in infrastructure and devices that make deployment feasible in linguistically diverse, lower-resource settings; increase public-sector participation in edge LM efforts.</li>
	</ol>
	</div>
	</section>

	<section id="citation">
	<h2>Citation</h2>
	<div class="bibtex-wrap">
	<pre class="pixel-frame bibtex"><code id="bibtex-code">@misc{miranda2026multilingualityedgedevelopinglanguage,
	title={{M}ultilinguality at the {E}dge: {D}eveloping {L}anguage {M}odels for the {G}lobal {S}outh},
	author={Lester James Validad Miranda and Songbo Hu and Roi Reichart and Anna Korhonen},
	year={2026},
	eprint={2604.21637},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2604.21637},
	}
	</code></pre>
	<button type="button" class="copy-btn" data-copy-target="#bibtex-code" aria-label="Copy citation to clipboard">Copy</button>
	</div>
	</section>

	<section id="discussion">
	<h2>Discussion</h2>
	<p>Have feedback, questions, or ideas? Join the conversation below.</p>
	<div class="card">
	<div class="giscus"></div>
	</div>
	</section>

	<script src="js/pipeline.js"></script>
	<script src="js/main.js"></script>
	<script type="module" src="js/pipeline-mesh.js"></script>
	<script src="https://cdn.plot.ly/plotly-basic-2.35.2.min.js" charset="utf-8"></script>
	<script src="js/figures/fig-infra-lingdiv.js"></script>
	<script src="js/figures/fig-loader.js"></script>
	<script src="js/figures/fig-model-sizes.js"></script>
	<script src="js/figures/fig-literature-clusters.js"></script>
	<script src="js/figures/fig-domain-network.js"></script>
	<script>
	// Comments are GitHub-backed (giscus). On Hugging Face Spaces (*.hf.space)
	// the giscus pathname mapping and theme URL don't make sense, so hide the
	// Discussion section entirely there. Everywhere else, load giscus normally.
	(function () {
	if (location.hostname.endsWith('hf.space')) {
	var section = document.getElementById('discussion');
	if (section) section.remove();
	return;
	}
	var s = document.createElement('script');
	s.src = 'https://giscus.app/client.js';
	var attrs = {
	'data-repo': 'ljvmiranda921/multilinguality-at-the-edge',
	'data-repo-id': 'R_kgDORoQj6A',
	'data-category': 'General',
	'data-category-id': 'DIC_kwDORoQj6M4C7rwu',
	'data-mapping': 'pathname',
	'data-strict': '0',
	'data-reactions-enabled': '0',
	'data-emit-metadata': '0',
	'data-input-position': 'top',
	'data-theme': 'https://ljvmiranda921.github.io/multilinguality-at-the-edge/assets/giscus-theme.css',
	'data-lang': 'en'
	};
	Object.keys(attrs).forEach(function (k) { s.setAttribute(k, attrs[k]); });
	s.crossOrigin = 'anonymous';
	s.async = true;
	document.body.appendChild(s);
	})();
	</script>
	</div>
	</body>
	</html>