#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ app.py Gradio demo for Kabyle POS Tagger v2. Pre-splits punctuation and hyphenated clitics, strips hyphens from clitics before tokenization, then applies a post-processing lookup table to fix remaining clitic misclassifications. """ import re import gradio as gr from transformers import AutoTokenizer, AutoModelForTokenClassification import torch MODEL_NAME = "boffire/kabyle-pos-v2" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) model.eval() id2label = model.config.id2label # Universal Dependencies POS tagset with descriptions POS_DESCRIPTIONS = { "ADJ": "Adjective", "ADP": "Adposition", "ADV": "Adverb", "AUX": "Auxiliary verb", "CCONJ": "Coordinating conjunction", "DET": "Determiner", "INTJ": "Interjection", "NOUN": "Noun", "NUM": "Numeral", "PART": "Particle", "PRON": "Pronoun", "PROPN": "Proper noun", "PUNCT": "Punctuation", "SCONJ": "Subordinating conjunction", "SYM": "Symbol", "VERB": "Verb", "X": "Other" } # High-contrast color palette (dark backgrounds, light text) POS_COLORS = { "NOUN": "#1565c0", # Dark blue "PROPN": "#0d47a1", # Darker blue "PRON": "#0277bd", # Ocean blue "VERB": "#2e7d32", # Forest green "AUX": "#1b5e20", # Dark green "ADJ": "#ef6c00", # Burnt orange "ADV": "#f9a825", # Golden (dark text) "ADP": "#6a1b9a", # Deep purple "PART": "#ad1457", # Dark pink "DET": "#c62828", # Dark red "NUM": "#00838f", # Teal "CCONJ": "#00695c", # Dark cyan "SCONJ": "#004d40", # Darker cyan "INTJ": "#d84315", # Deep orange "PUNCT": "#455a64", # Blue grey "SYM": "#37474f", # Darker blue grey "X": "#5d4037", # Brown } # Text colors POS_TEXT_COLORS = { "NOUN": "#ffffff", "PROPN": "#ffffff", "PRON": "#ffffff", "VERB": "#ffffff", "AUX": "#ffffff", "ADJ": "#ffffff", "ADV": "#000000", # Dark text on yellow "ADP": "#ffffff", "PART": "#ffffff", "DET": "#ffffff", "NUM": "#ffffff", "CCONJ": "#ffffff", "SCONJ": "#ffffff", "INTJ": "#ffffff", "PUNCT": "#ffffff", "SYM": "#ffffff", "X": "#ffffff", } # ============================================================================= # POST-PROCESSING: Clitic Lookup Table # ============================================================================= # These morphemes are closed-class and their POS is deterministic. # We split into two tiers: # 1. Unambiguous forms (override always): multi-letter clitics and the # copula/directional particle "d". # 2. Hyphen-only forms (override only when hyphenated): short subject # affixes and preposition-like clitics where standalone usage differs # from affix usage (e.g., "i" = ADP preposition, "i-" = PRON subject). # ----------------------------------------------------------------------------- CLITIC_POS = { # --- Unambiguous: override regardless of hyphenation --- "d": "PART", # Directional / copula particle # Possessive plural "nneɣ": "PRON", "neɣ": "PRON", "nteɣ": "PRON", "nnteɣ": "PRON", "wen": "PRON", "nwen": "PRON", "nkent": "PRON", "tkent": "PRON", "nsen": "PRON", "tsen": "PRON", "nsent": "PRON", "tsent": "PRON", "nnek": "PRON", "nnem": "PRON", "nnes": "PRON", # Accusative / dative "iyi": "PRON", "yi": "PRON", "ayi": "PRON", "kem": "PRON", "akem": "PRON", "tt": "PRON", "itt": "PRON", "aɣ": "PRON", "yaɣ": "PRON", "ken": "PRON", "akent": "PRON", "ten": "PRON", "ak": "PRON", "am": "PRON", "as": "PRON", "asen": "PRON", "aneɣ": "PRON", "anaɣ": "PRON", "yanaɣ": "PRON", "atneɣ": "PRON", "atenteɣ": "PRON", "awen": "PRON", "atwen": "PRON", "atkent": "PRON", "atsen": "PRON", "sen": "PRON", "asent": "PRON", "atsent": "PRON", # Demonstratives / determiners "agi": "DET", "a": "DET", "nni": "DET", "nniḍen": "DET", "niḍen": "DET", } # Short affixes that are ambiguous when standalone (e.g., "i" = preposition ADP, # "i-" = subject pronoun PRON). Only override if the user wrote them hyphenated. CLITIC_POS_HYPHEN_ONLY = { "ɣ": "PRON", # 1st sg subject affix "t": "PRON", # 2nd sg / 3rd fem sg subject affix "k": "PRON", # 2nd masc sg "m": "PRON", # 2nd fem sg "n": "PRON", # 1st pl / 3rd masc pl subject affix "i": "PRON", # 3rd masc sg subject affix (vs. standalone prep.) "w": "PRON", "iw": "PRON", "inu": "PRON", "ik": "PRON", "im": "PRON", "is": "PRON", # possessive / dative "kent": "PRON", "sen": "PRON", "sent": "PRON", } def apply_clitic_override(results): """ Post-process model predictions using the clitic lookup table. Overrides known closed-class morphemes to their linguistically correct POS. """ for i, token in enumerate(results): word = token["word"] stripped = word.strip('-') # Skip empty or pure punctuation if not stripped or stripped in CLITIC_POS.get("PUNCT", {}): continue # Tier 1: unambiguous forms (override always) if stripped in CLITIC_POS: token["entity_group"] = CLITIC_POS[stripped] token["score"] = max(token["score"], 0.99) continue # Tier 2: short affixes — only override if hyphenated (affix context) if word.startswith('-') or word.endswith('-'): if stripped in CLITIC_POS_HYPHEN_ONLY: token["entity_group"] = CLITIC_POS_HYPHEN_ONLY[stripped] token["score"] = max(token["score"], 0.99) return results def tag_text(text): if not text or not text.strip(): return "", "Please enter some Kabyle text." try: # 1. Split text into words, hyphenated clitics, and punctuation raw_tokens = re.findall(r"-?[\w'’]+|[^\w\s'’-]", text.strip(), re.UNICODE) display_tokens = [] model_tokens = [] for tok in raw_tokens: if tok == "-": display_tokens.append(tok) model_tokens.append(tok) elif tok.startswith('-') and len(tok) > 1: display_tokens.append(tok) # UI: "-nneɣ" model_tokens.append(tok[1:]) # Model: "nneɣ" elif tok.endswith('-') and len(tok) > 1: display_tokens.append(tok) # UI: "akent-" model_tokens.append(tok[:-1]) # Model: "akent" else: display_tokens.append(tok) model_tokens.append(tok) # 2. Tokenize the model tokens inputs = tokenizer( model_tokens, is_split_into_words=True, return_tensors="pt", return_offsets_mapping=False, ) word_ids = inputs.word_ids(batch_index=0) with torch.no_grad(): outputs = model(**inputs) predictions = torch.argmax(outputs.logits, dim=-1)[0].tolist() scores = torch.softmax(outputs.logits, dim=-1)[0].max(dim=-1).values.tolist() # 3. Group subword pieces by original word index word_groups = {} for idx, wid in enumerate(word_ids): if wid is None: continue if wid not in word_groups: word_groups[wid] = {"labels": [], "scores": []} word_groups[wid]["labels"].append(id2label[predictions[idx]]) word_groups[wid]["scores"].append(scores[idx]) # 4. Build results with majority voting per word results = [] for wid in sorted(word_groups.keys()): info = word_groups[wid] word_text = display_tokens[wid] counts = {} for lbl in info["labels"]: counts[lbl] = counts.get(lbl, 0) + 1 majority = max(counts, key=counts.get) avg_score = sum(info["scores"]) / len(info["scores"]) results.append({ "word": word_text, "entity_group": majority, "score": avg_score }) # 5. Apply post-processing clitic lookup table results = apply_clitic_override(results) except Exception as e: return "", f"Error: {str(e)}" if not results: return "", "No tokens found." # HTML visualization html_parts = ['<
'] for token in results: word = token["word"] label = token["entity_group"] score = token["score"] bg_color = POS_COLORS.get(label, "#333333") text_color = POS_TEXT_COLORS.get(label, "#ffffff") html_parts.append( '' '' + word + '' '' + label + ' (' + f"{score:.2f}" + ')' '' '' ) html_parts.append('
') # Markdown table table_lines = [ "| Token | POS Tag | Description | Confidence |", "|-------|---------|-------------|------------|" ] for token in results: word = token["word"] label = token["entity_group"] desc = POS_DESCRIPTIONS.get(label, label) score = token["score"] table_lines.append(f"| {word} | `{label}` | {desc} | {score:.3f} |") return "\n".join(html_parts), "\n".join(table_lines) examples = [ "Aṭas n medden i yessen.", "Taqbaylit d tutlayt deg Lezzayer.", "Yella wuccen ameqqran deg taddart.", "Tameddakelt-nneɣ teɣra adlis-is.", "D nekkni i d-yusan d imezwura.", ] demo = gr.Interface( fn=tag_text, inputs=gr.Textbox( label="Kabyle Text", placeholder="Enter a sentence in Kabyle (e.g., Aṭas n medden i yessen.)", lines=2 ), outputs=[ gr.HTML(label="Tagged Visualization"), gr.Markdown(label="Results Table") ], title="Kabyle POS Tagger v2", description="""

Kabyle Part-of-Speech Tagger

Enter a sentence in Kabyle (Berber language) to see POS tags predicted by boffire/kabyle-pos-v2 (XLM-RoBERTa-base, Test F1: 93.8%).

Tags follow the Universal Dependencies POS tagset.

""", examples=examples, ) if __name__ == "__main__": demo.launch()