Initial commit

Browse files

Added model, script, vocab, readme

Files changed (4) hide show

best_model.pt +3 -0
model_def_multitask.py +147 -0
readme.md +152 -0
vocab.json +1 -0

best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb33ee6c7f76ed135c8e91832819010e8660fc829d3c7924308a32767354413b
+size 50138134

model_def_multitask.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+facs2dipl / dipl2norm — Multi-task Char-level Transformer
+Model definition, encode/decode helpers, and greedy inference.
+Save this file to: /content/drive/MyDrive/facs2dipl_multitask/model_def.py
+Then in any notebook run: %run /content/drive/MyDrive/facs2dipl_multitask/model_def.py
+"""
+import math
+import torch
+import torch.nn as nn
+# ── Special token indices (must match training) ────────────────────────────
+PAD, SOS, EOS, UNK = 0, 1, 2, 3
+# Task prefix token indices
+DIPL_IDX = 4   # <DIPL> — facs→dipl task
+NORM_IDX = 5   # <NORM> — dipl→norm task
+# ── Model ──────────────────────────────────────────────────────────────────
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len, dropout):
+        super().__init__()
+        self.dropout = nn.Dropout(dropout)
+        pe  = torch.zeros(max_len, d_model)
+        pos = torch.arange(max_len).unsqueeze(1)
+        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(pos * div)
+        pe[:, 1::2] = torch.cos(pos * div)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x):
+        return self.dropout(x + self.pe[:, :x.size(1)])
+class CharSeq2Seq(nn.Module):
+    def __init__(self, vocab_size, d_model, n_heads, n_enc, n_dec, d_ff, max_len, dropout):
+        super().__init__()
+        self.d_model = d_model
+        self.embed   = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
+        self.pos_enc = PositionalEncoding(d_model, max_len, dropout)
+        enc_layer    = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True, norm_first=True)
+        dec_layer    = nn.TransformerDecoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True, norm_first=True)
+        self.encoder = nn.TransformerEncoder(enc_layer, n_enc, norm=nn.LayerNorm(d_model))
+        self.decoder = nn.TransformerDecoder(dec_layer, n_dec, norm=nn.LayerNorm(d_model))
+        self.proj    = nn.Linear(d_model, vocab_size)
+        self._init_weights()
+    def _init_weights(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def encode(self, src, src_key_padding_mask):
+        x = self.pos_enc(self.embed(src) * math.sqrt(self.d_model))
+        return self.encoder(x, src_key_padding_mask=src_key_padding_mask)
+    def decode(self, tgt, memory, tgt_mask, tgt_key_padding_mask, memory_key_padding_mask):
+        x = self.pos_enc(self.embed(tgt) * math.sqrt(self.d_model))
+        return self.decoder(
+            x, memory,
+            tgt_mask=tgt_mask,
+            tgt_key_padding_mask=tgt_key_padding_mask,
+            memory_key_padding_mask=memory_key_padding_mask,
+        )
+    def forward(self, src, tgt):
+        src_pad_mask = (src == PAD)
+        tgt_pad_mask = (tgt == PAD)
+        T            = tgt.size(1)
+        tgt_mask     = nn.Transformer.generate_square_subsequent_mask(T, device=src.device)
+        memory       = self.encode(src, src_pad_mask)
+        out          = self.decode(tgt, memory, tgt_mask, tgt_pad_mask, src_pad_mask)
+        return self.proj(out)
+# ── Helpers ────────────────────────────────────────────────────────────────
+def encode_text(text, task_idx, c2i, max_len):
+    """
+    Encode source text with a task prefix token.
+    Layout: [task_token, char, char, ..., EOS, PAD, ...]
+    """
+    ids  = [task_idx] + [c2i.get(c, UNK) for c in text]
+    ids  = ids[:max_len - 1] + [EOS]
+    ids += [PAD] * (max_len - len(ids))
+    return ids
+def encode_target(text, c2i, max_len):
+    """Encode target: [SOS, char, char, ..., EOS, PAD, ...]"""
+    ids  = [SOS] + [c2i.get(c, UNK) for c in text]
+    ids  = ids[:max_len - 1] + [EOS]
+    ids += [PAD] * (max_len - len(ids))
+    return ids
+def decode_ids(ids, i2c):
+    """Decode token ids to string, stopping at EOS, skipping special tokens."""
+    chars = []
+    skip  = {PAD, SOS, EOS, UNK, DIPL_IDX, NORM_IDX}
+    for i in ids:
+        if i == EOS:
+            break
+        if i not in skip:
+            chars.append(i2c.get(i, ''))
+    return ''.join(chars)
+@torch.no_grad()
+def greedy_decode(model, src, max_len, device, i2c):
+    """
+    Greedy decode a batch. Task is implicit in the src prefix token.
+    Args:
+        model   : CharSeq2Seq in eval mode
+        src     : LongTensor (B, S) — first token is the task prefix
+        max_len : int
+        device  : str or torch.device
+        i2c     : dict[int, str]
+    Returns:
+        list[str] of length B
+    """
+    model.eval()
+    src          = src.to(device)
+    src_pad_mask = (src == PAD)
+    memory       = model.encode(src, src_pad_mask)
+    B    = src.size(0)
+    ys   = torch.full((B, 1), SOS, dtype=torch.long, device=device)
+    done = torch.zeros(B, dtype=torch.bool, device=device)
+    for _ in range(max_len - 1):
+        T        = ys.size(1)
+        tgt_mask = nn.Transformer.generate_square_subsequent_mask(T, device=device)
+        tgt_pad  = (ys == PAD)
+        out      = model.decode(ys, memory, tgt_mask, tgt_pad, src_pad_mask)
+        next_tok = model.proj(out[:, -1]).argmax(-1)
+        next_tok = next_tok.masked_fill(done, PAD)
+        ys       = torch.cat([ys, next_tok.unsqueeze(1)], dim=1)
+        done     = done | (next_tok == EOS)
+        if done.all():
+            break
+    return [decode_ids(row.tolist(), i2c) for row in ys]

readme.md ADDED Viewed

	@@ -0,0 +1,152 @@

+---
+language:
+- non
+tags:
+- text-normalization
+- historical-text
+- old-icelandic
+- seq2seq
+- character-level
+- multi-task
+- medieval
+license: mit
+datasets:
+- custom
+metrics:
+- cer
+- wer
+---
+# Old Icelandic facs2dipl2norm
+This repository contains a character-level transformer model for Old Icelandic manuscript normalisation tasks, specifically facsimile transcription to diplomatic transcription (facs → dipl) and diplomatic transcription to normalised form (dipl → norm).
+The model was trained on all the available MENOTA texts by Andrea de Leeuw van Weenen (AM 132 fol., AM 519 a 4to., and AM 677 4to). This is around 75% of all the currently available MENOTA texts, which are normalised, lemmatized, and (at least partially) POS-tagged.
+Old Icelandic manuscript normalisation tasks:
+- **facs → dipl**: facsimile transcription → diplomatic transcription (abbreviation expansion, character normalisation)
+- **dipl → norm**: diplomatic transcription → normalised form (orthographic regularisation)
+Task routing is controlled by a prefix token prepended to the source sequence — no architectural changes were necessary between tasks.
+## Model Details
+| Property | Value |
+|---|---|
+| Architecture | Transformer encoder-decoder |
+| Parameters | ~10M |
+| Vocabulary | ~120 characters (data-derived) |
+| Max sequence length | 128 characters |
+| Model dimension | 256 |
+| Attention heads | 4 |
+| Encoder / decoder layers | 3 / 3 |
+| Feed-forward dim | 512 |
+| Task tokens | `<DIPL>` (facs→dipl), `<NORM>` (dipl→norm) |
+| Training data | ~36k line-level triples |
+| Language | Old Icelandic (`non`) |
+## Training Data
+- Corpus size: 36240 text chunks of differing lengths, containing around 400k word tokens.
+- Training-validation-test split: 80-10-10.
+- Sources: <a href="https://clarino.uib.no/menota/catalogue/menota">AM 132 fol., AM 519 a 4to, and AM 677 4to</a>, edited and annotated by Andrea de Leeuw van Weenen.
+## Training
+TODO
+## Performance
+| Task | CER | WER |
+|---|---|---|
+| facs → dipl | 0.0112 | 0.0270 |
+| dipl → norm | 0.0350 | 0.1370 |
+## Intended Use
+This model is intended for researchers and digital humanists working with Old Icelandic manuscript material who need to automate or assist with the production of diplomatic and normalised transcriptions from facsimile-level texts (e.g., from HTR output from models like OICEN-HTR).
+## Usage
+Try it out in <a href="https://colab.research.google.com/drive/13Rq2FZomqRjdG5DyHMNcuWSmv0rbq3qR?usp=sharing">Google Colab</a>!
+```python
+import json, torch
+from model_def import CharSeq2Seq, encode_text, decode_ids, greedy_decode, DIPL_IDX, NORM_IDX
+# Load vocab
+with open("vocab.json", encoding="utf-8") as f:
+    v = json.load(f)
+c2i = v["c2i"]
+i2c = {int(k): val for k, val in v["i2c"].items()}
+# Load model
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+ckpt   = torch.load("best_model.pt", map_location=DEVICE)
+hp     = ckpt["hparams"]
+model = CharSeq2Seq(
+    vocab_size = hp["VOCAB_SIZE"],
+    d_model    = hp["D_MODEL"],
+    n_heads    = hp["N_HEADS"],
+    n_enc      = hp["N_ENC"],
+    n_dec      = hp["N_DEC"],
+    d_ff       = hp["D_FF"],
+    max_len    = hp["MAX_LEN"],
+    dropout    = hp["DROPOUT"],
+).to(DEVICE)
+model.load_state_dict(ckpt["model"])
+model.eval()
+```
+### facs → dipl
+```python
+MAX_LEN = hp["MAX_LEN"]
+def predict_dipl(texts):
+    if isinstance(texts, str):
+        texts = [texts]
+    src = torch.tensor(
+        [encode_text(t, DIPL_IDX, c2i, MAX_LEN) for t in texts],
+        dtype=torch.long
+    )
+    return greedy_decode(model, src, MAX_LEN, DEVICE, i2c)
+predict_dipl("koma egƚ. kappı þınu ⁊ ꝺırꝼð . en ſkaplynꝺı") # random line from test set
+# → "koma eg(il)l kappi þinu (ok) dirfð . en ſkaplyndi"
+```
+### dipl → norm
+```python
+def predict_norm(texts):
+    if isinstance(texts, str):
+        texts = [texts]
+    src = torch.tensor(
+        [encode_text(t, NORM_IDX, c2i, MAX_LEN) for t in texts],
+        dtype=torch.long
+    )
+    return greedy_decode(model, src, MAX_LEN, DEVICE, i2c)
+predict_norm("TODO")
+# → TODO
+```
+### Full pipeline: facs → dipl → norm
+```python
+def predict_pipeline(texts):
+    if isinstance(texts, str):
+        texts = [texts]
+    dipl = predict_dipl(texts)
+    norm = predict_norm(dipl)
+    return list(zip(dipl, norm))
+predict_pipeline("TODO")
+# TODO
+```

vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"c2i": {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3, "<DIPL>": 4, "<NORM>": 5, "\n": 6, " ": 7, "!": 8, "\"": 9, "#": 10, "&": 11, "'": 12, "(": 13, ")": 14, "*": 15, "+": 16, ",": 17, "-": 18, ".": 19, "0": 20, "1": 21, "2": 22, "3": 23, "4": 24, "5": 25, "6": 26, "7": 27, "8": 28, "9": 29, ":": 30, ";": 31, "=": 32, ">": 33, "?": 34, "A": 35, "B": 36, "C": 37, "D": 38, "E": 39, "F": 40, "G": 41, "H": 42, "I": 43, "J": 44, "K": 45, "L": 46, "M": 47, "N": 48, "O": 49, "P": 50, "Q": 51, "R": 52, "S": 53, "T": 54, "U": 55, "V": 56, "X": 57, "Y": 58, "Z": 59, "[": 60, "]": 61, "a": 62, "b": 63, "c": 64, "d": 65, "e": 66, "f": 67, "g": 68, "h": 69, "i": 70, "j": 71, "k": 72, "l": 73, "m": 74, "n": 75, "o": 76, "p": 77, "q": 78, "r": 79, "s": 80, "t": 81, "u": 82, "v": 83, "x": 84, "y": 85, "z": 86, "{": 87, "}": 88, "¯": 89, "Á": 90, "Æ": 91, "Í": 92, "Ó": 93, "Ø": 94, "Ú": 95, "Ý": 96, "Þ": 97, "á": 98, "æ": 99, "é": 100, "í": 101, "ð": 102, "ó": 103, "ø": 104, "ú": 105, "ý": 106, "þ": 107, "ā": 108, "ē": 109, "Ę": 110, "ę": 111, "ġ": 112, "ħ": 113, "ī": 114, "ı": 115, "ń": 116, "ŋ": 117, "ō": 118, "ŕ": 119, "ū": 120, "ſ": 121, "ƀ": 122, "ƚ": 123, "ƞ": 124, "ǣ": 125, "Ǫ": 126, "ǫ": 127, "ǭ": 128, "ǵ": 129, "Ǽ": 130, "ǽ": 131, "Ǿ": 132, "ǿ": 133, "ȳ": 134, "ȷ": 135, "ɢ": 136, "ɪ": 137, "ɴ": 138, "ʀ": 139, "ʙ": 140, "ʜ": 141, "ʟ": 142, "́": 143, "̄": 144, "̅": 145, "̇": 146, "̲": 147, "͛": 148, "ͣ": 149, "ͤ": 150, "ͥ": 151, "ͦ": 152, "ͧ": 153, "ͨ": 154, "ͫ": 155, "ͬ": 156, "ͭ": 157, "ͮ": 158, "ᛘ": 159, "ᴀ": 160, "ᴇ": 161, "ᴍ": 162, "᷑": 163, "᷒": 164, "ᷓ": 165, "ᷔ": 166, "ᷖ": 167, "ᷙ": 168, "ᷜ": 169, "ᷝ": 170, "ᷟ": 171, "ᷠ": 172, "ᷡ": 173, "ᷢ": 174, "ᷦ": 175, "ḗ": 176, "ḿ": 177, "ṁ": 178, "ṅ": 179, "ṓ": 180, "ṗ": 181, "Ṙ": 182, "ṙ": 183, "—": 184, "⁊": 185, "Ↄ": 186, "ↄ": 187, "⋅": 188, "✝": 189, "⟨": 190, "⟩": 191, "⸌": 192, "⸍": 193, "⸫": 194, "ꜳ": 195, "ꜵ": 196, "ꜷ": 197, "ꜹ": 198, "ꝁ": 199, "ꝇ": 200, "ꝉ": 201, "ꝑ": 202, "ꝓ": 203, "ꝗ": 204, "ꝛ": 205, "ꝝ": 206, "Ꝥ": 207, "ꝥ": 208, "ꝧ": 209, "ꝩ": 210, "ꝺ": 211, "Ꝼ": 212, "ꝼ": 213, "Ꞇ": 214, "ꞇ": 215, "": 216, "": 217, "": 218, "": 219, "": 220, "": 221, "": 222, "": 223, "": 224, "": 225, "": 226, "": 227, "": 228, "": 229, "": 230, "": 231, "": 232, "": 233, "": 234, "": 235, "": 236, "": 237, "": 238, "": 239, "": 240, "": 241, "": 242, "": 243, "": 244, "": 245, "": 246, "": 247, "": 248, "": 249, "": 250, "": 251, "": 252, "": 253, "": 254, "": 255, "": 256, "": 257, "": 258, "": 259, "": 260, "": 261, "": 262, "": 263, "": 264, "": 265, "": 266, "": 267, "": 268, "": 269, "": 270, "": 271, "": 272, "": 273, "": 274, "": 275, "": 276, "": 277, "": 278, "": 279, "": 280, "": 281, "": 282, "": 283, "": 284, "": 285, "": 286, "": 287, "": 288, "": 289, "": 290, "": 291, "": 292, "": 293, "": 294, "": 295, "": 296, "": 297, "": 298, "": 299, "": 300, "": 301, "": 302, "": 303, "": 304, "": 305, "": 306, "": 307, "": 308, "": 309, "": 310, "": 311, "": 312, "": 313, "": 314, "": 315, "": 316, "": 317, "": 318, "": 319, "": 320, "": 321, "": 322, "": 323, "": 324, "": 325, "": 326, "": 327, "": 328, "": 329, "": 330, "": 331, "": 332, "": 333, "": 334, "": 335, "": 336, "": 337, "": 338, "": 339, "": 340, "": 341, "": 342, "": 343, "": 344, "": 345, "": 346, "": 347, "": 348, "": 349, "": 350, "": 351, "": 352, "": 353, "": 354, "": 355, "": 356, "": 357, "": 358, "": 359, "": 360, "": 361, "": 362, "": 363, "": 364, "": 365, "": 366, "": 367, "": 368, "": 369, "": 370, "": 371, "": 372, "": 373, "": 374, "": 375, "": 376, "": 377, "": 378, "": 379, "": 380, "": 381, "": 382, "": 383, "": 384, "": 385, "": 386, "": 387, "": 388, "": 389, "": 390, "": 391, "": 392, "": 393, "": 394}, "i2c": {"0": "<pad>", "1": "<sos>", "2": "<eos>", "3": "<unk>", "4": "<DIPL>", "5": "<NORM>", "6": "\n", "7": " ", "8": "!", "9": "\"", "10": "#", "11": "&", "12": "'", "13": "(", "14": ")", "15": "*", "16": "+", "17": ",", "18": "-", "19": ".", "20": "0", "21": "1", "22": "2", "23": "3", "24": "4", "25": "5", "26": "6", "27": "7", "28": "8", "29": "9", "30": ":", "31": ";", "32": "=", "33": ">", "34": "?", "35": "A", "36": "B", "37": "C", "38": "D", "39": "E", "40": "F", "41": "G", "42": "H", "43": "I", "44": "J", "45": "K", "46": "L", "47": "M", "48": "N", "49": "O", "50": "P", "51": "Q", "52": "R", "53": "S", "54": "T", "55": "U", "56": "V", "57": "X", "58": "Y", "59": "Z", "60": "[", "61": "]", "62": "a", "63": "b", "64": "c", "65": "d", "66": "e", "67": "f", "68": "g", "69": "h", "70": "i", "71": "j", "72": "k", "73": "l", "74": "m", "75": "n", "76": "o", "77": "p", "78": "q", "79": "r", "80": "s", "81": "t", "82": "u", "83": "v", "84": "x", "85": "y", "86": "z", "87": "{", "88": "}", "89": "¯", "90": "Á", "91": "Æ", "92": "Í", "93": "Ó", "94": "Ø", "95": "Ú", "96": "Ý", "97": "Þ", "98": "á", "99": "æ", "100": "é", "101": "í", "102": "ð", "103": "ó", "104": "ø", "105": "ú", "106": "ý", "107": "þ", "108": "ā", "109": "ē", "110": "Ę", "111": "ę", "112": "ġ", "113": "ħ", "114": "ī", "115": "ı", "116": "ń", "117": "ŋ", "118": "ō", "119": "ŕ", "120": "ū", "121": "ſ", "122": "ƀ", "123": "ƚ", "124": "ƞ", "125": "ǣ", "126": "Ǫ", "127": "ǫ", "128": "ǭ", "129": "ǵ", "130": "Ǽ", "131": "ǽ", "132": "Ǿ", "133": "ǿ", "134": "ȳ", "135": "ȷ", "136": "ɢ", "137": "ɪ", "138": "ɴ", "139": "ʀ", "140": "ʙ", "141": "ʜ", "142": "ʟ", "143": "́", "144": "̄", "145": "̅", "146": "̇", "147": "̲", "148": "͛", "149": "ͣ", "150": "ͤ", "151": "ͥ", "152": "ͦ", "153": "ͧ", "154": "ͨ", "155": "ͫ", "156": "ͬ", "157": "ͭ", "158": "ͮ", "159": "ᛘ", "160": "ᴀ", "161": "ᴇ", "162": "ᴍ", "163": "᷑", "164": "᷒", "165": "ᷓ", "166": "ᷔ", "167": "ᷖ", "168": "ᷙ", "169": "ᷜ", "170": "ᷝ", "171": "ᷟ", "172": "ᷠ", "173": "ᷡ", "174": "ᷢ", "175": "ᷦ", "176": "ḗ", "177": "ḿ", "178": "ṁ", "179": "ṅ", "180": "ṓ", "181": "ṗ", "182": "Ṙ", "183": "ṙ", "184": "—", "185": "⁊", "186": "Ↄ", "187": "ↄ", "188": "⋅", "189": "✝", "190": "⟨", "191": "⟩", "192": "⸌", "193": "⸍", "194": "⸫", "195": "ꜳ", "196": "ꜵ", "197": "ꜷ", "198": "ꜹ", "199": "ꝁ", "200": "ꝇ", "201": "ꝉ", "202": "ꝑ", "203": "ꝓ", "204": "ꝗ", "205": "ꝛ", "206": "ꝝ", "207": "Ꝥ", "208": "ꝥ", "209": "ꝧ", "210": "ꝩ", "211": "ꝺ", "212": "Ꝼ", "213": "ꝼ", "214": "Ꞇ", "215": "ꞇ", "216": "", "217": "", "218": "", "219": "", "220": "", "221": "", "222": "", "223": "", "224": "", "225": "", "226": "", "227": "", "228": "", "229": "", "230": "", "231": "", "232": "", "233": "", "234": "", "235": "", "236": "", "237": "", "238": "", "239": "", "240": "", "241": "", "242": "", "243": "", "244": "", "245": "", "246": "", "247": "", "248": "", "249": "", "250": "", "251": "", "252": "", "253": "", "254": "", "255": "", "256": "", "257": "", "258": "", "259": "", "260": "", "261": "", "262": "", "263": "", "264": "", "265": "", "266": "", "267": "", "268": "", "269": "", "270": "", "271": "", "272": "", "273": "", "274": "", "275": "", "276": "", "277": "", "278": "", "279": "", "280": "", "281": "", "282": "", "283": "", "284": "", "285": "", "286": "", "287": "", "288": "", "289": "", "290": "", "291": "", "292": "", "293": "", "294": "", "295": "", "296": "", "297": "", "298": "", "299": "", "300": "", "301": "", "302": "", "303": "", "304": "", "305": "", "306": "", "307": "", "308": "", "309": "", "310": "", "311": "", "312": "", "313": "", "314": "", "315": "", "316": "", "317": "", "318": "", "319": "", "320": "", "321": "", "322": "", "323": "", "324": "", "325": "", "326": "", "327": "", "328": "", "329": "", "330": "", "331": "", "332": "", "333": "", "334": "", "335": "", "336": "", "337": "", "338": "", "339": "", "340": "", "341": "", "342": "", "343": "", "344": "", "345": "", "346": "", "347": "", "348": "", "349": "", "350": "", "351": "", "352": "", "353": "", "354": "", "355": "", "356": "", "357": "", "358": "", "359": "", "360": "", "361": "", "362": "", "363": "", "364": "", "365": "", "366": "", "367": "", "368": "", "369": "", "370": "", "371": "", "372": "", "373": "", "374": "", "375": "", "376": "", "377": "", "378": "", "379": "", "380": "", "381": "", "382": "", "383": "", "384": "", "385": "", "386": "", "387": "", "388": "", "389": "", "390": "", "391": "", "392": "", "393": "", "394": ""}, "DIPL_IDX": 4, "NORM_IDX": 5}