NKCZ commited on
Commit
9e47315
·
verified ·
1 Parent(s): 632e91c

Initial commit

Browse files

Added model, script, vocab, readme

Files changed (4) hide show
  1. best_model.pt +3 -0
  2. model_def_multitask.py +147 -0
  3. readme.md +152 -0
  4. vocab.json +1 -0
best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb33ee6c7f76ed135c8e91832819010e8660fc829d3c7924308a32767354413b
3
+ size 50138134
model_def_multitask.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ facs2dipl / dipl2norm — Multi-task Char-level Transformer
3
+ Model definition, encode/decode helpers, and greedy inference.
4
+
5
+ Save this file to: /content/drive/MyDrive/facs2dipl_multitask/model_def.py
6
+ Then in any notebook run: %run /content/drive/MyDrive/facs2dipl_multitask/model_def.py
7
+ """
8
+
9
+ import math
10
+ import torch
11
+ import torch.nn as nn
12
+
13
+ # ── Special token indices (must match training) ────────────────────────────
14
+ PAD, SOS, EOS, UNK = 0, 1, 2, 3
15
+ # Task prefix token indices
16
+ DIPL_IDX = 4 # <DIPL> — facs→dipl task
17
+ NORM_IDX = 5 # <NORM> — dipl→norm task
18
+
19
+
20
+ # ── Model ──────────────────────────────────────────────────────────────────
21
+
22
+ class PositionalEncoding(nn.Module):
23
+ def __init__(self, d_model, max_len, dropout):
24
+ super().__init__()
25
+ self.dropout = nn.Dropout(dropout)
26
+ pe = torch.zeros(max_len, d_model)
27
+ pos = torch.arange(max_len).unsqueeze(1)
28
+ div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
29
+ pe[:, 0::2] = torch.sin(pos * div)
30
+ pe[:, 1::2] = torch.cos(pos * div)
31
+ self.register_buffer('pe', pe.unsqueeze(0))
32
+
33
+ def forward(self, x):
34
+ return self.dropout(x + self.pe[:, :x.size(1)])
35
+
36
+
37
+ class CharSeq2Seq(nn.Module):
38
+ def __init__(self, vocab_size, d_model, n_heads, n_enc, n_dec, d_ff, max_len, dropout):
39
+ super().__init__()
40
+ self.d_model = d_model
41
+ self.embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
42
+ self.pos_enc = PositionalEncoding(d_model, max_len, dropout)
43
+ enc_layer = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True, norm_first=True)
44
+ dec_layer = nn.TransformerDecoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True, norm_first=True)
45
+ self.encoder = nn.TransformerEncoder(enc_layer, n_enc, norm=nn.LayerNorm(d_model))
46
+ self.decoder = nn.TransformerDecoder(dec_layer, n_dec, norm=nn.LayerNorm(d_model))
47
+ self.proj = nn.Linear(d_model, vocab_size)
48
+ self._init_weights()
49
+
50
+ def _init_weights(self):
51
+ for p in self.parameters():
52
+ if p.dim() > 1:
53
+ nn.init.xavier_uniform_(p)
54
+
55
+ def encode(self, src, src_key_padding_mask):
56
+ x = self.pos_enc(self.embed(src) * math.sqrt(self.d_model))
57
+ return self.encoder(x, src_key_padding_mask=src_key_padding_mask)
58
+
59
+ def decode(self, tgt, memory, tgt_mask, tgt_key_padding_mask, memory_key_padding_mask):
60
+ x = self.pos_enc(self.embed(tgt) * math.sqrt(self.d_model))
61
+ return self.decoder(
62
+ x, memory,
63
+ tgt_mask=tgt_mask,
64
+ tgt_key_padding_mask=tgt_key_padding_mask,
65
+ memory_key_padding_mask=memory_key_padding_mask,
66
+ )
67
+
68
+ def forward(self, src, tgt):
69
+ src_pad_mask = (src == PAD)
70
+ tgt_pad_mask = (tgt == PAD)
71
+ T = tgt.size(1)
72
+ tgt_mask = nn.Transformer.generate_square_subsequent_mask(T, device=src.device)
73
+ memory = self.encode(src, src_pad_mask)
74
+ out = self.decode(tgt, memory, tgt_mask, tgt_pad_mask, src_pad_mask)
75
+ return self.proj(out)
76
+
77
+
78
+ # ── Helpers ────────────────────────────────────────────────────────────────
79
+
80
+ def encode_text(text, task_idx, c2i, max_len):
81
+ """
82
+ Encode source text with a task prefix token.
83
+ Layout: [task_token, char, char, ..., EOS, PAD, ...]
84
+ """
85
+ ids = [task_idx] + [c2i.get(c, UNK) for c in text]
86
+ ids = ids[:max_len - 1] + [EOS]
87
+ ids += [PAD] * (max_len - len(ids))
88
+ return ids
89
+
90
+
91
+ def encode_target(text, c2i, max_len):
92
+ """Encode target: [SOS, char, char, ..., EOS, PAD, ...]"""
93
+ ids = [SOS] + [c2i.get(c, UNK) for c in text]
94
+ ids = ids[:max_len - 1] + [EOS]
95
+ ids += [PAD] * (max_len - len(ids))
96
+ return ids
97
+
98
+
99
+ def decode_ids(ids, i2c):
100
+ """Decode token ids to string, stopping at EOS, skipping special tokens."""
101
+ chars = []
102
+ skip = {PAD, SOS, EOS, UNK, DIPL_IDX, NORM_IDX}
103
+ for i in ids:
104
+ if i == EOS:
105
+ break
106
+ if i not in skip:
107
+ chars.append(i2c.get(i, ''))
108
+ return ''.join(chars)
109
+
110
+
111
+ @torch.no_grad()
112
+ def greedy_decode(model, src, max_len, device, i2c):
113
+ """
114
+ Greedy decode a batch. Task is implicit in the src prefix token.
115
+
116
+ Args:
117
+ model : CharSeq2Seq in eval mode
118
+ src : LongTensor (B, S) — first token is the task prefix
119
+ max_len : int
120
+ device : str or torch.device
121
+ i2c : dict[int, str]
122
+
123
+ Returns:
124
+ list[str] of length B
125
+ """
126
+ model.eval()
127
+ src = src.to(device)
128
+ src_pad_mask = (src == PAD)
129
+ memory = model.encode(src, src_pad_mask)
130
+
131
+ B = src.size(0)
132
+ ys = torch.full((B, 1), SOS, dtype=torch.long, device=device)
133
+ done = torch.zeros(B, dtype=torch.bool, device=device)
134
+
135
+ for _ in range(max_len - 1):
136
+ T = ys.size(1)
137
+ tgt_mask = nn.Transformer.generate_square_subsequent_mask(T, device=device)
138
+ tgt_pad = (ys == PAD)
139
+ out = model.decode(ys, memory, tgt_mask, tgt_pad, src_pad_mask)
140
+ next_tok = model.proj(out[:, -1]).argmax(-1)
141
+ next_tok = next_tok.masked_fill(done, PAD)
142
+ ys = torch.cat([ys, next_tok.unsqueeze(1)], dim=1)
143
+ done = done | (next_tok == EOS)
144
+ if done.all():
145
+ break
146
+
147
+ return [decode_ids(row.tolist(), i2c) for row in ys]
readme.md ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - non
4
+ tags:
5
+ - text-normalization
6
+ - historical-text
7
+ - old-icelandic
8
+ - seq2seq
9
+ - character-level
10
+ - multi-task
11
+ - medieval
12
+ license: mit
13
+ datasets:
14
+ - custom
15
+ metrics:
16
+ - cer
17
+ - wer
18
+ ---
19
+
20
+ # Old Icelandic facs2dipl2norm
21
+
22
+ This repository contains a character-level transformer model for Old Icelandic manuscript normalisation tasks, specifically facsimile transcription to diplomatic transcription (facs → dipl) and diplomatic transcription to normalised form (dipl → norm).
23
+
24
+ The model was trained on all the available MENOTA texts by Andrea de Leeuw van Weenen (AM 132 fol., AM 519 a 4to., and AM 677 4to). This is around 75% of all the currently available MENOTA texts, which are normalised, lemmatized, and (at least partially) POS-tagged.
25
+
26
+ Old Icelandic manuscript normalisation tasks:
27
+
28
+ - **facs → dipl**: facsimile transcription → diplomatic transcription (abbreviation expansion, character normalisation)
29
+ - **dipl → norm**: diplomatic transcription → normalised form (orthographic regularisation)
30
+
31
+ Task routing is controlled by a prefix token prepended to the source sequence — no architectural changes were necessary between tasks.
32
+
33
+ ## Model Details
34
+
35
+ | Property | Value |
36
+ |---|---|
37
+ | Architecture | Transformer encoder-decoder |
38
+ | Parameters | ~10M |
39
+ | Vocabulary | ~120 characters (data-derived) |
40
+ | Max sequence length | 128 characters |
41
+ | Model dimension | 256 |
42
+ | Attention heads | 4 |
43
+ | Encoder / decoder layers | 3 / 3 |
44
+ | Feed-forward dim | 512 |
45
+ | Task tokens | `<DIPL>` (facs→dipl), `<NORM>` (dipl→norm) |
46
+ | Training data | ~36k line-level triples |
47
+ | Language | Old Icelandic (`non`) |
48
+
49
+ ## Training Data
50
+
51
+ - Corpus size: 36240 text chunks of differing lengths, containing around 400k word tokens.
52
+
53
+ - Training-validation-test split: 80-10-10.
54
+
55
+ - Sources: <a href="https://clarino.uib.no/menota/catalogue/menota">AM 132 fol., AM 519 a 4to, and AM 677 4to</a>, edited and annotated by Andrea de Leeuw van Weenen.
56
+
57
+ ## Training
58
+ TODO
59
+
60
+
61
+ ## Performance
62
+
63
+ | Task | CER | WER |
64
+ |---|---|---|
65
+ | facs → dipl | 0.0112 | 0.0270 |
66
+ | dipl → norm | 0.0350 | 0.1370 |
67
+
68
+
69
+ ## Intended Use
70
+
71
+ This model is intended for researchers and digital humanists working with Old Icelandic manuscript material who need to automate or assist with the production of diplomatic and normalised transcriptions from facsimile-level texts (e.g., from HTR output from models like OICEN-HTR).
72
+
73
+ ## Usage
74
+
75
+ Try it out in <a href="https://colab.research.google.com/drive/13Rq2FZomqRjdG5DyHMNcuWSmv0rbq3qR?usp=sharing">Google Colab</a>!
76
+
77
+ ```python
78
+ import json, torch
79
+ from model_def import CharSeq2Seq, encode_text, decode_ids, greedy_decode, DIPL_IDX, NORM_IDX
80
+
81
+ # Load vocab
82
+ with open("vocab.json", encoding="utf-8") as f:
83
+ v = json.load(f)
84
+ c2i = v["c2i"]
85
+ i2c = {int(k): val for k, val in v["i2c"].items()}
86
+
87
+ # Load model
88
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
89
+ ckpt = torch.load("best_model.pt", map_location=DEVICE)
90
+ hp = ckpt["hparams"]
91
+
92
+ model = CharSeq2Seq(
93
+ vocab_size = hp["VOCAB_SIZE"],
94
+ d_model = hp["D_MODEL"],
95
+ n_heads = hp["N_HEADS"],
96
+ n_enc = hp["N_ENC"],
97
+ n_dec = hp["N_DEC"],
98
+ d_ff = hp["D_FF"],
99
+ max_len = hp["MAX_LEN"],
100
+ dropout = hp["DROPOUT"],
101
+ ).to(DEVICE)
102
+ model.load_state_dict(ckpt["model"])
103
+ model.eval()
104
+ ```
105
+
106
+ ### facs → dipl
107
+
108
+ ```python
109
+ MAX_LEN = hp["MAX_LEN"]
110
+
111
+ def predict_dipl(texts):
112
+ if isinstance(texts, str):
113
+ texts = [texts]
114
+ src = torch.tensor(
115
+ [encode_text(t, DIPL_IDX, c2i, MAX_LEN) for t in texts],
116
+ dtype=torch.long
117
+ )
118
+ return greedy_decode(model, src, MAX_LEN, DEVICE, i2c)
119
+
120
+ predict_dipl("koma egƚ. kappı þınu ⁊ ꝺırꝼð . en ſkaplynꝺı") # random line from test set
121
+ # → "koma eg(il)l kappi þinu (ok) dirfð . en ſkaplyndi"
122
+ ```
123
+
124
+ ### dipl → norm
125
+
126
+ ```python
127
+ def predict_norm(texts):
128
+ if isinstance(texts, str):
129
+ texts = [texts]
130
+ src = torch.tensor(
131
+ [encode_text(t, NORM_IDX, c2i, MAX_LEN) for t in texts],
132
+ dtype=torch.long
133
+ )
134
+ return greedy_decode(model, src, MAX_LEN, DEVICE, i2c)
135
+
136
+ predict_norm("TODO")
137
+ # → TODO
138
+ ```
139
+
140
+ ### Full pipeline: facs → dipl → norm
141
+
142
+ ```python
143
+ def predict_pipeline(texts):
144
+ if isinstance(texts, str):
145
+ texts = [texts]
146
+ dipl = predict_dipl(texts)
147
+ norm = predict_norm(dipl)
148
+ return list(zip(dipl, norm))
149
+
150
+ predict_pipeline("TODO")
151
+ # TODO
152
+ ```
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"c2i": {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3, "<DIPL>": 4, "<NORM>": 5, "\n": 6, " ": 7, "!": 8, "\"": 9, "#": 10, "&": 11, "'": 12, "(": 13, ")": 14, "*": 15, "+": 16, ",": 17, "-": 18, ".": 19, "0": 20, "1": 21, "2": 22, "3": 23, "4": 24, "5": 25, "6": 26, "7": 27, "8": 28, "9": 29, ":": 30, ";": 31, "=": 32, ">": 33, "?": 34, "A": 35, "B": 36, "C": 37, "D": 38, "E": 39, "F": 40, "G": 41, "H": 42, "I": 43, "J": 44, "K": 45, "L": 46, "M": 47, "N": 48, "O": 49, "P": 50, "Q": 51, "R": 52, "S": 53, "T": 54, "U": 55, "V": 56, "X": 57, "Y": 58, "Z": 59, "[": 60, "]": 61, "a": 62, "b": 63, "c": 64, "d": 65, "e": 66, "f": 67, "g": 68, "h": 69, "i": 70, "j": 71, "k": 72, "l": 73, "m": 74, "n": 75, "o": 76, "p": 77, "q": 78, "r": 79, "s": 80, "t": 81, "u": 82, "v": 83, "x": 84, "y": 85, "z": 86, "{": 87, "}": 88, "¯": 89, "Á": 90, "Æ": 91, "Í": 92, "Ó": 93, "Ø": 94, "Ú": 95, "Ý": 96, "Þ": 97, "á": 98, "æ": 99, "é": 100, "í": 101, "ð": 102, "ó": 103, "ø": 104, "ú": 105, "ý": 106, "þ": 107, "ā": 108, "ē": 109, "Ę": 110, "ę": 111, "ġ": 112, "ħ": 113, "ī": 114, "ı": 115, "ń": 116, "ŋ": 117, "ō": 118, "ŕ": 119, "ū": 120, "ſ": 121, "ƀ": 122, "ƚ": 123, "ƞ": 124, "ǣ": 125, "Ǫ": 126, "ǫ": 127, "ǭ": 128, "ǵ": 129, "Ǽ": 130, "ǽ": 131, "Ǿ": 132, "ǿ": 133, "ȳ": 134, "ȷ": 135, "ɢ": 136, "ɪ": 137, "ɴ": 138, "ʀ": 139, "ʙ": 140, "ʜ": 141, "ʟ": 142, "́": 143, "̄": 144, "̅": 145, "̇": 146, "̲": 147, "͛": 148, "ͣ": 149, "ͤ": 150, "ͥ": 151, "ͦ": 152, "ͧ": 153, "ͨ": 154, "ͫ": 155, "ͬ": 156, "ͭ": 157, "ͮ": 158, "ᛘ": 159, "ᴀ": 160, "ᴇ": 161, "ᴍ": 162, "᷑": 163, "᷒": 164, "ᷓ": 165, "ᷔ": 166, "ᷖ": 167, "ᷙ": 168, "ᷜ": 169, "ᷝ": 170, "ᷟ": 171, "ᷠ": 172, "ᷡ": 173, "ᷢ": 174, "ᷦ": 175, "ḗ": 176, "ḿ": 177, "ṁ": 178, "ṅ": 179, "ṓ": 180, "ṗ": 181, "Ṙ": 182, "ṙ": 183, "—": 184, "⁊": 185, "Ↄ": 186, "ↄ": 187, "⋅": 188, "✝": 189, "⟨": 190, "⟩": 191, "⸌": 192, "⸍": 193, "⸫": 194, "ꜳ": 195, "ꜵ": 196, "ꜷ": 197, "ꜹ": 198, "ꝁ": 199, "ꝇ": 200, "ꝉ": 201, "ꝑ": 202, "ꝓ": 203, "ꝗ": 204, "ꝛ": 205, "ꝝ": 206, "Ꝥ": 207, "ꝥ": 208, "ꝧ": 209, "ꝩ": 210, "ꝺ": 211, "Ꝼ": 212, "ꝼ": 213, "Ꞇ": 214, "ꞇ": 215, "": 216, "": 217, "": 218, "": 219, "": 220, "": 221, "": 222, "": 223, "": 224, "": 225, "": 226, "": 227, "": 228, "": 229, "": 230, "": 231, "": 232, "": 233, "": 234, "": 235, "": 236, "": 237, "": 238, "": 239, "": 240, "": 241, "": 242, "": 243, "": 244, "": 245, "": 246, "": 247, "": 248, "": 249, "": 250, "": 251, "": 252, "": 253, "": 254, "": 255, "": 256, "": 257, "": 258, "": 259, "": 260, "": 261, "": 262, "": 263, "": 264, "": 265, "": 266, "": 267, "": 268, "": 269, "": 270, "": 271, "": 272, "": 273, "": 274, "": 275, "": 276, "": 277, "": 278, "": 279, "": 280, "": 281, "": 282, "": 283, "": 284, "": 285, "": 286, "": 287, "": 288, "": 289, "": 290, "": 291, "": 292, "": 293, "": 294, "": 295, "": 296, "": 297, "": 298, "": 299, "": 300, "": 301, "": 302, "": 303, "": 304, "": 305, "": 306, "": 307, "": 308, "": 309, "": 310, "": 311, "": 312, "": 313, "": 314, "": 315, "": 316, "": 317, "": 318, "": 319, "": 320, "": 321, "": 322, "": 323, "": 324, "": 325, "": 326, "": 327, "": 328, "": 329, "": 330, "": 331, "": 332, "": 333, "": 334, "": 335, "": 336, "": 337, "": 338, "": 339, "": 340, "": 341, "": 342, "": 343, "": 344, "": 345, "": 346, "": 347, "": 348, "": 349, "": 350, "": 351, "": 352, "": 353, "": 354, "": 355, "": 356, "": 357, "": 358, "": 359, "": 360, "": 361, "": 362, "": 363, "": 364, "": 365, "": 366, "": 367, "": 368, "": 369, "": 370, "": 371, "": 372, "": 373, "": 374, "": 375, "": 376, "": 377, "": 378, "": 379, "": 380, "": 381, "": 382, "": 383, "": 384, "": 385, "": 386, "": 387, "": 388, "": 389, "": 390, "": 391, "": 392, "": 393, "": 394}, "i2c": {"0": "<pad>", "1": "<sos>", "2": "<eos>", "3": "<unk>", "4": "<DIPL>", "5": "<NORM>", "6": "\n", "7": " ", "8": "!", "9": "\"", "10": "#", "11": "&", "12": "'", "13": "(", "14": ")", "15": "*", "16": "+", "17": ",", "18": "-", "19": ".", "20": "0", "21": "1", "22": "2", "23": "3", "24": "4", "25": "5", "26": "6", "27": "7", "28": "8", "29": "9", "30": ":", "31": ";", "32": "=", "33": ">", "34": "?", "35": "A", "36": "B", "37": "C", "38": "D", "39": "E", "40": "F", "41": "G", "42": "H", "43": "I", "44": "J", "45": "K", "46": "L", "47": "M", "48": "N", "49": "O", "50": "P", "51": "Q", "52": "R", "53": "S", "54": "T", "55": "U", "56": "V", "57": "X", "58": "Y", "59": "Z", "60": "[", "61": "]", "62": "a", "63": "b", "64": "c", "65": "d", "66": "e", "67": "f", "68": "g", "69": "h", "70": "i", "71": "j", "72": "k", "73": "l", "74": "m", "75": "n", "76": "o", "77": "p", "78": "q", "79": "r", "80": "s", "81": "t", "82": "u", "83": "v", "84": "x", "85": "y", "86": "z", "87": "{", "88": "}", "89": "¯", "90": "Á", "91": "Æ", "92": "Í", "93": "Ó", "94": "Ø", "95": "Ú", "96": "Ý", "97": "Þ", "98": "á", "99": "æ", "100": "é", "101": "í", "102": "ð", "103": "ó", "104": "ø", "105": "ú", "106": "ý", "107": "þ", "108": "ā", "109": "ē", "110": "Ę", "111": "ę", "112": "ġ", "113": "ħ", "114": "ī", "115": "ı", "116": "ń", "117": "ŋ", "118": "ō", "119": "ŕ", "120": "ū", "121": "ſ", "122": "ƀ", "123": "ƚ", "124": "ƞ", "125": "ǣ", "126": "Ǫ", "127": "ǫ", "128": "ǭ", "129": "ǵ", "130": "Ǽ", "131": "ǽ", "132": "Ǿ", "133": "ǿ", "134": "ȳ", "135": "ȷ", "136": "ɢ", "137": "ɪ", "138": "ɴ", "139": "ʀ", "140": "ʙ", "141": "ʜ", "142": "ʟ", "143": "́", "144": "̄", "145": "̅", "146": "̇", "147": "̲", "148": "͛", "149": "ͣ", "150": "ͤ", "151": "ͥ", "152": "ͦ", "153": "ͧ", "154": "ͨ", "155": "ͫ", "156": "ͬ", "157": "ͭ", "158": "ͮ", "159": "ᛘ", "160": "ᴀ", "161": "ᴇ", "162": "ᴍ", "163": "᷑", "164": "᷒", "165": "ᷓ", "166": "ᷔ", "167": "ᷖ", "168": "ᷙ", "169": "ᷜ", "170": "ᷝ", "171": "ᷟ", "172": "ᷠ", "173": "ᷡ", "174": "ᷢ", "175": "ᷦ", "176": "ḗ", "177": "ḿ", "178": "ṁ", "179": "ṅ", "180": "ṓ", "181": "ṗ", "182": "Ṙ", "183": "ṙ", "184": "—", "185": "⁊", "186": "Ↄ", "187": "ↄ", "188": "⋅", "189": "✝", "190": "⟨", "191": "⟩", "192": "⸌", "193": "⸍", "194": "⸫", "195": "ꜳ", "196": "ꜵ", "197": "ꜷ", "198": "ꜹ", "199": "ꝁ", "200": "ꝇ", "201": "ꝉ", "202": "ꝑ", "203": "ꝓ", "204": "ꝗ", "205": "ꝛ", "206": "ꝝ", "207": "Ꝥ", "208": "ꝥ", "209": "ꝧ", "210": "ꝩ", "211": "ꝺ", "212": "Ꝼ", "213": "ꝼ", "214": "Ꞇ", "215": "ꞇ", "216": "", "217": "", "218": "", "219": "", "220": "", "221": "", "222": "", "223": "", "224": "", "225": "", "226": "", "227": "", "228": "", "229": "", "230": "", "231": "", "232": "", "233": "", "234": "", "235": "", "236": "", "237": "", "238": "", "239": "", "240": "", "241": "", "242": "", "243": "", "244": "", "245": "", "246": "", "247": "", "248": "", "249": "", "250": "", "251": "", "252": "", "253": "", "254": "", "255": "", "256": "", "257": "", "258": "", "259": "", "260": "", "261": "", "262": "", "263": "", "264": "", "265": "", "266": "", "267": "", "268": "", "269": "", "270": "", "271": "", "272": "", "273": "", "274": "", "275": "", "276": "", "277": "", "278": "", "279": "", "280": "", "281": "", "282": "", "283": "", "284": "", "285": "", "286": "", "287": "", "288": "", "289": "", "290": "", "291": "", "292": "", "293": "", "294": "", "295": "", "296": "", "297": "", "298": "", "299": "", "300": "", "301": "", "302": "", "303": "", "304": "", "305": "", "306": "", "307": "", "308": "", "309": "", "310": "", "311": "", "312": "", "313": "", "314": "", "315": "", "316": "", "317": "", "318": "", "319": "", "320": "", "321": "", "322": "", "323": "", "324": "", "325": "", "326": "", "327": "", "328": "", "329": "", "330": "", "331": "", "332": "", "333": "", "334": "", "335": "", "336": "", "337": "", "338": "", "339": "", "340": "", "341": "", "342": "", "343": "", "344": "", "345": "", "346": "", "347": "", "348": "", "349": "", "350": "", "351": "", "352": "", "353": "", "354": "", "355": "", "356": "", "357": "", "358": "", "359": "", "360": "", "361": "", "362": "", "363": "", "364": "", "365": "", "366": "", "367": "", "368": "", "369": "", "370": "", "371": "", "372": "", "373": "", "374": "", "375": "", "376": "", "377": "", "378": "", "379": "", "380": "", "381": "", "382": "", "383": "", "384": "", "385": "", "386": "", "387": "", "388": "", "389": "", "390": "", "391": "", "392": "", "393": "", "394": ""}, "DIPL_IDX": 4, "NORM_IDX": 5}