sentinel-universal-tokenizer / sentinel_manifold.json
5dimension's picture
🦴 v2.0: 65K text vocab, 30 languages, 300K+ samples
3824578 verified
Raw
History Blame Contribute Delete
1.29 kB
{
"version": "2.0.0",
"framework": "Sentinel Manifold",
"theorem": "lim F'(z)/F(z) = 1/e",
"function": "F(z) = \u03a3 z^n/n^n",
"text_vocab": 65536,
"image_codebook": 16384,
"audio_codebook": 8192,
"video_codebook": 4096,
"total_vocab": 94208,
"training_languages": 30,
"training_samples": 287600,
"training_chars": 465942294,
"constants": {
"INV_E": 0.36787944117144233,
"C1": -0.007994021805952546,
"C2": 0.00020005604296784437
},
"benchmark": {
"Sentinel-v2": {
"compress": 4.3427,
"fertility": 10.5022,
"vocab": 94208,
"efficiency": 0.046097
},
"GPT-2": {
"compress": 2.4381,
"fertility": 28.8158,
"vocab": 50257,
"efficiency": 0.048513
},
"Gemma": {
"compress": 5.3287,
"fertility": 8.348,
"vocab": 256000,
"efficiency": 0.020815
},
"Qwen2": {
"compress": 4.3289,
"fertility": 10.4499,
"vocab": 151936,
"efficiency": 0.028491
}
},
"modality_architecture": {
"text": "ByteLevel BPE (65,536), NFKC, 30 languages",
"image": "VQ codebook (16,384), Cosmos/VQGAN/FSQ compatible",
"audio": "VQ codebook (8,192), EnCodec/SoundStream compatible",
"video": "VQ codebook (4,096), Cosmos-DV compatible"
}
}