diff --git a/Basque/.gitattributes b/Basque/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Basque/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Basque/README.md b/Basque/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Basque/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Basque/added_tokens.json b/Basque/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..9a6e8091709a1384a0e5c220579d695bf8504888 --- /dev/null +++ b/Basque/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 44 +} diff --git a/Basque/special_tokens_map.json b/Basque/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Basque/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Basque/tokenizer.json b/Basque/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1f3e1e2e47f5cb7fd454dab2d15ba2ad8d689bae --- /dev/null +++ b/Basque/tokenizer.json @@ -0,0 +1,159 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "b": 4, + "ai̯": 5, + "e": 6, + "s̪̻": 7, + "ɟ": 8, + "ei̯": 9, + "t̺s̺": 10, + "i": 11, + "oi̯": 12, + "a": 13, + "ɾ": 14, + "k": 15, + "t̠ʃ": 16, + "s̺": 17, + "l": 18, + "p": 19, + "o": 20, + "r": 21, + "t̪": 22, + "u": 23, + "n": 24, + "m": 25, + "ð": 26, + "t̪̻s̪̻": 27, + "β": 28, + "ʎ": 29, + "ɡ": 30, + "ɣ": 31, + "au̯": 32, + "c": 33, + "j": 34, + "d̪": 35, + "ʃ": 36, + "ɲ": 37, + "f": 38, + "eu̯": 39, + "θ": 40, + "x": 41 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Basque/tokenizer_config.json b/Basque/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Basque/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Basque/vocab.json b/Basque/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..8a4570a58693e64f104d5220a47a7a5b5199005b --- /dev/null +++ b/Basque/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"b":4,"ai̯":5,"e":6,"s̪̻":7,"ɟ":8,"ei̯":9,"t̺s̺":10,"i":11,"oi̯":12,"a":13,"ɾ":14,"k":15,"t̠ʃ":16,"s̺":17,"l":18,"p":19,"o":20,"r":21,"t̪":22,"u":23,"n":24,"m":25,"ð":26,"t̪̻s̪̻":27,"β":28,"ʎ":29,"ɡ":30,"ɣ":31,"au̯":32,"c":33,"j":34,"d̪":35,"ʃ":36,"ɲ":37,"f":38,"eu̯":39,"θ":40,"x":41} \ No newline at end of file diff --git a/Cantonese/.gitattributes b/Cantonese/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Cantonese/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Cantonese/README.md b/Cantonese/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Cantonese/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Cantonese/added_tokens.json b/Cantonese/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b1f8010f65098c268512eb96a20eb712ba0e7dac --- /dev/null +++ b/Cantonese/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 154 +} diff --git a/Cantonese/special_tokens_map.json b/Cantonese/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Cantonese/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Cantonese/tokenizer.json b/Cantonese/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4740600ed1b2e8271f303caee92ec24bfed0ec39 --- /dev/null +++ b/Cantonese/tokenizer.json @@ -0,0 +1,269 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "WhitespaceSplit" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "j": 4, + "ɐ˥": 5, + "t": 6, + "k": 7, + "ɐu˧˥": 8, + "i˨": 9, + "n": 10, + "i˧˩̰": 11, + "y˨": 12, + "s": 13, + "ɐ˨": 14, + "p": 15, + "ts": 16, + "ɐu˥": 17, + "ɪ̞˧˥": 18, + "ŋ": 19, + "ɵ˧": 20, + "a̞˧": 21, + "l": 22, + "ʊ̟˥": 23, + "a̞˧˩̰": 24, + "ɛ˥": 25, + "ei˩˧": 26, + "w": 27, + "a̞˨": 28, + "ɐi˧˥": 29, + "a̞˧˥": 30, + "m̩˧˥": 31, + "m": 32, + "ou˥": 33, + "ei˥": 34, + "i˧": 35, + "ɔ̽˧˥": 36, + "tʰ": 37, + "i˥": 38, + "f": 39, + "aːĭ˧": 40, + "h": 41, + "ɵy˧": 42, + "a̞˥": 43, + "ei˧˩̰": 44, + "ou˨": 45, + "ɔ̽˧": 46, + "ɐi˧˩̰": 47, + "u˧": 48, + "ɔːĭ˥": 49, + "ɐu˨": 50, + "ei˧˥": 51, + "ɐi˨": 52, + "ʊ̟˧˩̰": 53, + "ʊ̟˨": 54, + "a̞˩˧": 55, + "ou˧˥": 56, + "aːĭ˧˥": 57, + "ɔ̽˨": 58, + "ɛ˩˧": 59, + "ɪ̞˨": 60, + "iːŭ˧": 61, + "ɛ˧˩̰": 62, + "m̩˧˩̰": 63, + "ɵ˧˥": 64, + "ei˧": 65, + "ɐu˧˩̰": 66, + "m̩˧": 67, + "ɐ˧˥": 68, + "ɐu˩˧": 69, + "ɐi˥": 70, + "ɔ̽˥": 71, + "ɔ̽˧˩̰": 72, + "ɔːĭ˧": 73, + "ou˩˧": 74, + "m̩˥": 75, + "ɐ˧": 76, + "tsʰ": 77, + "ɛ˧˥": 78, + "i˧˥": 79, + "ɔ̽˩˧": 80, + "kʰ": 81, + "ɐ˧˩̰": 82, + "aːŭ˧˥": 83, + "pʰ": 84, + "aːĭ˧˩̰": 85, + "ɵy˩˧": 86, + "ɛ˧": 87, + "u˧˥": 88, + "ɛ˨": 89, + "ʊ̟˧": 90, + "u˥": 91, + "m̩˩˧": 92, + "aːŭ˧": 93, + "œ̞˩˧": 94, + "i˩˧": 95, + "ɪ̞˧˩̰": 96, + "u˨": 97, + "ɪ̞˥": 98, + "iːŭ˧˩̰": 99, + "œ̞˧˥": 100, + "y˧": 101, + "uːĭ˩˧": 102, + "uːĭ˥": 103, + "ɵy˧˥": 104, + "y˧˩̰": 105, + "ɔːĭ˧˥": 106, + "ɛ": 107, + "ou˧": 108, + "ei˨": 109, + "ɵ˥": 110, + "u˧˩̰": 111, + "y˥": 112, + "œ̞˥": 113, + "œ̞˧˩̰": 114, + "aːĭ˨": 115, + "ɐ˩˧": 116, + "œ̞˧": 117, + "uːĭ˧˥": 118, + "ɐu˧": 119, + "ɐi˩˧": 120, + "ɐi˧": 121, + "ou˧˩̰": 122, + "aːĭ˥": 123, + "aːŭ˥": 124, + "ŋ˩˧": 125, + "y˧˥": 126, + "iːŭ˥": 127, + "ɔːĭ˨": 128, + "ʊ̟˧˥": 129, + "iːŭ˧˥": 130, + "ɵy˥": 131, + "ɔːĭ˧˩̰": 132, + "uːĭ˧": 133, + "ɵy˧˩̰": 134, + "œ̞˨": 135, + "m̩˨": 136, + "aːŭ˧˩̰": 137, + "y˩˧": 138, + "aːŭ˩˧": 139, + "aːĭ˩˧": 140, + "uːĭ˨": 141, + "ɵy˨": 142, + "aːŭ˨": 143, + "ɪ̞˧": 144, + "ɵ˨": 145, + "iːŭ˩˧": 146, + "iːŭ˨": 147, + "ɵ˧˩̰": 148, + "uːĭ˧˩̰": 149, + "u˩˧": 150, + "ŋ˧˩̰": 151 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Cantonese/tokenizer_config.json b/Cantonese/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Cantonese/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Cantonese/vocab.json b/Cantonese/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..e1046969f94bc52e30222072b560ca895d699c9c --- /dev/null +++ b/Cantonese/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"j":4,"ɐ˥":5,"t":6,"k":7,"ɐu˧˥":8,"i˨":9,"n":10,"i˧˩̰":11,"y˨":12,"s":13,"ɐ˨":14,"p":15,"ts":16,"ɐu˥":17,"ɪ̞˧˥":18,"ŋ":19,"ɵ˧":20,"a̞˧":21,"l":22,"ʊ̟˥":23,"a̞˧˩̰":24,"ɛ˥":25,"ei˩˧":26,"w":27,"a̞˨":28,"ɐi˧˥":29,"a̞˧˥":30,"m̩˧˥":31,"m":32,"ou˥":33,"ei˥":34,"i˧":35,"ɔ̽˧˥":36,"tʰ":37,"i˥":38,"f":39,"aːĭ˧":40,"h":41,"ɵy˧":42,"a̞˥":43,"ei˧˩̰":44,"ou˨":45,"ɔ̽˧":46,"ɐi˧˩̰":47,"u˧":48,"ɔːĭ˥":49,"ɐu˨":50,"ei˧˥":51,"ɐi˨":52,"ʊ̟˧˩̰":53,"ʊ̟˨":54,"a̞˩˧":55,"ou˧˥":56,"aːĭ˧˥":57,"ɔ̽˨":58,"ɛ˩˧":59,"ɪ̞˨":60,"iːŭ˧":61,"ɛ˧˩̰":62,"m̩˧˩̰":63,"ɵ˧˥":64,"ei˧":65,"ɐu˧˩̰":66,"m̩˧":67,"ɐ˧˥":68,"ɐu˩˧":69,"ɐi˥":70,"ɔ̽˥":71,"ɔ̽˧˩̰":72,"ɔːĭ˧":73,"ou˩˧":74,"m̩˥":75,"ɐ˧":76,"tsʰ":77,"ɛ˧˥":78,"i˧˥":79,"ɔ̽˩˧":80,"kʰ":81,"ɐ˧˩̰":82,"aːŭ˧˥":83,"pʰ":84,"aːĭ˧˩̰":85,"ɵy˩˧":86,"ɛ˧":87,"u˧˥":88,"ɛ˨":89,"ʊ̟˧":90,"u˥":91,"m̩˩˧":92,"aːŭ˧":93,"œ̞˩˧":94,"i˩˧":95,"ɪ̞˧˩̰":96,"u˨":97,"ɪ̞˥":98,"iːŭ˧˩̰":99,"œ̞˧˥":100,"y˧":101,"uːĭ˩˧":102,"uːĭ˥":103,"ɵy˧˥":104,"y˧˩̰":105,"ɔːĭ˧˥":106,"ɛ":107,"ou˧":108,"ei˨":109,"ɵ˥":110,"u˧˩̰":111,"y˥":112,"œ̞˥":113,"œ̞˧˩̰":114,"aːĭ˨":115,"ɐ˩˧":116,"œ̞˧":117,"uːĭ˧˥":118,"ɐu˧":119,"ɐi˩˧":120,"ɐi˧":121,"ou˧˩̰":122,"aːĭ˥":123,"aːŭ˥":124,"ŋ˩˧":125,"y˧˥":126,"iːŭ˥":127,"ɔːĭ˨":128,"ʊ̟˧˥":129,"iːŭ˧˥":130,"ɵy˥":131,"ɔːĭ˧˩̰":132,"uːĭ˧":133,"ɵy˧˩̰":134,"œ̞˨":135,"m̩˨":136,"aːŭ˧˩̰":137,"y˩˧":138,"aːŭ˩˧":139,"aːĭ˩˧":140,"uːĭ˨":141,"ɵy˨":142,"aːŭ˨":143,"ɪ̞˧":144,"ɵ˨":145,"iːŭ˩˧":146,"iːŭ˨":147,"ɵ˧˩̰":148,"uːĭ˧˩̰":149,"u˩˧":150,"ŋ˧˩̰":151} \ No newline at end of file diff --git a/Catalan/.gitattributes b/Catalan/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Catalan/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Catalan/README.md b/Catalan/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Catalan/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Catalan/added_tokens.json b/Catalan/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..173b4e3e8487e918f68d170c4edb9aeb93fb119f --- /dev/null +++ b/Catalan/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 45 +} diff --git a/Catalan/special_tokens_map.json b/Catalan/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Catalan/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Catalan/tokenizer.json b/Catalan/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..2f47ac4a217d546ab7159e3ee6b17383272385de --- /dev/null +++ b/Catalan/tokenizer.json @@ -0,0 +1,157 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "a": 4, + "w": 5, + "ɛ": 6, + "ə": 7, + "ð": 8, + "t̪": 9, + "j": 10, + "i": 11, + "ɔ": 12, + "n̺": 13, + "z̺": 14, + "d̪": 15, + "s̺": 16, + "β": 17, + "m": 18, + "e": 19, + "f": 20, + "ɾ̺": 21, + "r̺": 22, + "u̯": 23, + "k": 24, + "u": 25, + "b": 26, + "p": 27, + "ɣ": 28, + "ɡ": 29, + "ŋ": 30, + "o": 31, + "ɫ̺": 32, + "ɲ̟": 33, + "ʒ": 34, + "ʃ": 35, + "ʎ̟": 36, + "t̠ʃ": 37, + "d̠ʒ": 38, + "ts̺": 39 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Catalan/tokenizer_config.json b/Catalan/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Catalan/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Catalan/vocab.json b/Catalan/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..cb92cd365bcaba617e0e8acc7f26b5442d6531a8 --- /dev/null +++ b/Catalan/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"a":4,"w":5,"ɛ":6,"ə":7,"ð":8,"t̪":9,"j":10,"i":11,"ɔ":12,"n̺":13,"z̺":14,"d̪":15,"s̺":16,"β":17,"m":18,"e":19,"f":20,"ɾ̺":21,"r̺":22,"u̯":23,"k":24,"u":25,"b":26,"p":27,"ɣ":28,"ɡ":29,"ŋ":30,"o":31,"ɫ̺":32,"ɲ̟":33,"ʒ":34,"ʃ":35,"ʎ̟":36,"t̠ʃ":37,"d̠ʒ":38,"ts̺":39} \ No newline at end of file diff --git a/Croatian/.gitattributes b/Croatian/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Croatian/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Croatian/README.md b/Croatian/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Croatian/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Croatian/added_tokens.json b/Croatian/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..21f9e91b654510683215be785ef011a9205f68c8 --- /dev/null +++ b/Croatian/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 48 +} diff --git a/Croatian/special_tokens_map.json b/Croatian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Croatian/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Croatian/tokenizer.json b/Croatian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..e4a3da73872c082aa8e19c6fbe3bacc5b6e8595f --- /dev/null +++ b/Croatian/tokenizer.json @@ -0,0 +1,152 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "e": 4, + "a": 5, + "u": 6, + "x": 7, + "k": 8, + "t̪": 9, + "n": 10, + "o": 11, + "d̪": 12, + "i": 13, + "r": 14, + "m": 15, + "ʃ": 16, + "p": 17, + "s": 18, + "ʋ": 19, + "j": 20, + "t̠ʃ": 21, + "l": 22, + "ɡ": 23, + "ʒ": 24, + "b": 25, + "t̪s": 26, + "z": 27, + "d̠ʒ": 28, + "ʎ": 29, + "f": 30, + "ɲ": 31, + "y": 32, + "q": 33, + "w": 34 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Croatian/tokenizer_config.json b/Croatian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Croatian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Croatian/vocab.json b/Croatian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..7af04ca32de518837dfdd7a7e0abe4b26f2de65f --- /dev/null +++ b/Croatian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"e":4,"a":5,"u":6,"x":7,"k":8,"t̪":9,"n":10,"o":11,"d̪":12,"i":13,"r":14,"m":15,"ʃ":16,"p":17,"s":18,"ʋ":19,"j":20,"t̠ʃ":21,"l":22,"ɡ":23,"ʒ":24,"b":25,"t̪s":26,"z":27,"d̠ʒ":28,"ʎ":29,"f":30,"ɲ":31,"y":32,"q":33,"w":34} \ No newline at end of file diff --git a/Danish/.gitattributes b/Danish/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Danish/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Danish/README.md b/Danish/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Danish/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Danish/added_tokens.json b/Danish/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..35c56cfce187b0a20457d3e9ccf194ef3dcfcdfa --- /dev/null +++ b/Danish/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 47 +} diff --git a/Danish/special_tokens_map.json b/Danish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Danish/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Danish/tokenizer.json b/Danish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..c8d4d2291abfac684584a936a36c9fb5897b4047 --- /dev/null +++ b/Danish/tokenizer.json @@ -0,0 +1,172 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "n": 4, + "oˤ": 5, + "t": 6, + "y": 7, + "ə": 8, + "ð": 9, + "ʁ": 10, + "ɑˤː": 11, + "s": 12, + "k": 13, + "i": 14, + "b": 15, + "eˤ": 16, + "t̠ʃ": 17, + "a": 18, + "l": 19, + "d": 20, + "ɡ": 21, + "f": 22, + "e": 23, + "ɛ": 24, + "r": 25, + "ɔ": 26, + "w": 27, + "ɔˤ": 28, + "m": 29, + "uˤ": 30, + "j": 31, + "ɑ": 32, + "u": 33, + "ɒ": 34, + "iˤ": 35, + "ʋ": 36, + "h": 37, + "œ": 38, + "p": 39, + "ɕ": 40, + "o": 41, + "ŋ": 42, + "ɒː": 43, + "aˤ": 44, + "ɜ": 45, + "œː": 46, + "eː": 47, + "aː": 48, + "d̠ʒ": 49, + "uː": 50, + "ɔː": 51, + "oː": 52, + "iː": 53, + "yː": 54 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Danish/tokenizer_config.json b/Danish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Danish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Danish/vocab.json b/Danish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..644d9d176c7fa0a71490f9530f4c8ee1f944b4f9 --- /dev/null +++ b/Danish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"n":4,"oˤ":5,"t":6,"y":7,"ə":8,"ð":9,"ʁ":10,"ɑˤː":11,"s":12,"k":13,"i":14,"b":15,"eˤ":16,"t̠ʃ":17,"a":18,"l":19,"d":20,"ɡ":21,"f":22,"e":23,"ɛ":24,"r":25,"ɔ":26,"w":27,"ɔˤ":28,"m":29,"uˤ":30,"j":31,"ɑ":32,"u":33,"ɒ":34,"iˤ":35,"ʋ":36,"h":37,"œ":38,"p":39,"ɕ":40,"o":41,"ŋ":42,"ɒː":43,"aˤ":44,"ɜ":45,"œː":46,"eː":47,"aː":48,"d̠ʒ":49,"uː":50,"ɔː":51,"oː":52,"iː":53,"yː":54} \ No newline at end of file diff --git a/Dutch/.gitattributes b/Dutch/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Dutch/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Dutch/README.md b/Dutch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Dutch/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Dutch/added_tokens.json b/Dutch/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..ed3360c4935f5ee7ddf3fb06e0c313f86da320d8 --- /dev/null +++ b/Dutch/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 55 +} diff --git a/Dutch/special_tokens_map.json b/Dutch/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Dutch/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Dutch/tokenizer.json b/Dutch/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..8301a1f0cd6e7d4df60263df3ad8cf94f491363a --- /dev/null +++ b/Dutch/tokenizer.json @@ -0,0 +1,167 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "z": 4, + "oː": 5, + "j": 6, + "ãː": 7, + "ɦ": 8, + "ɾ": 9, + "d": 10, + "i": 11, + "ɛ": 12, + "p": 13, + "ɪ": 14, + "k": 15, + "ɑ": 16, + "l": 17, + "ɛː": 18, + "n": 19, + "s": 20, + "v": 21, + "ə": 22, + "ɛi": 23, + "ʋ": 24, + "t": 25, + "m": 26, + "ɣ": 27, + "ʏ": 28, + "ɔ": 29, + "x": 30, + "u": 31, + "f": 32, + "ŋ": 33, + "øː": 34, + "b": 35, + "ɔː": 36, + "ʌu": 37, + "y": 38, + "œy": 39, + "tʲ": 40, + "w": 41, + "ʃ": 42, + "t̠ʃ": 43, + "ɲ": 44, + "ʒ": 45, + "iː": 46, + "ɡ": 47, + "d̠ʒ": 48, + "ã": 49 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Dutch/tokenizer_config.json b/Dutch/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Dutch/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Dutch/vocab.json b/Dutch/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..e32f88f0726392ee8f6b60a3baa5159263318ee0 --- /dev/null +++ b/Dutch/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"z":4,"oː":5,"j":6,"ãː":7,"ɦ":8,"ɾ":9,"d":10,"i":11,"ɛ":12,"p":13,"ɪ":14,"k":15,"ɑ":16,"l":17,"ɛː":18,"n":19,"s":20,"v":21,"ə":22,"ɛi":23,"ʋ":24,"t":25,"m":26,"ɣ":27,"ʏ":28,"ɔ":29,"x":30,"u":31,"f":32,"ŋ":33,"øː":34,"b":35,"ɔː":36,"ʌu":37,"y":38,"œy":39,"tʲ":40,"w":41,"ʃ":42,"t̠ʃ":43,"ɲ":44,"ʒ":45,"iː":46,"ɡ":47,"d̠ʒ":48,"ã":49} \ No newline at end of file diff --git a/EnglishUK/.gitattributes b/EnglishUK/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/EnglishUK/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/EnglishUK/README.md b/EnglishUK/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/EnglishUK/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/EnglishUK/added_tokens.json b/EnglishUK/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..4e63d7e4bd7282359e3ce957e01e29a1c1265211 --- /dev/null +++ b/EnglishUK/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 56 +} diff --git a/EnglishUK/special_tokens_map.json b/EnglishUK/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/EnglishUK/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/EnglishUK/tokenizer.json b/EnglishUK/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..cc19c38fcc9ecbc578817f1093baceee41e44cb0 --- /dev/null +++ b/EnglishUK/tokenizer.json @@ -0,0 +1,168 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ð": 4, + "æ": 5, + "tʰ": 6, + "ɡ": 7, + "ʊ": 8, + "d": 9, + "ɑː": 10, + "l": 11, + "ɪ": 12, + "n": 13, + "eɪ": 14, + "t̠ʃ": 15, + "w": 16, + "ɒ": 17, + "ʌ": 18, + "z": 19, + "m": 20, + "iː": 21, + "aɪ": 22, + "h": 23, + "e": 24, + "kʰ": 25, + "s": 26, + "ə": 27, + "ɔː": 28, + "ɹ": 29, + "i": 30, + "əʊ": 31, + "uː": 32, + "j": 33, + "ɪə": 34, + "ɔɪ": 35, + "v": 36, + "f": 37, + "ɜː": 38, + "b": 39, + "pʰ": 40, + "d̠ʒ": 41, + "ɐ": 42, + "eə": 43, + "ʃ": 44, + "θ": 45, + "ŋ": 46, + "aʊ": 47, + "ʊə": 48, + "n̩": 49, + "ʒ": 50 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/EnglishUK/tokenizer_config.json b/EnglishUK/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/EnglishUK/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/EnglishUK/vocab.json b/EnglishUK/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..fa1afb7415edd0a141f1a813765ac30449e0a552 --- /dev/null +++ b/EnglishUK/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ð":4,"æ":5,"tʰ":6,"ɡ":7,"ʊ":8,"d":9,"ɑː":10,"l":11,"ɪ":12,"n":13,"eɪ":14,"t̠ʃ":15,"w":16,"ɒ":17,"ʌ":18,"z":19,"m":20,"iː":21,"aɪ":22,"h":23,"e":24,"kʰ":25,"s":26,"ə":27,"ɔː":28,"ɹ":29,"i":30,"əʊ":31,"uː":32,"j":33,"ɪə":34,"ɔɪ":35,"v":36,"f":37,"ɜː":38,"b":39,"pʰ":40,"d̠ʒ":41,"ɐ":42,"eə":43,"ʃ":44,"θ":45,"ŋ":46,"aʊ":47,"ʊə":48,"n̩":49,"ʒ":50} \ No newline at end of file diff --git a/Estonian/.gitattributes b/Estonian/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Estonian/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Estonian/README.md b/Estonian/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Estonian/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Estonian/added_tokens.json b/Estonian/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..4e63d7e4bd7282359e3ce957e01e29a1c1265211 --- /dev/null +++ b/Estonian/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 56 +} diff --git a/Estonian/special_tokens_map.json b/Estonian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Estonian/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Estonian/tokenizer.json b/Estonian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..bacb89b3d32ff26618f7e63715ad993083909ff6 --- /dev/null +++ b/Estonian/tokenizer.json @@ -0,0 +1,185 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "n": 4, + "o": 5, + "t": 6, + "ʃ": 7, + "a": 8, + "uː": 9, + "m": 10, + "u": 11, + "tʲ": 12, + "i": 13, + "s": 14, + "eː": 15, + "d": 16, + "iː": 17, + "k": 18, + "ɡ": 19, + "ɑ": 20, + "ɤ": 21, + "ʊ": 22, + "sʲ": 23, + "j": 24, + "aː": 25, + "h": 26, + "v": 27, + "æi": 28, + "kː": 29, + "e": 30, + "ɪ": 31, + "tː": 32, + "r": 33, + "ɛ": 34, + "mː": 35, + "p": 36, + "sː": 37, + "æ": 38, + "l": 39, + "pː": 40, + "yː": 41, + "æː": 42, + "b": 43, + "ɔ": 44, + "ɤː": 45, + "lː": 46, + "ø": 47, + "øː": 48, + "ŋ": 49, + "y": 50, + "oː": 51, + "rː": 52, + "ɲ": 53, + "nː": 54, + "w": 55, + "tʲː": 56, + "øɪ̯": 57, + "f": 58, + "dʲ": 59, + "sʲː": 60, + "t̠ʃ": 61, + "ʃː": 62, + "ʒ": 63, + "z": 64, + "fː": 65, + "dː": 66, + "yi": 67 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Estonian/tokenizer_config.json b/Estonian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Estonian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Estonian/vocab.json b/Estonian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..f1616a438e1ce75a34684c1c7c797df4df2762e9 --- /dev/null +++ b/Estonian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"n":4,"o":5,"t":6,"ʃ":7,"a":8,"uː":9,"m":10,"u":11,"tʲ":12,"i":13,"s":14,"eː":15,"d":16,"iː":17,"k":18,"ɡ":19,"ɑ":20,"ɤ":21,"ʊ":22,"sʲ":23,"j":24,"aː":25,"h":26,"v":27,"æi":28,"kː":29,"e":30,"ɪ":31,"tː":32,"r":33,"ɛ":34,"mː":35,"p":36,"sː":37,"æ":38,"l":39,"pː":40,"yː":41,"æː":42,"b":43,"ɔ":44,"ɤː":45,"lː":46,"ø":47,"øː":48,"ŋ":49,"y":50,"oː":51,"rː":52,"ɲ":53,"nː":54,"w":55,"tʲː":56,"øɪ̯":57,"f":58,"dʲ":59,"sʲː":60,"t̠ʃ":61,"ʃː":62,"ʒ":63,"z":64,"fː":65,"dː":66,"yi":67} \ No newline at end of file diff --git a/Farsi/.gitattributes b/Farsi/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Farsi/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Farsi/README.md b/Farsi/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Farsi/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Farsi/added_tokens.json b/Farsi/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..e8d07e099125b3795d380ff6f412b450a7ee1edd --- /dev/null +++ b/Farsi/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 31 +} diff --git a/Farsi/special_tokens_map.json b/Farsi/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Farsi/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Farsi/tokenizer.json b/Farsi/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..aae4e6922e8be600b5dd29b5d41e66aea2a4aad8 --- /dev/null +++ b/Farsi/tokenizer.json @@ -0,0 +1,148 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "m": 4, + "a̟": 5, + "b": 6, + "s": 7, + "e": 8, + "r": 9, + "j": 10, + "h": 11, + "t̠ʃ": 12, + "kʰ": 13, + "d̪": 14, + "n̪": 15, + "z": 16, + "ʃ": 17, + "ɡ": 18, + "i": 19, + "u": 20, + "o": 21, + "f": 22, + "t̪ʰ": 23, + "ɑ": 24, + "d̠ʒ": 25, + "v": 26, + "pʰ": 27, + "l": 28, + "w": 29, + "ɢ": 30 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Farsi/tokenizer_config.json b/Farsi/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Farsi/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Farsi/vocab.json b/Farsi/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..3ab332b96d95f3b8bb06d8bbb6d6a98fb5972d22 --- /dev/null +++ b/Farsi/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"m":4,"a̟":5,"b":6,"s":7,"e":8,"r":9,"j":10,"h":11,"t̠ʃ":12,"kʰ":13,"d̪":14,"n̪":15,"z":16,"ʃ":17,"ɡ":18,"i":19,"u":20,"o":21,"f":22,"t̪ʰ":23,"ɑ":24,"d̠ʒ":25,"v":26,"pʰ":27,"l":28,"w":29,"ɢ":30} \ No newline at end of file diff --git a/French/.gitattributes b/French/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/French/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/French/README.md b/French/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/French/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/French/added_tokens.json b/French/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..9a6e8091709a1384a0e5c220579d695bf8504888 --- /dev/null +++ b/French/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 44 +} diff --git a/French/special_tokens_map.json b/French/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/French/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/French/tokenizer.json b/French/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..8369a41339817f9ef945761844d45326b7f8500b --- /dev/null +++ b/French/tokenizer.json @@ -0,0 +1,156 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "m": 4, + "a": 5, + "ɑ̃": 6, + "d": 7, + "ɔ": 8, + "n": 9, + "b": 10, + "ʁ": 11, + "ə": 12, + "ɡ": 13, + "ʒ": 14, + "i": 15, + "v": 16, + "t": 17, + "k": 18, + "o": 19, + "ɛ̃": 20, + "w": 21, + "y": 22, + "j": 23, + "e": 24, + "ɔ̃": 25, + "p": 26, + "ɛ": 27, + "f": 28, + "s": 29, + "z": 30, + "l": 31, + "u": 32, + "ʃ": 33, + "œ": 34, + "ø": 35, + "ɲ": 36, + "t̠ʃ": 37, + "d̠ʒ": 38 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/French/tokenizer_config.json b/French/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/French/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/French/vocab.json b/French/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..2471ecaab8cd646a1139ff621f701ce8867685a9 --- /dev/null +++ b/French/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"m":4,"a":5,"ɑ̃":6,"d":7,"ɔ":8,"n":9,"b":10,"ʁ":11,"ə":12,"ɡ":13,"ʒ":14,"i":15,"v":16,"t":17,"k":18,"o":19,"ɛ̃":20,"w":21,"y":22,"j":23,"e":24,"ɔ̃":25,"p":26,"ɛ":27,"f":28,"s":29,"z":30,"l":31,"u":32,"ʃ":33,"œ":34,"ø":35,"ɲ":36,"t̠ʃ":37,"d̠ʒ":38} \ No newline at end of file diff --git a/German/.gitattributes b/German/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/German/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/German/README.md b/German/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/German/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/German/added_tokens.json b/German/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..4e63d7e4bd7282359e3ce957e01e29a1c1265211 --- /dev/null +++ b/German/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 56 +} diff --git a/German/special_tokens_map.json b/German/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/German/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/German/tokenizer.json b/German/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..bb84b037c35c509567e630f5f2d95fcc3a4e8acc --- /dev/null +++ b/German/tokenizer.json @@ -0,0 +1,162 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "aː": 4, + "oː": 5, + "a": 6, + "b": 7, + "x": 8, + "v": 9, + "øː": 10, + "n": 11, + "ɛː": 12, + "f": 13, + "l": 14, + "iː": 15, + "yː": 16, + "j": 17, + "uː": 18, + "h": 19, + "ʊ": 20, + "m": 21, + "ɔ": 22, + "ɪ": 23, + "eː": 24, + "ə": 25, + "d̺": 26, + "t̺ʰ": 27, + "ɛ": 28, + "ŋ": 29, + "ç": 30, + "œ": 31, + "kʰ": 32, + "ʀ": 33, + "ɡ": 34, + "pʰ": 35, + "ʏ": 36, + "s": 37, + "z": 38, + "ts": 39, + "ʃ": 40, + "ɐ": 41, + "pf": 42, + "t̠ʃ": 43, + "d̠ʒ": 44 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/German/tokenizer_config.json b/German/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/German/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/German/vocab.json b/German/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..9986a47024c76582fbfeb1af3b23d5c6084084f1 --- /dev/null +++ b/German/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"aː":4,"oː":5,"a":6,"b":7,"x":8,"v":9,"øː":10,"n":11,"ɛː":12,"f":13,"l":14,"iː":15,"yː":16,"j":17,"uː":18,"h":19,"ʊ":20,"m":21,"ɔ":22,"ɪ":23,"eː":24,"ə":25,"d̺":26,"t̺ʰ":27,"ɛ":28,"ŋ":29,"ç":30,"œ":31,"kʰ":32,"ʀ":33,"ɡ":34,"pʰ":35,"ʏ":36,"s":37,"z":38,"ts":39,"ʃ":40,"ɐ":41,"pf":42,"t̠ʃ":43,"d̠ʒ":44} \ No newline at end of file diff --git a/Hungarian/.gitattributes b/Hungarian/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Hungarian/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Hungarian/README.md b/Hungarian/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Hungarian/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Hungarian/added_tokens.json b/Hungarian/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..ed3360c4935f5ee7ddf3fb06e0c313f86da320d8 --- /dev/null +++ b/Hungarian/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 55 +} diff --git a/Hungarian/special_tokens_map.json b/Hungarian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Hungarian/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Hungarian/tokenizer.json b/Hungarian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..841476489072874d3344486e3e232b14e55732c4 --- /dev/null +++ b/Hungarian/tokenizer.json @@ -0,0 +1,182 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "i": 4, + "d̪": 5, + "ɛ": 6, + "b": 7, + "aː": 8, + "t̠ʃ": 9, + "m": 10, + "l̪": 11, + "s̻": 12, + "z̻": 13, + "ɡ": 14, + "k": 15, + "o": 16, + "ɑ": 17, + "t̪ː": 18, + "j": 19, + "ø": 20, + "n̪": 21, + "ɲ": 22, + "u": 23, + "t̻s̻": 24, + "y": 25, + "r̪": 26, + "h": 27, + "oː": 28, + "v": 29, + "d̠ʒ": 30, + "t̪": 31, + "eː": 32, + "ʃ": 33, + "ɟʝ": 34, + "s̻ː": 35, + "p": 36, + "øː": 37, + "mː": 38, + "z̻ː": 39, + "l̪ː": 40, + "f": 41, + "ɟʝː": 42, + "uː": 43, + "n̪ː": 44, + "iː": 45, + "ɲː": 46, + "ʃː": 47, + "r̪ː": 48, + "kː": 49, + "ŋ": 50, + "t̠ʃː": 51, + "jː": 52, + "bː": 53, + "cç": 54, + "t̻s̻ː": 55, + "d̪ː": 56, + "ɡː": 57, + "pː": 58, + "ʒ": 59, + "vː": 60, + "cçː": 61, + "fː": 62, + "hː": 63, + "yː": 64 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Hungarian/tokenizer_config.json b/Hungarian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Hungarian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Hungarian/vocab.json b/Hungarian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..cf891f8dff077b7e40a98547a62dd413d883841d --- /dev/null +++ b/Hungarian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"i":4,"d̪":5,"ɛ":6,"b":7,"aː":8,"t̠ʃ":9,"m":10,"l̪":11,"s̻":12,"z̻":13,"ɡ":14,"k":15,"o":16,"ɑ":17,"t̪ː":18,"j":19,"ø":20,"n̪":21,"ɲ":22,"u":23,"t̻s̻":24,"y":25,"r̪":26,"h":27,"oː":28,"v":29,"d̠ʒ":30,"t̪":31,"eː":32,"ʃ":33,"ɟʝ":34,"s̻ː":35,"p":36,"øː":37,"mː":38,"z̻ː":39,"l̪ː":40,"f":41,"ɟʝː":42,"uː":43,"n̪ː":44,"iː":45,"ɲː":46,"ʃː":47,"r̪ː":48,"kː":49,"ŋ":50,"t̠ʃː":51,"jː":52,"bː":53,"cç":54,"t̻s̻ː":55,"d̪ː":56,"ɡː":57,"pː":58,"ʒ":59,"vː":60,"cçː":61,"fː":62,"hː":63,"yː":64} \ No newline at end of file diff --git a/Icelandic/.gitattributes b/Icelandic/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Icelandic/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Icelandic/README.md b/Icelandic/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Icelandic/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Icelandic/added_tokens.json b/Icelandic/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..ef00004cc13e059472f0294cdfabbcf80546b84e --- /dev/null +++ b/Icelandic/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 54 +} diff --git a/Icelandic/special_tokens_map.json b/Icelandic/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Icelandic/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Icelandic/tokenizer.json b/Icelandic/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4a58a112fa9bca1ed96d0cf923dc60315f01dc44 --- /dev/null +++ b/Icelandic/tokenizer.json @@ -0,0 +1,175 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "pʰ": 4, + "iː": 5, + "i": 6, + "aː": 7, + "r̥": 8, + "ɪ": 9, + "ɛ": 10, + "t̪ʰ": 11, + "s̺": 12, + "j": 13, + "ä": 14, + "k": 15, + "ʋ": 16, + "ɛː": 17, + "r": 18, + "ei̯": 19, + "θ̻": 20, + "l": 21, + "n̪": 22, + "t̪": 23, + "ɬ": 24, + "uː": 25, + "ð̺̞": 26, + "ɡ": 27, + "c": 28, + "h": 29, + "ɔ": 30, + "n̪̥": 31, + "äu̯": 32, + "ŋ̥": 33, + "ʏ": 34, + "m": 35, + "f": 36, + "ɔː": 37, + "x": 38, + "cʰ": 39, + "ou̯": 40, + "p": 41, + "ŋ": 42, + "øɪ̯": 43, + "äi̯": 44, + "ɰ": 45, + "ʏː": 46, + "u": 47, + "ɪː": 48, + "œ": 49, + "ç": 50, + "ə": 51, + "œː": 52, + "ɲ": 53, + "m̥": 54, + "ɔi̯": 55, + "z": 56, + "ɲ̥": 57 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Icelandic/tokenizer_config.json b/Icelandic/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Icelandic/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Icelandic/vocab.json b/Icelandic/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..e41464442ca5233ae7db89717d0d6619677f1541 --- /dev/null +++ b/Icelandic/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"pʰ":4,"iː":5,"i":6,"aː":7,"r̥":8,"ɪ":9,"ɛ":10,"t̪ʰ":11,"s̺":12,"j":13,"ä":14,"k":15,"ʋ":16,"ɛː":17,"r":18,"ei̯":19,"θ̻":20,"l":21,"n̪":22,"t̪":23,"ɬ":24,"uː":25,"ð̺̞":26,"ɡ":27,"c":28,"h":29,"ɔ":30,"n̪̥":31,"äu̯":32,"ŋ̥":33,"ʏ":34,"m":35,"f":36,"ɔː":37,"x":38,"cʰ":39,"ou̯":40,"p":41,"ŋ":42,"øɪ̯":43,"äi̯":44,"ɰ":45,"ʏː":46,"u":47,"ɪː":48,"œ":49,"ç":50,"ə":51,"œː":52,"ɲ":53,"m̥":54,"ɔi̯":55,"z":56,"ɲ̥":57} \ No newline at end of file diff --git a/Indonesian/.gitattributes b/Indonesian/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Indonesian/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Indonesian/README.md b/Indonesian/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Indonesian/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Indonesian/added_tokens.json b/Indonesian/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..670c79b0d6ef88e9bb0d7778a9cb047babf38484 --- /dev/null +++ b/Indonesian/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 41 +} diff --git a/Indonesian/special_tokens_map.json b/Indonesian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Indonesian/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Indonesian/tokenizer.json b/Indonesian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f47cee8f3938210bf990747730d1470d052cec1a --- /dev/null +++ b/Indonesian/tokenizer.json @@ -0,0 +1,148 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "s": 4, + "i": 5, + "n": 6, + "m": 7, + "a": 8, + "j": 9, + "u": 10, + "k": 11, + "o": 12, + "h": 13, + "l": 14, + "t": 15, + "w": 16, + "d̠ʒ": 17, + "ŋ": 18, + "ə": 19, + "d": 20, + "p": 21, + "ɡ": 22, + "b": 23, + "r": 24, + "ɲ": 25, + "t̠ʃ": 26, + "f": 27, + "z": 28, + "ʃ": 29, + "x": 30 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Indonesian/tokenizer_config.json b/Indonesian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Indonesian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Indonesian/vocab.json b/Indonesian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..8b94ac5a898bb0640a5ca1123876126507b5f5ca --- /dev/null +++ b/Indonesian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"s":4,"i":5,"n":6,"m":7,"a":8,"j":9,"u":10,"k":11,"o":12,"h":13,"l":14,"t":15,"w":16,"d̠ʒ":17,"ŋ":18,"ə":19,"d":20,"p":21,"ɡ":22,"b":23,"r":24,"ɲ":25,"t̠ʃ":26,"f":27,"z":28,"ʃ":29,"x":30} \ No newline at end of file diff --git a/Irish/.gitattributes b/Irish/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Irish/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Irish/README.md b/Irish/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Irish/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Irish/added_tokens.json b/Irish/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..8463d05e7c1a8891821a262143d9a349ef7da72e --- /dev/null +++ b/Irish/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 66 +} diff --git a/Irish/special_tokens_map.json b/Irish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Irish/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Irish/tokenizer.json b/Irish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..de5c678a1c9403291999bd5c1b8019ecb5e3faf6 --- /dev/null +++ b/Irish/tokenizer.json @@ -0,0 +1,171 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "kʰ": 4, + "a": 5, + "ɾ̪ʲ": 6, + "d̪ˠ": 7, + "eː": 8, + "ʃ": 9, + "ɪ": 10, + "n̪ˠ": 11, + "ə": 12, + "w": 13, + "l̪ˠ": 14, + "ɛ̝": 15, + "ɡ": 16, + "ɾ̪ˠ": 17, + "mˠ": 18, + "x": 19, + "iː": 20, + "sˠ": 21, + "bˠ": 22, + "pˠʰ": 23, + "t̪ʲʰ": 24, + "ɔ̝": 25, + "cʰ": 26, + "t̪ˠʰ": 27, + "h": 28, + "vˠ": 29, + "ʊ": 30, + "j": 31, + "oː": 32, + "ɑː": 33, + "fˠ": 34, + "d̠ʒ": 35, + "l̪ʲ": 36, + "iːə": 37, + "uːe": 38, + "uː": 39, + "n̪ʲ": 40, + "d̪ʲ": 41, + "ɐ": 42, + "mʲ": 43, + "pʲʰ": 44, + "ɣ": 45, + "ɐɪ": 46, + "ŋ": 47, + "i̞": 48, + "ç": 49, + "z": 50, + "fʲ": 51, + "ʒ": 52, + "bʲ": 53 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Irish/tokenizer_config.json b/Irish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Irish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Irish/vocab.json b/Irish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..5786f979abc1d824413dd431d591e0f9874595f9 --- /dev/null +++ b/Irish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"kʰ":4,"a":5,"ɾ̪ʲ":6,"d̪ˠ":7,"eː":8,"ʃ":9,"ɪ":10,"n̪ˠ":11,"ə":12,"w":13,"l̪ˠ":14,"ɛ̝":15,"ɡ":16,"ɾ̪ˠ":17,"mˠ":18,"x":19,"iː":20,"sˠ":21,"bˠ":22,"pˠʰ":23,"t̪ʲʰ":24,"ɔ̝":25,"cʰ":26,"t̪ˠʰ":27,"h":28,"vˠ":29,"ʊ":30,"j":31,"oː":32,"ɑː":33,"fˠ":34,"d̠ʒ":35,"l̪ʲ":36,"iːə":37,"uːe":38,"uː":39,"n̪ʲ":40,"d̪ʲ":41,"ɐ":42,"mʲ":43,"pʲʰ":44,"ɣ":45,"ɐɪ":46,"ŋ":47,"i̞":48,"ç":49,"z":50,"fʲ":51,"ʒ":52,"bʲ":53} \ No newline at end of file diff --git a/Italian/.gitattributes b/Italian/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Italian/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Italian/README.md b/Italian/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Italian/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Italian/added_tokens.json b/Italian/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..d46d8c657b30f16aca565eecf8b5b8dbeb7dd552 --- /dev/null +++ b/Italian/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 60 +} diff --git a/Italian/special_tokens_map.json b/Italian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Italian/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Italian/tokenizer.json b/Italian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..2840c8268845111e2326e21631ccef7b62599f94 --- /dev/null +++ b/Italian/tokenizer.json @@ -0,0 +1,174 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ɛ": 4, + "kː": 5, + "o": 6, + "pː": 7, + "l": 8, + "ɐ": 9, + "n": 10, + "i": 11, + "m": 12, + "k": 13, + "s": 14, + "t": 15, + "ɔ": 16, + "z": 17, + "f": 18, + "v": 19, + "e": 20, + "d": 21, + "j": 22, + "t̠ʃ": 23, + "b": 24, + "w": 25, + "ɛː": 26, + "p": 27, + "r": 28, + "u": 29, + "ɡ": 30, + "ʎ": 31, + "d̠ʒ": 32, + "tː": 33, + "ɐː": 34, + "ts": 35, + "dː": 36, + "oː": 37, + "iː": 38, + "sː": 39, + "t̠ʃː": 40, + "ɾ": 41, + "eː": 42, + "dz": 43, + "bː": 44, + "d̠ʒː": 45, + "ɲ": 46, + "tsː": 47, + "ʃ": 48, + "a": 49, + "ɔː": 50, + "dzː": 51, + "ŋ": 52, + "h": 53, + "uː": 54, + "ɡː": 55, + "ʒ": 56 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Italian/tokenizer_config.json b/Italian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Italian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Italian/vocab.json b/Italian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..38956f9bd2a248f9d372e745325ea39cae92e8b5 --- /dev/null +++ b/Italian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ɛ":4,"kː":5,"o":6,"pː":7,"l":8,"ɐ":9,"n":10,"i":11,"m":12,"k":13,"s":14,"t":15,"ɔ":16,"z":17,"f":18,"v":19,"e":20,"d":21,"j":22,"t̠ʃ":23,"b":24,"w":25,"ɛː":26,"p":27,"r":28,"u":29,"ɡ":30,"ʎ":31,"d̠ʒ":32,"tː":33,"ɐː":34,"ts":35,"dː":36,"oː":37,"iː":38,"sː":39,"t̠ʃː":40,"ɾ":41,"eː":42,"dz":43,"bː":44,"d̠ʒː":45,"ɲ":46,"tsː":47,"ʃ":48,"a":49,"ɔː":50,"dzː":51,"ŋ":52,"h":53,"uː":54,"ɡː":55,"ʒ":56} \ No newline at end of file diff --git a/Japanese/.gitattributes b/Japanese/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Japanese/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Japanese/README.md b/Japanese/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Japanese/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Japanese/added_tokens.json b/Japanese/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..8da40da5ff93186649f03c9fc0af239f77a4c360 --- /dev/null +++ b/Japanese/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 38 +} diff --git a/Japanese/special_tokens_map.json b/Japanese/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Japanese/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Japanese/tokenizer.json b/Japanese/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..95858452a2c7bd2841da3e92ce16ef8b8f434396 --- /dev/null +++ b/Japanese/tokenizer.json @@ -0,0 +1,157 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "kʲ": 4, + "aː": 5, + "o": 6, + "ts": 7, + "ɯ": 8, + "k": 9, + "a": 10, + "i": 11, + "w": 12, + "d̠ʒ": 13, + "t": 14, + "e": 15, + "n": 16, + "ʃ": 17, + "d": 18, + "b": 19, + "s": 20, + "m": 21, + "h": 22, + "ɾ": 23, + "t̠ʃ": 24, + "ɯː": 25, + "p": 26, + "j": 27, + "ɡʲ": 28, + "ɸ": 29, + "ɡ": 30, + "oː": 31, + "ɲ": 32, + "z": 33, + "eː": 34, + "pʲ": 35, + "ɾʲ": 36, + "ç": 37, + "bʲ": 38, + "mʲ": 39 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Japanese/tokenizer_config.json b/Japanese/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Japanese/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Japanese/vocab.json b/Japanese/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..9b568f9d25f492a50cb1cb57b6006e226177fc38 --- /dev/null +++ b/Japanese/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"kʲ":4,"aː":5,"o":6,"ts":7,"ɯ":8,"k":9,"a":10,"i":11,"w":12,"d̠ʒ":13,"t":14,"e":15,"n":16,"ʃ":17,"d":18,"b":19,"s":20,"m":21,"h":22,"ɾ":23,"t̠ʃ":24,"ɯː":25,"p":26,"j":27,"ɡʲ":28,"ɸ":29,"ɡ":30,"oː":31,"ɲ":32,"z":33,"eː":34,"pʲ":35,"ɾʲ":36,"ç":37,"bʲ":38,"mʲ":39} \ No newline at end of file diff --git a/Korean/.gitattributes b/Korean/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Korean/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Korean/README.md b/Korean/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Korean/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Korean/added_tokens.json b/Korean/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..b256316a61c8918eaeca1e8cc76aa2ee66042e3f --- /dev/null +++ b/Korean/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 35 +} diff --git a/Korean/special_tokens_map.json b/Korean/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Korean/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Korean/tokenizer.json b/Korean/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4cd26b8cba5660e4bd31a22f64bde19546e0953c --- /dev/null +++ b/Korean/tokenizer.json @@ -0,0 +1,150 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "i": 4, + "ɾ": 5, + "ɯ": 6, + "m": 7, + "a": 8, + "u": 9, + "j": 10, + "ɤ̞": 11, + "ɡ": 12, + "ŋ": 13, + "h": 14, + "æ": 15, + "p": 16, + "o": 17, + "dʑ": 18, + "w": 19, + "n̪": 20, + "d": 21, + "e": 22, + "l": 23, + "t̠ʃ": 24, + "b": 25, + "s̪": 26, + "k": 27, + "t̪": 28, + "pʰ": 29, + "kʰ": 30, + "ɯi": 31, + "t̠ʃʰ": 32 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Korean/tokenizer_config.json b/Korean/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Korean/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Korean/vocab.json b/Korean/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..fb2494d3b5f743d94db034cf9e76469ac53a564d --- /dev/null +++ b/Korean/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"i":4,"ɾ":5,"ɯ":6,"m":7,"a":8,"u":9,"j":10,"ɤ̞":11,"ɡ":12,"ŋ":13,"h":14,"æ":15,"p":16,"o":17,"dʑ":18,"w":19,"n̪":20,"d":21,"e":22,"l":23,"t̠ʃ":24,"b":25,"s̪":26,"k":27,"t̪":28,"pʰ":29,"kʰ":30,"ɯi":31,"t̠ʃʰ":32} \ No newline at end of file diff --git a/Mandarin/.gitattributes b/Mandarin/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Mandarin/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Mandarin/README.md b/Mandarin/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Mandarin/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Mandarin/added_tokens.json b/Mandarin/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..48157e5b67a9156b729d96e0fda85215b712d141 --- /dev/null +++ b/Mandarin/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 117 +} diff --git a/Mandarin/special_tokens_map.json b/Mandarin/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Mandarin/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Mandarin/tokenizer.json b/Mandarin/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..bde39543251170a9ec815f9656e9ac15e296f0e1 --- /dev/null +++ b/Mandarin/tokenizer.json @@ -0,0 +1,232 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "WhitespaceSplit" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "a˧˥": 4, + "u˧˥": 5, + "a˥": 6, + "au": 7, + "n": 8, + "a˥˩": 9, + "ʃ̺": 10, + "ɻ̩˥˩": 11, + "ə˧˥": 12, + "m": 13, + "ɤ": 14, + "p": 15, + "j": 16, + "e˧˥": 17, + "kʰ": 18, + "k": 19, + "ɤ˥˩": 20, + "w": 21, + "o˥": 22, + "t̠ʃ̺ʰ": 23, + "ə˥": 24, + "ŋ": 25, + "t": 26, + "ʊ˥": 27, + "ɕ": 28, + "i": 29, + "a": 30, + "l": 31, + "au˧˩˧": 32, + "x": 33, + "u˧˩˧": 34, + "i˥": 35, + "ei˧˩˧": 36, + "pʰ": 37, + "i˧˥": 38, + "ai˧˥": 39, + "ou˧˩˧": 40, + "ɤ˧˥": 41, + "o˧˩˧": 42, + "tɕ": 43, + "au˥˩": 44, + "ts": 45, + "ə˧˩˧": 46, + "ɤ˥": 47, + "ei˧˥": 48, + "ʊ˧˥": 49, + "i˧˩˧": 50, + "t̠ʃ̺": 51, + "ɻ̩˧˩˧": 52, + "ei˥˩": 53, + "s": 54, + "u˥˩": 55, + "ɹ̪̩": 56, + "ai˥": 57, + "u˥": 58, + "tɕʰ": 59, + "a˧˩˧": 60, + "ai˥˩": 61, + "ɛ˥˩": 62, + "f": 63, + "i˥˩": 64, + "y˥˩": 65, + "au˧˥": 66, + "ɻ": 67, + "ou˥˩": 68, + "e˥": 69, + "tʰ": 70, + "ɹ̪̩˥˩": 71, + "ɛ˧˥": 72, + "au˥": 73, + "ou˧˥": 74, + "e˧˩˧": 75, + "ɛ˥": 76, + "ɻ̩˥": 77, + "ɥ": 78, + "ɹ̪̩˧˩˧": 79, + "ai˧˩˧": 80, + "ou˥": 81, + "o˥˩": 82, + "ɛ˧˩˧": 83, + "ʊ˧˩˧": 84, + "ɔ˥": 85, + "tsʰ": 86, + "ei": 87, + "ə˥˩": 88, + "o": 89, + "ʊ˥˩": 90, + "ou": 91, + "ɤ˧˩˧": 92, + "o˧˥": 93, + "ei˥": 94, + "e˥˩": 95, + "ɚ˧˩˧": 96, + "y˥": 97, + "ɚ˥˩": 98, + "y˧˥": 99, + "ɻ̩": 100, + "y˧˩˧": 101, + "ɹ̪̩˥": 102, + "ɻ̩˧˥": 103, + "u": 104, + "ə": 105, + "ai": 106, + "ʊ": 107, + "e": 108, + "ɚ˧˥": 109, + "ɔ˥˩": 110, + "ɹ̪̩˧˥": 111, + "ɛ": 112, + "y": 113, + "m˧˥": 114 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Mandarin/tokenizer_config.json b/Mandarin/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Mandarin/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Mandarin/vocab.json b/Mandarin/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..6bb491736dd51d7b5d7477fa47e059950d7fdb25 --- /dev/null +++ b/Mandarin/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"a˧˥":4,"u˧˥":5,"a˥":6,"au":7,"n":8,"a˥˩":9,"ʃ̺":10,"ɻ̩˥˩":11,"ə˧˥":12,"m":13,"ɤ":14,"p":15,"j":16,"e˧˥":17,"kʰ":18,"k":19,"ɤ˥˩":20,"w":21,"o˥":22,"t̠ʃ̺ʰ":23,"ə˥":24,"ŋ":25,"t":26,"ʊ˥":27,"ɕ":28,"i":29,"a":30,"l":31,"au˧˩˧":32,"x":33,"u˧˩˧":34,"i˥":35,"ei˧˩˧":36,"pʰ":37,"i˧˥":38,"ai˧˥":39,"ou˧˩˧":40,"ɤ˧˥":41,"o˧˩˧":42,"tɕ":43,"au˥˩":44,"ts":45,"ə˧˩˧":46,"ɤ˥":47,"ei˧˥":48,"ʊ˧˥":49,"i˧˩˧":50,"t̠ʃ̺":51,"ɻ̩˧˩˧":52,"ei˥˩":53,"s":54,"u˥˩":55,"ɹ̪̩":56,"ai˥":57,"u˥":58,"tɕʰ":59,"a˧˩˧":60,"ai˥˩":61,"ɛ˥˩":62,"f":63,"i˥˩":64,"y˥˩":65,"au˧˥":66,"ɻ":67,"ou˥˩":68,"e˥":69,"tʰ":70,"ɹ̪̩˥˩":71,"ɛ˧˥":72,"au˥":73,"ou˧˥":74,"e˧˩˧":75,"ɛ˥":76,"ɻ̩˥":77,"ɥ":78,"ɹ̪̩˧˩˧":79,"ai˧˩˧":80,"ou˥":81,"o˥˩":82,"ɛ˧˩˧":83,"ʊ˧˩˧":84,"ɔ˥":85,"tsʰ":86,"ei":87,"ə˥˩":88,"o":89,"ʊ˥˩":90,"ou":91,"ɤ˧˩˧":92,"o˧˥":93,"ei˥":94,"e˥˩":95,"ɚ˧˩˧":96,"y˥":97,"ɚ˥˩":98,"y˧˥":99,"ɻ̩":100,"y˧˩˧":101,"ɹ̪̩˥":102,"ɻ̩˧˥":103,"u":104,"ə":105,"ai":106,"ʊ":107,"e":108,"ɚ˧˥":109,"ɔ˥˩":110,"ɹ̪̩˧˥":111,"ɛ":112,"y":113,"m˧˥":114} \ No newline at end of file diff --git a/Norwegian/.gitattributes b/Norwegian/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Norwegian/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Norwegian/README.md b/Norwegian/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Norwegian/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Norwegian/added_tokens.json b/Norwegian/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..8dd7be93aea013162779ff86a346958694f973d7 --- /dev/null +++ b/Norwegian/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 59 +} diff --git a/Norwegian/special_tokens_map.json b/Norwegian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Norwegian/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Norwegian/tokenizer.json b/Norwegian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0bba92e7724ea9ed3f5c99fa6b6a71df7bf6df --- /dev/null +++ b/Norwegian/tokenizer.json @@ -0,0 +1,171 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "t̪ʰ": 4, + "ɑ": 5, + "kː": 6, + "ʋ": 7, + "a": 8, + "ɾ": 9, + "ʃ": 10, + "o̞ː": 11, + "ɡ": 12, + "uː": 13, + "d̪": 14, + "eː": 15, + "e̞": 16, + "s": 17, + "h": 18, + "ʉː": 19, + "tː": 20, + "n̪": 21, + "pː": 22, + "ə": 23, + "l": 24, + "ɪ": 25, + "b": 26, + "iː": 27, + "æ": 28, + "j": 29, + "kʰ": 30, + "ʉ": 31, + "ɒ̝": 32, + "m": 33, + "ø̞ː": 34, + "f": 35, + "yː": 36, + "ai": 37, + "pʰ": 38, + "øy": 39, + "ŋ": 40, + "dː": 41, + "œ": 42, + "bː": 43, + "ç": 44, + "æː": 45, + "ɑː": 46, + "ʏ": 47, + "æʉ": 48, + "ʊ": 49, + "ɡː": 50, + "ɔy": 51, + "ʂ": 52, + "w": 53 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Norwegian/tokenizer_config.json b/Norwegian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Norwegian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Norwegian/vocab.json b/Norwegian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..d073a2aebece4b61067c1a63c20e220b5ab96052 --- /dev/null +++ b/Norwegian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"t̪ʰ":4,"ɑ":5,"kː":6,"ʋ":7,"a":8,"ɾ":9,"ʃ":10,"o̞ː":11,"ɡ":12,"uː":13,"d̪":14,"eː":15,"e̞":16,"s":17,"h":18,"ʉː":19,"tː":20,"n̪":21,"pː":22,"ə":23,"l":24,"ɪ":25,"b":26,"iː":27,"æ":28,"j":29,"kʰ":30,"ʉ":31,"ɒ̝":32,"m":33,"ø̞ː":34,"f":35,"yː":36,"ai":37,"pʰ":38,"øy":39,"ŋ":40,"dː":41,"œ":42,"bː":43,"ç":44,"æː":45,"ɑː":46,"ʏ":47,"æʉ":48,"ʊ":49,"ɡː":50,"ɔy":51,"ʂ":52,"w":53} \ No newline at end of file diff --git a/Polish/.gitattributes b/Polish/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Polish/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Polish/README.md b/Polish/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Polish/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Polish/special_tokens_map.json b/Polish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Polish/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Polish/tokenizer.json b/Polish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ae540657a817beb82468694a9cc16d71be0b355 --- /dev/null +++ b/Polish/tokenizer.json @@ -0,0 +1,160 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "e": 4, + "d̪": 5, + "l̪": 6, + "v": 7, + "o": 8, + "w": 9, + "a": 10, + "j": 11, + "b": 12, + "r": 13, + "ɲ": 14, + "i": 15, + "ɕ": 16, + "u": 17, + "x": 18, + "tɕ": 19, + "t̪": 20, + "k": 21, + "p": 22, + "ɨ": 23, + "dʑ": 24, + "z̪": 25, + "n̪": 26, + "f": 27, + "ʑ": 28, + "m": 29, + "z̻": 30, + "s̻": 31, + "t̻s̻": 32, + "t̪s̪": 33, + "ɡ": 34, + "s̪": 35, + "ŋ": 36, + "kʲ": 37, + "t": 38, + "ɡʲ": 39, + "ɣ": 40, + "ẽ": 41, + "d̻z̻": 42 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Polish/tokenizer_config.json b/Polish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Polish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Polish/vocab.json b/Polish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..87beef69d550700a0998c1d79f275489c7e907e6 --- /dev/null +++ b/Polish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"e":4,"d̪":5,"l̪":6,"v":7,"o":8,"w":9,"a":10,"j":11,"b":12,"r":13,"ɲ":14,"i":15,"ɕ":16,"u":17,"x":18,"tɕ":19,"t̪":20,"k":21,"p":22,"ɨ":23,"dʑ":24,"z̪":25,"n̪":26,"f":27,"ʑ":28,"m":29,"z̻":30,"s̻":31,"t̻s̻":32,"t̪s̪":33,"ɡ":34,"s̪":35,"ŋ":36,"kʲ":37,"t":38,"ɡʲ":39,"ɣ":40,"ẽ":41,"d̻z̻":42} \ No newline at end of file diff --git a/PortugueseBr/.gitattributes b/PortugueseBr/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/PortugueseBr/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/PortugueseBr/README.md b/PortugueseBr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/PortugueseBr/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/PortugueseBr/added_tokens.json b/PortugueseBr/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..348c85742bbb564b92ef6cc49149635926897725 --- /dev/null +++ b/PortugueseBr/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 52 +} diff --git a/PortugueseBr/special_tokens_map.json b/PortugueseBr/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/PortugueseBr/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/PortugueseBr/tokenizer.json b/PortugueseBr/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..e7c0e86057b98488e1ffc4be4c4b8833318b6b14 --- /dev/null +++ b/PortugueseBr/tokenizer.json @@ -0,0 +1,168 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "m": 4, + "a": 5, + "s̪": 6, + "k": 7, + "ɛ": 8, + "ɾ": 9, + "u": 10, + "b": 11, + "e": 12, + "aʊ̯": 13, + "ɡ": 14, + "ɐ": 15, + "oɪ̯": 16, + "z": 17, + "i": 18, + "õ": 19, + "t̪": 20, + "eʊ̯": 21, + "n̪": 22, + "v": 23, + "d̪": 24, + "ɐ̃ʊ̯̃": 25, + "eɪ̯": 26, + "d̠ʒ": 27, + "ẽɪ̯̃": 28, + "p": 29, + "r": 30, + "ɔ": 31, + "o": 32, + "l": 33, + "ɐ̃": 34, + "ĩ": 35, + "f": 36, + "ɲ": 37, + "ũ": 38, + "uɪ̯": 39, + "w": 40, + "ʒ": 41, + "iʊ̯": 42, + "ʃ": 43, + "oʊ̯": 44, + "aɪ̯": 45, + "ɔɪ̯": 46, + "ɣ": 47, + "ɛɪ̯": 48, + "ɛʊ̯": 49, + "ɪ̯": 50 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/PortugueseBr/tokenizer_config.json b/PortugueseBr/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/PortugueseBr/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/PortugueseBr/vocab.json b/PortugueseBr/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..0fa5b78d53c6e619b0509f72bf046c69da146942 --- /dev/null +++ b/PortugueseBr/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"m":4,"a":5,"s̪":6,"k":7,"ɛ":8,"ɾ":9,"u":10,"b":11,"e":12,"aʊ̯":13,"ɡ":14,"ɐ":15,"oɪ̯":16,"z":17,"i":18,"õ":19,"t̪":20,"eʊ̯":21,"n̪":22,"v":23,"d̪":24,"ɐ̃ʊ̯̃":25,"eɪ̯":26,"d̠ʒ":27,"ẽɪ̯̃":28,"p":29,"r":30,"ɔ":31,"o":32,"l":33,"ɐ̃":34,"ĩ":35,"f":36,"ɲ":37,"ũ":38,"uɪ̯":39,"w":40,"ʒ":41,"iʊ̯":42,"ʃ":43,"oʊ̯":44,"aɪ̯":45,"ɔɪ̯":46,"ɣ":47,"ɛɪ̯":48,"ɛʊ̯":49,"ɪ̯":50} \ No newline at end of file diff --git a/PortuguesePt/.gitattributes b/PortuguesePt/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/PortuguesePt/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/PortuguesePt/README.md b/PortuguesePt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/PortuguesePt/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/PortuguesePt/added_tokens.json b/PortuguesePt/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..db6f09e14103e0fd1ae90a5c6c73b19ee843d2f0 --- /dev/null +++ b/PortuguesePt/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 61 +} diff --git a/PortuguesePt/special_tokens_map.json b/PortuguesePt/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/PortuguesePt/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/PortuguesePt/tokenizer.json b/PortuguesePt/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..54a51047797a2f7be790018ae09fe210bf194437 --- /dev/null +++ b/PortuguesePt/tokenizer.json @@ -0,0 +1,169 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ɔ": 4, + "l̪ˠ": 5, + "a": 6, + "p": 7, + "ɐ": 8, + "i": 9, + "n̪": 10, + "e": 11, + "ʃ": 12, + "f": 13, + "ɾ": 14, + "ɐ̃": 15, + "d̪": 16, + "m": 17, + "ʒ": 18, + "b": 19, + "ɯ": 20, + "ɛ": 21, + "ɐ̃i": 22, + "ʁ": 23, + "t̪": 24, + "s": 25, + "o": 26, + "ɐ̃u̜": 27, + "ũ": 28, + "ɡ": 29, + "u": 30, + "k": 31, + "z": 32, + "au̜": 33, + "ai": 34, + "eu̜": 35, + "ɐi": 36, + "ɲ": 37, + "ɛu̜": 38, + "ĩ": 39, + "ũi": 40, + "ɔi": 41, + "õ": 42, + "õi": 43, + "ẽ": 44, + "v": 45, + "oi": 46, + "ʎ": 47, + "iu̜": 48, + "ui": 49, + "ɛi": 50, + "ts": 51 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/PortuguesePt/tokenizer_config.json b/PortuguesePt/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/PortuguesePt/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/PortuguesePt/vocab.json b/PortuguesePt/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..eed8073e2f1cb4ec378ba0dbe3cc8222008fc393 --- /dev/null +++ b/PortuguesePt/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ɔ":4,"l̪ˠ":5,"a":6,"p":7,"ɐ":8,"i":9,"n̪":10,"e":11,"ʃ":12,"f":13,"ɾ":14,"ɐ̃":15,"d̪":16,"m":17,"ʒ":18,"b":19,"ɯ":20,"ɛ":21,"ɐ̃i":22,"ʁ":23,"t̪":24,"s":25,"o":26,"ɐ̃u̜":27,"ũ":28,"ɡ":29,"u":30,"k":31,"z":32,"au̜":33,"ai":34,"eu̜":35,"ɐi":36,"ɲ":37,"ɛu̜":38,"ĩ":39,"ũi":40,"ɔi":41,"õ":42,"õi":43,"ẽ":44,"v":45,"oi":46,"ʎ":47,"iu̜":48,"ui":49,"ɛi":50,"ts":51} \ No newline at end of file diff --git a/Quechua/.gitattributes b/Quechua/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Quechua/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Quechua/README.md b/Quechua/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Quechua/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Quechua/added_tokens.json b/Quechua/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..5572cc119e65e5e3951dd3330b93c0f864ee62fb --- /dev/null +++ b/Quechua/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 34 +} diff --git a/Quechua/special_tokens_map.json b/Quechua/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Quechua/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Quechua/tokenizer.json b/Quechua/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a24893772d1e4034e808ae9754165725383087e1 --- /dev/null +++ b/Quechua/tokenizer.json @@ -0,0 +1,153 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ɛ": 4, + "l": 5, + "β": 6, + "ɪ": 7, + "n": 8, + "a": 9, + "s": 10, + "t": 11, + "r": 12, + "d": 13, + "aː": 14, + "t̠ʃ": 15, + "m": 16, + "ɔ": 17, + "h": 18, + "p": 19, + "ʊ": 20, + "ɡ": 21, + "k": 22, + "q": 23, + "f": 24, + "j": 25, + "w": 26, + "ʎ": 27, + "pʼ": 28, + "ʔ": 29, + "tʼ": 30, + "t̠ʃʼ": 31, + "kʼ": 32, + "ɪː": 33, + "qʼ": 34, + "ɛː": 35 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Quechua/tokenizer_config.json b/Quechua/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Quechua/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Quechua/vocab.json b/Quechua/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..33809cf6fa772d8a89c866adb527103944d8caeb --- /dev/null +++ b/Quechua/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ɛ":4,"l":5,"β":6,"ɪ":7,"n":8,"a":9,"s":10,"t":11,"r":12,"d":13,"aː":14,"t̠ʃ":15,"m":16,"ɔ":17,"h":18,"p":19,"ʊ":20,"ɡ":21,"k":22,"q":23,"f":24,"j":25,"w":26,"ʎ":27,"pʼ":28,"ʔ":29,"tʼ":30,"t̠ʃʼ":31,"kʼ":32,"ɪː":33,"qʼ":34,"ɛː":35} \ No newline at end of file diff --git a/Romanian/.gitattributes b/Romanian/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Romanian/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Romanian/README.md b/Romanian/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Romanian/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Romanian/added_tokens.json b/Romanian/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..db6f09e14103e0fd1ae90a5c6c73b19ee843d2f0 --- /dev/null +++ b/Romanian/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 61 +} diff --git a/Romanian/special_tokens_map.json b/Romanian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Romanian/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Romanian/tokenizer.json b/Romanian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..488a7d5438b1ae62beca7457dbd6dfb861a3fe89 --- /dev/null +++ b/Romanian/tokenizer.json @@ -0,0 +1,180 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "m": 4, + "ä": 5, + "n̪": 6, + "d̠ʒ": 7, + "i": 8, + "v": 9, + "e̞": 10, + "h": 11, + "u": 12, + "ʒ": 13, + "d̪": 14, + "o̞": 15, + "l": 16, + "ɾ̪": 17, + "t̠ʃ": 18, + "p": 19, + "j": 20, + "s̪": 21, + "oʊ": 22, + "t̪": 23, + "aɪ": 24, + "k": 25, + "w": 26, + "ɡ": 27, + "b": 28, + "t̠ʃʲ": 29, + "e̯ä": 30, + "ʃ": 31, + "ʃʲ": 32, + "ə": 33, + "o̯ä": 34, + "ɨ": 35, + "uɪ": 36, + "f": 37, + "t̪s̪": 38, + "z̪": 39, + "əɪ": 40, + "eɪ": 41, + "tsʲ": 42, + "zʲ": 43, + "iɪ": 44, + "aʊ": 45, + "tʲ": 46, + "nʲ": 47, + "eʊ": 48, + "iʊ": 49, + "ɾʲ": 50, + "mʲ": 51, + "bʲ": 52, + "sʲ": 53, + "kʲ": 54, + "lʲ": 55, + "eo": 56, + "d̠ʒʲ": 57, + "dʲ": 58, + "pʲ": 59, + "əʊ": 60, + "fʲ": 61, + "oɪ": 62 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Romanian/tokenizer_config.json b/Romanian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Romanian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Romanian/vocab.json b/Romanian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..26238fcce498ebb7b67a22f7c596224053e541fd --- /dev/null +++ b/Romanian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"m":4,"ä":5,"n̪":6,"d̠ʒ":7,"i":8,"v":9,"e̞":10,"h":11,"u":12,"ʒ":13,"d̪":14,"o̞":15,"l":16,"ɾ̪":17,"t̠ʃ":18,"p":19,"j":20,"s̪":21,"oʊ":22,"t̪":23,"aɪ":24,"k":25,"w":26,"ɡ":27,"b":28,"t̠ʃʲ":29,"e̯ä":30,"ʃ":31,"ʃʲ":32,"ə":33,"o̯ä":34,"ɨ":35,"uɪ":36,"f":37,"t̪s̪":38,"z̪":39,"əɪ":40,"eɪ":41,"tsʲ":42,"zʲ":43,"iɪ":44,"aʊ":45,"tʲ":46,"nʲ":47,"eʊ":48,"iʊ":49,"ɾʲ":50,"mʲ":51,"bʲ":52,"sʲ":53,"kʲ":54,"lʲ":55,"eo":56,"d̠ʒʲ":57,"dʲ":58,"pʲ":59,"əʊ":60,"fʲ":61,"oɪ":62} \ No newline at end of file diff --git a/Serbian/.gitattributes b/Serbian/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Serbian/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Serbian/README.md b/Serbian/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Serbian/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Serbian/special_tokens_map.json b/Serbian/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Serbian/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Serbian/tokenizer.json b/Serbian/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..3534dcd8119733ce1c710c573f85d49def54ef3c --- /dev/null +++ b/Serbian/tokenizer.json @@ -0,0 +1,151 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "j": 4, + "e̞": 5, + "s̪̻": 6, + "t̪̻": 7, + "u": 8, + "l": 9, + "o̞": 10, + "ʒ̺": 11, + "i": 12, + "ʋ": 13, + "d̪̻": 14, + "ä": 15, + "m": 16, + "n": 17, + "r": 18, + "k": 19, + "t̪̻s̪̻": 20, + "p": 21, + "ʃ̺": 22, + "x": 23, + "b": 24, + "ɡ": 25, + "t̻ʃ̻": 26, + "f": 27, + "z̪̻": 28, + "ɲ": 29, + "ʎ": 30, + "d̻ʒ̻": 31, + "y": 32, + "w": 33 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Serbian/tokenizer_config.json b/Serbian/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Serbian/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Serbian/vocab.json b/Serbian/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..147646ea2e4397f80112d624f240c9167cd81010 --- /dev/null +++ b/Serbian/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"j":4,"e̞":5,"s̪̻":6,"t̪̻":7,"u":8,"l":9,"o̞":10,"ʒ̺":11,"i":12,"ʋ":13,"d̪̻":14,"ä":15,"m":16,"n":17,"r":18,"k":19,"t̪̻s̪̻":20,"p":21,"ʃ̺":22,"x":23,"b":24,"ɡ":25,"t̻ʃ̻":26,"f":27,"z̪̻":28,"ɲ":29,"ʎ":30,"d̻ʒ̻":31,"y":32,"w":33} \ No newline at end of file diff --git a/Spanish/.gitattributes b/Spanish/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Spanish/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Spanish/README.md b/Spanish/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Spanish/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Spanish/added_tokens.json b/Spanish/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..02ad31246cce94ec78bd9aba99df1840f8e80c17 --- /dev/null +++ b/Spanish/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 46 +} diff --git a/Spanish/special_tokens_map.json b/Spanish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Spanish/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Spanish/tokenizer.json b/Spanish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d78f791ada70b614356e12438ea827961ac57982 --- /dev/null +++ b/Spanish/tokenizer.json @@ -0,0 +1,148 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "a": 4, + "i": 5, + "ɾ": 6, + "e̞": 7, + "n": 8, + "k": 9, + "ɲ": 10, + "o̞": 11, + "m": 12, + "s": 13, + "u": 14, + "p": 15, + "d": 16, + "l": 17, + "t": 18, + "β": 19, + "ɡ": 20, + "w": 21, + "ʝ": 22, + "f": 23, + "x": 24, + "j": 25, + "r": 26, + "t̠ʃ": 27, + "ʃ": 28, + "tl": 29, + "ts": 30 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Spanish/tokenizer_config.json b/Spanish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Spanish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Spanish/vocab.json b/Spanish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..e63939d763931eed0ff9e6d10ba29c58a4c91b81 --- /dev/null +++ b/Spanish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"a":4,"i":5,"ɾ":6,"e̞":7,"n":8,"k":9,"ɲ":10,"o̞":11,"m":12,"s":13,"u":14,"p":15,"d":16,"l":17,"t":18,"β":19,"ɡ":20,"w":21,"ʝ":22,"f":23,"x":24,"j":25,"r":26,"t̠ʃ":27,"ʃ":28,"tl":29,"ts":30} \ No newline at end of file diff --git a/Swedish/.gitattributes b/Swedish/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Swedish/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Swedish/README.md b/Swedish/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Swedish/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Swedish/added_tokens.json b/Swedish/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..02ad31246cce94ec78bd9aba99df1840f8e80c17 --- /dev/null +++ b/Swedish/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 46 +} diff --git a/Swedish/special_tokens_map.json b/Swedish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Swedish/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Swedish/tokenizer.json b/Swedish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..91dfc8d529da584e2e51e3f4aff51c09d488e872 --- /dev/null +++ b/Swedish/tokenizer.json @@ -0,0 +1,160 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ɔ": 4, + "ʝ": 5, + "k": 6, + "l": 7, + "ɛ": 8, + "m": 9, + "d̪": 10, + "e": 11, + "ʉ̟": 12, + "f": 13, + "ɪ": 14, + "ŋ": 15, + "ɹ": 16, + "a": 17, + "n̪": 18, + "iː": 19, + "ɑː": 20, + "ɛː": 21, + "t̪": 22, + "s̪": 23, + "v": 24, + "oː": 25, + "uː": 26, + "eː": 27, + "ʊ": 28, + "p": 29, + "b": 30, + "h": 31, + "øː": 32, + "yː": 33, + "ʂ": 34, + "ɡ": 35, + "ɵ": 36, + "ʃ": 37, + "œ": 38, + "ɕ": 39, + "ʏ": 40, + "ɧ": 41, + "z": 42 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Swedish/tokenizer_config.json b/Swedish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Swedish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Swedish/vocab.json b/Swedish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..373a50086871da331904483c3445e41128b1d40f --- /dev/null +++ b/Swedish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ɔ":4,"ʝ":5,"k":6,"l":7,"ɛ":8,"m":9,"d̪":10,"e":11,"ʉ̟":12,"f":13,"ɪ":14,"ŋ":15,"ɹ":16,"a":17,"n̪":18,"iː":19,"ɑː":20,"ɛː":21,"t̪":22,"s̪":23,"v":24,"oː":25,"uː":26,"eː":27,"ʊ":28,"p":29,"b":30,"h":31,"øː":32,"yː":33,"ʂ":34,"ɡ":35,"ɵ":36,"ʃ":37,"œ":38,"ɕ":39,"ʏ":40,"ɧ":41,"z":42} \ No newline at end of file diff --git a/Turkish/.gitattributes b/Turkish/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Turkish/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Turkish/README.md b/Turkish/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Turkish/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Turkish/added_tokens.json b/Turkish/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..af206b510ed8ecbd024cef6b6af83152104beead --- /dev/null +++ b/Turkish/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 49 +} diff --git a/Turkish/special_tokens_map.json b/Turkish/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Turkish/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Turkish/tokenizer.json b/Turkish/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5c3784f91d2b994af93259afb89960796bf983bc --- /dev/null +++ b/Turkish/tokenizer.json @@ -0,0 +1,158 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "a": 4, + "m": 5, + "h": 6, + "e": 7, + "ɾ": 8, + "k": 9, + "lʲ": 10, + "iː": 11, + "b": 12, + "f": 13, + "l̪ˠ": 14, + "n̪": 15, + "ɯ": 16, + "j": 17, + "o": 18, + "z̪": 19, + "s̪": 20, + "v": 21, + "d̪": 22, + "i": 23, + "p": 24, + "ɟ": 25, + "œ": 26, + "y": 27, + "eː": 28, + "d̠ʒ": 29, + "ʃ": 30, + "u": 31, + "ɡ": 32, + "t̪": 33, + "t̠ʃ": 34, + "aː": 35, + "pː": 36, + "ʒ": 37, + "uː": 38, + "c": 39, + "w": 40 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Turkish/tokenizer_config.json b/Turkish/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Turkish/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Turkish/vocab.json b/Turkish/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..a120f9062fc379757b6a6030f7a5ac8520899afe --- /dev/null +++ b/Turkish/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"a":4,"m":5,"h":6,"e":7,"ɾ":8,"k":9,"lʲ":10,"iː":11,"b":12,"f":13,"l̪ˠ":14,"n̪":15,"ɯ":16,"j":17,"o":18,"z̪":19,"s̪":20,"v":21,"d̪":22,"i":23,"p":24,"ɟ":25,"œ":26,"y":27,"eː":28,"d̠ʒ":29,"ʃ":30,"u":31,"ɡ":32,"t̪":33,"t̠ʃ":34,"aː":35,"pː":36,"ʒ":37,"uː":38,"c":39,"w":40} \ No newline at end of file diff --git a/Welsh/.gitattributes b/Welsh/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Welsh/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Welsh/README.md b/Welsh/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bc5f30d6632ac0efdc7be2e9095e9e9579af2e33 --- /dev/null +++ b/Welsh/README.md @@ -0,0 +1,199 @@ +--- +library_name: transformers +tags: [] +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + +This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] \ No newline at end of file diff --git a/Welsh/added_tokens.json b/Welsh/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..ef00004cc13e059472f0294cdfabbcf80546b84e --- /dev/null +++ b/Welsh/added_tokens.json @@ -0,0 +1,3 @@ +{ + "<|endoftext|>": 54 +} diff --git a/Welsh/special_tokens_map.json b/Welsh/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..fec44445c7dd84b325cdb731fface5fdf4649cd0 --- /dev/null +++ b/Welsh/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "bos_token": "UTT_BOUNDARY", + "eos_token": "UTT_BOUNDARY", + "pad_token": "PAD", + "unk_token": "UNK" +} diff --git a/Welsh/tokenizer.json b/Welsh/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9be771c635ece0839f29fed9499005f32da6f6e9 --- /dev/null +++ b/Welsh/tokenizer.json @@ -0,0 +1,165 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "UNK", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "PAD", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "WORD_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "UTT_BOUNDARY", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "Sequence", + "normalizers": [ + { + "type": "Strip", + "strip_left": true, + "strip_right": true + } + ] + }, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "UTT_BOUNDARY", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "UTT_BOUNDARY": { + "id": "UTT_BOUNDARY", + "ids": [ + 3 + ], + "tokens": [ + "UTT_BOUNDARY" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "UNK": 0, + "PAD": 1, + "WORD_BOUNDARY": 2, + "UTT_BOUNDARY": 3, + "ɔ": 4, + "h": 5, + "m": 6, + "ai": 7, + "ɛ": 8, + "r": 9, + "t": 10, + "ɑː": 11, + "p": 12, + "d": 13, + "iː": 14, + "b": 15, + "oː": 16, + "f": 17, + "eː": 18, + "χ": 19, + "w": 20, + "a": 21, + "n": 22, + "ø": 23, + "j": 24, + "au": 25, + "ə": 26, + "ɔi": 27, + "ð": 28, + "ɪ": 29, + "s": 30, + "ɡ": 31, + "ʊi": 32, + "ʊ": 33, + "əi": 34, + "θ": 35, + "l": 36, + "ʌ": 37, + "ŋ": 38, + "v": 39, + "k": 40, + "ɬ": 41, + "ɪu": 42, + "uː": 43, + "ʃ": 44, + "ɛu": 45, + "d̠ʒ": 46, + "z": 47 + }, + "unk_token": "UNK" + } +} \ No newline at end of file diff --git a/Welsh/tokenizer_config.json b/Welsh/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9edd32e2e292fdae209686c2d75a305decdd934 --- /dev/null +++ b/Welsh/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "0": { + "content": "UNK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "PAD", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "WORD_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "UTT_BOUNDARY", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "UTT_BOUNDARY", + "clean_up_tokenization_spaces": true, + "eos_token": "UTT_BOUNDARY", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "PAD", + "tokenizer_class": "GPT2Tokenizer", + "unk_token": "UNK" +} diff --git a/Welsh/vocab.json b/Welsh/vocab.json new file mode 100644 index 0000000000000000000000000000000000000000..432cadeb35fc1822695df3a08874fe3c24cdef04 --- /dev/null +++ b/Welsh/vocab.json @@ -0,0 +1 @@ +{"UNK":0,"PAD":1,"WORD_BOUNDARY":2,"UTT_BOUNDARY":3,"ɔ":4,"h":5,"m":6,"ai":7,"ɛ":8,"r":9,"t":10,"ɑː":11,"p":12,"d":13,"iː":14,"b":15,"oː":16,"f":17,"eː":18,"χ":19,"w":20,"a":21,"n":22,"ø":23,"j":24,"au":25,"ə":26,"ɔi":27,"ð":28,"ɪ":29,"s":30,"ɡ":31,"ʊi":32,"ʊ":33,"əi":34,"θ":35,"l":36,"ʌ":37,"ŋ":38,"v":39,"k":40,"ɬ":41,"ɪu":42,"uː":43,"ʃ":44,"ɛu":45,"d̠ʒ":46,"z":47} \ No newline at end of file