| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ Tokenization classes for python tokenizers. |
| For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py |
| """ |
|
|
| import itertools |
| import logging |
| import re |
| import unicodedata |
| from typing import Dict, List, Optional, Tuple, Union |
|
|
| from .file_utils import add_end_docstrings |
| from .tokenization_utils_base import ( |
| ENCODE_KWARGS_DOCSTRING, |
| ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, |
| AddedToken, |
| BatchEncoding, |
| EncodedInput, |
| EncodedInputPair, |
| PaddingStrategy, |
| PreTokenizedInput, |
| PreTokenizedInputPair, |
| PreTrainedTokenizerBase, |
| TensorType, |
| TextInput, |
| TextInputPair, |
| TruncationStrategy, |
| ) |
|
|
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def _is_whitespace(char): |
| """Checks whether `chars` is a whitespace character.""" |
| |
| |
| if char == " " or char == "\t" or char == "\n" or char == "\r": |
| return True |
| cat = unicodedata.category(char) |
| if cat == "Zs": |
| return True |
| return False |
|
|
|
|
| def _is_control(char): |
| """Checks whether `chars` is a control character.""" |
| |
| |
| if char == "\t" or char == "\n" or char == "\r": |
| return False |
| cat = unicodedata.category(char) |
| if cat.startswith("C"): |
| return True |
| return False |
|
|
|
|
| def _is_punctuation(char): |
| """Checks whether `chars` is a punctuation character.""" |
| cp = ord(char) |
| |
| |
| |
| |
| if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): |
| return True |
| cat = unicodedata.category(char) |
| if cat.startswith("P"): |
| return True |
| return False |
|
|
|
|
| def _is_end_of_word(text): |
| """Checks whether the last character in text is one of a punctuation, control or whitespace character.""" |
| last_char = text[-1] |
| return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char)) |
|
|
|
|
| def _is_start_of_word(text): |
| """Checks whether the first character in text is one of a punctuation, control or whitespace character.""" |
| first_char = text[0] |
| return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) |
|
|
|
|
| class PreTrainedTokenizer(PreTrainedTokenizerBase): |
| """ Base class for all slow tokenizers. |
| |
| Handle all the shared methods for tokenization and special tokens as well as methods |
| downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. |
| |
| This class also contain the added tokens in a unified way on top of all tokenizers so we don't |
| have to handle the specific vocabulary augmentation methods of the various underlying |
| dictionary structures (BPE, sentencepiece...). |
| |
| Class attributes (overridden by derived classes): |
| |
| - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file |
| required by the model, and as associated values, the filename for saving the associated file (string). |
| - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys |
| being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the |
| `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the |
| associated pretrained vocabulary file. |
| - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained |
| models, and as associated values, the maximum length of the sequence inputs of this model, or None if the |
| model has no maximum input size. |
| - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the |
| pretrained models, and as associated values, a dictionnary of specific arguments to pass to the |
| ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the |
| ``from_pretrained()`` method. |
| |
| Args: |
| - ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. |
| When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated |
| model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). |
| no associated max_length can be found in ``max_model_input_sizes``. |
| - ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. |
| Should be selected between ['right', 'left'] |
| - ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the |
| model ("token_type_ids", "attention_mask"...). |
| - ``bos_token``: (`Optional`) string: a beginning of sentence token. |
| Will be associated to ``self.bos_token`` and ``self.bos_token_id`` |
| - ``eos_token``: (`Optional`) string: an end of sentence token. |
| Will be associated to ``self.eos_token`` and ``self.eos_token_id`` |
| - ``unk_token``: (`Optional`) string: an unknown token. |
| Will be associated to ``self.unk_token`` and ``self.unk_token_id`` |
| - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). |
| Will be associated to ``self.sep_token`` and ``self.sep_token_id`` |
| - ``pad_token``: (`Optional`) string: a padding token. |
| Will be associated to ``self.pad_token`` and ``self.pad_token_id`` |
| - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence |
| leveraging self-attention along the full depth of the model). |
| Will be associated to ``self.cls_token`` and ``self.cls_token_id`` |
| - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language |
| modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` |
| - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. |
| Adding all special tokens here ensure they won't be split by the tokenization process. |
| Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` |
| |
| |
| .. automethod:: __call__ |
| """ |
|
|
| def __init__(self, **kwargs): |
| super().__init__(**kwargs) |
|
|
| |
| |
| self.added_tokens_encoder: Dict[str, int] = {} |
| self.added_tokens_decoder: Dict[int, str] = {} |
| self.unique_no_split_tokens: List[str] = [] |
|
|
| @property |
| def is_fast(self) -> bool: |
| return False |
|
|
| @property |
| def vocab_size(self) -> int: |
| """ Size of the base vocabulary (without the added tokens) """ |
| raise NotImplementedError |
|
|
| def get_vocab(self): |
| """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """ |
| raise NotImplementedError() |
|
|
| def get_added_vocab(self) -> Dict[str, int]: |
| return self.added_tokens_encoder |
|
|
| def __len__(self): |
| """ Size of the full vocabulary with the added tokens """ |
| return self.vocab_size + len(self.added_tokens_encoder) |
|
|
| def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens=False) -> int: |
| """ |
| Add a list of new tokens to the tokenizer class. If the new tokens are not in the |
| vocabulary, they are added to it with indices starting from length of the current vocabulary. |
| |
| Args: |
| new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not |
| already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). |
| |
| Returns: |
| Number of tokens added to the vocabulary. |
| |
| Examples:: |
| |
| # Let's see how to increase the vocabulary of Bert model and tokenizer |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
| model = BertModel.from_pretrained('bert-base-uncased') |
| |
| num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) |
| print('We have added', num_added_toks, 'tokens') |
| model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. |
| """ |
| new_tokens = [str(tok) for tok in new_tokens] |
|
|
| tokens_to_add = [] |
| for token in new_tokens: |
| assert isinstance(token, str) |
| if not special_tokens and self.init_kwargs.get("do_lower_case", False): |
| token = token.lower() |
| if ( |
| token != self.unk_token |
| and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) |
| and token not in tokens_to_add |
| ): |
| tokens_to_add.append(token) |
| if self.verbose: |
| logger.info("Adding %s to the vocabulary", token) |
|
|
| added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) |
| added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} |
| self.added_tokens_encoder.update(added_tok_encoder) |
| self.added_tokens_decoder.update(added_tok_decoder) |
|
|
| |
| if special_tokens: |
| self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens))) |
| else: |
| |
| self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add))) |
|
|
| return len(tokens_to_add) |
|
|
| def num_special_tokens_to_add(self, pair=False): |
| """ |
| Returns the number of added tokens when encoding a sequence with special tokens. |
| |
| Note: |
| This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this |
| inside your training loop. |
| |
| Args: |
| pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the |
| number of added tokens in the case of a single sequence if set to False. |
| |
| Returns: |
| Number of tokens added to sequences |
| """ |
| token_ids_0 = [] |
| token_ids_1 = [] |
| return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) |
|
|
| def tokenize(self, text: TextInput, **kwargs): |
| """ Converts a string in a sequence of tokens (string), using the tokenizer. |
| Split in words for word-based vocabulary or sub-words for sub-word-based |
| vocabularies (BPE/SentencePieces/WordPieces). |
| |
| Take care of added tokens. |
| |
| Args: |
| text (:obj:`string`): The sequence to be encoded. |
| **kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method. |
| """ |
| |
| all_special_tokens_extended = dict( |
| (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken) |
| ) |
|
|
| text, kwargs = self.prepare_for_tokenization(text, **kwargs) |
|
|
| if kwargs: |
| logger.warning(f"Keyword arguments {kwargs} not recognized.") |
|
|
| |
| if self.init_kwargs.get("do_lower_case", False): |
| |
| escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens] |
| pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" |
| text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) |
|
|
| def split_on_token(tok, text): |
| result = [] |
| tok_extended = all_special_tokens_extended.get(tok, None) |
| split_text = text.split(tok) |
| full_word = "" |
| for i, sub_text in enumerate(split_text): |
| |
| |
| |
| |
| if isinstance(tok_extended, AddedToken): |
| if tok_extended.single_word: |
| |
| if ( |
| i < len(split_text) - 1 |
| and not _is_end_of_word(sub_text) |
| and not _is_start_of_word(split_text[i + 1]) |
| ): |
| |
| full_word += sub_text + tok |
| elif full_word: |
| full_word += sub_text |
| result += [full_word] |
| full_word = "" |
| continue |
| |
| if tok_extended.rstrip and i > 0: |
| |
| |
| sub_text = sub_text.lstrip() |
| |
| if tok_extended.lstrip and i < len(split_text) - 1: |
| sub_text = sub_text.rstrip() |
| else: |
| |
| if i < len(split_text) - 1: |
| sub_text = sub_text.rstrip() |
| if i > 0: |
| sub_text = sub_text.lstrip() |
|
|
| if i == 0 and not sub_text: |
| result += [tok] |
| elif i == len(split_text) - 1: |
| if sub_text: |
| result += [sub_text] |
| else: |
| pass |
| else: |
| if sub_text: |
| result += [sub_text] |
| result += [tok] |
| return result |
|
|
| def split_on_tokens(tok_list, text): |
| if not text.strip(): |
| return [] |
| if not tok_list: |
| return self._tokenize(text) |
|
|
| tokenized_text = [] |
| text_list = [text] |
| for tok in tok_list: |
| tokenized_text = [] |
| for sub_text in text_list: |
| if sub_text not in self.unique_no_split_tokens: |
| tokenized_text += split_on_token(tok, sub_text) |
| else: |
| tokenized_text += [sub_text] |
| text_list = tokenized_text |
|
|
| return list( |
| itertools.chain.from_iterable( |
| ( |
| self._tokenize(token) if token not in self.unique_no_split_tokens else [token] |
| for token in tokenized_text |
| ) |
| ) |
| ) |
|
|
| no_split_token = self.unique_no_split_tokens |
| tokenized_text = split_on_tokens(no_split_token, text) |
| return tokenized_text |
|
|
| def _tokenize(self, text, **kwargs): |
| """ Converts a string in a sequence of tokens (string), using the tokenizer. |
| Split in words for word-based vocabulary or sub-words for sub-word-based |
| vocabularies (BPE/SentencePieces/WordPieces). |
| |
| Do NOT take care of added tokens. |
| """ |
| raise NotImplementedError |
|
|
| def convert_tokens_to_ids(self, tokens): |
| """ Converts a token string (or a sequence of tokens) in a single integer id |
| (or a sequence of ids), using the vocabulary. |
| """ |
| if tokens is None: |
| return None |
|
|
| if isinstance(tokens, str): |
| return self._convert_token_to_id_with_added_voc(tokens) |
|
|
| ids = [] |
| for token in tokens: |
| ids.append(self._convert_token_to_id_with_added_voc(token)) |
| return ids |
|
|
| def _convert_token_to_id_with_added_voc(self, token): |
| if token is None: |
| return None |
|
|
| if token in self.added_tokens_encoder: |
| return self.added_tokens_encoder[token] |
| return self._convert_token_to_id(token) |
|
|
| def _convert_token_to_id(self, token): |
| raise NotImplementedError |
|
|
| def _encode_plus( |
| self, |
| text: Union[TextInput, PreTokenizedInput, EncodedInput], |
| text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
| add_special_tokens: bool = True, |
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
| truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
| max_length: Optional[int] = None, |
| stride: int = 0, |
| is_pretokenized: bool = False, |
| pad_to_multiple_of: Optional[int] = None, |
| return_tensors: Optional[Union[str, TensorType]] = None, |
| return_token_type_ids: Optional[bool] = None, |
| return_attention_mask: Optional[bool] = None, |
| return_overflowing_tokens: bool = False, |
| return_special_tokens_mask: bool = False, |
| return_offsets_mapping: bool = False, |
| return_length: bool = False, |
| verbose: bool = True, |
| **kwargs |
| ) -> BatchEncoding: |
| def get_input_ids(text): |
| if isinstance(text, str): |
| tokens = self.tokenize(text, **kwargs) |
| return self.convert_tokens_to_ids(tokens) |
| elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): |
| if is_pretokenized: |
| tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text))) |
| return self.convert_tokens_to_ids(tokens) |
| else: |
| return self.convert_tokens_to_ids(text) |
| elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): |
| return text |
| else: |
| if is_pretokenized: |
| raise ValueError( |
| f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`." |
| ) |
| else: |
| raise ValueError( |
| f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." |
| ) |
|
|
| if return_offsets_mapping: |
| raise NotImplementedError( |
| "return_offset_mapping is not available when using Python tokenizers." |
| "To use this feature, change your tokenizer to one deriving from " |
| "transformers.PreTrainedTokenizerFast." |
| "More information on available tokenizers at " |
| "https://github.com/huggingface/transformers/pull/2674" |
| ) |
|
|
| first_ids = get_input_ids(text) |
| second_ids = get_input_ids(text_pair) if text_pair is not None else None |
|
|
| return self.prepare_for_model( |
| first_ids, |
| pair_ids=second_ids, |
| add_special_tokens=add_special_tokens, |
| padding=padding_strategy.value, |
| truncation=truncation_strategy.value, |
| max_length=max_length, |
| stride=stride, |
| pad_to_multiple_of=pad_to_multiple_of, |
| return_tensors=return_tensors, |
| prepend_batch_axis=True, |
| return_attention_mask=return_attention_mask, |
| return_token_type_ids=return_token_type_ids, |
| return_overflowing_tokens=return_overflowing_tokens, |
| return_special_tokens_mask=return_special_tokens_mask, |
| return_length=return_length, |
| verbose=verbose, |
| ) |
|
|
| def _batch_encode_plus( |
| self, |
| batch_text_or_text_pairs: Union[ |
| List[TextInput], |
| List[TextInputPair], |
| List[PreTokenizedInput], |
| List[PreTokenizedInputPair], |
| List[EncodedInput], |
| List[EncodedInputPair], |
| ], |
| add_special_tokens: bool = True, |
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
| truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
| max_length: Optional[int] = None, |
| stride: int = 0, |
| is_pretokenized: bool = False, |
| pad_to_multiple_of: Optional[int] = None, |
| return_tensors: Optional[Union[str, TensorType]] = None, |
| return_token_type_ids: Optional[bool] = None, |
| return_attention_mask: Optional[bool] = None, |
| return_overflowing_tokens: bool = False, |
| return_special_tokens_mask: bool = False, |
| return_offsets_mapping: bool = False, |
| return_length: bool = False, |
| verbose: bool = True, |
| **kwargs |
| ) -> BatchEncoding: |
| def get_input_ids(text): |
| if isinstance(text, str): |
| tokens = self.tokenize(text, **kwargs) |
| return self.convert_tokens_to_ids(tokens) |
| elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): |
| if is_pretokenized: |
| tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text))) |
| return self.convert_tokens_to_ids(tokens) |
| else: |
| return self.convert_tokens_to_ids(text) |
| elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): |
| return text |
| else: |
| raise ValueError( |
| "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." |
| ) |
|
|
| if return_offsets_mapping: |
| raise NotImplementedError( |
| "return_offset_mapping is not available when using Python tokenizers." |
| "To use this feature, change your tokenizer to one deriving from " |
| "transformers.PreTrainedTokenizerFast." |
| ) |
|
|
| input_ids = [] |
| for ids_or_pair_ids in batch_text_or_text_pairs: |
| if not isinstance(ids_or_pair_ids, (list, tuple)): |
| ids, pair_ids = ids_or_pair_ids, None |
| elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)): |
| ids, pair_ids = ids_or_pair_ids, None |
| else: |
| ids, pair_ids = ids_or_pair_ids |
|
|
| first_ids = get_input_ids(ids) |
| second_ids = get_input_ids(pair_ids) if pair_ids is not None else None |
| input_ids.append((first_ids, second_ids)) |
|
|
| batch_outputs = self._batch_prepare_for_model( |
| input_ids, |
| add_special_tokens=add_special_tokens, |
| padding_strategy=padding_strategy, |
| truncation_strategy=truncation_strategy, |
| max_length=max_length, |
| stride=stride, |
| pad_to_multiple_of=pad_to_multiple_of, |
| return_attention_mask=return_attention_mask, |
| return_token_type_ids=return_token_type_ids, |
| return_overflowing_tokens=return_overflowing_tokens, |
| return_special_tokens_mask=return_special_tokens_mask, |
| return_length=return_length, |
| return_tensors=return_tensors, |
| verbose=verbose, |
| ) |
|
|
| return BatchEncoding(batch_outputs) |
|
|
| @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
| def _batch_prepare_for_model( |
| self, |
| batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]], |
| add_special_tokens: bool = True, |
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
| truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
| max_length: Optional[int] = None, |
| stride: int = 0, |
| pad_to_multiple_of: Optional[int] = None, |
| return_tensors: Optional[str] = None, |
| return_token_type_ids: Optional[bool] = None, |
| return_attention_mask: Optional[bool] = None, |
| return_overflowing_tokens: bool = False, |
| return_special_tokens_mask: bool = False, |
| return_length: bool = False, |
| verbose: bool = True, |
| ) -> BatchEncoding: |
| """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. |
| It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and |
| manages a moving window (with user defined stride) for overflowing tokens |
| |
| Args: |
| batch_ids_pairs: list of tokenized input ids or input ids pairs |
| """ |
|
|
| batch_outputs = {} |
| for first_ids, second_ids in batch_ids_pairs: |
| outputs = self.prepare_for_model( |
| first_ids, |
| second_ids, |
| add_special_tokens=add_special_tokens, |
| padding=PaddingStrategy.DO_NOT_PAD.value, |
| truncation=truncation_strategy.value, |
| max_length=max_length, |
| stride=stride, |
| pad_to_multiple_of=None, |
| return_attention_mask=False, |
| return_token_type_ids=return_token_type_ids, |
| return_overflowing_tokens=return_overflowing_tokens, |
| return_special_tokens_mask=return_special_tokens_mask, |
| return_length=return_length, |
| return_tensors=None, |
| prepend_batch_axis=False, |
| verbose=verbose, |
| ) |
|
|
| for key, value in outputs.items(): |
| if key not in batch_outputs: |
| batch_outputs[key] = [] |
| batch_outputs[key].append(value) |
|
|
| batch_outputs = self.pad( |
| batch_outputs, |
| padding=padding_strategy.value, |
| max_length=max_length, |
| pad_to_multiple_of=pad_to_multiple_of, |
| return_attention_mask=return_attention_mask, |
| ) |
|
|
| batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) |
|
|
| return batch_outputs |
|
|
| def prepare_for_tokenization(self, text: str, is_pretokenized=False, **kwargs) -> (str, dict): |
| """ Performs any necessary transformations before tokenization. |
| |
| This method should pop the arguments from kwargs and return kwargs as well. |
| We test kwargs at the end of the encoding process to be sure all the arguments have been used. |
| """ |
| return (text, kwargs) |
|
|
| def get_special_tokens_mask( |
| self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False |
| ) -> List[int]: |
| """ |
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
| special tokens using the tokenizer ``prepare_for_model`` method. |
| |
| Args: |
| token_ids_0: list of ids (must not contain special tokens) |
| token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids |
| for sequence pairs |
| already_has_special_tokens: (default False) Set to True if the token list is already formated with |
| special tokens for the model |
| |
| Returns: |
| A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| """ |
| return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) |
|
|
| def convert_ids_to_tokens( |
| self, ids: Union[int, List[int]], skip_special_tokens: bool = False |
| ) -> Union[str, List[str]]: |
| """ Converts a single index or a sequence of indices (integers) in a token " |
| (resp.) a sequence of tokens (str), using the vocabulary and added tokens. |
| |
| Args: |
| skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False |
| """ |
| if isinstance(ids, int): |
| if ids in self.added_tokens_decoder: |
| return self.added_tokens_decoder[ids] |
| else: |
| return self._convert_id_to_token(ids) |
| tokens = [] |
| for index in ids: |
| index = int(index) |
| if skip_special_tokens and index in self.all_special_ids: |
| continue |
| if index in self.added_tokens_decoder: |
| tokens.append(self.added_tokens_decoder[index]) |
| else: |
| tokens.append(self._convert_id_to_token(index)) |
| return tokens |
|
|
| def _convert_id_to_token(self, index: int) -> str: |
| raise NotImplementedError |
|
|
| def convert_tokens_to_string(self, tokens: List[str]) -> str: |
| """ Converts a sequence of tokens (string) in a single string. |
| The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) |
| but we often want to remove sub-word tokenization artifacts at the same time. |
| """ |
| return " ".join(self.convert_ids_to_tokens(tokens)) |
|
|
| def decode( |
| self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True |
| ) -> str: |
| filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) |
|
|
| |
| |
| |
| sub_texts = [] |
| current_sub_text = [] |
| for token in filtered_tokens: |
| if skip_special_tokens and token in self.all_special_ids: |
| continue |
| if token in self.added_tokens_encoder: |
| if current_sub_text: |
| sub_texts.append(self.convert_tokens_to_string(current_sub_text)) |
| current_sub_text = [] |
| sub_texts.append(token) |
| else: |
| current_sub_text.append(token) |
| if current_sub_text: |
| sub_texts.append(self.convert_tokens_to_string(current_sub_text)) |
| text = " ".join(sub_texts) |
|
|
| if clean_up_tokenization_spaces: |
| clean_text = self.clean_up_tokenization(text) |
| return clean_text |
| else: |
| return text |
|
|
| def save_vocabulary(self, save_directory) -> Tuple[str]: |
| """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens |
| and special token mappings. |
| |
| Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full |
| Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` |
| class method. |
| """ |
| raise NotImplementedError |
|
|