Source code for libai.tokenizer.tokenization_t5

# coding=utf-8
# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tokenization class for Google T5 (sentence piece)."""

import logging
import os
from shutil import copyfile

import regex as re
import sentencepiece as spm

from .tokenization_base import PreTrainedTokenizer

logger = logging.getLogger(__name__)

VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {"t5-base": "https://huggingface.co/t5-base/resolve/main/spiece.model"}
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "t5-base": 512,
}


[docs]class GoogleT5Tokenizer(PreTrainedTokenizer): """ Construct a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`. Args: vocab_file (:obj:`str`): Path to the vocabulary file. eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`): The end of sequence token. unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): The token used for padding, for example when batching sequences of different lengths. extra_ids (:obj:`int`, `optional`, defaults to 100): Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing see `here <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__). additional_special_tokens (:obj:`List[str]`, `optional`): Additional special tokens used by the tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__( self, vocab_file, eos_token="</s>", unk_token="<unk>", pad_token="<pad>", extra_ids=100, additional_special_tokens=None, **kwargs, ): # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)] elif extra_ids > 0 and additional_special_tokens is not None: extra_tokens = len( set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)) ) if extra_tokens != extra_ids: raise ValueError( f"Both extra_ids ({extra_ids}) and additional_special_tokens " f"({additional_special_tokens}) are privided to T5Tokenizer. " "In this case the additional_special_tokens must include the extra_ids tokens" ) super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, **kwargs, ) self.vocab_file = vocab_file self._extra_ids = extra_ids self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return self.sp_model.get_piece_size() + self._extra_ids
[docs] def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab
def _tokenize(self, text): """Tokenize a string.""" pieces = self.sp_model.encode(text, out_type=str) return pieces def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" if token.startswith("<extra_id_"): match = re.match(r"<extra_id_(\d+)>", token) num = int(match.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) else: token = f"<extra_id_{self.vocab_size - 1 - index}>" return token
[docs] def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) to a single string.""" current_sub_tokens = [] out_string = "" for token in tokens: # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " " current_sub_tokens = [] else: current_sub_tokens.append(token) out_string += self.sp_model.decode_pieces(current_sub_tokens) return out_string.strip()
[docs] def save_vocabulary(self, save_directory, filename_prefix=None): """Save the tokenizer vocabulary to a directory or file.""" if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"], ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) logger.info(f"Copy vocab file to {out_vocab_file}") return (out_vocab_file,)