# coding=utf-8
# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization class for Google T5 (sentence piece)."""
import logging
import os
from shutil import copyfile
import regex as re
import sentencepiece as spm
from .tokenization_base import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"t5-base": "https://huggingface.co/t5-base/resolve/main/spiece.model"}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"t5-base": 512,
}
[docs]class GoogleT5Tokenizer(PreTrainedTokenizer):
"""
Construct a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot
be converted to an ID and is set to be this token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
extra_ids (:obj:`int`, `optional`, defaults to 100):
Add a number of extra ids added to the end of the vocabulary for use
as sentinels. These tokens are accessible as "<extra_id_{%d}>" where
"{%d}" is a number between 0 and extra_ids-1. Extra tokens are indexed
from the end of the vocabulary up to beginning ("<extra_id_0>" is the
last token in the vocabulary like in T5 preprocessing see `here
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
**kwargs,
):
# Add extra_ids to the special token list
if extra_ids > 0 and additional_special_tokens is None:
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and additional_special_tokens is not None:
extra_tokens = len(
set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens))
)
if extra_tokens != extra_ids:
raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens "
f"({additional_special_tokens}) are privided to T5Tokenizer. "
"In this case the additional_special_tokens must include the extra_ids tokens"
)
super().__init__(
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
self._extra_ids = extra_ids
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
@property
def vocab_size(self):
return self.sp_model.get_piece_size() + self._extra_ids
[docs] def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""Tokenize a string."""
pieces = self.sp_model.encode(text, out_type=str)
return pieces
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
if token.startswith("<extra_id_"):
match = re.match(r"<extra_id_(\d+)>", token)
num = int(match.group(1))
return self.vocab_size - num - 1
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index < self.sp_model.get_piece_size():
token = self.sp_model.IdToPiece(index)
else:
token = f"<extra_id_{self.vocab_size - 1 - index}>"
return token
[docs] def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) to a single string."""
current_sub_tokens = []
out_string = ""
for token in tokens:
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " "
current_sub_tokens = []
else:
current_sub_tokens.append(token)
out_string += self.sp_model.decode_pieces(current_sub_tokens)
return out_string.strip()
[docs] def save_vocabulary(self, save_directory, filename_prefix=None):
"""Save the tokenizer vocabulary to a directory or file."""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
logger.info(f"Copy vocab file to {out_vocab_file}")
return (out_vocab_file,)