Source code for libai.data.datasets.t5_dataset

# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""T5 Style dataset."""

import collections

import numpy as np
import oneflow as flow

from libai.data.structures import DistTensorData, Instance

from ..data_utils import create_masked_lm_predictions, get_samples_mapping


[docs]class T5Dataset(flow.utils.data.Dataset): """ Dataset containing sentences for T5 training. Args: name: Name of dataset. tokenizer: Tokenizer to use. data_prefix (str): Path to the training dataset. indexed_dataset: Indexed dataset to use. max_seq_length (int, optional): Maximum length of the sequence passing into encoder. All values are padded to this length. Defaults to 512. max_seq_length_dec (int, optional): Maximum length of the sequence passing into decoder. All values are padded to this length. Defaults to 128. mask_lm_prob (float, optional): Probability to mask tokens. Defaults to 0.15. max_preds_per_seq (int, optional): Maximum number of masked tokens in each sentence. Defaults to None. short_seq_prob (float, optional): Probability of producing a short sequence. Defaults to 0.0. seed (int, optional): Seed for random number generator for reproducibility. Defaults to 1234. """ def __init__( self, name, tokenizer, indexed_dataset, data_prefix, max_num_samples, masked_lm_prob, max_seq_length, max_seq_length_dec, short_seq_prob, seed, ): # Params to store. self.name = name self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.max_seq_length_dec = max_seq_length_dec # Dataset. self.indexed_dataset = indexed_dataset # Build the samples mapping. self.samples_mapping = get_samples_mapping( self.indexed_dataset, data_prefix, None, max_num_samples, self.max_seq_length - 2, # account for added tokens short_seq_prob, self.seed, self.name, False, ) # Vocab stuff. self.tokenizer = tokenizer tokenizer.add_tokens( [tokenizer._bos_token, tokenizer._eos_token, *tokenizer._additional_special_tokens] ) vocab = tokenizer.get_vocab() inv_vocab = {v: k for k, v in vocab.items()} self.vocab_id_list = list(inv_vocab.keys()) self.vocab_id_to_token_dict = inv_vocab self.cls_id = vocab[tokenizer._cls_token] self.sep_id = vocab[tokenizer._sep_token] self.mask_id = vocab[tokenizer._mask_token] self.pad_id = vocab[tokenizer._pad_token] self.bos_id = vocab[tokenizer._bos_token] self.eos_id = vocab[tokenizer._eos_token] self.sentinel_tokens = [vocab[x] for x in tokenizer._additional_special_tokens] assert len(self.sentinel_tokens) > 0 def __len__(self): return self.samples_mapping.shape[0] def __getitem__(self, idx): start_index, end_index, seq_length = self.samples_mapping[idx] sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) return build_training_sample( self.tokenizer, sample, seq_length, self.max_seq_length, # needed for padding self.max_seq_length_dec, self.vocab_id_list, self.vocab_id_to_token_dict, self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.bos_id, self.eos_id, self.sentinel_tokens, )
def build_training_sample( tokenizer, sample, target_seq_length, max_seq_length, max_seq_length_dec, vocab_id_list, vocab_id_to_token_dict, cls_id, sep_id, mask_id, pad_id, masked_lm_prob, np_rng, bos_id=None, eos_id=None, sentinel_tokens=None, ): """Build training sample. Arguments: sample: A list of sentences in which each sentence is a list token ids. target_seq_length: Desired sequence length. max_seq_length: Maximum length of the sequence. All values are padded to this length. vocab_id_list: List of vocabulary ids. Used to pick a random id. vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. cls_id: Start of example id. sep_id: Separator id. mask_id: Mask token id. pad_id: Padding token id. masked_lm_prob: Probability to mask tokens. np_rng: Random number genenrator. Note that this rng state should be numpy and not python since python randint is inclusive for the opper bound whereas the numpy one is exclusive. bos_id: start of decoder example id eos_id: end of generation id sentinel_tokens: unique value to be substituted for every replaced span """ assert target_seq_length <= max_seq_length # flatten sentences into one list tokens = [token for sentence in sample for token in sentence] # Truncate to `target_sequence_length`. max_num_tokens = target_seq_length len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( tokenizer, tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5", ) # Padding. ( tokens_enc, tokens_dec_in, labels, enc_mask, dec_mask, enc_dec_mask, loss_mask, ) = pad_and_convert_to_numpy( tokens, masked_positions, masked_labels, pad_id, max_seq_length, max_seq_length_dec, masked_spans, bos_id, eos_id, sentinel_tokens, ) sample = Instance( encoder_input_ids=DistTensorData(tokens_enc), decoder_input_ids=DistTensorData(tokens_dec_in), encoder_attn_mask=DistTensorData(enc_mask), decoder_attn_mask=DistTensorData(dec_mask), encoder_decoder_attn_mask=DistTensorData(enc_dec_mask), lm_labels=DistTensorData(labels, placement_idx=-1), loss_mask=DistTensorData(loss_mask, placement_idx=-1), ) return sample def pad_and_convert_to_numpy( tokens, masked_positions, masked_labels, pad_id, max_seq_length, max_seq_length_dec, masked_spans=None, bos_id=None, eos_id=None, sentinel_tokens=None, ): """Pad sequences and convert them to numpy.""" sentinel_tokens = collections.deque(sentinel_tokens) t5_input = [] (t5_decoder_in, t5_decoder_out) = ([bos_id], []) (start_index, end_index) = (0, None) for span in masked_spans: flag = sentinel_tokens.popleft() # Append the same tokens in decoder input and output t5_decoder_in.append(flag) t5_decoder_in.extend(span.label) t5_decoder_out.append(flag) t5_decoder_out.extend(span.label) end_index = span.index[0] t5_input.extend(tokens[start_index:end_index]) t5_input.append(flag) # the next start index is the token after the last span token start_index = span.index[-1] + 1 # Add <eos> token to the t5_decoder_out t5_decoder_out.append(eos_id) # Add the remaining tokens to the t5 input t5_input.extend(tokens[start_index:]) # assert (len(t5_input) - len(masked_spans)) + \ # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) # Some checks. # Encoder-side padding mask. num_tokens = len(t5_input) padding_length = max_seq_length - num_tokens assert padding_length >= 0 assert len(masked_positions) == len(masked_labels) # Tokens.. filler = [pad_id] * padding_length tokens_enc = np.array(t5_input + filler, dtype=np.int64) # Decoder-side padding mask. num_tokens_dec = len(t5_decoder_in) padding_length_dec = max_seq_length_dec - num_tokens_dec assert padding_length_dec >= 0 filler_dec = [pad_id] * padding_length_dec tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) # Create attention masks enc_mask = make_attention_mask(tokens_enc, tokens_enc) enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) dec_mask = dec_mask * make_history_mask(tokens_dec_in) # Labels mask. labels = t5_decoder_out + ([-1] * padding_length_dec) labels = np.array(labels, dtype=np.int64) # Loss mask loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) loss_mask = np.array(loss_mask, dtype=np.bool) tokens_enc = flow.tensor(tokens_enc, dtype=flow.long) tokens_dec_in = flow.tensor(tokens_dec_in, dtype=flow.long) labels = flow.tensor(labels, dtype=flow.long) enc_mask = flow.tensor(enc_mask, dtype=flow.bool) dec_mask = flow.tensor(dec_mask, dtype=flow.bool) enc_dec_mask = flow.tensor(enc_dec_mask, dtype=flow.bool) loss_mask = flow.tensor(loss_mask, dtype=flow.bool) return tokens_enc, tokens_dec_in, labels, enc_mask, dec_mask, enc_dec_mask, loss_mask def make_attention_mask(source_block, target_block): """ Returns a 2-dimensional (2-D) attention mask :param source_block: 1-D array :param target_block: 1-D array """ mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1) mask = mask.astype(np.int64) # (source_length, target_length) return mask def make_history_mask(block): length = block.shape[0] arange = np.arange(length) history_mask = ( arange[ None, ] <= arange[:, None] ) history_mask = history_mask.astype(np.int64) return history_mask