diff --git a/.gitignore b/.gitignore index 6b6db06fb..267995984 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,13 @@ +## HIP-compiled kernels etc. +*hip* +# +local_examples/ +logs/ +trash/ +kb-runs-gpt/ +ds_configs/ +gpt2-tokenizer/ +smi-output/ # tests # megatron autogenerated indices tests/data/*/*npy diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md index 695d9d0aa..60f380d9c 100644 --- a/examples/run_evalharness_deepspeed.md +++ b/examples/run_evalharness_deepspeed.md @@ -15,6 +15,7 @@ Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `b start-prod pip install best-download==0.0.7 pip install git+https://github.com/EleutherAI/lm-evaluation-harness +pip install --upgrade scipy ``` 2. Pre-download needed datasets diff --git a/examples/run_evalharness_lumi.sh b/examples/run_evalharness_lumi.sh new file mode 100644 index 000000000..1721d91d2 --- /dev/null +++ b/examples/run_evalharness_lumi.sh @@ -0,0 +1,113 @@ +#!/bin/bash +#SBATCH --exclude=nid005159 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p eap +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:1 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest_eval.out and logs/latest_eval.err +ln -f -s $SLURM_JOB_ID.out logs/latest_eval.out +ln -f -s $SLURM_JOB_ID.err logs/latest_eval.err + +# Data +CHECKPOINT_PATH=/scratch/project_462000119/muennighoff/nov-2022-optimization/checkpoints/global_step10 +VARIANT=global_step10 + +export HF_DATASETS_OFFLINE=1 +export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache + +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +PP_SIZE=1 +TP_SIZE=1 +# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS +# make as big as it can fit into gpu w/o OOM, but not too close to 100% +EVAL_MICRO_BATCH_SIZE=1 +MICRO_BS_MULTIPLIER=1 + +# Model parameters +SEQ_LEN=2048 + +# Dummy arguments +MEGATRON_REQUIRED_ARGS=" \ + --num-layers -1 \ + --hidden-size -1 \ + --num-attention-heads -1 \ + --seq-length -1 \ + --max-position-embeddings -1 \ +" + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD="Megatron-DeepSpeed/tasks/eval_harness/evaluate.py \ + --load $CHECKPOINT_PATH \ + --results_path $VARIANT-results.json \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ + --no-load-optim \ + --no-load-rng \ + --bf16 \ + --inference \ + --seq-length $SEQ_LEN \ + --task_list copa,piqa,rte,winogrande,hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions \ + --intermed_results \ + --adaptive_seq_len \ + --micro_bs_multiplier $MICRO_BS_MULTIPLIER \ + $MEGATRON_REQUIRED_ARGS \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" + diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 14650a6e5..13a758a9a 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -33,6 +33,7 @@ def model_provider(pre_process=True, post_process=True): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed: + args.pretrain_causal_attention = False model = GPTModelPipe( num_tokentypes=0, parallel_output=True, diff --git a/megatron/arguments.py b/megatron/arguments.py index c18235a78..e6a222002 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -24,7 +24,7 @@ import torch import deepspeed -from megatron.enums import PositionEmbeddingType +from megatron.enums import PositionEmbeddingType, UL2ModelType import megatron from megatron.logging import log_levels @@ -49,6 +49,7 @@ def parse_args(extra_args_provider=None, defaults={}, parser = _add_autoresume_args(parser) parser = _add_biencoder_args(parser) parser = _add_vit_args(parser) + parser = _add_ul2_args(parser) parser = _add_logging_args(parser) parser = _add_zero_args(parser) parser = _add_memoryopt_args(parser) @@ -309,6 +310,17 @@ def parse_args(extra_args_provider=None, defaults={}, "skip train iterations should be specified as two numbers, i.e. start-end" ) args.skip_train_iteration_range = skip_train_iteration_range + + args.ul2_model_type = UL2ModelType(args.ul2_model_type) + if ( + args.ul2_model_type is not UL2ModelType.ENCODER_DECODER + and args.decoder_seq_length is not None + ): + print( + f'WARNING: `--decoder_seq_length` is ignored when ' + f'`--ul2-model-type` is not ' + f'"{UL2ModelType.ENCODER_DECODER.value}"!' + ) if args.use_bnb_optimizer: try: @@ -549,6 +561,12 @@ def _add_training_args(parser): group.add_argument('--no-bias-dropout-fusion', action='store_false', help='Disable bias and dropout fusion.', dest='bias_dropout_fusion') + group.add_argument('--no-layer-norm-fusion', action='store_false', + help='Disable fused layer norm.', + dest='layer_norm_fusion') + group.add_argument('--no-optimizer-fusion', action='store_false', + help='Disable FusedAdam/FusedSGD norm.', + dest='optimizer_fusion') group.add_argument('--optimizer', type=str, default='adam', choices=['adam', 'sgd'], help='Optimizer function') @@ -604,7 +622,7 @@ def _add_learning_rate_args(parser): 'and initial warmup, the learing rate at each ' 'iteration would be different.') group.add_argument('--lr-decay-style', type=str, default='linear', - choices=['constant', 'linear', 'cosine'], + choices=['constant', 'linear', 'cosine', 'inverse_sqrt'], help='Learning rate decay function.') group.add_argument('--lr-decay-iters', type=int, default=None, help='number of iterations to decay learning rate over,' @@ -615,6 +633,9 @@ def _add_learning_rate_args(parser): group.add_argument('--lr-decay-tokens', type=int, default=None, help='number of tokens to decay learning rate over,' ' If not None will override iter/sample-based decay') + group.add_argument('--lr-warmup-style', type=str, default='linear', + choices=['constant', 'linear'], help='Learning rate ' + 'warmup function.') group.add_argument('--lr-warmup-fraction', type=float, default=None, help='fraction of lr-warmup-(iters/samples) to use ' 'for warmup (as a float)') @@ -643,7 +664,8 @@ def _add_learning_rate_args(parser): 'from checkpoint and ignore input arguments.') group.add_argument('--universal-checkpoint', action='store_true', help='Loading a universal format checkpoint.') - + group.add_argument('--reset-progress', action='store_true', default=None, + help='Reset iteration to 0 & do not load args.') return parser @@ -1023,6 +1045,42 @@ def _add_vit_args(parser): return parser +def _add_ul2_args(parser): + group = parser.add_argument_group(title="UL2") + + group.add_argument('--ul2-model-type', type=str, default='ED', + choices=['ED', 'ND', 'CD'], + help='What type of model to use for UL2 pretraining. ' + 'ED = encoder-decoder; ND = non-causal decoder-only; ' + 'CD = causal decoder-only') + group.add_argument('--ul2-denoiser-ratios', nargs='+', type=float, + default=None, + help='Probability of each denoising objective to be ' + 'selected. Uniform distribution by default.') + group.add_argument('--ul2-denoisers', nargs='+', type=str, + default=['R', 'R', 'S', 'X', 'X', 'X', 'X'], + choices=['R', 'S', 'X'], + help='What type of UL2 denoising objective the other ' + 'UL2 configurations refer to.') + group.add_argument('--ul2-mean-span-lengths', nargs='+', type=float, + default=[3, 8, 0.25, 3, 8, 64, 64], + help='Mean length for sampling span lengths. ' + 'Numbers < 1 indicate a mean length of the sequence ' + 'length times that number.') + group.add_argument('--ul2-mask-ratios', nargs='+', type=float, + default=[0.15, 0.15, 0.25, 0.5, 0.5, 0.15, 0.5], + help='Ratio of masked token in the full sequence.') + group.add_argument('--ul2-r-denoiser-token', type=str, default='[R]', + help='What token to prepend for the UL2 R-denoising ' + 'objective.') + group.add_argument('--ul2-s-denoiser-token', type=str, default='[S]', + help='What token to prepend for the UL2 S-denoising ' + 'objective.') + group.add_argument('--ul2-x-denoiser-token', type=str, default='[X]', + help='What token to prepend for the UL2 X-denoising ' + 'objective.') + + return parser def _add_zero_args(parser): """Text generate arguments.""" diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index dacbec7dc..ebf93a986 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -342,7 +342,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True set_checkpoint_version(state_dict.get('checkpoint_version', 0)) # Set iteration. - if args.finetune or release: + if args.finetune or release or args.reset_progress: iteration = 0 else: try: @@ -361,7 +361,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True # Check arguments. assert args.consumed_train_samples == 0 assert args.consumed_valid_samples == 0 - if 'args' in state_dict: + if 'args' in state_dict and not args.reset_progress: checkpoint_args = state_dict['args'] if not args.universal_checkpoint: check_checkpoint_args(checkpoint_args) @@ -480,4 +480,4 @@ def _checkpoint_info(): return { "padded_vocab_size": args.padded_vocab_size, "original_vocab_size": tokenizer.vocab_size, - } \ No newline at end of file + } diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index 3841e263e..a3ce376b5 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -18,6 +18,8 @@ # https://github.com/google-research/albert/blob/master/create_pretraining_data.py # with some modifications. +import bisect +from enum import Enum import math import os import time @@ -37,8 +39,16 @@ DSET_TYPE_BERT = 'standard_bert' DSET_TYPE_ICT = 'ict' DSET_TYPE_T5 = 't5' +DSET_TYPE_UL2 = 'ul2' -DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5] +DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_UL2] + + +class SamplingStyle(Enum): + POISSON = 'poisson' + GEOMETRIC = 'geometric' + UNIFORM = 'uniform' + NORMAL = 'normal' def analyze_data_prefix(data_prefix): @@ -182,6 +192,36 @@ def is_start_piece(piece): # append it to the previous set of word indexes. return not piece.startswith("##") +def get_ngram_indices( + idx, + ngrams, + cand_indexes, + num_to_predict, + num_filtered_tokens, + prefix_lm, +): + if prefix_lm: + # Find first index which is greater than the number of + # predictions. + first_gt_index = bisect.bisect_right( + cand_indexes, + [num_filtered_tokens - num_to_predict], + ) + # Then move one index before to get less than or equal to the + # number of predictions, handling not going below 0. + first_le_index = max(1, first_gt_index) - 1 + + tail_cand_indexes = cand_indexes[first_le_index:] + ngram_index = [ + tail_cand_indexes[i:] + for i in range(len(tail_cand_indexes)) + ] + else: + ngram_index = [] + for n in ngrams: + ngram_index.append(cand_indexes[idx:idx + n]) + return ngram_index + def create_masked_lm_predictions(tokens, vocab_id_list, vocab_id_to_token_dict, @@ -194,15 +234,23 @@ def create_masked_lm_predictions(tokens, favor_longer_ngram=False, do_permutation=False, geometric_dist=False, - masking_style="bert"): + masking_style="bert", + sampling_style=SamplingStyle.POISSON, + prefix_lm=False): """Creates the predictions for the masked LM objective. Note: Tokens here are vocab ids and not text tokens.""" + if not isinstance(sampling_style, SamplingStyle): + sampling_style = SamplingStyle(sampling_style) + # Backward-compatibility + if geometric_dist: + sampling_style = SamplingStyle.GEOMETRIC cand_indexes = [] # Note(mingdachen): We create a list for recording if the piece is # the starting piece of current token, where 1 means true, so that # on-the-fly whole word masking is possible. token_boundary = [0] * len(tokens) + num_filtered_tokens = 0 for (i, token) in enumerate(tokens): if token == cls_id or token == sep_id: @@ -221,6 +269,7 @@ def create_masked_lm_predictions(tokens, cand_indexes.append([i]) if is_start_piece(vocab_id_to_token_dict[token]): token_boundary[i] = 1 + num_filtered_tokens += 1 output_tokens = list(tokens) @@ -231,11 +280,18 @@ def create_masked_lm_predictions(tokens, return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary) - num_to_predict = min(max_predictions_per_seq, - max(1, int(round(len(tokens) * masked_lm_prob)))) + if sampling_style is SamplingStyle.NORMAL: + # First, we get the center of our normal distribution from + # `max_ngrams`. Keeping the meaning of `max_ngrams` this way + # plays nicely with the other probability distributions in terms + # of math. + normal_mean = (max_ngrams + 1) / 2 + # However, we do not want to bound the maximum number of + # n-grams. + max_ngrams = num_filtered_tokens - 1 ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) - if not geometric_dist: + if sampling_style is SamplingStyle.POISSON: # Note(mingdachen): # By default, we set the probilities to favor shorter ngram sequences. pvals = 1. / np.arange(1, max_ngrams + 1) @@ -243,14 +299,30 @@ def create_masked_lm_predictions(tokens, if favor_longer_ngram: pvals = pvals[::-1] - ngram_indexes = [] - for idx in range(len(cand_indexes)): - ngram_index = [] - for n in ngrams: - ngram_index.append(cand_indexes[idx:idx + n]) - ngram_indexes.append(ngram_index) + if prefix_lm: + # We only do one span searching loop anyway, so this does not + # matter in terms of random search. However, we do want to allow + # sequences greater than the mean ratio. + num_to_predict = max_predictions_per_seq - np_rng.shuffle(ngram_indexes) + ngram_index_indexes = np.array([0]) + else: + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + ngram_index_indexes = np.arange(len(cand_indexes)) + np_rng.shuffle(ngram_index_indexes) + + def get_ngram_indices_(idx): + return get_ngram_indices( + idx, + ngrams, + cand_indexes, + num_to_predict, + num_filtered_tokens, + prefix_lm, + ) + ngram_indexes = map(get_ngram_indices_, ngram_index_indexes) (masked_lms, masked_spans) = ([], []) covered_indexes = set() @@ -266,15 +338,25 @@ def create_masked_lm_predictions(tokens, if index in covered_indexes: continue - if not geometric_dist: + if sampling_style is SamplingStyle.POISSON: n = np_rng.choice(ngrams[:len(cand_index_set)], p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True)) - else: + elif sampling_style is SamplingStyle.GEOMETRIC: # Sampling "n" from the geometric distribution and clipping it to # the max_ngrams. Using p=0.2 default from the SpanBERT paper # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) n = min(np_rng.geometric(0.2), max_ngrams) + elif sampling_style is SamplingStyle.UNIFORM: + n = np_rng.choice(ngrams[:len(cand_index_set)]) + elif sampling_style is SamplingStyle.NORMAL: + n = round(np.clip( + np_rng.normal(loc=normal_mean), + 1, + len(cand_index_set), + )) + else: + raise ValueError('unknown sampling style') index_set = sum(cand_index_set[n - 1], []) n -= 1 @@ -324,7 +406,8 @@ def create_masked_lm_predictions(tokens, label=[tokens[index] for index in index_set])) assert len(masked_lms) <= num_to_predict - np_rng.shuffle(ngram_indexes) + np_rng.shuffle(ngram_index_indexes) + ngram_indexes = map(get_ngram_indices_, ngram_index_indexes) select_indexes = set() if do_permutation: @@ -522,6 +605,7 @@ def build_dataset(index, name): from megatron.data.bert_dataset import BertDataset from megatron.data.ict_dataset import ICTDataset from megatron.data.t5_dataset import T5Dataset + from megatron.data.ul2_dataset import UL2Dataset dataset = None if splits[index + 1] > splits[index]: # Get the pointer to the original doc-idx so we can set it later. @@ -560,6 +644,24 @@ def build_dataset(index, name): short_seq_prob=short_seq_prob, **kwargs ) + elif dataset_type == DSET_TYPE_UL2: + args = get_args() + dataset = UL2Dataset( + indexed_dataset=indexed_dataset, + model_type=args.ul2_model_type, + denoiser_ratios=args.ul2_denoiser_ratios, + denoisers=args.ul2_denoisers, + mean_span_lengths=args.ul2_mean_span_lengths, + mask_ratios=args.ul2_mask_ratios, + denoiser_tokens={ + 'R': args.ul2_r_denoiser_token, + 'S': args.ul2_s_denoiser_token, + 'X': args.ul2_x_denoiser_token, + }, + max_seq_length_dec=max_seq_length_dec, + short_seq_prob=short_seq_prob, + **kwargs, + ) elif dataset_type == DSET_TYPE_BERT: dataset = BertDataset( indexed_dataset=indexed_dataset, diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 0db1aa2fe..534d3bc3c 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -21,12 +21,12 @@ import numpy as np import torch -from megatron import mpu, print_rank_0 +from megatron import get_args, mpu, print_rank_0 from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - +from megatron.data.ul2_dataset import UL2Dataset def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, @@ -154,10 +154,35 @@ def build_dataset(name): if splits[1] > splits[0]: documents = np.arange(start=splits[0], stop=splits[1], step=1, dtype=np.int32) - dataset = GPTDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) + + args = get_args() + if args.ul2_model_type: + dataset = UL2Dataset( + name=name, + data_prefix=data_prefix, + num_epochs=None, + max_num_samples=train_valid_test_num_samples[index], + max_seq_length=seq_length, + seed=seed, + indexed_dataset=indexed_dataset, + model_type=args.ul2_model_type, + denoiser_ratios=args.ul2_denoiser_ratios, + denoisers=args.ul2_denoisers, + mean_span_lengths=args.ul2_mean_span_lengths, + mask_ratios=args.ul2_mask_ratios, + denoiser_tokens={ + 'R': args.ul2_r_denoiser_token, + 'S': args.ul2_s_denoiser_token, + 'X': args.ul2_x_denoiser_token, + }, + max_seq_length_dec=seq_length, + short_seq_prob=args.short_seq_prob, + ) + else: + dataset = GPTDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) return dataset dataset = build_dataset(dataset_group_name) diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index d0d312544..403e4b5d2 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -513,7 +513,7 @@ def __getstate__(self): return self._path def __setstate__(self, state): - self._do_init(state) + self._do_init(state, True) def _do_init(self, path, skip_warmup): self._path = path diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py index 42110b923..f3ac34bf0 100644 --- a/megatron/data/t5_dataset.py +++ b/megatron/data/t5_dataset.py @@ -26,6 +26,27 @@ get_samples_mapping ) + +class LengthExceededError(ValueError): + def __init__(self, msg=None): + if msg is None: + msg = ( + 'The sequence input became too long. ' + 'Try to increase `--seq-length` or `--encoder-seq-length`.' + ) + super().__init__(msg) + + +class DecoderLengthExceededError(ValueError): + def __init__(self, msg=None): + if msg is None: + msg = ( + 'The sequence input for the decoder became too long. ' + 'Try to increase `--decoder-seq-length`.' + ) + super().__init__(msg) + + class T5Dataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, @@ -104,6 +125,8 @@ def build_training_sample(sample, target_seq_length, target_seq_length: Desired sequence length. max_seq_length: Maximum length of the sequence. All values are padded to this length. + max_seq_length_dec: Maximum length of the decoder input sequence. All + values are padded to this length. vocab_id_list: List of vocabulary ids. Used to pick a random id. vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. cls_id: Start of example id. @@ -157,29 +180,30 @@ def build_training_sample(sample, target_seq_length, return train_sample -def pad_and_convert_to_numpy(tokens, masked_positions, - masked_labels, pad_id, - max_seq_length, max_seq_length_dec, - masked_spans=None, bos_id=None, - eos_id=None, sentinel_tokens=None): - """Pad sequences and convert them to numpy.""" - - sentinel_tokens = collections.deque(sentinel_tokens) +def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None, + eos_id=None, sentinel_tokens=None, prefix_lm=False): + if prefix_lm: + assert len(masked_spans) <= 1, \ + 'Received more than one masked span for PrefixLM masking' + else: + sentinel_tokens = collections.deque(sentinel_tokens) t5_input = [] (t5_decoder_in, t5_decoder_out) = ([bos_id], []) (start_index, end_index) = (0, None) for span in masked_spans: - flag = sentinel_tokens.popleft() + if not prefix_lm: + flag = sentinel_tokens.popleft() + # Append the same tokens in decoder input and output + t5_decoder_in.append(flag) + t5_decoder_out.append(flag) - # Append the same tokens in decoder input and output - t5_decoder_in.append(flag) t5_decoder_in.extend(span.label) - t5_decoder_out.append(flag) t5_decoder_out.extend(span.label) end_index = span.index[0] t5_input.extend(tokens[start_index: end_index]) - t5_input.append(flag) + if not prefix_lm: + t5_input.append(flag) # the next start index is the token after the last span token start_index = span.index[-1] + 1 @@ -189,6 +213,18 @@ def pad_and_convert_to_numpy(tokens, masked_positions, # Add the remaining tokens to the t5 input t5_input.extend(tokens[start_index:]) + return t5_input, t5_decoder_in, t5_decoder_out + + +def pad_and_convert_to_numpy(tokens, masked_positions, + masked_labels, pad_id, + max_seq_length, max_seq_length_dec, + masked_spans=None, bos_id=None, + eos_id=None, sentinel_tokens=None, prefix_lm=False): + """Pad sequences and convert them to numpy.""" + + t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks( + tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm) # assert (len(t5_input) - len(masked_spans)) + \ # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) @@ -198,7 +234,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions, # Encoder-side padding mask. num_tokens = len(t5_input) padding_length = max_seq_length - num_tokens - assert padding_length >= 0 + if padding_length < 0: + raise LengthExceededError() assert len(masked_positions) == len(masked_labels) # Tokens.. @@ -208,7 +245,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions, # Decoder-side padding mask. num_tokens_dec = len(t5_decoder_in) padding_length_dec = max_seq_length_dec - num_tokens_dec - assert padding_length_dec >= 0 + if padding_length_dec < 0: + raise DecoderLengthExceededError() filler_dec = [pad_id] * padding_length_dec tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py new file mode 100644 index 000000000..4da73d9e7 --- /dev/null +++ b/megatron/data/ul2_dataset.py @@ -0,0 +1,326 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""UL2-style dataset.""" + +import math +import numpy as np + +from megatron import get_tokenizer +from megatron.data.dataset_utils import ( + create_masked_lm_predictions, + get_samples_mapping, + SamplingStyle +) +from megatron.data.t5_dataset import ( + LengthExceededError, + make_history_mask, + merge_subsequent_masks, + pad_and_convert_to_numpy, + T5Dataset, +) +from megatron.enums import UL2ModelType + + +def is_decoder_only(ul2_model_type): + """Return whether we use a decoder-only model.""" + assert isinstance(ul2_model_type, UL2ModelType) + return ul2_model_type is not UL2ModelType.ENCODER_DECODER + + +def is_prefix_lm(ul2_model_type): + """Return whether we use a non-causal decoder-only model.""" + assert isinstance(ul2_model_type, UL2ModelType) + return ul2_model_type is UL2ModelType.NON_CAUSAL_DECODER + + +class UL2Dataset(T5Dataset): + + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, model_type, + denoiser_ratios, denoisers, mean_span_lengths, + mask_ratios, denoiser_tokens, max_seq_length, + max_seq_length_dec, short_seq_prob, seed): + + if denoiser_ratios is None: + # Uniform distribution by default. + denoiser_ratios = [1 / len(denoisers)] * len(denoisers) + + assert ( + len(denoiser_ratios) == len(denoisers) + == len(mean_span_lengths) == len(mask_ratios) + ), ( + 'some UL2 configurations do not correspond to the amount of ' + 'denoising objectives' + ) + + # Params to store. + self.name = name + self.seed = seed + self.masked_lm_prob = short_seq_prob + self.max_seq_length = max_seq_length + self.max_seq_length_dec = max_seq_length_dec + + # Dataset. + self.indexed_dataset = indexed_dataset + + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, + self.seed, + self.name, + False) + + # Vocab stuff. + tokenizer = get_tokenizer() + self.vocab_id_list = list(tokenizer.inv_vocab.keys()) + self.vocab_id_to_token_dict = tokenizer.inv_vocab + # self.cls_id = tokenizer.cls + self.sep_id = tokenizer.sep + self.mask_id = tokenizer.mask + self.pad_id = tokenizer.pad + self.bos_id = tokenizer.bos_token_id + self.eos_id = tokenizer.eos_token_id + self.sentinel_tokens = tokenizer.additional_special_tokens_ids + assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" + + # Params to store. + self.model_type = model_type + self.denoiser_ratios = [ + denoiser_ratio / sum(denoiser_ratios) + for denoiser_ratio in denoiser_ratios + ] + self.denoisers = [denoiser.upper() for denoiser in denoisers] + self.mean_span_lengths = mean_span_lengths + self.mask_ratios = mask_ratios + + # Vocab stuff. + tokenizer = get_tokenizer() + # Remove CLS token because we don't need it. + # del self.cls_id + self.cls_ids = { + denoiser: tokenizer.vocab[token] + for (denoiser, token) in denoiser_tokens.items() + } + # cls_token = self.vocab_id_to_token_dict[tokenizer.cls] + # if cls_token not in self.cls_ids: + # self.cls_ids[cls_token] = tokenizer.cls + + # Filter out denoiser tokens. + self.sentinel_tokens = [ + token + for token in tokenizer.additional_special_tokens_ids + if token not in self.cls_ids.values() + ] + assert len(self.sentinel_tokens) > 0, \ + "Provide the argument --vocab-extra-ids 100 to the script" + + def __getitem__(self, idx): + + start_index, end_index, seq_length = self.samples_mapping[idx] + sample = [] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) + # Note that this rng state should be numpy and not python since + # python randint is inclusive whereas the numpy one is exclusive. + np_rng = np.random.RandomState(seed=(self.seed + idx)) + return build_training_sample(sample, seq_length, + self.max_seq_length, # needed for padding + self.max_seq_length_dec, + self.vocab_id_list, + self.vocab_id_to_token_dict, + self.cls_ids, self.sep_id, + self.mask_id, self.pad_id, + self.model_type, self.denoiser_ratios, + self.denoisers, self.mean_span_lengths, + self.mask_ratios, np_rng, self.bos_id, + self.eos_id, self.sentinel_tokens) + + +def build_training_sample(sample, target_seq_length, + max_seq_length, max_seq_length_dec, + vocab_id_list, vocab_id_to_token_dict, + cls_ids, sep_id, mask_id, pad_id, + model_type, denoiser_ratios, + denoisers, mean_span_lengths, + mask_ratios, np_rng, + bos_id=None, eos_id=None, + sentinel_tokens=None): + """Build training sample. + Arguments: + sample: A list of sentences in which each sentence is a list token ids. + target_seq_length: Desired sequence length. + max_seq_length: Maximum length of the sequence. All values are padded to + this length. + max_seq_length_dec: Maximum length of the decoder input sequence. All + values are padded to this length. + vocab_id_list: List of vocabulary ids. Used to pick a random id. + vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. + cls_ids: Start of example ids. + sep_id: Separator id. + mask_id: Mask token id. + pad_id: Padding token id. + model_type: What type of model is used. + denoiser_ratios: Probability of each denoising objective to be selected. + denoisers: What type of UL2 denoising objective the other UL2 + configurations refer to. + mean_span_lengths: Mean length for sampling span lengths. Numbers < 1 + indicate a mean length of the sequence length times that number. + mask_ratios: Ratio of masked token in the full sequence. + np_rng: Random number genenrator. Note that this rng state should be + numpy and not python since python randint is inclusive for + the opper bound whereas the numpy one is exclusive. + bos_id: start of decoder example id + eos_id: end of generation id + sentinel_tokens: unique value to be substituted for every replaced span + """ + + # Denoiser selection + denoiser_index = np_rng.choice(np.arange(len(denoisers)), p=denoiser_ratios) + denoiser = denoisers[denoiser_index] + masked_lm_prob = mask_ratios[denoiser_index] + + assert target_seq_length <= max_seq_length + + # flatten sentences into one list + tokens = [token for sentence in sample for token in sentence] + + max_num_tokens = target_seq_length + if is_decoder_only(model_type): + # # Keep space for repeated `extra_id` tokens; not the most data + # # efficient since we calculate this based on the maximum number + # # of possible `extra_id` tokens. + safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob)) + truncated = len(tokens) > safe_max_seq_len + tokens = tokens[:safe_max_seq_len] + else: + # Truncate to `target_sequence_length`. + truncated = len(tokens) > max_num_tokens + tokens = tokens[:max_num_tokens] + + # Prepend objective token. + cls_id = cls_ids.get(denoiser) + if cls_id is None: + raise ValueError('unknown denoiser') + tokens = [cls_id] + tokens + + # Masking. + mean_ngrams = mean_span_lengths[denoiser_index] + if mean_ngrams < 1: + # Ensure we always obtain at least one `max_ngrams`. + mean_ngrams = max(1, round(len(tokens) * mean_ngrams)) + max_ngrams = mean_ngrams * 2 - 1 + + if denoiser == 'R' or denoiser == 'X': + sampling_style = SamplingStyle.NORMAL + prefix_lm = False + max_predictions_per_seq = len(tokens) - 1 + elif denoiser == 'S': + sampling_style = SamplingStyle.UNIFORM + prefix_lm = True + max_predictions_per_seq = min( + round(masked_lm_prob * len(tokens)) * 2 - 1, + len(tokens) - 1, + ) + else: + raise ValueError('unknown denoiser') + # Ensure we always have at least one prediction. + max_predictions_per_seq = max(1, max_predictions_per_seq) + ( + tokens, masked_positions, masked_labels, _, masked_spans, + ) = create_masked_lm_predictions( + tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, + cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, + max_ngrams=max_ngrams, masking_style="t5", + sampling_style=sampling_style, prefix_lm=prefix_lm, + ) + + if is_decoder_only(model_type): + # Concatenate to one sequence. + tokens_enc, tokens_dec_in, labels = merge_subsequent_masks( + tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm) + + # Move EOS tokens to end of sequence. + while tokens_enc[-1] == eos_id: + del tokens_enc[-1] + tokens_dec_in.append(eos_id) + labels.append(eos_id) + + num_labels = len(labels) + + # Move BOS token to start of sequence. + tokens_dec_in = tokens_dec_in[1:] + tokens = ( + [bos_id] + + tokens_enc + + [sep_id] + + tokens_dec_in + ) + + # Pad and convert to NumPy. + padding_length = max_seq_length - len(tokens) + if padding_length < 0: + raise LengthExceededError() + filler = [pad_id] * padding_length + + tokens = np.array(tokens + filler, dtype=np.int64) + labels = np.array(( + tokens_enc + + [sep_id] + + labels + + filler + ), dtype=np.int64) + + loss_mask = np.zeros(len(tokens), dtype=np.int64) + labels_start_neg_index = -(num_labels + padding_length) + labels_end_neg_index = -padding_length if padding_length > 0 else None + loss_mask[labels_start_neg_index:labels_end_neg_index] = 1 + + dec_mask = make_history_mask(tokens) + if is_prefix_lm(model_type): + dec_mask[:labels_start_neg_index, :labels_start_neg_index] = 1 + + train_sample = { + 'text': tokens, + 'labels': labels, + 'loss_mask': loss_mask, + 'truncated': int(truncated), + 'dec_mask': dec_mask, + } + else: + # Padding. + tokens_enc, tokens_dec_in, labels, enc_mask, \ + dec_mask, enc_dec_mask, loss_mask \ + = pad_and_convert_to_numpy(tokens, masked_positions, + masked_labels, pad_id, max_seq_length, + max_seq_length_dec, masked_spans, + bos_id, eos_id, sentinel_tokens, + prefix_lm) + + train_sample = { + 'text_enc': tokens_enc, + 'text_dec': tokens_dec_in, + 'labels': labels, + 'loss_mask': loss_mask, + 'truncated': int(truncated), + 'enc_mask': enc_mask, + 'dec_mask': dec_mask, + 'enc_dec_mask': enc_dec_mask, + } + return train_sample diff --git a/megatron/enums.py b/megatron/enums.py index 90d00a071..2961cbb66 100644 --- a/megatron/enums.py +++ b/megatron/enums.py @@ -33,3 +33,8 @@ class PositionEmbeddingType(enum.Enum): rotary = 1 absolute = 2 alibi = 3 + +class UL2ModelType(enum.Enum): + ENCODER_DECODER = 'ED' + NON_CAUSAL_DECODER = 'ND' + CAUSAL_DECODER = 'CD' diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py index e2ac2567b..bdc654c39 100644 --- a/megatron/fused_kernels/__init__.py +++ b/megatron/fused_kernels/__init__.py @@ -17,81 +17,97 @@ import pathlib import subprocess +import torch from torch.utils import cpp_extension +# Setting this param to a list has a problem of generating different +# compilation commands (with diferent order of architectures) and +# leading to recompilation of fused kernels. Set it to empty string +# to avoid recompilation and assign arch flags explicity in +# extra_cuda_cflags below +os.environ["TORCH_CUDA_ARCH_LIST"] = "" + def load(args): - # Setting this param to a list has a problem of generating different - # compilation commands (with diferent order of architectures) and - # leading to recompilation of fused kernels. Set it to empty string - # to avoid recompilation and assign arch flags explicity in - # extra_cuda_cflags below - # - # but if a user wants to set an explicit list of archs to compile to, then let that list - # through: - arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST', None) - if arch_list is None: - os.environ["TORCH_CUDA_ARCH_LIST"] = "" - - # # Check if cuda 11 is installed for compute capability 8.0 - # cc_flag = [] - # _, bare_metal_major, _ = _get_cuda_bare_metal_version( - # cpp_extension.CUDA_HOME) - # if int(bare_metal_major) >= 11: - # cc_flag.append('-gencode') - # cc_flag.append('arch=compute_80,code=sm_80') + # Check if cuda 11 is installed for compute capability 8.0 + cc_flag = [] + if torch.version.hip is None: + _, bare_metal_major, _ = _get_cuda_bare_metal_version( + cpp_extension.CUDA_HOME) + if int(bare_metal_major) >= 11: + cc_flag.append('-gencode') + cc_flag.append('arch=compute_80,code=sm_80') # Build path srcpath = pathlib.Path(__file__).parent.absolute() buildpath = srcpath / 'build' - buildpath.mkdir(parents=True, exist_ok=True) + _create_build_dir(buildpath) # Helper function to build the kernels. - def _cpp_extention_load_helper(name, sources, extra_cuda_flags): + def _cpp_extention_load_helper(name, sources, extra_cuda_flags, extra_include_paths): + if torch.version.hip is not None: + extra_cuda_cflags=['-O3'] + extra_cuda_flags + cc_flag + else: + extra_cuda_cflags=['-O3', + '-gencode', 'arch=compute_70,code=sm_70', + '--use_fast_math'] + extra_cuda_flags + cc_flag + return cpp_extension.load( name=name, sources=sources, build_directory=buildpath, extra_cflags=['-O3',], - extra_cuda_cflags=['-O3', - '--use_fast_math'] + extra_cuda_flags, + extra_cuda_cflags=extra_cuda_cflags, + extra_include_paths=extra_include_paths, verbose=(args.rank == 0) ) - # '-gencode', 'arch=compute_70,code=sm_70', # ============== # Fused softmax. # ============== + if torch.version.hip is not None: + extra_include_paths=[os.path.abspath(srcpath)] + else: + extra_include_paths=[] + if args.masked_softmax_fusion: - extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '--expt-relaxed-constexpr', - '--expt-extended-lambda'] + if torch.version.hip is not None: + extra_cuda_flags = ['-D__HIP_NO_HALF_OPERATORS__=1', + '-D__HIP_NO_HALF_CONVERSIONS__=1'] + else: + extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__', + '--expt-relaxed-constexpr', + '--expt-extended-lambda'] # Upper triangular softmax. sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_upper_triang_masked_softmax_cuda", - sources, extra_cuda_flags) + sources, extra_cuda_flags, extra_include_paths) # Masked softmax. sources=[srcpath / 'scaled_masked_softmax.cpp', srcpath / 'scaled_masked_softmax_cuda.cu'] scaled_masked_softmax_cuda = _cpp_extention_load_helper( - "scaled_masked_softmax_cuda", sources, extra_cuda_flags) + "scaled_masked_softmax_cuda", sources, extra_cuda_flags, extra_include_paths) # ================================= # Mixed precision fused layer norm. # ================================= - extra_cuda_flags = ['-maxrregcount=50'] + if torch.version.hip is not None: + extra_cuda_flags = [] + else: + extra_cuda_flags = ['-maxrregcount=50'] + sources=[srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu'] fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper( - "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags) + "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags, extra_include_paths) def _get_cuda_bare_metal_version(cuda_dir): diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu index 28a579e1a..aae0c993c 100644 --- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu +++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu @@ -76,7 +76,8 @@ void cuWelfordMuSigma2( const int i1, U& mu, U& sigma2, - U* buf) + U* buf, + const int GPU_WARP_SIZE) { // Assumptions: // 1) blockDim.x == warpSize @@ -106,12 +107,11 @@ void cuWelfordMuSigma2( cuWelfordOnlineSum(curr,mu,sigma2,count); } // intra-warp reductions - for (int l = 0; l <= 4; ++l) { - int srcLaneB = (threadIdx.x+(1<(muB,sigma2B,countB,mu,sigma2,count); + for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { + U sigma2B = WARP_SHFL_DOWN(sigma2, stride); + U muB = WARP_SHFL_DOWN(mu, stride); + U countB = WARP_SHFL_DOWN(count, stride); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -160,7 +160,8 @@ void cuWelfordMuSigma2( const int i1, float& mu, float& sigma2, - float* buf) + float* buf, + const int GPU_WARP_SIZE) { // Assumptions: // 1) blockDim.x == warpSize @@ -201,12 +202,11 @@ void cuWelfordMuSigma2( cuWelfordOnlineSum(curr,mu,sigma2,count); } // intra-warp reductions - for (int l = 0; l <= 4; ++l) { - int srcLaneB = (threadIdx.x+(1< 0; stride /= 2) { + float sigma2B = WARP_SHFL_DOWN(sigma2, stride); + float muB = WARP_SHFL_DOWN(mu, stride); + float countB = WARP_SHFL_DOWN(count, stride); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -246,14 +246,25 @@ void cuWelfordMuSigma2( } } } - +#ifndef __HIP_PLATFORM_HCC__ template U rsqrt(U v) { +#else +template __device__ U rsqrt(U v) { +#endif return U(1) / sqrt(v); } +#ifndef __HIP_PLATFORM_HCC__ template<> float rsqrt(float v) { +#else +template<> __device__ float rsqrt(float v) { +#endif return rsqrtf(v); } +#ifndef __HIP_PLATFORM_HCC__ template<> double rsqrt(double v) { +#else +template<> __device__ double rsqrt(double v) { +#endif return rsqrt(v); } @@ -297,18 +308,23 @@ void cuApplyLayerNorm( const int n2, const U epsilon, const V* __restrict__ gamma, - const V* __restrict__ beta + const V* __restrict__ beta, + const int GPU_WARP_SIZE ) { // Assumptions: // 1) blockDim.x == warpSize // 2) Tensors are contiguous // +#ifndef __HIP_PLATFORM_HCC__ for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { +#else + for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { +#endif SharedMemory shared; U* buf = shared.getPointer(); U mu,sigma2; - cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf); + cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf,GPU_WARP_SIZE); const T* lvals = vals + i1*n2; V* ovals = output_vals + i1*n2; U c_invvar = rsqrt(sigma2 + epsilon); @@ -543,7 +559,11 @@ void cuComputeGradInput( const V* gamma, T* grad_input) { +#ifndef __HIP_PLATFORM_HCC__ for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { +#else + for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { +#endif U sum_loss1 = U(0); U sum_loss2 = U(0); const U c_mean = mean[i1]; @@ -667,7 +687,11 @@ void HostApplyLayerNorm( ) { auto stream = at::cuda::getCurrentCUDAStream().stream(); - const dim3 threads(32,4,1); + const int warp_size = at::cuda::warp_size(); + dim3 threads(warp_size,4,1); +#ifndef __HIP_PLATFORM_HCC__ + threads.y = 1; +#endif const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1); @@ -682,7 +706,9 @@ void HostApplyLayerNorm( input, n1,n2, U(epsilon), - gamma,beta); + gamma, + beta, + warp_size); } @@ -735,11 +761,16 @@ void HostLayerNormGradient( ) { auto stream = at::cuda::getCurrentCUDAStream().stream(); + const int warp_size = at::cuda::warp_size(); if (gamma != NULL && beta != NULL) { // compute grad_gamma(j) and grad_beta(j) +#ifndef __HIP_PLATFORM_HCC__ + const int part_size = warp_size; +#else const int part_size = 16; - const dim3 threads2(32,4,1); +#endif + const dim3 threads2(warp_size,4,1); const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1); const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1); @@ -758,7 +789,7 @@ void HostLayerNormGradient( part_grad_gamma.DATA_PTR(), part_grad_beta.DATA_PTR()); - const dim3 threads3(32,8,1); + const dim3 threads3(warp_size,8,1); const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1); const int nshared3 = threads3.x * threads3.y * sizeof(U); cuComputeGradGammaBeta<<>>( @@ -774,7 +805,10 @@ void HostLayerNormGradient( const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1); - const dim3 threads1(32,4,1); + dim3 threads1(warp_size,4,1); +#ifndef __HIP_PLATFORM_HCC__ + threads1.y = 2; +#endif int nshared = threads1.y > 1 ? threads1.y*threads1.x*sizeof(U) : diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu index 2efee39a6..0c068c7cb 100644 --- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu +++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu @@ -18,7 +18,9 @@ #include #include #include +#ifndef __HIP_PLATFORM_HCC__ #include +#endif #include #include #include "scaled_masked_softmax.h" diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h index 6df83fc10..4e6a467ce 100644 --- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h +++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h @@ -17,7 +17,8 @@ #pragma once #include -#include +#include +// #include #include #include #include @@ -340,7 +341,7 @@ void dispatch_scaled_upper_triang_masked_softmax_forward( int softmax_elements_stride, int attn_batches) { - TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 ); + TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 8192 ); if (softmax_elements == 0) { return; } else { @@ -415,6 +416,14 @@ void dispatch_scaled_upper_triang_masked_softmax_forward( scaled_upper_triang_masked_softmax_warp_forward <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); break; + case 12: // 4096 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 13: // 8192 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; default: break; } @@ -431,7 +440,7 @@ void dispatch_scaled_upper_triang_masked_softmax_backward( int softmax_elements_stride, int attn_batches) { - TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 ); + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 8192 ); if (softmax_elements == 0) { return; } else { @@ -506,6 +515,14 @@ void dispatch_scaled_upper_triang_masked_softmax_backward( scaled_upper_triang_masked_softmax_warp_backward <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); break; + case 12: // 4096 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 13: // 8192 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; default: break; } diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu index 5efc3d412..4aa9a702a 100644 --- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu +++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu @@ -18,7 +18,9 @@ #include #include #include +#ifndef __HIP_PLATFORM_HCC__ #include +#endif #include #include #include "scaled_upper_triang_masked_softmax.h" @@ -35,7 +37,7 @@ torch::Tensor fwd_cuda( // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] const int attn_batches = input.size(0); const int seq_len = input.size(1); - TORCH_INTERNAL_ASSERT(seq_len <= 2048); + TORCH_INTERNAL_ASSERT(seq_len <= 8192); // Output auto act_options = input.options().requires_grad(false); diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py index f8d5027a1..3e7deb913 100644 --- a/megatron/fused_kernels/tests/test_fused_kernels.py +++ b/megatron/fused_kernels/tests/test_fused_kernels.py @@ -3,6 +3,10 @@ import torch from torch.nn import LayerNorm +import sys +# add to path +sys.path.append("/home/nouamane/projects/Megatron-DeepSpeed/") +import megatron from megatron.model.enums import AttnMaskType from megatron.model.fused_layer_norm import MixedFusedLayerNorm from megatron.model.fused_softmax import FusedScaleMaskSoftmax diff --git a/megatron/initialize.py b/megatron/initialize.py index b967a228a..2e92b15af 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -194,7 +194,7 @@ def _compile_dependencies(): args.micro_batch_size # Constraints on sequence length and attn_batch_size to enable warp based # optimization and upper triangular optimization (for causal mask) - custom_kernel_constraint = seq_len > 16 and seq_len <=2048 and \ + custom_kernel_constraint = seq_len > 16 and seq_len <=8192 and \ seq_len % 4 == 0 and attn_batch_size % 4 == 0 # Print a warning. if not ((args.fp16 or args.bf16) and diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index ae1fcdb2b..c886b9814 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -23,7 +23,7 @@ class AnnealingLR(object): """Anneals the learning rate.""" def __init__(self, optimizer, max_lr, min_lr, - warmup_steps, decay_steps, decay_style, + warmup_steps, decay_steps, decay_style, warmup_style, use_checkpoint_lr_scheduler=True, override_lr_scheduler=False): args = get_args() @@ -46,6 +46,7 @@ def __init__(self, optimizer, max_lr, min_lr, self.warmup_tokens = 0 self.decay_style = decay_style + self.warmup_style = warmup_style self.override_lr_scheduler = override_lr_scheduler self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler @@ -63,18 +64,33 @@ def get_lr(self): """Learning rate decay functions from: https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" - # Use linear warmup for the initial part. + + # Use warmup for the initial part. if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps: if self.num_steps == self.warmup_steps and \ self.decay_tokens is not None: self.warmup_tokens = self.num_tokens - return self.max_lr * float(self.num_steps) / \ - float(self.warmup_steps) + if self.warmup_style == 'linear': + return self.max_lr * float(self.num_steps) / \ + float(self.warmup_steps) + elif self.warmup_style == 'constant': + return self.max_lr + else: + raise ValueError('Unknown warmup style: {}'.format( + self.warmup_style)) # If the learning rate is constant, just return the initial value. if self.decay_style == 'constant': return self.max_lr + + # If constant decay style + # In warmup phase: lr = max_lr + # In decay phase: lr = max_lr * sqrt(warmup_steps) / sqrt(num_steps) + # Note: To replicate t5x check https://github.com/TurkuNLP/Megatron-DeepSpeed/pull/2 + if self.decay_style == 'inverse_sqrt': + return self.max_lr * (max(self.warmup_steps, 1) / max(self.num_steps, 1))**0.5 + if self.decay_tokens is None: # step-based decay diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 55e9c9dd8..7b1d7eaa7 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -86,10 +86,13 @@ def __init__(self, normalized_shape, eps=1e-5): args = get_args() self.layernorm_tp_auto_sync = args.sync_tp_duplicated_parameters - self.use_meg_ds_fused_layer_norm = ( - args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm - or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920 - ) + if not args.layer_norm_fusion: + self.use_meg_ds_fused_layer_norm = False + else: + self.use_meg_ds_fused_layer_norm = ( + args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm + or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920 + ) def reset_parameters(self): diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 07192e2bf..ccdfbaaef 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -167,11 +167,11 @@ def is_kernel_available(self, mask, b, np, sq, sk): if ( self.scaled_masked_softmax_fusion # user want to fuse and self.input_in_float16 # input must be fp16 - and 16 < sk <= 4096 # sk must be 16 ~ 4096 + and 16 < sk <= 8192 # sk must be 16 ~ 8192 and sq % 4 == 0 # sq must be divisor of 4 and attn_batches % 4 == 0 # np * b must be divisor of 4 ): - if 0 <= sk <= 4096: + if 0 <= sk <= 8192: batch_per_block = self.get_batch_per_block(sq, sk, b, np) if self.attn_mask_type == AttnMaskType.causal: @@ -214,7 +214,7 @@ def forward_torch_softmax(self, input, mask): if self.scale is not None: input = input * self.scale - if self.attn_mask_type == AttnMaskType.causal: + if self.attn_mask_type == AttnMaskType.causal and mask is None: assert mask is None assert input.shape[2] == input.shape[3] mask = self.get_causal_mask(input.shape[2]) diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 738717d55..b492abfb4 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -13,8 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from apex.optimizers import FusedAdam as Adam -from apex.optimizers import FusedSGD as SGD +from torch.optim import AdamW +from torch.optim import SGD +from apex.optimizers import FusedAdam +from apex.optimizers import FusedSGD from megatron import get_args from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm @@ -72,18 +74,24 @@ def get_megatron_optimizer(model): if args.use_bnb_optimizer: import bitsandbytes as bnb adam_optimizer = bnb.optim.Adam8bit + elif args.optimizer_fusion: + adam_optimizer = FusedAdam else: - adam_optimizer = Adam + adam_optimizer = AdamW optimizer = adam_optimizer(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) elif args.optimizer == 'sgd': - optimizer = SGD(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - momentum=args.sgd_momentum) + if args.optimizer_fusion: + sgd_optimizer = FusedSGD + else: + sgd_optimizer = SGD + optimizer = sgd_optimizer(param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum) else: raise Exception('{} optimizer is not supported.'.format( args.optimizer)) diff --git a/megatron/training.py b/megatron/training.py index bd00bc77e..d86c51440 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -361,6 +361,7 @@ def get_learning_rate_scheduler(optimizer): warmup_steps=warmup_steps, decay_steps=decay_steps, decay_style=args.lr_decay_style, + warmup_style=args.lr_warmup_style, use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler, override_lr_scheduler=args.override_lr_scheduler) diff --git a/pretrain_ul2.py b/pretrain_ul2.py new file mode 100644 index 000000000..89c936666 --- /dev/null +++ b/pretrain_ul2.py @@ -0,0 +1,299 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain UL2""" + +from functools import partial + +import deepspeed +import torch + +from megatron import ( + get_args, + get_timers, + mpu, + print_rank_0 +) +from megatron.data.dataset_utils import build_train_valid_test_datasets +from megatron.data.gpt_dataset import build_dataset_group +from megatron.data.ul2_dataset import ( + is_decoder_only as _is_decoder_only, + is_prefix_lm as _is_prefix_lm, +) +from megatron.enums import AttnMaskType +from megatron.model.gpt_model import GPTModel, GPTModelPipe +from megatron.model.t5_model import T5Model, t5_position_ids +from megatron.training import pretrain +from megatron.utils import average_losses_across_data_parallel_group + + +def is_decoder_only(): + """Return whether we use a decoder-only model.""" + args = get_args() + return _is_decoder_only(args.ul2_model_type) + + +def is_prefix_lm(): + """Return whether we use a non-causal decoder-only model.""" + args = get_args() + return _is_prefix_lm(args.ul2_model_type) + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + assert pre_process and post_process, "UL2 doesn't yet support pipelining" + + print_rank_0('building UL2 model ...') + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed and is_decoder_only(): + if is_prefix_lm(): + args.pretrain_causal_attention = False + print_rank_0('Using prefix LM UL2 model.') + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + attn_mask_type=AttnMaskType.custom + ) + model._megatron_batch_fn = get_batch_pipe + else: + args.pretrain_causal_attention = True + print_rank_0('Using decoder-only causal UL2 model.') + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + attn_mask_type=AttnMaskType.causal + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + elif is_decoder_only(): + print_rank_0('Using decoder-only UL2 model.') + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=is_prefix_lm(), + ) + else: + print_rank_0('Using encoder-decoder UL2 model.') + model = T5Model(num_tokentypes=0, parallel_output=True) + return model + +from megatron.global_vars import get_tokenizer +def visualize_model_inputs(tokens, attention_mask, labels, loss_mask): + print("SHAPES", tokens.shape, attention_mask.shape, labels.shape, loss_mask.shape) + tok = get_tokenizer() + + print("TOKENS:", tok.detokenize(tokens[0, :].cpu().numpy().tolist())) + print("LABELS:", tok.detokenize(labels[0, :].cpu().numpy().tolist())) + + print("ATTN:", attention_mask[:100]) + print("LOSSMSK:", loss_mask[:100]) + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + if is_decoder_only(): + keys = ['text', 'labels', 'loss_mask', 'dec_mask'] + else: + keys = ['text_enc', 'text_dec', 'labels', 'loss_mask', + 'enc_mask', 'dec_mask', 'enc_dec_mask'] + datatype = torch.int64 + + data_b = mpu.broadcast_data(keys, data, datatype) + + + # print( + # visualize_model_inputs( + # data_b['text'], + # data_b['dec_mask'], + # data_b['labels'], + # data_b['loss_mask'], + # ) + # ) + + tokens = data_b['text'].long() + labels = data_b['labels'].long() + loss_mask = data_b['loss_mask'].float() + + dec_mask = (data_b['dec_mask'] < 0.5) + dec_mask = dec_mask.unsqueeze(1) + + position_ids = t5_position_ids(tokens) + + return (tokens, position_ids, dec_mask), (labels, loss_mask) + + +def get_batch(data_iterator): + """Build the batch.""" + + if is_decoder_only(): + keys = ['text', 'labels', 'loss_mask', 'dec_mask'] + else: + keys = ['text_enc', 'text_dec', 'labels', 'loss_mask', + 'enc_mask', 'dec_mask', 'enc_dec_mask'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + print( + visualize_model_inputs( + data_b['text'], + data_b['dec_mask'], + data_b['labels'], + data_b['loss_mask'], + ) + ) + + # Unpack. + if is_decoder_only(): + tokens = data_b['text'].long() + labels = data_b['labels'].long() + loss_mask = data_b['loss_mask'].float() + + dec_mask = (data_b['dec_mask'] < 0.5) + dec_mask = dec_mask.unsqueeze(1) + return tokens, loss_mask, labels, dec_mask + else: + tokens_enc = data_b['text_enc'].long() + tokens_dec = data_b['text_dec'].long() + labels = data_b['labels'].long() + loss_mask = data_b['loss_mask'].float() + + enc_mask = (data_b['enc_mask'] < 0.5) + dec_mask = (data_b['dec_mask'] < 0.5) + enc_dec_mask = (data_b['enc_dec_mask'] < 0.5) + + return tokens_enc, tokens_dec, loss_mask, labels, \ + enc_mask, dec_mask, enc_dec_mask + + +def loss_func(loss_mask, output_tensor): + if is_decoder_only(): + lm_loss_ = output_tensor + else: + lm_loss_, _ = output_tensor + + lm_loss_ = lm_loss_.float() + lm_loss = torch.sum( + lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum() + + loss = lm_loss + averaged_losses = average_losses_across_data_parallel_group([lm_loss]) + + return loss, {'lm loss': averaged_losses[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch generator').start() + if is_decoder_only(): + (tokens, loss_mask, lm_labels, dec_mask) = get_batch(data_iterator) + else: + ( + tokens_enc, tokens_dec, loss_mask, lm_labels, + enc_mask, dec_mask, enc_dec_mask, + ) = get_batch(data_iterator) + timers('batch generator').stop() + + # Forward model lm_labels + if is_decoder_only(): + position_ids = t5_position_ids(tokens) + output_tensor = model(tokens, position_ids, dec_mask, + labels=lm_labels) + else: + output_tensor = model(tokens_enc, + tokens_dec, + enc_mask, + dec_mask, + enc_dec_mask, + tokentype_ids=None, + lm_labels=lm_labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets ' + 'for UL2 ...') + if args.data_path: + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + dataset_type='ul2') + elif args.train_weighted_split_paths: + assigned_train_valid_test = [] + if args.train_weighted_split_paths is not None: + train_ds = [] + assigned_train_valid_test.append("train") + if args.valid_weighted_split_paths is not None: + valid_ds = [] + assigned_train_valid_test.append("valid") + if args.test_weighted_split_paths is not None: + test_ds = [] + assigned_train_valid_test.append("test") + + for s in assigned_train_valid_test: + data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + eval(f"args.{s}_weighted_split_weights"), + eval(f"args.{s}_weighted_split_splits"), + eval(f"args.{s}_weighted_split_names")) + for paths, weights, splits, name in data_groups: + d = build_dataset_group(name, paths, weights, splits, + args.data_impl, + train_val_test_num_samples, + args.seq_length, args.seed, + (not args.mmap_warmup), + train_valid_test=s) + eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating UL2 datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) diff --git a/tasks/eval_harness/download.py b/tasks/eval_harness/download.py index d2abcd83a..be5d5c303 100644 --- a/tasks/eval_harness/download.py +++ b/tasks/eval_harness/download.py @@ -1,6 +1,7 @@ # Downloads the specified taks in the evaluation harness # This is particularly useful when running in environments where the GPU nodes # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation. +# May want to set a cache before, e.g. export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache from lm_eval import tasks from lm_eval.tasks import ALL_TASKS diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 68dd649fd..3511cc601 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -42,6 +42,8 @@ def __init__(self, model, tokenizer): self.tokenizer = tokenizer self.VOCAB_SIZE = tokenizer.vocab_size self.EOT_TOKEN_ID = tokenizer.eod + self.add_denoiser = args.add_denoiser + self.DENOISER_TOKEN_ID = tokenizer.tokenize("[S]")[0] self._max_length = args.seq_length @@ -80,9 +82,15 @@ def loglikelihood(self, requests): for context, continuation in requests: if context == "": # end of text as context - context_enc = [self.EOT_TOKEN_ID] + if self.add_denoiser: + context_enc = [self.DENOISER_TOKEN_ID] + [self.EOT_TOKEN_ID] + else: + context_enc = [self.EOT_TOKEN_ID] else: - context_enc = self.tokenizer_encode(context) + if self.add_denoiser: + context_enc = [self.DENOISER_TOKEN_ID] + self.tokenizer_encode(context) + else: + context_enc = self.tokenizer_encode(context) continuation_enc = self.tokenizer_encode(continuation) @@ -260,7 +268,7 @@ def tokenizer_encode(self, text): from megatron.initialize import initialize_megatron import megatron -from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint +from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint def override_args(args, override_args, skip_keys, skip_if_specified_keys): @@ -390,6 +398,9 @@ def tasks_args(parser): group.add_argument('--intermed_results', default = False, action='store_true', help='Whether to print & write intermediate results for each task') group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation') group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel') + group.add_argument('--fewshots', type=int, default=0, help='Num fewshots') + group.add_argument('--limit', type=int, default=None, help='Limit samples') + group.add_argument('--add_denoiser', default = False, action='store_true', help='Whether to add a denoiser to the model') return parser from megatron.global_vars import _parse_args @@ -398,6 +409,10 @@ def main(): # parse the megatron args. But wait with initalizing megatron. # avoid printing the arguments, since they will later be overridden. args = _parse_args(tasks_args) + if os.path.exists(args.results_path): + print("Exists ", args.results_path) + exit() + load_path = args.load model = load_ds_checkpoint_and_setup_megatron(args) @@ -422,11 +437,11 @@ def main(): global_results = {"results": {}, "versions": {}} timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') iteration_id = load_path.split("/")[-1].replace("/", "") - results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") + results_path = args.results_path#.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots.json") # Backup file in case of interruption during writing - results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") + results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots_backup.json") for task_name, task in task_dict.items(): - results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters) + results = evaluator.evaluate(adaptor, {task_name: task}, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit) global_results["results"] = {**global_results["results"], **results["results"]} global_results["versions"] = {**global_results["versions"], **results["versions"]} if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: @@ -436,7 +451,7 @@ def main(): with open(results_path_backup, 'w') as outfile: json.dump(global_results, outfile, indent=4) else: - global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters) + global_results = evaluator.evaluate(adaptor, task_dict, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit) if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: print(json.dumps(global_results, indent=2)) with open(args.results_path, 'w') as outfile: diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py index 74e5ca7c9..08471d0bb 100755 --- a/tools/convert_checkpoint/deepspeed_to_megatron.py +++ b/tools/convert_checkpoint/deepspeed_to_megatron.py @@ -4,7 +4,7 @@ import os import torch from collections import OrderedDict -from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint +from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint MODEL_KEY = 'model' ARGS_KEY = 'args'