diff --git a/.gitignore b/.gitignore
index 6b6db06fb..267995984 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,13 @@
+## HIP-compiled kernels etc.
+*hip*
+# 
+local_examples/
+logs/
+trash/
+kb-runs-gpt/
+ds_configs/
+gpt2-tokenizer/
+smi-output/
 # tests
 # megatron autogenerated indices
 tests/data/*/*npy
diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 695d9d0aa..60f380d9c 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -15,6 +15,7 @@ Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `b
 start-prod
 pip install best-download==0.0.7
 pip install git+https://github.com/EleutherAI/lm-evaluation-harness
+pip install --upgrade scipy
 ```
 
 2. Pre-download needed datasets
diff --git a/examples/run_evalharness_lumi.sh b/examples/run_evalharness_lumi.sh
new file mode 100644
index 000000000..1721d91d2
--- /dev/null
+++ b/examples/run_evalharness_lumi.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+#SBATCH --exclude=nid005159
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH -p eap
+#SBATCH -t 2-0:00:00
+#SBATCH --gpus-per-node=mi250:1
+#SBATCH --exclusive=user
+#SBATCH --hint=nomultithread
+#SBATCH --account=project_462000119
+#SBATCH -o logs/%j.out
+#SBATCH -e logs/%j.err
+
+# if run without sbatch, invoke here
+if [ -z $SLURM_JOB_ID ]; then
+    mkdir -p logs
+    sbatch "$0"
+    exit
+fi
+
+set -euo pipefail
+
+# symlink logs/latest_eval.out and logs/latest_eval.err
+ln -f -s $SLURM_JOB_ID.out logs/latest_eval.out
+ln -f -s $SLURM_JOB_ID.err logs/latest_eval.err
+
+# Data
+CHECKPOINT_PATH=/scratch/project_462000119/muennighoff/nov-2022-optimization/checkpoints/global_step10
+VARIANT=global_step10
+
+export HF_DATASETS_OFFLINE=1
+export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache
+
+VOCAB_FILE="gpt2/vocab.json"
+MERGE_FILE="gpt2/merges.txt"
+
+PP_SIZE=1
+TP_SIZE=1
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+MICRO_BS_MULTIPLIER=1
+
+# Model parameters
+SEQ_LEN=2048
+
+# Dummy arguments
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+ZERO_STAGE=0
+
+mkdir -p ds_configs
+DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
+
+cat <<EOF > $DS_CONFIG_PATH
+{
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": 1,
+    "gradient_clipping": 1.0,
+    "zero_optimization": {
+        "stage": $ZERO_STAGE
+    },
+    "bf16": {
+        "enabled": true
+    },
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
+EOF
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config $DS_CONFIG_PATH \
+    --zero-stage $ZERO_STAGE \
+    "
+
+CMD="Megatron-DeepSpeed/tasks/eval_harness/evaluate.py \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list copa,piqa,rte,winogrande,hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier $MICRO_BS_MULTIPLIER \
+    $MEGATRON_REQUIRED_ARGS \
+    $DEEPSPEED_ARGS \
+    "
+
+echo $CMD
+
+echo "START $SLURM_JOBID: $(date)"
+
+srun --label launch.sh $CMD
+
+echo "END $SLURM_JOBID: $(date)"
+
diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py
index 14650a6e5..13a758a9a 100644
--- a/finetune_t0_non_causal_decoder.py
+++ b/finetune_t0_non_causal_decoder.py
@@ -33,6 +33,7 @@ def model_provider(pre_process=True, post_process=True):
                              enabled=args.zero_stage == 3,
                              mpu=mpu):
         if args.deepspeed:
+            args.pretrain_causal_attention = False
             model = GPTModelPipe(
                 num_tokentypes=0,
                 parallel_output=True,
diff --git a/megatron/arguments.py b/megatron/arguments.py
index c18235a78..e6a222002 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -24,7 +24,7 @@
 import torch
 import deepspeed
 
-from megatron.enums import PositionEmbeddingType
+from megatron.enums import PositionEmbeddingType, UL2ModelType
 import megatron
 from megatron.logging import log_levels
 
@@ -49,6 +49,7 @@ def parse_args(extra_args_provider=None, defaults={},
     parser = _add_autoresume_args(parser)
     parser = _add_biencoder_args(parser)
     parser = _add_vit_args(parser)
+    parser = _add_ul2_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_zero_args(parser)
     parser = _add_memoryopt_args(parser)
@@ -309,6 +310,17 @@ def parse_args(extra_args_provider=None, defaults={},
                     "skip train iterations should be specified as two numbers, i.e. start-end"
                 )
         args.skip_train_iteration_range = skip_train_iteration_range
+    
+    args.ul2_model_type = UL2ModelType(args.ul2_model_type)
+    if (
+            args.ul2_model_type is not UL2ModelType.ENCODER_DECODER
+            and args.decoder_seq_length is not None
+    ):
+        print(
+            f'WARNING: `--decoder_seq_length` is ignored when '
+            f'`--ul2-model-type` is not '
+            f'"{UL2ModelType.ENCODER_DECODER.value}"!'
+        )
 
     if args.use_bnb_optimizer:
         try:
@@ -549,6 +561,12 @@ def _add_training_args(parser):
     group.add_argument('--no-bias-dropout-fusion', action='store_false',
                        help='Disable bias and dropout fusion.',
                        dest='bias_dropout_fusion')
+    group.add_argument('--no-layer-norm-fusion', action='store_false',
+                       help='Disable fused layer norm.',
+                       dest='layer_norm_fusion')
+    group.add_argument('--no-optimizer-fusion', action='store_false',
+                       help='Disable FusedAdam/FusedSGD norm.',
+                       dest='optimizer_fusion')
     group.add_argument('--optimizer', type=str, default='adam',
                        choices=['adam', 'sgd'],
                        help='Optimizer function')
@@ -604,7 +622,7 @@ def _add_learning_rate_args(parser):
                        'and initial warmup, the learing rate at each '
                        'iteration would be different.')
     group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine'],
+                       choices=['constant', 'linear', 'cosine', 'inverse_sqrt'],
                        help='Learning rate decay function.')
     group.add_argument('--lr-decay-iters', type=int, default=None,
                        help='number of iterations to decay learning rate over,'
@@ -615,6 +633,9 @@ def _add_learning_rate_args(parser):
     group.add_argument('--lr-decay-tokens', type=int, default=None,
                        help='number of tokens to decay learning rate over,'
                        ' If not None will override iter/sample-based decay')
+    group.add_argument('--lr-warmup-style', type=str, default='linear',
+                       choices=['constant', 'linear'], help='Learning rate '
+                          'warmup function.')
     group.add_argument('--lr-warmup-fraction', type=float, default=None,
                        help='fraction of lr-warmup-(iters/samples) to use '
                        'for warmup (as a float)')
@@ -643,7 +664,8 @@ def _add_learning_rate_args(parser):
                        'from checkpoint and ignore input arguments.')
     group.add_argument('--universal-checkpoint', action='store_true',
                         help='Loading a universal format checkpoint.')
-
+    group.add_argument('--reset-progress', action='store_true', default=None,
+                        help='Reset iteration to 0 & do not load args.') 
     return parser
 
 
@@ -1023,6 +1045,42 @@ def _add_vit_args(parser):
 
     return parser
 
+def _add_ul2_args(parser):
+    group = parser.add_argument_group(title="UL2")
+
+    group.add_argument('--ul2-model-type', type=str, default='ED',
+                       choices=['ED', 'ND', 'CD'],
+                       help='What type of model to use for UL2 pretraining. '
+                       'ED = encoder-decoder; ND = non-causal decoder-only; '
+                       'CD = causal decoder-only')
+    group.add_argument('--ul2-denoiser-ratios', nargs='+', type=float,
+                       default=None,
+                       help='Probability of each denoising objective to be '
+                       'selected. Uniform distribution by default.')
+    group.add_argument('--ul2-denoisers', nargs='+', type=str,
+                       default=['R', 'R', 'S', 'X', 'X', 'X', 'X'],
+                       choices=['R', 'S', 'X'],
+                       help='What type of UL2 denoising objective the other '
+                       'UL2 configurations refer to.')
+    group.add_argument('--ul2-mean-span-lengths', nargs='+', type=float,
+                       default=[3, 8, 0.25, 3, 8, 64, 64],
+                       help='Mean length for sampling span lengths. '
+                       'Numbers < 1 indicate a mean length of the sequence '
+                       'length times that number.')
+    group.add_argument('--ul2-mask-ratios', nargs='+', type=float,
+                       default=[0.15, 0.15, 0.25, 0.5, 0.5, 0.15, 0.5],
+                       help='Ratio of masked token in the full sequence.')
+    group.add_argument('--ul2-r-denoiser-token', type=str, default='[R]',
+                       help='What token to prepend for the UL2 R-denoising '
+                       'objective.')
+    group.add_argument('--ul2-s-denoiser-token', type=str, default='[S]',
+                       help='What token to prepend for the UL2 S-denoising '
+                       'objective.')
+    group.add_argument('--ul2-x-denoiser-token', type=str, default='[X]',
+                       help='What token to prepend for the UL2 X-denoising '
+                       'objective.')
+
+    return parser
 
 def _add_zero_args(parser):
     """Text generate arguments."""
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index dacbec7dc..ebf93a986 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -342,7 +342,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     set_checkpoint_version(state_dict.get('checkpoint_version', 0))
 
     # Set iteration.
-    if args.finetune or release:
+    if args.finetune or release or args.reset_progress:
         iteration = 0
     else:
         try:
@@ -361,7 +361,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     # Check arguments.
     assert args.consumed_train_samples == 0
     assert args.consumed_valid_samples == 0
-    if 'args' in state_dict:
+    if 'args' in state_dict and not args.reset_progress:
         checkpoint_args = state_dict['args']
         if not args.universal_checkpoint:
             check_checkpoint_args(checkpoint_args)
@@ -480,4 +480,4 @@ def _checkpoint_info():
     return {
         "padded_vocab_size": args.padded_vocab_size,
         "original_vocab_size": tokenizer.vocab_size,
-    }
\ No newline at end of file
+    }
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 3841e263e..a3ce376b5 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -18,6 +18,8 @@
 #   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
 # with some modifications.
 
+import bisect
+from enum import Enum
 import math
 import os
 import time
@@ -37,8 +39,16 @@
 DSET_TYPE_BERT = 'standard_bert'
 DSET_TYPE_ICT = 'ict'
 DSET_TYPE_T5  = 't5'
+DSET_TYPE_UL2  = 'ul2'
 
-DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
+DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_UL2]
+
+
+class SamplingStyle(Enum):
+    POISSON = 'poisson'
+    GEOMETRIC = 'geometric'
+    UNIFORM = 'uniform'
+    NORMAL = 'normal'
 
 
 def analyze_data_prefix(data_prefix):
@@ -182,6 +192,36 @@ def is_start_piece(piece):
     # append it to the previous set of word indexes.
     return not piece.startswith("##")
 
+def get_ngram_indices(
+        idx,
+        ngrams,
+        cand_indexes,
+        num_to_predict,
+        num_filtered_tokens,
+        prefix_lm,
+):
+    if prefix_lm:
+        # Find first index which is greater than the number of
+        # predictions.
+        first_gt_index = bisect.bisect_right(
+            cand_indexes,
+            [num_filtered_tokens - num_to_predict],
+        )
+        # Then move one index before to get less than or equal to the
+        # number of predictions, handling not going below 0.
+        first_le_index = max(1, first_gt_index) - 1
+
+        tail_cand_indexes = cand_indexes[first_le_index:]
+        ngram_index = [
+            tail_cand_indexes[i:]
+            for i in range(len(tail_cand_indexes))
+        ]
+    else:
+        ngram_index = []
+        for n in ngrams:
+            ngram_index.append(cand_indexes[idx:idx + n])
+    return ngram_index
+
 
 def create_masked_lm_predictions(tokens,
                                  vocab_id_list, vocab_id_to_token_dict,
@@ -194,15 +234,23 @@ def create_masked_lm_predictions(tokens,
                                  favor_longer_ngram=False,
                                  do_permutation=False,
                                  geometric_dist=False,
-                                 masking_style="bert"):
+                                 masking_style="bert",
+                                 sampling_style=SamplingStyle.POISSON,
+                                 prefix_lm=False):
     """Creates the predictions for the masked LM objective.
     Note: Tokens here are vocab ids and not text tokens."""
+    if not isinstance(sampling_style, SamplingStyle):
+        sampling_style = SamplingStyle(sampling_style)
+    # Backward-compatibility
+    if geometric_dist:
+        sampling_style = SamplingStyle.GEOMETRIC
 
     cand_indexes = []
     # Note(mingdachen): We create a list for recording if the piece is
     # the starting piece of current token, where 1 means true, so that
     # on-the-fly whole word masking is possible.
     token_boundary = [0] * len(tokens)
+    num_filtered_tokens = 0
 
     for (i, token) in enumerate(tokens):
         if token == cls_id or token == sep_id:
@@ -221,6 +269,7 @@ def create_masked_lm_predictions(tokens,
             cand_indexes.append([i])
             if is_start_piece(vocab_id_to_token_dict[token]):
                 token_boundary[i] = 1
+        num_filtered_tokens += 1
 
     output_tokens = list(tokens)
 
@@ -231,11 +280,18 @@ def create_masked_lm_predictions(tokens,
         return (output_tokens, masked_lm_positions,
                 masked_lm_labels, token_boundary)
 
-    num_to_predict = min(max_predictions_per_seq,
-                         max(1, int(round(len(tokens) * masked_lm_prob))))
+    if sampling_style is SamplingStyle.NORMAL:
+        # First, we get the center of our normal distribution from
+        # `max_ngrams`. Keeping the meaning of `max_ngrams` this way
+        # plays nicely with the other probability distributions in terms
+        # of math.
+        normal_mean = (max_ngrams + 1) / 2
+        # However, we do not want to bound the maximum number of
+        # n-grams.
+        max_ngrams = num_filtered_tokens - 1
 
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
-    if not geometric_dist:
+    if sampling_style is SamplingStyle.POISSON:
         # Note(mingdachen):
         # By default, we set the probilities to favor shorter ngram sequences.
         pvals = 1. / np.arange(1, max_ngrams + 1)
@@ -243,14 +299,30 @@ def create_masked_lm_predictions(tokens,
         if favor_longer_ngram:
             pvals = pvals[::-1]
 
-    ngram_indexes = []
-    for idx in range(len(cand_indexes)):
-        ngram_index = []
-        for n in ngrams:
-            ngram_index.append(cand_indexes[idx:idx + n])
-        ngram_indexes.append(ngram_index)
+    if prefix_lm:
+        # We only do one span searching loop anyway, so this does not
+        # matter in terms of random search. However, we do want to allow
+        # sequences greater than the mean ratio.
+        num_to_predict = max_predictions_per_seq
 
-    np_rng.shuffle(ngram_indexes)
+        ngram_index_indexes = np.array([0])
+    else:
+        num_to_predict = min(max_predictions_per_seq,
+                             max(1, int(round(len(tokens) * masked_lm_prob))))
+
+        ngram_index_indexes = np.arange(len(cand_indexes))
+        np_rng.shuffle(ngram_index_indexes)
+
+    def get_ngram_indices_(idx):
+        return get_ngram_indices(
+            idx,
+            ngrams,
+            cand_indexes,
+            num_to_predict,
+            num_filtered_tokens,
+            prefix_lm,
+        )
+    ngram_indexes = map(get_ngram_indices_, ngram_index_indexes)
 
     (masked_lms, masked_spans) = ([], [])
     covered_indexes = set()
@@ -266,15 +338,25 @@ def create_masked_lm_predictions(tokens,
                 if index in covered_indexes:
                     continue
 
-        if not geometric_dist:
+        if sampling_style is SamplingStyle.POISSON:
             n = np_rng.choice(ngrams[:len(cand_index_set)],
                               p=pvals[:len(cand_index_set)] /
                               pvals[:len(cand_index_set)].sum(keepdims=True))
-        else:
+        elif sampling_style is SamplingStyle.GEOMETRIC:
             # Sampling "n" from the geometric distribution and clipping it to
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
             # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
             n = min(np_rng.geometric(0.2), max_ngrams)
+        elif sampling_style is SamplingStyle.UNIFORM:
+            n = np_rng.choice(ngrams[:len(cand_index_set)])
+        elif sampling_style is SamplingStyle.NORMAL:
+            n = round(np.clip(
+                np_rng.normal(loc=normal_mean),
+                1,
+                len(cand_index_set),
+            ))
+        else:
+            raise ValueError('unknown sampling style')
 
         index_set = sum(cand_index_set[n - 1], [])
         n -= 1
@@ -324,7 +406,8 @@ def create_masked_lm_predictions(tokens,
             label=[tokens[index] for index in index_set]))
 
     assert len(masked_lms) <= num_to_predict
-    np_rng.shuffle(ngram_indexes)
+    np_rng.shuffle(ngram_index_indexes)
+    ngram_indexes = map(get_ngram_indices_, ngram_index_indexes)
 
     select_indexes = set()
     if do_permutation:
@@ -522,6 +605,7 @@ def build_dataset(index, name):
         from megatron.data.bert_dataset import BertDataset
         from megatron.data.ict_dataset import ICTDataset
         from megatron.data.t5_dataset import T5Dataset
+        from megatron.data.ul2_dataset import UL2Dataset
         dataset = None
         if splits[index + 1] > splits[index]:
             # Get the pointer to the original doc-idx so we can set it later.
@@ -560,6 +644,24 @@ def build_dataset(index, name):
                     short_seq_prob=short_seq_prob,
                     **kwargs
                 )
+            elif dataset_type == DSET_TYPE_UL2:
+                args = get_args()
+                dataset = UL2Dataset(
+                    indexed_dataset=indexed_dataset,
+                    model_type=args.ul2_model_type,
+                    denoiser_ratios=args.ul2_denoiser_ratios,
+                    denoisers=args.ul2_denoisers,
+                    mean_span_lengths=args.ul2_mean_span_lengths,
+                    mask_ratios=args.ul2_mask_ratios,
+                    denoiser_tokens={
+                        'R': args.ul2_r_denoiser_token,
+                        'S': args.ul2_s_denoiser_token,
+                        'X': args.ul2_x_denoiser_token,
+                    },
+                    max_seq_length_dec=max_seq_length_dec,
+                    short_seq_prob=short_seq_prob,
+                    **kwargs,
+                )
             elif dataset_type == DSET_TYPE_BERT:
                 dataset = BertDataset(
                     indexed_dataset=indexed_dataset,
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 0db1aa2fe..534d3bc3c 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -21,12 +21,12 @@
 import numpy as np
 import torch
 
-from megatron import mpu, print_rank_0
+from megatron import get_args, mpu, print_rank_0
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
 from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-
+from megatron.data.ul2_dataset import UL2Dataset
 
 def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
                                     train_valid_test_num_samples,
@@ -154,10 +154,35 @@ def build_dataset(name):
         if splits[1] > splits[0]:
             documents = np.arange(start=splits[0], stop=splits[1],
                                   step=1, dtype=np.int32)
-            dataset = GPTDataset(name, data_prefix,
-                                  documents, indexed_dataset,
-                                  train_valid_test_num_samples[index],
-                                  seq_length, seed)
+
+            args = get_args()
+            if args.ul2_model_type:
+                dataset = UL2Dataset(
+                    name=name,
+                    data_prefix=data_prefix,
+                    num_epochs=None,
+                    max_num_samples=train_valid_test_num_samples[index],
+                    max_seq_length=seq_length,
+                    seed=seed,
+                    indexed_dataset=indexed_dataset,
+                    model_type=args.ul2_model_type,
+                    denoiser_ratios=args.ul2_denoiser_ratios,
+                    denoisers=args.ul2_denoisers,
+                    mean_span_lengths=args.ul2_mean_span_lengths,
+                    mask_ratios=args.ul2_mask_ratios,
+                    denoiser_tokens={
+                        'R': args.ul2_r_denoiser_token,
+                        'S': args.ul2_s_denoiser_token,
+                        'X': args.ul2_x_denoiser_token,
+                    },
+                    max_seq_length_dec=seq_length,
+                    short_seq_prob=args.short_seq_prob,
+                )
+            else:
+                dataset = GPTDataset(name, data_prefix,
+                                    documents, indexed_dataset,
+                                    train_valid_test_num_samples[index],
+                                    seq_length, seed)
         return dataset
 
     dataset = build_dataset(dataset_group_name)
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index d0d312544..403e4b5d2 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -513,7 +513,7 @@ def __getstate__(self):
         return self._path
 
     def __setstate__(self, state):
-        self._do_init(state)
+        self._do_init(state, True)
 
     def _do_init(self, path, skip_warmup):
         self._path = path
diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 42110b923..f3ac34bf0 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -26,6 +26,27 @@
     get_samples_mapping
 )
 
+
+class LengthExceededError(ValueError):
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = (
+                'The sequence input became too long. '
+                'Try to increase `--seq-length` or `--encoder-seq-length`.'
+            )
+        super().__init__(msg)
+
+
+class DecoderLengthExceededError(ValueError):
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = (
+                'The sequence input for the decoder became too long. '
+                'Try to increase `--decoder-seq-length`.'
+            )
+        super().__init__(msg)
+
+
 class T5Dataset(torch.utils.data.Dataset):
 
     def __init__(self, name, indexed_dataset, data_prefix,
@@ -104,6 +125,8 @@ def build_training_sample(sample, target_seq_length,
         target_seq_length: Desired sequence length.
         max_seq_length: Maximum length of the sequence. All values are padded to
             this length.
+        max_seq_length_dec: Maximum length of the decoder input sequence. All
+            values are padded to this length.            
         vocab_id_list: List of vocabulary ids. Used to pick a random id.
         vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
         cls_id: Start of example id.
@@ -157,29 +180,30 @@ def build_training_sample(sample, target_seq_length,
     return train_sample
 
 
-def pad_and_convert_to_numpy(tokens, masked_positions,
-                             masked_labels, pad_id,
-                             max_seq_length, max_seq_length_dec,
-                             masked_spans=None, bos_id=None,
-                             eos_id=None, sentinel_tokens=None):
-    """Pad sequences and convert them to numpy."""
-
-    sentinel_tokens = collections.deque(sentinel_tokens)
+def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
+                           eos_id=None, sentinel_tokens=None, prefix_lm=False):
+    if prefix_lm:
+        assert len(masked_spans) <= 1, \
+            'Received more than one masked span for PrefixLM masking'
+    else:
+        sentinel_tokens = collections.deque(sentinel_tokens)    
     t5_input = []
     (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
     (start_index, end_index) = (0, None)
     for span in masked_spans:
-        flag = sentinel_tokens.popleft()
+        if not prefix_lm:
+            flag = sentinel_tokens.popleft()
+            # Append the same tokens in decoder input and output
+            t5_decoder_in.append(flag)
+            t5_decoder_out.append(flag)            
 
-        # Append the same tokens in decoder input and output
-        t5_decoder_in.append(flag)
         t5_decoder_in.extend(span.label)
-        t5_decoder_out.append(flag)
         t5_decoder_out.extend(span.label)
 
         end_index = span.index[0]
         t5_input.extend(tokens[start_index: end_index])
-        t5_input.append(flag)
+        if not prefix_lm:
+            t5_input.append(flag)
 
         # the next start index is the token after the last span token
         start_index = span.index[-1] + 1
@@ -189,6 +213,18 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
 
     # Add the remaining tokens to the t5 input
     t5_input.extend(tokens[start_index:])
+    return t5_input, t5_decoder_in, t5_decoder_out
+
+
+def pad_and_convert_to_numpy(tokens, masked_positions,
+                             masked_labels, pad_id,
+                             max_seq_length, max_seq_length_dec,
+                             masked_spans=None, bos_id=None,
+                             eos_id=None, sentinel_tokens=None, prefix_lm=False):
+    """Pad sequences and convert them to numpy."""
+
+    t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks(
+        tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm)
 
     # assert (len(t5_input) - len(masked_spans)) + \
     #        (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)
@@ -198,7 +234,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
     # Encoder-side padding mask.
     num_tokens = len(t5_input)
     padding_length = max_seq_length - num_tokens
-    assert padding_length >= 0
+    if padding_length < 0:
+        raise LengthExceededError()
     assert len(masked_positions) == len(masked_labels)
 
     # Tokens..
@@ -208,7 +245,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
     # Decoder-side padding mask.
     num_tokens_dec = len(t5_decoder_in)
     padding_length_dec = max_seq_length_dec - num_tokens_dec
-    assert padding_length_dec >= 0
+    if padding_length_dec < 0:
+        raise DecoderLengthExceededError()
     filler_dec = [pad_id] * padding_length_dec
     tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64)
 
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
new file mode 100644
index 000000000..4da73d9e7
--- /dev/null
+++ b/megatron/data/ul2_dataset.py
@@ -0,0 +1,326 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""UL2-style dataset."""
+
+import math
+import numpy as np
+
+from megatron import get_tokenizer
+from megatron.data.dataset_utils import (
+    create_masked_lm_predictions, 
+    get_samples_mapping,
+    SamplingStyle
+)
+from megatron.data.t5_dataset import (
+    LengthExceededError,
+    make_history_mask,
+    merge_subsequent_masks,
+    pad_and_convert_to_numpy,
+    T5Dataset,
+)
+from megatron.enums import UL2ModelType
+
+
+def is_decoder_only(ul2_model_type):
+    """Return whether we use a decoder-only model."""
+    assert isinstance(ul2_model_type, UL2ModelType)
+    return ul2_model_type is not UL2ModelType.ENCODER_DECODER
+
+
+def is_prefix_lm(ul2_model_type):
+    """Return whether we use a non-causal decoder-only model."""
+    assert isinstance(ul2_model_type, UL2ModelType)
+    return ul2_model_type is UL2ModelType.NON_CAUSAL_DECODER
+
+
+class UL2Dataset(T5Dataset):
+
+    def __init__(self, name, indexed_dataset, data_prefix,
+                 num_epochs, max_num_samples, model_type,
+                 denoiser_ratios, denoisers, mean_span_lengths,
+                 mask_ratios, denoiser_tokens, max_seq_length,
+                 max_seq_length_dec, short_seq_prob, seed):
+
+        if denoiser_ratios is None:
+            # Uniform distribution by default.
+            denoiser_ratios = [1 / len(denoisers)] * len(denoisers)
+
+        assert (
+            len(denoiser_ratios) == len(denoisers)
+            == len(mean_span_lengths) == len(mask_ratios)
+        ), (
+            'some UL2 configurations do not correspond to the amount of '
+            'denoising objectives'
+        )
+
+        # Params to store.
+        self.name = name
+        self.seed = seed
+        self.masked_lm_prob = short_seq_prob
+        self.max_seq_length = max_seq_length
+        self.max_seq_length_dec = max_seq_length_dec
+
+        # Dataset.
+        self.indexed_dataset = indexed_dataset
+
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
+                                                   data_prefix,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   self.max_seq_length - 2, # account for added tokens
+                                                   short_seq_prob,
+                                                   self.seed,
+                                                   self.name,
+                                                   False)
+
+        # Vocab stuff.
+        tokenizer = get_tokenizer()
+        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = tokenizer.inv_vocab
+        # self.cls_id = tokenizer.cls
+        self.sep_id = tokenizer.sep
+        self.mask_id = tokenizer.mask
+        self.pad_id = tokenizer.pad
+        self.bos_id = tokenizer.bos_token_id
+        self.eos_id = tokenizer.eos_token_id
+        self.sentinel_tokens = tokenizer.additional_special_tokens_ids
+        assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
+
+        # Params to store.
+        self.model_type = model_type
+        self.denoiser_ratios = [
+            denoiser_ratio / sum(denoiser_ratios)
+            for denoiser_ratio in denoiser_ratios
+        ]
+        self.denoisers = [denoiser.upper() for denoiser in denoisers]
+        self.mean_span_lengths = mean_span_lengths
+        self.mask_ratios = mask_ratios
+
+        # Vocab stuff.
+        tokenizer = get_tokenizer()
+        # Remove CLS token because we don't need it.
+        # del self.cls_id
+        self.cls_ids = {
+            denoiser: tokenizer.vocab[token]
+            for (denoiser, token) in denoiser_tokens.items()
+        }
+        # cls_token = self.vocab_id_to_token_dict[tokenizer.cls]
+        # if cls_token not in self.cls_ids:
+        #     self.cls_ids[cls_token] = tokenizer.cls
+
+        # Filter out denoiser tokens.
+        self.sentinel_tokens = [
+            token
+            for token in tokenizer.additional_special_tokens_ids
+            if token not in self.cls_ids.values()
+        ]
+        assert len(self.sentinel_tokens) > 0, \
+            "Provide the argument --vocab-extra-ids 100 to the script"
+
+    def __getitem__(self, idx):
+
+        start_index, end_index, seq_length = self.samples_mapping[idx]
+        sample = []
+        for index in range(start_index, end_index):
+            sample.append(self.indexed_dataset[index])
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length,  # needed for padding
+                                     self.max_seq_length_dec,
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_ids, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.model_type, self.denoiser_ratios,
+                                     self.denoisers, self.mean_span_lengths,
+                                     self.mask_ratios, np_rng, self.bos_id,
+                                     self.eos_id, self.sentinel_tokens)
+
+
+def build_training_sample(sample, target_seq_length,
+                          max_seq_length, max_seq_length_dec,
+                          vocab_id_list, vocab_id_to_token_dict,
+                          cls_ids, sep_id, mask_id, pad_id,
+                          model_type, denoiser_ratios,
+                          denoisers, mean_span_lengths,
+                          mask_ratios, np_rng,
+                          bos_id=None, eos_id=None,
+                          sentinel_tokens=None):
+    """Build training sample.
+    Arguments:
+        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
+        max_seq_length_dec: Maximum length of the decoder input sequence. All
+            values are padded to this length.
+        vocab_id_list: List of vocabulary ids. Used to pick a random id.
+        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
+        cls_ids: Start of example ids.
+        sep_id: Separator id.
+        mask_id: Mask token id.
+        pad_id: Padding token id.
+        model_type: What type of model is used.
+        denoiser_ratios: Probability of each denoising objective to be selected.
+        denoisers: What type of UL2 denoising objective the other UL2
+              configurations refer to.
+        mean_span_lengths: Mean length for sampling span lengths. Numbers < 1
+              indicate a mean length of the sequence length times that number.
+        mask_ratios: Ratio of masked token in the full sequence.
+        np_rng: Random number genenrator. Note that this rng state should be
+              numpy and not python since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
+        bos_id: start of decoder example id
+        eos_id: end of generation id
+        sentinel_tokens: unique value to be substituted for every replaced span
+    """
+
+    # Denoiser selection
+    denoiser_index = np_rng.choice(np.arange(len(denoisers)), p=denoiser_ratios)
+    denoiser = denoisers[denoiser_index]
+    masked_lm_prob = mask_ratios[denoiser_index]
+
+    assert target_seq_length <= max_seq_length
+
+    # flatten sentences into one list
+    tokens = [token for sentence in sample for token in sentence]
+
+    max_num_tokens = target_seq_length
+    if is_decoder_only(model_type):
+    #     # Keep space for repeated `extra_id` tokens; not the most data
+    #     # efficient since we calculate this based on the maximum number
+    #     # of possible `extra_id` tokens.
+        safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+        truncated = len(tokens) > safe_max_seq_len
+        tokens = tokens[:safe_max_seq_len]
+    else:
+    # Truncate to `target_sequence_length`.
+        truncated = len(tokens) > max_num_tokens
+        tokens = tokens[:max_num_tokens]
+
+    # Prepend objective token.
+    cls_id = cls_ids.get(denoiser)
+    if cls_id is None:
+        raise ValueError('unknown denoiser')
+    tokens = [cls_id] + tokens
+
+    # Masking.
+    mean_ngrams = mean_span_lengths[denoiser_index]
+    if mean_ngrams < 1:
+        # Ensure we always obtain at least one `max_ngrams`.
+        mean_ngrams = max(1, round(len(tokens) * mean_ngrams))
+    max_ngrams = mean_ngrams * 2 - 1
+
+    if denoiser == 'R' or denoiser == 'X':
+        sampling_style = SamplingStyle.NORMAL
+        prefix_lm = False
+        max_predictions_per_seq = len(tokens) - 1        
+    elif denoiser == 'S':
+        sampling_style = SamplingStyle.UNIFORM
+        prefix_lm = True
+        max_predictions_per_seq = min(
+            round(masked_lm_prob * len(tokens)) * 2 - 1,
+            len(tokens) - 1,
+        )        
+    else:
+        raise ValueError('unknown denoiser')
+    # Ensure we always have at least one prediction.
+    max_predictions_per_seq = max(1, max_predictions_per_seq)        
+    (
+        tokens, masked_positions, masked_labels, _, masked_spans,
+    ) = create_masked_lm_predictions(
+        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
+        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng,
+        max_ngrams=max_ngrams, masking_style="t5",
+        sampling_style=sampling_style, prefix_lm=prefix_lm,
+    )
+
+    if is_decoder_only(model_type):
+        # Concatenate to one sequence.
+        tokens_enc, tokens_dec_in, labels = merge_subsequent_masks(
+            tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm)
+
+        # Move EOS tokens to end of sequence.
+        while tokens_enc[-1] == eos_id:
+            del tokens_enc[-1]
+            tokens_dec_in.append(eos_id)
+            labels.append(eos_id)
+
+        num_labels = len(labels)
+
+        # Move BOS token to start of sequence.
+        tokens_dec_in = tokens_dec_in[1:]
+        tokens = (
+            [bos_id]
+            + tokens_enc
+            + [sep_id]
+            + tokens_dec_in
+        )
+
+        # Pad and convert to NumPy.
+        padding_length = max_seq_length - len(tokens)
+        if padding_length < 0:
+            raise LengthExceededError()
+        filler = [pad_id] * padding_length
+
+        tokens = np.array(tokens + filler, dtype=np.int64)
+        labels = np.array((
+            tokens_enc
+            + [sep_id]
+            + labels
+            + filler
+        ), dtype=np.int64)
+
+        loss_mask = np.zeros(len(tokens), dtype=np.int64)
+        labels_start_neg_index = -(num_labels + padding_length)
+        labels_end_neg_index = -padding_length if padding_length > 0 else None
+        loss_mask[labels_start_neg_index:labels_end_neg_index] = 1
+
+        dec_mask = make_history_mask(tokens)
+        if is_prefix_lm(model_type):
+            dec_mask[:labels_start_neg_index, :labels_start_neg_index] = 1
+
+        train_sample = {
+            'text': tokens,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'truncated': int(truncated),
+            'dec_mask': dec_mask,
+        }
+    else:
+        # Padding.
+        tokens_enc, tokens_dec_in, labels, enc_mask, \
+        dec_mask, enc_dec_mask, loss_mask \
+            = pad_and_convert_to_numpy(tokens, masked_positions,
+                                       masked_labels, pad_id, max_seq_length,
+                                       max_seq_length_dec, masked_spans,
+                                       bos_id, eos_id, sentinel_tokens,
+                                       prefix_lm)
+
+        train_sample = {
+            'text_enc': tokens_enc,
+            'text_dec': tokens_dec_in,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'truncated': int(truncated),
+            'enc_mask': enc_mask,
+            'dec_mask': dec_mask,
+            'enc_dec_mask': enc_dec_mask,
+        }
+    return train_sample
diff --git a/megatron/enums.py b/megatron/enums.py
index 90d00a071..2961cbb66 100644
--- a/megatron/enums.py
+++ b/megatron/enums.py
@@ -33,3 +33,8 @@ class PositionEmbeddingType(enum.Enum):
     rotary = 1
     absolute = 2
     alibi = 3
+
+class UL2ModelType(enum.Enum):
+    ENCODER_DECODER = 'ED'
+    NON_CAUSAL_DECODER = 'ND'
+    CAUSAL_DECODER = 'CD'
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index e2ac2567b..bdc654c39 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -17,81 +17,97 @@
 import pathlib
 import subprocess
 
+import torch
 from torch.utils import cpp_extension
 
+# Setting this param to a list has a problem of generating different
+# compilation commands (with diferent order of architectures) and
+# leading to recompilation of fused kernels. Set it to empty string
+# to avoid recompilation and assign arch flags explicity in
+# extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
 
 def load(args):
 
-    # Setting this param to a list has a problem of generating different
-    # compilation commands (with diferent order of architectures) and
-    # leading to recompilation of fused kernels. Set it to empty string
-    # to avoid recompilation and assign arch flags explicity in
-    # extra_cuda_cflags below
-    #
-    # but if a user wants to set an explicit list of archs to compile to, then let that list
-    # through:
-    arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
-    if arch_list is None:
-        os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-
-    # # Check if cuda 11 is installed for compute capability 8.0
-    # cc_flag = []
-    # _, bare_metal_major, _ = _get_cuda_bare_metal_version(
-    #     cpp_extension.CUDA_HOME)
-    # if int(bare_metal_major) >= 11:
-    #     cc_flag.append('-gencode')
-    #     cc_flag.append('arch=compute_80,code=sm_80')
+    # Check if cuda 11 is installed for compute capability 8.0
+    cc_flag = []
+    if torch.version.hip is None:
+        _, bare_metal_major, _ = _get_cuda_bare_metal_version(
+            cpp_extension.CUDA_HOME)
+        if int(bare_metal_major) >= 11:
+            cc_flag.append('-gencode')
+            cc_flag.append('arch=compute_80,code=sm_80')
 
     # Build path
     srcpath = pathlib.Path(__file__).parent.absolute()
     buildpath = srcpath / 'build'
-    buildpath.mkdir(parents=True, exist_ok=True)
+    _create_build_dir(buildpath)
 
     # Helper function to build the kernels.
-    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
+    def _cpp_extention_load_helper(name, sources, extra_cuda_flags, extra_include_paths):
+        if torch.version.hip is not None:
+            extra_cuda_cflags=['-O3'] + extra_cuda_flags + cc_flag
+        else:
+            extra_cuda_cflags=['-O3',
+                               '-gencode', 'arch=compute_70,code=sm_70',
+                               '--use_fast_math'] + extra_cuda_flags + cc_flag
+
         return cpp_extension.load(
             name=name,
             sources=sources,
             build_directory=buildpath,
             extra_cflags=['-O3',],
-            extra_cuda_cflags=['-O3',
-                               '--use_fast_math'] + extra_cuda_flags,
+            extra_cuda_cflags=extra_cuda_cflags,
+            extra_include_paths=extra_include_paths,
             verbose=(args.rank == 0)
         )
-                               # '-gencode', 'arch=compute_70,code=sm_70',
 
     # ==============
     # Fused softmax.
     # ==============
 
+    if torch.version.hip is not None:
+        extra_include_paths=[os.path.abspath(srcpath)]
+    else:
+        extra_include_paths=[]
+
     if args.masked_softmax_fusion:
-        extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
-                            '-U__CUDA_NO_HALF_CONVERSIONS__',
-                            '--expt-relaxed-constexpr',
-                            '--expt-extended-lambda']
+        if torch.version.hip is not None:
+             extra_cuda_flags = ['-D__HIP_NO_HALF_OPERATORS__=1',
+                                '-D__HIP_NO_HALF_CONVERSIONS__=1']
+        else:
+             extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
+                                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                '--expt-relaxed-constexpr',
+                                '--expt-extended-lambda']
         
         # Upper triangular softmax.
         sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
                  srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
         scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
             "scaled_upper_triang_masked_softmax_cuda",
-            sources, extra_cuda_flags)
+            sources, extra_cuda_flags, extra_include_paths)
 
         # Masked softmax.
         sources=[srcpath / 'scaled_masked_softmax.cpp',
                  srcpath / 'scaled_masked_softmax_cuda.cu']
         scaled_masked_softmax_cuda = _cpp_extention_load_helper(
-            "scaled_masked_softmax_cuda", sources, extra_cuda_flags)
+            "scaled_masked_softmax_cuda", sources, extra_cuda_flags, extra_include_paths)
 
     # =================================
     # Mixed precision fused layer norm.
     # =================================
 
-    extra_cuda_flags = ['-maxrregcount=50']
+    if torch.version.hip is not None:
+        extra_cuda_flags = []
+    else:
+        extra_cuda_flags = ['-maxrregcount=50']
+
     sources=[srcpath / 'layer_norm_cuda.cpp',
              srcpath / 'layer_norm_cuda_kernel.cu']
     fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
-        "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags)
+        "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags, extra_include_paths)
 
 
 def _get_cuda_bare_metal_version(cuda_dir):
diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
index 28a579e1a..aae0c993c 100644
--- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu
+++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
@@ -76,7 +76,8 @@ void cuWelfordMuSigma2(
   const int i1,
   U& mu,
   U& sigma2,
-  U* buf) 
+  U* buf,
+  const int GPU_WARP_SIZE)
 {
   // Assumptions:
   // 1) blockDim.x == warpSize
@@ -106,12 +107,11 @@ void cuWelfordMuSigma2(
       cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
     }
     // intra-warp reductions
-    for (int l = 0;  l <= 4;  ++l) {
-      int srcLaneB = (threadIdx.x+(1<<l))&31;
-      U muB = WARP_SHFL(mu, srcLaneB);
-      U countB = WARP_SHFL(count, srcLaneB);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
-      cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      U sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      U muB = WARP_SHFL_DOWN(mu, stride);
+      U countB = WARP_SHFL_DOWN(count, stride);
+      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -160,7 +160,8 @@ void cuWelfordMuSigma2(
   const int i1,
   float& mu,
   float& sigma2,
-  float* buf) 
+  float* buf,
+  const int GPU_WARP_SIZE)
 {
   // Assumptions:
   // 1) blockDim.x == warpSize
@@ -201,12 +202,11 @@ void cuWelfordMuSigma2(
       cuWelfordOnlineSum(curr,mu,sigma2,count);
     }
     // intra-warp reductions
-    for (int l = 0;  l <= 4;  ++l) {
-      int srcLaneB = (threadIdx.x+(1<<l))&31;
-      float muB = WARP_SHFL(mu, srcLaneB);
-      float countB = WARP_SHFL(count, srcLaneB);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
-      cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      float sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      float muB = WARP_SHFL_DOWN(mu, stride);
+      float countB = WARP_SHFL_DOWN(count, stride);
+      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -246,14 +246,25 @@ void cuWelfordMuSigma2(
     }
   }
 }
-
+#ifndef __HIP_PLATFORM_HCC__
 template<typename U> U rsqrt(U v) {
+#else
+template<typename U> __device__ U rsqrt(U v) {
+#endif
   return U(1) / sqrt(v);
 }
+#ifndef __HIP_PLATFORM_HCC__
 template<> float rsqrt(float v) {
+#else
+template<> __device__ float rsqrt(float v) {
+#endif
   return rsqrtf(v);
 }
+#ifndef __HIP_PLATFORM_HCC__
 template<> double rsqrt(double v) {
+#else
+template<> __device__ double rsqrt(double v) {
+#endif
   return rsqrt(v);
 }
 
@@ -297,18 +308,23 @@ void cuApplyLayerNorm(
   const int n2,
   const U epsilon,
   const V* __restrict__ gamma,
-  const V* __restrict__ beta
+  const V* __restrict__ beta,
+  const int GPU_WARP_SIZE
   ) 
 {
   // Assumptions:
   // 1) blockDim.x == warpSize
   // 2) Tensors are contiguous
   //
+#ifndef __HIP_PLATFORM_HCC__
   for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#else
+  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#endif
     SharedMemory<U> shared;
     U* buf = shared.getPointer();
     U mu,sigma2;
-    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf);
+    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf,GPU_WARP_SIZE);
     const T* lvals = vals + i1*n2;
     V* ovals = output_vals + i1*n2;
     U c_invvar = rsqrt(sigma2 + epsilon);
@@ -543,7 +559,11 @@ void cuComputeGradInput(
     const V* gamma,
     T* grad_input)
 {
+#ifndef __HIP_PLATFORM_HCC__
   for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#else
+  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#endif
     U sum_loss1 = U(0);
     U sum_loss2 = U(0);
     const U c_mean = mean[i1];
@@ -667,7 +687,11 @@ void HostApplyLayerNorm(
     )
 {
     auto stream = at::cuda::getCurrentCUDAStream().stream();
-    const dim3 threads(32,4,1);
+    const int warp_size = at::cuda::warp_size();
+    dim3 threads(warp_size,4,1);
+#ifndef __HIP_PLATFORM_HCC__
+    threads.y = 1;
+#endif
     const uint64_t maxGridY =
       at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
     const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
@@ -682,7 +706,9 @@ void HostApplyLayerNorm(
 		    input,
 		    n1,n2,
 		    U(epsilon),
-            gamma,beta);
+                    gamma,
+		    beta,
+		    warp_size);
 }
 
 
@@ -735,11 +761,16 @@ void HostLayerNormGradient(
     )
 {
     auto stream = at::cuda::getCurrentCUDAStream().stream();
+    const int warp_size = at::cuda::warp_size();
 
     if (gamma != NULL && beta != NULL) {
       // compute grad_gamma(j) and grad_beta(j)
+#ifndef __HIP_PLATFORM_HCC__
+      const int part_size = warp_size;
+#else
       const int part_size = 16;
-      const dim3 threads2(32,4,1);
+#endif
+      const dim3 threads2(warp_size,4,1);
       const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1);
       const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y *
 	(threads2.x + 1);
@@ -758,7 +789,7 @@ void HostLayerNormGradient(
 		      part_grad_gamma.DATA_PTR<U>(),
 		      part_grad_beta.DATA_PTR<U>());
 
-      const dim3 threads3(32,8,1);
+      const dim3 threads3(warp_size,8,1);
       const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
       const int nshared3 = threads3.x * threads3.y * sizeof(U);
       cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
@@ -774,7 +805,10 @@ void HostLayerNormGradient(
     const uint64_t maxGridY =
       at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
     const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
-    const dim3 threads1(32,4,1);
+    dim3 threads1(warp_size,4,1);
+#ifndef __HIP_PLATFORM_HCC__
+    threads1.y = 2;
+#endif
     int nshared =
 	    threads1.y > 1 ?
 	    threads1.y*threads1.x*sizeof(U) :
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
index 2efee39a6..0c068c7cb 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -18,7 +18,9 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include "scaled_masked_softmax.h"
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
index 6df83fc10..4e6a467ce 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
@@ -17,7 +17,8 @@
 #pragma once
 
 #include <assert.h>
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
+// #include <cuda_fp16.h>
 #include <cfloat>
 #include <limits>
 #include <stdint.h>
@@ -340,7 +341,7 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
     int softmax_elements_stride, 
     int attn_batches)
 {
-    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 );
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 8192 );
     if (softmax_elements == 0) {
         return;
     } else {
@@ -415,6 +416,14 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
                 scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
                 break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
             default:
                 break;
         }
@@ -431,7 +440,7 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
     int softmax_elements_stride, 
     int attn_batches)
 {
-    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 8192 );
     if (softmax_elements == 0) {
        return;
     } else {
@@ -506,6 +515,14 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
                 scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
                 break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
             default:
                 break;
         }
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 5efc3d412..4aa9a702a 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -18,7 +18,9 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include "scaled_upper_triang_masked_softmax.h"
@@ -35,7 +37,7 @@ torch::Tensor fwd_cuda(
   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
   const int attn_batches = input.size(0);
   const int seq_len = input.size(1);
-  TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+  TORCH_INTERNAL_ASSERT(seq_len <= 8192);
 
   // Output 
   auto act_options = input.options().requires_grad(false);
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index f8d5027a1..3e7deb913 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -3,6 +3,10 @@
 import torch
 from torch.nn import LayerNorm
 
+import sys
+# add to path
+sys.path.append("/home/nouamane/projects/Megatron-DeepSpeed/")
+import megatron
 from megatron.model.enums import AttnMaskType
 from megatron.model.fused_layer_norm import MixedFusedLayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
diff --git a/megatron/initialize.py b/megatron/initialize.py
index b967a228a..2e92b15af 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -194,7 +194,7 @@ def _compile_dependencies():
         args.micro_batch_size
     # Constraints on sequence length and attn_batch_size to enable warp based
     # optimization and upper triangular optimization (for causal mask)
-    custom_kernel_constraint = seq_len > 16 and seq_len <=2048 and \
+    custom_kernel_constraint = seq_len > 16 and seq_len <=8192 and \
         seq_len % 4 == 0 and attn_batch_size % 4 == 0
     # Print a warning.
     if not ((args.fp16 or args.bf16) and
diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index ae1fcdb2b..c886b9814 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -23,7 +23,7 @@ class AnnealingLR(object):
     """Anneals the learning rate."""
 
     def __init__(self, optimizer, max_lr, min_lr,
-                 warmup_steps, decay_steps, decay_style,
+                 warmup_steps, decay_steps, decay_style, warmup_style,
                  use_checkpoint_lr_scheduler=True,
                  override_lr_scheduler=False):
         args = get_args()
@@ -46,6 +46,7 @@ def __init__(self, optimizer, max_lr, min_lr,
         self.warmup_tokens = 0
 
         self.decay_style = decay_style
+        self.warmup_style = warmup_style
 
         self.override_lr_scheduler = override_lr_scheduler
         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
@@ -63,18 +64,33 @@ def get_lr(self):
         """Learning rate decay functions from:
               https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
 
-        # Use linear warmup for the initial part.
+
+        # Use warmup for the initial part.
         if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
             if self.num_steps == self.warmup_steps and \
                 self.decay_tokens is not None:
                 self.warmup_tokens = self.num_tokens
-            return self.max_lr * float(self.num_steps) / \
-                float(self.warmup_steps)
+            if self.warmup_style == 'linear':
+                return self.max_lr * float(self.num_steps) / \
+                    float(self.warmup_steps)
+            elif self.warmup_style == 'constant':
+                return self.max_lr
+            else:
+                raise ValueError('Unknown warmup style: {}'.format(
+                    self.warmup_style))
 
         # If the learning rate is constant, just return the initial value.
         if self.decay_style == 'constant':
             return self.max_lr
 
+
+        # If constant decay style
+        # In warmup phase: lr = max_lr
+        # In decay phase: lr = max_lr * sqrt(warmup_steps) / sqrt(num_steps)
+        # Note: To replicate t5x check https://github.com/TurkuNLP/Megatron-DeepSpeed/pull/2
+        if self.decay_style == 'inverse_sqrt':
+            return self.max_lr * (max(self.warmup_steps, 1) / max(self.num_steps, 1))**0.5
+
         if self.decay_tokens is None:
             # step-based decay
             
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 55e9c9dd8..7b1d7eaa7 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -86,10 +86,13 @@ def __init__(self, normalized_shape, eps=1e-5):
     args = get_args()
     self.layernorm_tp_auto_sync = args.sync_tp_duplicated_parameters
 
-    self.use_meg_ds_fused_layer_norm = (
-      args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
-      or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920
-    )
+    if not args.layer_norm_fusion:
+        self.use_meg_ds_fused_layer_norm = False
+    else:
+        self.use_meg_ds_fused_layer_norm = (
+          args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
+          or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920
+        )
 
 
   def reset_parameters(self):
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 07192e2bf..ccdfbaaef 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -167,11 +167,11 @@ def is_kernel_available(self, mask, b, np, sq, sk):
         if (
             self.scaled_masked_softmax_fusion  # user want to fuse
             and self.input_in_float16  # input must be fp16
-            and 16 < sk <= 4096  # sk must be 16 ~ 4096
+            and 16 < sk <= 8192  # sk must be 16 ~ 8192
             and sq % 4 == 0  # sq must be divisor of 4
             and attn_batches % 4 == 0  # np * b must be divisor of 4
         ):
-            if 0 <= sk <= 4096:
+            if 0 <= sk <= 8192:
                 batch_per_block = self.get_batch_per_block(sq, sk, b, np)
 
                 if self.attn_mask_type == AttnMaskType.causal:
@@ -214,7 +214,7 @@ def forward_torch_softmax(self, input, mask):
         if self.scale is not None:
             input = input * self.scale
 
-        if self.attn_mask_type == AttnMaskType.causal:
+        if self.attn_mask_type == AttnMaskType.causal and mask is None:
             assert mask is None
             assert input.shape[2] == input.shape[3]
             mask = self.get_causal_mask(input.shape[2])
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
index 738717d55..b492abfb4 100644
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -13,8 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from apex.optimizers import FusedAdam as Adam
-from apex.optimizers import FusedSGD as SGD
+from torch.optim import AdamW
+from torch.optim import SGD
+from apex.optimizers import FusedAdam
+from apex.optimizers import FusedSGD
 
 from megatron import get_args
 from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
@@ -72,18 +74,24 @@ def get_megatron_optimizer(model):
         if args.use_bnb_optimizer:
             import bitsandbytes as bnb
             adam_optimizer = bnb.optim.Adam8bit
+        elif args.optimizer_fusion:
+            adam_optimizer = FusedAdam
         else:
-            adam_optimizer = Adam
+            adam_optimizer = AdamW
         optimizer = adam_optimizer(param_groups,
                                    lr=args.lr,
                                    weight_decay=args.weight_decay,
                                    betas=(args.adam_beta1, args.adam_beta2),
                                    eps=args.adam_eps)
     elif args.optimizer == 'sgd':
-        optimizer = SGD(param_groups,
-                        lr=args.lr,
-                        weight_decay=args.weight_decay,
-                        momentum=args.sgd_momentum)
+        if args.optimizer_fusion:
+            sgd_optimizer = FusedSGD
+        else:
+            sgd_optimizer = SGD
+        optimizer = sgd_optimizer(param_groups,
+                                  lr=args.lr,
+                                  weight_decay=args.weight_decay,
+                                  momentum=args.sgd_momentum)
     else:
         raise Exception('{} optimizer is not supported.'.format(
             args.optimizer))
diff --git a/megatron/training.py b/megatron/training.py
index bd00bc77e..d86c51440 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -361,6 +361,7 @@ def get_learning_rate_scheduler(optimizer):
         warmup_steps=warmup_steps,
         decay_steps=decay_steps,
         decay_style=args.lr_decay_style,
+        warmup_style=args.lr_warmup_style,
         use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler,
         override_lr_scheduler=args.override_lr_scheduler)
 
diff --git a/pretrain_ul2.py b/pretrain_ul2.py
new file mode 100644
index 000000000..89c936666
--- /dev/null
+++ b/pretrain_ul2.py
@@ -0,0 +1,299 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretrain UL2"""
+
+from functools import partial
+
+import deepspeed
+import torch
+
+from megatron import (
+    get_args,
+    get_timers,
+    mpu,
+    print_rank_0
+)
+from megatron.data.dataset_utils import build_train_valid_test_datasets
+from megatron.data.gpt_dataset import build_dataset_group
+from megatron.data.ul2_dataset import (
+    is_decoder_only as _is_decoder_only,
+    is_prefix_lm as _is_prefix_lm,
+)
+from megatron.enums import AttnMaskType
+from megatron.model.gpt_model import GPTModel, GPTModelPipe
+from megatron.model.t5_model import T5Model, t5_position_ids
+from megatron.training import pretrain
+from megatron.utils import average_losses_across_data_parallel_group
+
+
+def is_decoder_only():
+    """Return whether we use a decoder-only model."""
+    args = get_args()
+    return _is_decoder_only(args.ul2_model_type)
+
+
+def is_prefix_lm():
+    """Return whether we use a non-causal decoder-only model."""
+    args = get_args()
+    return _is_prefix_lm(args.ul2_model_type)
+
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+    assert pre_process and post_process, "UL2 doesn't yet support pipelining"
+    
+    print_rank_0('building UL2 model ...')
+    args = get_args()
+
+    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
+                             remote_device=None if args.remote_device == 'none' else args.remote_device,
+                             config_dict_or_path=args.deepspeed_config,
+                             enabled=args.zero_stage == 3,
+                             mpu=mpu):
+        if args.deepspeed and is_decoder_only():
+            if is_prefix_lm():
+                args.pretrain_causal_attention = False
+                print_rank_0('Using prefix LM UL2 model.')
+                model = GPTModelPipe(
+                    num_tokentypes=0,
+                    parallel_output=True,
+                    attn_mask_type=AttnMaskType.custom
+                )
+                model._megatron_batch_fn = get_batch_pipe
+            else:
+                args.pretrain_causal_attention = True
+                print_rank_0('Using decoder-only causal UL2 model.')   
+                model = GPTModelPipe(
+                    num_tokentypes=0,
+                    parallel_output=True,
+                    attn_mask_type=AttnMaskType.causal
+                )
+                # This is a hack to give us a reference to get_batch_pipe from within training.py
+                # We need to call model.set_batch_fn after deepspeed.initialize
+                model._megatron_batch_fn = get_batch_pipe
+        elif is_decoder_only():
+            print_rank_0('Using decoder-only UL2 model.')
+            model = GPTModel(
+                num_tokentypes=0,
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process,
+                prefix_lm=is_prefix_lm(),
+            )
+        else:
+            print_rank_0('Using encoder-decoder UL2 model.')
+            model = T5Model(num_tokentypes=0, parallel_output=True)
+    return model
+
+from megatron.global_vars import get_tokenizer
+def visualize_model_inputs(tokens, attention_mask, labels, loss_mask):
+    print("SHAPES", tokens.shape, attention_mask.shape, labels.shape, loss_mask.shape)
+    tok = get_tokenizer()
+
+    print("TOKENS:", tok.detokenize(tokens[0, :].cpu().numpy().tolist()))
+    print("LABELS:", tok.detokenize(labels[0, :].cpu().numpy().tolist()))
+
+    print("ATTN:", attention_mask[:100])
+    print("LOSSMSK:", loss_mask[:100])
+
+def get_batch_pipe(data):
+    """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
+    if is_decoder_only():
+        keys = ['text', 'labels', 'loss_mask', 'dec_mask']
+    else:
+        keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
+                'enc_mask', 'dec_mask', 'enc_dec_mask']
+    datatype = torch.int64
+
+    data_b = mpu.broadcast_data(keys, data, datatype)    
+    
+
+    # print(
+    #     visualize_model_inputs(
+    #         data_b['text'],
+    #         data_b['dec_mask'],
+    #         data_b['labels'],
+    #         data_b['loss_mask'],
+    #     )
+    # )
+
+    tokens = data_b['text'].long()
+    labels = data_b['labels'].long()
+    loss_mask = data_b['loss_mask'].float()
+
+    dec_mask = (data_b['dec_mask'] < 0.5)
+    dec_mask = dec_mask.unsqueeze(1)
+
+    position_ids = t5_position_ids(tokens)
+
+    return (tokens, position_ids, dec_mask), (labels, loss_mask)
+
+
+def get_batch(data_iterator):
+    """Build the batch."""
+
+    if is_decoder_only():
+        keys = ['text', 'labels', 'loss_mask', 'dec_mask']
+    else:
+        keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
+                'enc_mask', 'dec_mask', 'enc_dec_mask']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    print(
+        visualize_model_inputs(
+            data_b['text'],
+            data_b['dec_mask'],
+            data_b['labels'],
+            data_b['loss_mask'],
+        )
+    )
+
+    # Unpack.
+    if is_decoder_only():
+        tokens = data_b['text'].long()
+        labels = data_b['labels'].long()
+        loss_mask = data_b['loss_mask'].float()
+
+        dec_mask = (data_b['dec_mask'] < 0.5)
+        dec_mask = dec_mask.unsqueeze(1)
+        return tokens, loss_mask, labels, dec_mask
+    else:
+        tokens_enc = data_b['text_enc'].long()
+        tokens_dec = data_b['text_dec'].long()
+        labels = data_b['labels'].long()
+        loss_mask = data_b['loss_mask'].float()
+
+        enc_mask = (data_b['enc_mask'] < 0.5)
+        dec_mask = (data_b['dec_mask'] < 0.5)
+        enc_dec_mask = (data_b['enc_dec_mask'] < 0.5)
+
+        return tokens_enc, tokens_dec, loss_mask, labels, \
+               enc_mask, dec_mask, enc_dec_mask
+
+
+def loss_func(loss_mask, output_tensor):
+    if is_decoder_only():
+        lm_loss_ = output_tensor
+    else:
+        lm_loss_, _ = output_tensor
+
+    lm_loss_ = lm_loss_.float()
+    lm_loss = torch.sum(
+        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
+
+    loss = lm_loss
+    averaged_losses = average_losses_across_data_parallel_group([lm_loss])
+
+    return loss, {'lm loss': averaged_losses[0]}
+
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch generator').start()
+    if is_decoder_only():
+        (tokens, loss_mask, lm_labels, dec_mask) = get_batch(data_iterator)
+    else:
+        (
+            tokens_enc, tokens_dec, loss_mask, lm_labels,
+            enc_mask, dec_mask, enc_dec_mask,
+        ) = get_batch(data_iterator)
+    timers('batch generator').stop()
+
+    # Forward model lm_labels
+    if is_decoder_only():
+        position_ids = t5_position_ids(tokens)
+        output_tensor = model(tokens, position_ids, dec_mask,
+                              labels=lm_labels)
+    else:
+        output_tensor = model(tokens_enc,
+                              tokens_dec,
+                              enc_mask,
+                              dec_mask,
+                              enc_dec_mask,
+                              tokentype_ids=None,
+                              lm_labels=lm_labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+    train_ds, valid_ds, test_ds = None, None, None
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for UL2 ...')
+    if args.data_path:
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            data_prefix=args.data_path,
+            data_impl=args.data_impl,
+            splits_string=args.split,
+            train_valid_test_num_samples=train_val_test_num_samples,
+            max_seq_length=args.encoder_seq_length,
+            max_seq_length_dec=args.decoder_seq_length,
+            masked_lm_prob=args.mask_prob,
+            short_seq_prob=args.short_seq_prob,
+            seed=args.seed,
+            skip_warmup=(not args.mmap_warmup),
+            dataset_type='ul2')
+    elif args.train_weighted_split_paths:
+        assigned_train_valid_test = []
+        if args.train_weighted_split_paths is not None:
+            train_ds = []
+            assigned_train_valid_test.append("train")
+        if args.valid_weighted_split_paths is not None:
+            valid_ds = []
+            assigned_train_valid_test.append("valid")
+        if args.test_weighted_split_paths is not None:
+            test_ds = []
+            assigned_train_valid_test.append("test")
+
+        for s in assigned_train_valid_test:
+            data_groups = zip(eval(f"args.{s}_weighted_split_paths"),
+                                eval(f"args.{s}_weighted_split_weights"),
+                                eval(f"args.{s}_weighted_split_splits"),
+                                eval(f"args.{s}_weighted_split_names"))
+            for paths, weights, splits, name in data_groups:
+                d = build_dataset_group(name, paths, weights, splits,
+                                        args.data_impl,
+                                        train_val_test_num_samples,
+                                        args.seq_length, args.seed,
+                                        (not args.mmap_warmup),
+                                        train_valid_test=s)
+                eval(f"{s}_ds").append(d)
+    else:
+        raise NotImplementedError("No dataloading argument passed")                           
+
+    print_rank_0("> finished creating UL2 datasets ...")
+    
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+
+    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
+             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
diff --git a/tasks/eval_harness/download.py b/tasks/eval_harness/download.py
index d2abcd83a..be5d5c303 100644
--- a/tasks/eval_harness/download.py
+++ b/tasks/eval_harness/download.py
@@ -1,6 +1,7 @@
 # Downloads the specified taks in the evaluation harness
 # This is particularly useful when running in environments where the GPU nodes 
 # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
+# May want to set a cache before, e.g. export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache
 
 from lm_eval import tasks
 from lm_eval.tasks import ALL_TASKS
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 68dd649fd..3511cc601 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -42,6 +42,8 @@ def __init__(self, model, tokenizer):
         self.tokenizer = tokenizer
         self.VOCAB_SIZE = tokenizer.vocab_size
         self.EOT_TOKEN_ID = tokenizer.eod
+        self.add_denoiser = args.add_denoiser
+        self.DENOISER_TOKEN_ID = tokenizer.tokenize("[S]")[0]
 
         self._max_length = args.seq_length
 
@@ -80,9 +82,15 @@ def loglikelihood(self, requests):
         for context, continuation in requests:
             if context == "":
                 # end of text as context
-                context_enc = [self.EOT_TOKEN_ID]
+                if self.add_denoiser:
+                    context_enc = [self.DENOISER_TOKEN_ID] + [self.EOT_TOKEN_ID]
+                else:
+                    context_enc = [self.EOT_TOKEN_ID]
             else:
-                context_enc = self.tokenizer_encode(context)
+                if self.add_denoiser:
+                    context_enc = [self.DENOISER_TOKEN_ID] + self.tokenizer_encode(context)
+                else:
+                    context_enc = self.tokenizer_encode(context)
 
             continuation_enc = self.tokenizer_encode(continuation)
 
@@ -260,7 +268,7 @@ def tokenizer_encode(self, text):
 from megatron.initialize import initialize_megatron
 import megatron
 
-from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
+from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
 from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
 
 def override_args(args, override_args, skip_keys, skip_if_specified_keys):
@@ -390,6 +398,9 @@ def tasks_args(parser):
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    group.add_argument('--fewshots', type=int, default=0, help='Num fewshots')
+    group.add_argument('--limit', type=int, default=None, help='Limit samples')
+    group.add_argument('--add_denoiser',  default = False, action='store_true', help='Whether to add a denoiser to the model')
     return parser
 
 from megatron.global_vars import _parse_args
@@ -398,6 +409,10 @@ def main():
     # parse the megatron args. But wait with initalizing megatron.
     # avoid printing the arguments, since they will later be overridden.
     args = _parse_args(tasks_args)
+    if os.path.exists(args.results_path):
+        print("Exists ", args.results_path)
+        exit()
+
     load_path = args.load
     model = load_ds_checkpoint_and_setup_megatron(args)
 
@@ -422,11 +437,11 @@ def main():
         global_results = {"results": {}, "versions": {}}
         timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
         iteration_id = load_path.split("/")[-1].replace("/", "")
-        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+        results_path = args.results_path#.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots.json")
         # Backup file in case of interruption during writing
-        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
+        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots_backup.json")
         for task_name, task in task_dict.items():
-            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+            results = evaluator.evaluate(adaptor, {task_name: task}, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit)
             global_results["results"] = {**global_results["results"], **results["results"]}
             global_results["versions"] = {**global_results["versions"], **results["versions"]}
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
@@ -436,7 +451,7 @@ def main():
                 with open(results_path_backup, 'w') as outfile:
                     json.dump(global_results, outfile, indent=4)
     else:
-        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+        global_results = evaluator.evaluate(adaptor, task_dict, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit)
         if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
             print(json.dumps(global_results, indent=2))
             with open(args.results_path, 'w') as outfile:
diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py
index 74e5ca7c9..08471d0bb 100755
--- a/tools/convert_checkpoint/deepspeed_to_megatron.py
+++ b/tools/convert_checkpoint/deepspeed_to_megatron.py
@@ -4,7 +4,7 @@
 import os
 import torch
 from collections import OrderedDict
-from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
+from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
 
 MODEL_KEY = 'model'
 ARGS_KEY = 'args'