From 71728c2f33dd8135eb29811e14f584e64629f27d Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Sun, 28 Nov 2021 21:47:24 +0100
Subject: [PATCH 01/69] Add functionality for running the evaluation harness on
 single gpu

---
 examples/run_evalharness.sh    |  70 ++++++++++
 tasks/eval_harness/download.py |  30 ++++
 tasks/eval_harness/evaluate.py | 243 +++++++++++++++++++++++++++++++++
 3 files changed, 343 insertions(+)
 create mode 100644 examples/run_evalharness.sh
 create mode 100644 tasks/eval_harness/download.py
 create mode 100644 tasks/eval_harness/evaluate.py

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
new file mode 100644
index 000000000..a803045d1
--- /dev/null
+++ b/examples/run_evalharness.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# Example file to run the evaluation harness.
+# 
+
+export HF_DATASETS_CACHE=$SCRATCH/cache/
+
+
+CHECKPOINT_PATH=checkpoints/gpt2
+#_pipeline
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+DATA_PATH=my-gpt2_text_document
+
+GPT_ARGS=" \
+    --num-layers 12 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --lr 0.00015 \
+    --train-iters 500000 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --lr-warmup-fraction .01 \
+    --fp16 \
+    --pipeline-model-parallel-size 2\
+    --tensor-model-parallel-size 1\
+    "
+
+
+DATA_ARGS=" \
+    --load $CHECKPOINT_PATH \
+    --tokenizer-type GPT2BPETokenizer
+    "
+#    --load $CHECKPOINT_PATH \
+
+
+
+CMD="./tasks/eval_harness/evaluate.py $GPT_ARGS $DATA_ARGS"
+N_GPUS=1
+LAUNCHER="deepspeed --num_gpus $N_GPUS"
+
+$LAUNCHER $CMD
+
+"""
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/eval_harness/evaluate.py \
+               --task_list piqa,boolq,wikitext \
+               --tokenizer-type GPT2BPETokenizer \
+               --vocab-file $VOCAB_FILE \
+               --merge-file $MERGE_FILE \
+               --load $CHECKPOINT \
+               --tensor-model-parallel-size 1 \
+               --pipeline-model-parallel-size 1 \
+               --num-layers 12 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --micro-batch-size 1 \
+               --checkpoint-activations \
+               --seq-length 1024 \
+               --max-position-embeddings 1024 \
+               --log-interval 10 \
+               --fp16 \
+               --no-load-optim \
+               --no-load-rng
+"""
\ No newline at end of file
diff --git a/tasks/eval_harness/download.py b/tasks/eval_harness/download.py
new file mode 100644
index 000000000..0656cd656
--- /dev/null
+++ b/tasks/eval_harness/download.py
@@ -0,0 +1,30 @@
+# Downloads the specified taks in the evaluation harness
+# This is particularly useful when running in environments where the GPU nodes 
+# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
+
+from lm_eval.base import LM
+from lm_eval import evaluator, tasks
+from lm_eval.tasks import ALL_TASKS
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
+parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
+parser.add_argument('--save_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded data tasks will be stored.')
+args = parser.parse_args()
+
+import pickle
+    
+def main():
+    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+    task_dict = tasks.get_task_dict(task_list)
+    with open(args.save_path, 'wb') as file:
+        pickle.dump(task_dict, file, protocol=pickle.HIGHEST_PROTOCOL)
+    print(f"Tasks have been saved to {args.save_path}!")
+
+if __name__ == '__main__':
+    main()
+
+
+    
\ No newline at end of file
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
new file mode 100644
index 000000000..4c3710a14
--- /dev/null
+++ b/tasks/eval_harness/evaluate.py
@@ -0,0 +1,243 @@
+
+import os
+import sys
+import numpy as np
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir,os.path.pardir)))
+
+# Downloads the taks in the evaluation harness
+# This is particularly useful when running in environments where the GPU nodes 
+# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
+
+from lm_eval.models.gpt2 import GPT2LM
+
+from lm_eval import evaluator, tasks, utils
+from megatron.global_vars import get_current_global_batch_size
+from lm_eval.base import CacheHook
+from tqdm import tqdm
+import torch.nn.functional as F 
+
+from lm_eval.tasks import ALL_TASKS
+from pretrain_gpt import model_provider
+import math
+
+import torch
+import megatron.model
+from megatron import get_args
+from megatron import print_rank_0, is_last_rank
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.model.gpt_model import GPTModel
+from megatron.training import get_model,setup_model_and_optimizer
+
+from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.p2p_communication import recv_forward, send_forward
+from tasks.finetune_utils import build_data_loader
+
+
+#from .datasets import build_dataset
+
+# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from megatron.model.distributed import DistributedDataParallel as LocalDDP
+from megatron.model.module import Float16Module
+
+class EvalHarnessAdaptor(GPT2LM):
+    def __init__(self, model, tokenizer):
+        args = get_args()
+        self.args = args
+        self.model = model
+        self.tokenizer = tokenizer
+        self.tokenizer.encode = self.tokenizer.tokenize
+        self.VOCAB_SIZE = tokenizer.vocab_size
+        self.EOT_TOKEN_ID = tokenizer.eod
+
+        # Comes form the neox code. Should this not be args.max_position_embeddings - max_gen_tokens?
+        self.max_length = args.max_position_embeddings // 2 
+        self.max_gen_toks = 128
+        self.batch_size = get_current_global_batch_size()
+        self.cache_hook = CacheHook(None)
+        self.is_main = args.rank == 0
+        self.is_local_main = args.local_rank == 0
+        self.device = f"cuda:{args.local_rank}"
+        # TODO
+        #self.is_model_parallel = neox_args.model_parallel_size > 1
+        #self.is_pipe_parallel = self.model.is_pipe_parallel
+        #self.is_data_parallel = self.model.is_data_parallel
+        self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+        self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
+        self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
+        
+        if self.is_pipe_parallel:
+            print("***")
+            print("***")
+            print("RUNNING PIPELINE PARALLEL!")
+            print("***")
+            print("***")
+            
+        print(self.model)
+        
+        
+        #if self.is_model_parallel:
+        #    raise NotImplementedError("Tensor parallelism is currently not supported for evaluation")
+        if self.is_data_parallel:
+            raise NotImplementedError("Data parallelism is currently not supported for evaluation")
+
+        
+
+        self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage()  # only the last stage of the pipeline model will receive the logits
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):  
+        disable_tqdm = disable_tqdm if self.is_main else True
+        res = []
+        res_len = 0  # storing the result length for later
+        with torch.no_grad():
+            def _collate(x):
+                toks = x[1] + x[2]
+                return (-len(toks), tuple(toks))
+
+            reord = utils.Reorderer(requests, _collate)
+            for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
+                inps, contlens, inplens, padding_length = [], [], [], None
+                for _, context_enc, continuation_enc in chunk:
+                    # when too long to fit in context, truncate from the left
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
+                        , dtype=torch.long).to(self.device)
+                    inplen, = inp.shape
+
+                    cont = continuation_enc
+
+                    # since in _collate we make sure length is descending, the longest is always the first one.
+                    padding_length = padding_length if padding_length is not None else inplen
+
+                    # pad to length
+                    inp = torch.cat([
+                        inp,  # [seq]
+                        torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device)  # [padding_length - seq]
+                    ], dim=0)
+
+                    inps.append(inp.unsqueeze(0))
+                    contlens.append(cont)
+                    inplens.append(inplen)
+
+                logits = self._model_call(torch.cat(inps, dim=0))
+                res_len += len(chunk)
+
+                if logits is not None:
+                    multi_logits = F.log_softmax(logits, dim=-1)  # [batch, seq, vocab]
+                    for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens,
+                                                                                 contlens):
+                        contlen = len(cont_toks)
+                        logits = logits[inplen - contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
+                        greedy_tokens = logits.argmax(dim=-1)
+                        # cont_toks :: [1, seq]
+                        cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0).to(multi_logits.device)
+                        max_equal = (greedy_tokens == cont_toks).all()
+                        logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
+                        answer = (float(logits.sum()), bool(max_equal))
+
+                        # partial caching
+                        if cache_key is not None:
+                            self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+                        res.append(answer)
+
+        # broadcast results to all ranks
+        if self.is_pipe_parallel:
+            src_rank = mpu.get_pipeline_model_parallel_last_rank()
+
+            if res:
+                logits_sums, max_equals = list(zip(*res))
+                logits_sums = torch.FloatTensor(logits_sums).cuda()
+                max_equals = torch.LongTensor(max_equals).cuda()
+            else:
+                logits_sums = torch.zeros(res_len, dtype=torch.float32).cuda()
+                max_equals = torch.zeros(res_len, dtype=torch.int64).cuda()
+            torch.distributed.broadcast(tensor=logits_sums, src=src_rank)
+            torch.distributed.broadcast(tensor=max_equals, src=src_rank)
+            max_equals = [bool(i) for i in max_equals.tolist()]
+            logits_sums = logits_sums.tolist()
+            res = list(zip(logits_sums, max_equals))
+
+        return reord.get_original(res)
+
+    def _model_call(self, inps):
+        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+            inps,
+            self.tokenizer.eod,
+            self.args.reset_position_ids,
+            self.args.reset_attention_mask,
+            self.args.eod_mask_loss,
+            prefix_indices=None,
+            loss_on_targets_only=self.args.loss_on_targets_only
+        )
+        if False:
+            args = get_args()
+            self.model.eval()
+            args.attn_mask = attention_mask
+            self.model.module.activation_checkpoint_interval = 0
+            self.model._compute_loss = False
+            self.model.fwd_outputs = []
+            self.model.module.pipe_buffers["inputs"].append((inps, position_ids, attention_mask))
+            self.model.module.pipe_buffers["outputs"].append(None)
+
+            with torch.no_grad():
+                self.model._exec_forward_pass(buffer_id=0)
+            
+            logits = self.model.pipe_buffers["outputs"][0]
+            
+        logits = self.model(inps, position_ids, attention_mask)
+        return logits
+
+
+
+
+from megatron.initialize import initialize_megatron
+
+def get_tasks_args(parser):
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title='eval harness')
+    group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
+    group.add_argument('--task_load_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded tasks are stored, or None if download is possible.')
+    return parser
+
+import pickle
+
+def main():
+    
+    initialize_megatron(extra_args_provider=get_tasks_args)
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+
+    #load eval harness task dict
+    if args.task_load_path != 'None':
+        with open(args.task_load_path, 'rb') as file:
+            task_dict = pickle.load(file)
+        
+        if args.task_list != 'all':
+            task_list = args.task_list.split(',')
+            task_dict = dict((k,task_dict[k]) for k in task_list)
+            
+    else:
+        task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+        task_dict = tasks.get_task_dict(task_list)
+
+    # Set up model and load checkpoint.
+    model = get_model(model_provider)
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+
+    tokenizer = get_tokenizer()
+    adaptor = EvalHarnessAdaptor(model, tokenizer) 
+    results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
+    print(results)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 97a2339c183bceb5b9c7e5e46c7daf95f6c2dd12 Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Mon, 29 Nov 2021 02:06:12 +0100
Subject: [PATCH 02/69] Add support for pipelining

---
 examples/run_evalharness.sh    | 31 ++--------------
 tasks/eval_harness/evaluate.py | 66 ++++++++++++++--------------------
 2 files changed, 29 insertions(+), 68 deletions(-)

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
index a803045d1..978b075c3 100644
--- a/examples/run_evalharness.sh
+++ b/examples/run_evalharness.sh
@@ -5,9 +5,7 @@
 
 export HF_DATASETS_CACHE=$SCRATCH/cache/
 
-
-CHECKPOINT_PATH=checkpoints/gpt2
-#_pipeline
+CHECKPOINT_PATH=checkpoints/gpt2_pipeline
 VOCAB_FILE=gpt2-vocab.json
 MERGE_FILE=gpt2-merges.txt
 DATA_PATH=my-gpt2_text_document
@@ -32,39 +30,14 @@ GPT_ARGS=" \
     --tensor-model-parallel-size 1\
     "
 
-
 DATA_ARGS=" \
     --load $CHECKPOINT_PATH \
     --tokenizer-type GPT2BPETokenizer
     "
-#    --load $CHECKPOINT_PATH \
-
 
 
 CMD="./tasks/eval_harness/evaluate.py $GPT_ARGS $DATA_ARGS"
-N_GPUS=1
+N_GPUS=2
 LAUNCHER="deepspeed --num_gpus $N_GPUS"
 
 $LAUNCHER $CMD
-
-"""
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/eval_harness/evaluate.py \
-               --task_list piqa,boolq,wikitext \
-               --tokenizer-type GPT2BPETokenizer \
-               --vocab-file $VOCAB_FILE \
-               --merge-file $MERGE_FILE \
-               --load $CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --pipeline-model-parallel-size 1 \
-               --num-layers 12 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --micro-batch-size 1 \
-               --checkpoint-activations \
-               --seq-length 1024 \
-               --max-position-embeddings 1024 \
-               --log-interval 10 \
-               --fp16 \
-               --no-load-optim \
-               --no-load-rng
-"""
\ No newline at end of file
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 4c3710a14..483a00223 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -35,7 +35,6 @@
 from megatron.p2p_communication import recv_forward, send_forward
 from tasks.finetune_utils import build_data_loader
 
-
 #from .datasets import build_dataset
 
 # These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
@@ -53,14 +52,13 @@ def __init__(self, model, tokenizer):
         self.VOCAB_SIZE = tokenizer.vocab_size
         self.EOT_TOKEN_ID = tokenizer.eod
 
-        # Comes form the neox code. Should this not be args.max_position_embeddings - max_gen_tokens?
-        self.max_length = args.max_position_embeddings // 2 
+        self.max_length = args.max_position_embeddings
         self.max_gen_toks = 128
-        self.batch_size = get_current_global_batch_size()
+        self.batch_size = args.micro_batch_size
         self.cache_hook = CacheHook(None)
         self.is_main = args.rank == 0
         self.is_local_main = args.local_rank == 0
-        self.device = f"cuda:{args.local_rank}"
+        self.device = torch.cuda.current_device()
         # TODO
         #self.is_model_parallel = neox_args.model_parallel_size > 1
         #self.is_pipe_parallel = self.model.is_pipe_parallel
@@ -69,29 +67,18 @@ def __init__(self, model, tokenizer):
         self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
         self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
         
-        if self.is_pipe_parallel:
-            print("***")
-            print("***")
-            print("RUNNING PIPELINE PARALLEL!")
-            print("***")
-            print("***")
-            
-        print(self.model)
-        
-        
         #if self.is_model_parallel:
         #    raise NotImplementedError("Tensor parallelism is currently not supported for evaluation")
         if self.is_data_parallel:
             raise NotImplementedError("Data parallelism is currently not supported for evaluation")
 
-        
-
         self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage()  # only the last stage of the pipeline model will receive the logits
 
     def _loglikelihood_tokens(self, requests, disable_tqdm=False):  
         disable_tqdm = disable_tqdm if self.is_main else True
         res = []
         res_len = 0  # storing the result length for later
+        self.model.eval()
         with torch.no_grad():
             def _collate(x):
                 toks = x[1] + x[2]
@@ -124,7 +111,6 @@ def _collate(x):
 
                 logits = self._model_call(torch.cat(inps, dim=0))
                 res_len += len(chunk)
-
                 if logits is not None:
                     multi_logits = F.log_softmax(logits, dim=-1)  # [batch, seq, vocab]
                     for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens,
@@ -163,6 +149,7 @@ def _collate(x):
 
         return reord.get_original(res)
 
+
     def _model_call(self, inps):
         attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
             inps,
@@ -173,26 +160,28 @@ def _model_call(self, inps):
             prefix_indices=None,
             loss_on_targets_only=self.args.loss_on_targets_only
         )
-        if False:
-            args = get_args()
-            self.model.eval()
-            args.attn_mask = attention_mask
-            self.model.module.activation_checkpoint_interval = 0
-            self.model._compute_loss = False
-            self.model.fwd_outputs = []
-            self.model.module.pipe_buffers["inputs"].append((inps, position_ids, attention_mask))
-            self.model.module.pipe_buffers["outputs"].append(None)
-
-            with torch.no_grad():
-                self.model._exec_forward_pass(buffer_id=0)
-            
-            logits = self.model.pipe_buffers["outputs"][0]
-            
-        logits = self.model(inps, position_ids, attention_mask)
-        return logits
+        
+        # Since the shape of the micro-batch will change
+        # We need set the correct shapes here 
+        # So that latter pipeline stages knows which shapes to expect.
+        # Otherwise we will deadlock. 
+        args = get_args()
+        args.micro_batch_size = len(inps)
+        args.seq_length = len(inps[0])
 
+        input_tensor = recv_forward()
 
+        # Forward pass through the model.
+        unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
+        unwrapped_model.set_input_tensor(input_tensor)
+        output = self.model(inps, position_ids, attention_mask)
+        
+        send_forward(output)
 
+        if mpu.is_pipeline_last_stage():
+            return output
+        else:
+            return None
 
 from megatron.initialize import initialize_megatron
 
@@ -208,6 +197,7 @@ def get_tasks_args(parser):
 def main():
     
     initialize_megatron(extra_args_provider=get_tasks_args)
+
     args = get_args()
     if args.num_layers_per_virtual_pipeline_stage is not None:
         print("Interleaved pipeline schedule is not yet supported for text generation.")
@@ -227,10 +217,8 @@ def main():
         task_dict = tasks.get_task_dict(task_list)
 
     # Set up model and load checkpoint.
-    model = get_model(model_provider)
-    if args.load is not None:
-        _ = load_checkpoint(model, None, None)
-
+    model, _, _  = setup_model_and_optimizer(model_provider)
+    
     assert len(model) == 1, "Above condition should have caught this"
     model = model[0]
 

From e72a18f5ab03672849023bbba0b29446863c215b Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Mon, 29 Nov 2021 11:24:04 +0100
Subject: [PATCH 03/69] support tensor parallel

---
 examples/run_evalharness.sh    | 6 +++---
 tasks/eval_harness/evaluate.py | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
index 978b075c3..076a898d4 100644
--- a/examples/run_evalharness.sh
+++ b/examples/run_evalharness.sh
@@ -5,7 +5,7 @@
 
 export HF_DATASETS_CACHE=$SCRATCH/cache/
 
-CHECKPOINT_PATH=checkpoints/gpt2_pipeline
+CHECKPOINT_PATH=checkpoints/gpt2_tensor
 VOCAB_FILE=gpt2-vocab.json
 MERGE_FILE=gpt2-merges.txt
 DATA_PATH=my-gpt2_text_document
@@ -26,8 +26,8 @@ GPT_ARGS=" \
     --merge-file $MERGE_FILE \
     --lr-warmup-fraction .01 \
     --fp16 \
-    --pipeline-model-parallel-size 2\
-    --tensor-model-parallel-size 1\
+    --pipeline-model-parallel-size 1\
+    --tensor-model-parallel-size 2\
     "
 
 DATA_ARGS=" \
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 483a00223..9caebdd82 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -30,6 +30,7 @@
 from megatron.checkpointing import load_checkpoint
 from megatron.model.gpt_model import GPTModel
 from megatron.training import get_model,setup_model_and_optimizer
+from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
 
 from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
 from megatron.p2p_communication import recv_forward, send_forward
@@ -110,6 +111,7 @@ def _collate(x):
                     inplens.append(inplen)
 
                 logits = self._model_call(torch.cat(inps, dim=0))
+
                 res_len += len(chunk)
                 if logits is not None:
                     multi_logits = F.log_softmax(logits, dim=-1)  # [batch, seq, vocab]
@@ -177,9 +179,8 @@ def _model_call(self, inps):
         output = self.model(inps, position_ids, attention_mask)
         
         send_forward(output)
-
         if mpu.is_pipeline_last_stage():
-            return output
+            return gather_from_tensor_model_parallel_region(output)
         else:
             return None
 

From 0a82965143b06bf5036ea9c98c2594d02f96b1ea Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Mon, 29 Nov 2021 11:34:28 +0100
Subject: [PATCH 04/69] save the results

---
 tasks/eval_harness/evaluate.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 9caebdd82..6bb66fcdc 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -191,10 +191,11 @@ def get_tasks_args(parser):
     group = parser.add_argument_group(title='eval harness')
     group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
     group.add_argument('--task_load_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded tasks are stored, or None if download is possible.')
+    group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
     return parser
 
 import pickle
-
+import json
 def main():
     
     initialize_megatron(extra_args_provider=get_tasks_args)
@@ -226,7 +227,12 @@ def main():
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer) 
     results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
-    print(results)
+    
+    print_rank_0(json.dumps(results, indent=2))
+    if args.rank==0:
+        with open(args.results_path, 'w') as outfile:
+            json.dump(results, outfile, indent = 4)
+
 
 if __name__ == '__main__':
     main()
\ No newline at end of file

From ceddfc5b8a0cc58445bdbdac02ee82633eef8e0e Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Mon, 29 Nov 2021 11:44:53 +0100
Subject: [PATCH 05/69] Minor cleanup

---
 tasks/eval_harness/evaluate.py | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 6bb66fcdc..b02f4bc73 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -1,7 +1,6 @@
 
 import os
 import sys
-import numpy as np
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir,os.path.pardir)))
 
@@ -10,35 +9,27 @@
 # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
 
 from lm_eval.models.gpt2 import GPT2LM
-
 from lm_eval import evaluator, tasks, utils
-from megatron.global_vars import get_current_global_batch_size
 from lm_eval.base import CacheHook
 from tqdm import tqdm
 import torch.nn.functional as F 
 
 from lm_eval.tasks import ALL_TASKS
 from pretrain_gpt import model_provider
-import math
 
 import torch
-import megatron.model
 from megatron import get_args
-from megatron import print_rank_0, is_last_rank
+from megatron import print_rank_0
 from megatron import get_tokenizer
 from megatron import mpu
-from megatron.checkpointing import load_checkpoint
-from megatron.model.gpt_model import GPTModel
-from megatron.training import get_model,setup_model_and_optimizer
+from megatron.training import setup_model_and_optimizer
 from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
 
 from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
 from megatron.p2p_communication import recv_forward, send_forward
-from tasks.finetune_utils import build_data_loader
-
-#from .datasets import build_dataset
+import pickle
+import json
 
-# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron.model.distributed import DistributedDataParallel as LocalDDP
 from megatron.model.module import Float16Module
@@ -60,16 +51,10 @@ def __init__(self, model, tokenizer):
         self.is_main = args.rank == 0
         self.is_local_main = args.local_rank == 0
         self.device = torch.cuda.current_device()
-        # TODO
-        #self.is_model_parallel = neox_args.model_parallel_size > 1
-        #self.is_pipe_parallel = self.model.is_pipe_parallel
-        #self.is_data_parallel = self.model.is_data_parallel
         self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
         self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
         self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
         
-        #if self.is_model_parallel:
-        #    raise NotImplementedError("Tensor parallelism is currently not supported for evaluation")
         if self.is_data_parallel:
             raise NotImplementedError("Data parallelism is currently not supported for evaluation")
 
@@ -194,8 +179,6 @@ def get_tasks_args(parser):
     group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
     return parser
 
-import pickle
-import json
 def main():
     
     initialize_megatron(extra_args_provider=get_tasks_args)

From c1e8022560048740429465b01960a16fe7dcf49c Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Mon, 29 Nov 2021 16:17:24 +0100
Subject: [PATCH 06/69] Experimental Deepspeed support

---
 tasks/eval_harness/evaluate.py | 85 ++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 19 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index b02f4bc73..5a51adc51 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -1,4 +1,5 @@
 
+from logging import logMultiprocessing
 import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
@@ -54,7 +55,7 @@ def __init__(self, model, tokenizer):
         self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
         self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
         self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
-        
+        self.deepspeed = args.deepspeed
         if self.is_data_parallel:
             raise NotImplementedError("Data parallelism is currently not supported for evaluation")
 
@@ -84,7 +85,12 @@ def _collate(x):
 
                     # since in _collate we make sure length is descending, the longest is always the first one.
                     padding_length = padding_length if padding_length is not None else inplen
-
+                    
+                    if self.deepspeed:
+                        # deepspeed doesn't like chaning seq length.
+                        padding_length = self.max_length
+                        
+                    
                     # pad to length
                     inp = torch.cat([
                         inp,  # [seq]
@@ -96,7 +102,6 @@ def _collate(x):
                     inplens.append(inplen)
 
                 logits = self._model_call(torch.cat(inps, dim=0))
-
                 res_len += len(chunk)
                 if logits is not None:
                     multi_logits = F.log_softmax(logits, dim=-1)  # [batch, seq, vocab]
@@ -147,27 +152,62 @@ def _model_call(self, inps):
             prefix_indices=None,
             loss_on_targets_only=self.args.loss_on_targets_only
         )
-        
-        # Since the shape of the micro-batch will change
-        # We need set the correct shapes here 
-        # So that latter pipeline stages knows which shapes to expect.
-        # Otherwise we will deadlock. 
         args = get_args()
         args.micro_batch_size = len(inps)
         args.seq_length = len(inps[0])
 
-        input_tensor = recv_forward()
-
-        # Forward pass through the model.
-        unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
-        unwrapped_model.set_input_tensor(input_tensor)
-        output = self.model(inps, position_ids, attention_mask)
+        if args.deepspeed:
+            # This is quite hacky.
+            # I don't know if DS has a good inference api for pipelining.
+            # So we just manually roll our own here, same as with Megatron.
+            # We definitely need to verify that the order of the stages is guaranteed to be the same though.
+            
+            input_tensor = recv_forward()
         
-        send_forward(output)
-        if mpu.is_pipeline_last_stage():
-            return gather_from_tensor_model_parallel_region(output)
+            if input_tensor == None:
+                input_tensor = (inps, position_ids, attention_mask)
+
+            self.model.pipe_buffers["inputs"] = [input_tensor]
+            self.model.pipe_buffers["outputs"] = [None]
+
+            # Run model
+            with torch.no_grad():
+                self.model._exec_forward_pass(buffer_id=0)
+
+            output = self.model.pipe_buffers["outputs"][0]
+
+            send_forward(output)
+
+            # Prevent model from saving any state, to prevent OOM
+            self.model.loss = None
+            self.model.total_loss = None
+            self.model.fwd_outputs = []
+            self.model.pipe_buffers["outputs"] = [None]
+
+            if mpu.is_pipeline_last_stage():
+                return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
+            else:
+                #print("Fist stage: discarding output of shape", output.shape, "probably hidden?")
+                return None
+
         else:
-            return None
+            # Since the shape of the micro-batch will change
+            # We need set the correct shapes here 
+            # So that latter pipeline stages knows which shapes to expect.
+            # Otherwise we will deadlock. 
+
+            input_tensor = recv_forward()
+
+            # Forward pass through the model.
+            unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
+            unwrapped_model.set_input_tensor(input_tensor)
+            output = self.model(inps, position_ids, attention_mask)
+            
+            send_forward(output)
+            if mpu.is_pipeline_last_stage():
+                return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
+            else:
+                return None
 
 from megatron.initialize import initialize_megatron
 
@@ -203,10 +243,17 @@ def main():
 
     # Set up model and load checkpoint.
     model, _, _  = setup_model_and_optimizer(model_provider)
-    
+
     assert len(model) == 1, "Above condition should have caught this"
     model = model[0]
 
+
+    if args.deepspeed:
+        model.module.activation_checkpoint_interval = 0
+        model._compute_loss = False
+        model.fwd_outputs = []
+
+
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer) 
     results = evaluator.evaluate(adaptor, task_dict, False, 0, None)

From 0f8c8c097aaed43c0e6b13d01d0448c2824e1c61 Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Wed, 1 Dec 2021 23:47:47 +0100
Subject: [PATCH 07/69] Proper deepspeed integration, now working on combined
 tp and pp

---
 examples/run_evalharness.sh    |  20 ++++--
 tasks/eval_harness/evaluate.py | 108 ++++++++++++++-------------------
 2 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
index 076a898d4..cfeb230ea 100644
--- a/examples/run_evalharness.sh
+++ b/examples/run_evalharness.sh
@@ -5,18 +5,27 @@
 
 export HF_DATASETS_CACHE=$SCRATCH/cache/
 
-CHECKPOINT_PATH=checkpoints/gpt2_tensor
+CHECKPOINT_PATH=checkpoints/gpt2_both_ds
 VOCAB_FILE=gpt2-vocab.json
 MERGE_FILE=gpt2-merges.txt
 DATA_PATH=my-gpt2_text_document
 
+config_json="./ds_config.json"
+ZERO_STAGE=1
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+
 GPT_ARGS=" \
     --num-layers 12 \
     --hidden-size 1024 \
     --num-attention-heads 16 \
     --seq-length 1024 \
     --max-position-embeddings 1024 \
-    --micro-batch-size 4 \
+    --micro-batch-size 2 \
     --global-batch-size 8 \
     --lr 0.00015 \
     --train-iters 500000 \
@@ -26,7 +35,7 @@ GPT_ARGS=" \
     --merge-file $MERGE_FILE \
     --lr-warmup-fraction .01 \
     --fp16 \
-    --pipeline-model-parallel-size 1\
+    --pipeline-model-parallel-size 2\
     --tensor-model-parallel-size 2\
     "
 
@@ -36,8 +45,9 @@ DATA_ARGS=" \
     "
 
 
-CMD="./tasks/eval_harness/evaluate.py $GPT_ARGS $DATA_ARGS"
-N_GPUS=2
+CMD="./tasks/eval_harness/evaluate.py $DEEPSPEED_ARGS $GPT_ARGS $DATA_ARGS --task_list piqa"
+N_GPUS=4
 LAUNCHER="deepspeed --num_gpus $N_GPUS"
 
+
 $LAUNCHER $CMD
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 5a51adc51..865b20990 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -1,4 +1,5 @@
 
+from functools import reduce
 from logging import logMultiprocessing
 import os
 import sys
@@ -17,7 +18,8 @@
 
 from lm_eval.tasks import ALL_TASKS
 from pretrain_gpt import model_provider
-
+import numpy as np
+    
 import torch
 from megatron import get_args
 from megatron import print_rank_0
@@ -34,6 +36,7 @@
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron.model.distributed import DistributedDataParallel as LocalDDP
 from megatron.model.module import Float16Module
+from deepspeed.runtime.pipe import schedule
 
 class EvalHarnessAdaptor(GPT2LM):
     def __init__(self, model, tokenizer):
@@ -47,7 +50,9 @@ def __init__(self, model, tokenizer):
 
         self.max_length = args.max_position_embeddings
         self.max_gen_toks = 128
-        self.batch_size = args.micro_batch_size
+        # For ds we split into mini batches and then micro batches to keep pipelining api happy.
+        # With Megatron we just go to micro_batches directly 
+        self.batch_size = args.micro_batch_size if (not args.deepspeed) else args.global_batch_size
         self.cache_hook = CacheHook(None)
         self.is_main = args.rank == 0
         self.is_local_main = args.local_rank == 0
@@ -61,7 +66,7 @@ def __init__(self, model, tokenizer):
 
         self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage()  # only the last stage of the pipeline model will receive the logits
 
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):  
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
         disable_tqdm = disable_tqdm if self.is_main else True
         res = []
         res_len = 0  # storing the result length for later
@@ -86,10 +91,10 @@ def _collate(x):
                     # since in _collate we make sure length is descending, the longest is always the first one.
                     padding_length = padding_length if padding_length is not None else inplen
                     
+                    # Deepspeed doesn't like it when we change the seq len
+                    # The recent curriculum learning patches should fix this, but for now it's let's just do it the slow and easy way.
                     if self.deepspeed:
-                        # deepspeed doesn't like chaning seq length.
                         padding_length = self.max_length
-                        
                     
                     # pad to length
                     inp = torch.cat([
@@ -122,79 +127,62 @@ def _collate(x):
 
                         res.append(answer)
 
-        # broadcast results to all ranks
-        if self.is_pipe_parallel:
-            src_rank = mpu.get_pipeline_model_parallel_last_rank()
-
-            if res:
-                logits_sums, max_equals = list(zip(*res))
-                logits_sums = torch.FloatTensor(logits_sums).cuda()
-                max_equals = torch.LongTensor(max_equals).cuda()
-            else:
-                logits_sums = torch.zeros(res_len, dtype=torch.float32).cuda()
-                max_equals = torch.zeros(res_len, dtype=torch.int64).cuda()
-            torch.distributed.broadcast(tensor=logits_sums, src=src_rank)
-            torch.distributed.broadcast(tensor=max_equals, src=src_rank)
-            max_equals = [bool(i) for i in max_equals.tolist()]
-            logits_sums = logits_sums.tolist()
-            res = list(zip(logits_sums, max_equals))
-
+        if not mpu.is_pipeline_last_stage():
+            # @HACK: To make the eval harness happy on threads that don't have access to the results.
+            #        We just randomly generate some data. 
+            res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests]
+        
         return reord.get_original(res)
 
+    def get_batch_pipe(self, tokens):
+        """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
+        args = get_args()
+        tokenizer = get_tokenizer()
 
-    def _model_call(self, inps):
+        # Get the masks and postition ids.
         attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-            inps,
-            self.tokenizer.eod,
-            self.args.reset_position_ids,
-            self.args.reset_attention_mask,
-            self.args.eod_mask_loss,
+            tokens,
+            tokenizer.eod,
+            args.reset_position_ids,
+            args.reset_attention_mask,
+            args.eod_mask_loss,
             prefix_indices=None,
-            loss_on_targets_only=self.args.loss_on_targets_only
-        )
+            loss_on_targets_only=False)
+        
+        return (tokens, position_ids, attention_mask), (tokens, loss_mask)
+
+    def _model_call(self, inps):
         args = get_args()
-        args.micro_batch_size = len(inps)
-        args.seq_length = len(inps[0])
 
         if args.deepspeed:
-            # This is quite hacky.
-            # I don't know if DS has a good inference api for pipelining.
-            # So we just manually roll our own here, same as with Megatron.
-            # We definitely need to verify that the order of the stages is guaranteed to be the same though.
+            self.model.set_batch_fn(self.get_batch_pipe)
             
-            input_tensor = recv_forward()
-        
-            if input_tensor == None:
-                input_tensor = (inps, position_ids, attention_mask)
-
-            self.model.pipe_buffers["inputs"] = [input_tensor]
-            self.model.pipe_buffers["outputs"] = [None]
-
-            # Run model
-            with torch.no_grad():
-                self.model._exec_forward_pass(buffer_id=0)
-
-            output = self.model.pipe_buffers["outputs"][0]
-
-            send_forward(output)
+            # round up to multiple of micro_batch_size
+            new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
+            padded = F.pad(inps, (0, 0, 0, new_size), value = 1) 
 
-            # Prevent model from saving any state, to prevent OOM
-            self.model.loss = None
-            self.model.total_loss = None
-            self.model.fwd_outputs = []
-            self.model.pipe_buffers["outputs"] = [None]
+            # dummy data iterator for pipelining.
+            data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
+            self.model.micro_batches = len(data_iterator)
 
+            output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
+            
             if mpu.is_pipeline_last_stage():
+                output = torch.cat(output, 0)[:len(inps)]
                 return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
             else:
-                #print("Fist stage: discarding output of shape", output.shape, "probably hidden?")
                 return None
 
         else:
+
+            _, position_ids, attention_mask = self.get_batch_pipe(inps)[0]
+
             # Since the shape of the micro-batch will change
             # We need set the correct shapes here 
             # So that latter pipeline stages knows which shapes to expect.
             # Otherwise we will deadlock. 
+            args.micro_batch_size = len(inps)
+            args.seq_length = len(inps[0])
 
             input_tensor = recv_forward()
 
@@ -247,19 +235,17 @@ def main():
     assert len(model) == 1, "Above condition should have caught this"
     model = model[0]
 
-
     if args.deepspeed:
         model.module.activation_checkpoint_interval = 0
         model._compute_loss = False
         model.fwd_outputs = []
 
-
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer) 
     results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
     
-    print_rank_0(json.dumps(results, indent=2))
-    if args.rank==0:
+    if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+        print(json.dumps(results, indent=2))
         with open(args.results_path, 'w') as outfile:
             json.dump(results, outfile, indent = 4)
 

From 37e6962e8667e90fb59ab95225dc8b805e7c620b Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Sun, 5 Dec 2021 02:59:32 +0100
Subject: [PATCH 08/69] Update model loading and clean up code.

---
 examples/run_evalharness.sh                   |  71 ++---
 tasks/eval_harness/evaluate.py                | 251 ++++++++++++------
 .../deepspeed_to_megatron.py                  |   2 +-
 3 files changed, 189 insertions(+), 135 deletions(-)

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
index cfeb230ea..04d9d3aac 100644
--- a/examples/run_evalharness.sh
+++ b/examples/run_evalharness.sh
@@ -1,53 +1,32 @@
-#!/bin/bash
+CHECKPOINT_PATH=/gpfsssd/scratch/rech/bbv/utw68ny/checkpoints/final_step
+CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/gpt2-350m-en/global_step37876
+#CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/
 
-# Example file to run the evaluation harness.
-# 
-
-export HF_DATASETS_CACHE=$SCRATCH/cache/
-
-CHECKPOINT_PATH=checkpoints/gpt2_both_ds
+PP_SIZE=1
+TP_SIZE=2
 VOCAB_FILE=gpt2-vocab.json
 MERGE_FILE=gpt2-merges.txt
-DATA_PATH=my-gpt2_text_document
 
-config_json="./ds_config.json"
-ZERO_STAGE=1
-DEEPSPEED_ARGS=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${ZERO_STAGE} \
-    --deepspeed-activation-checkpointing \
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS="\
+    --num-layers -1\
+    --hidden-size -1\
+    --num-attention-heads -1\
+    --seq-length -1 \
+    --max-position-embeddings -1
+"
+
+CMD="./tasks/eval_harness/evaluate.py \
+    --load $CHECKPOINT_PATH\
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE\
+    --vocab-file $VOCAB_FILE\
+    --merge-file $MERGE_FILE\
+    --micro-batch-size 1\
+    --task_list piqa\
+    $MEGATRON_REQUIRED_ARGS\
     "
 
-GPT_ARGS=" \
-    --num-layers 12 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 2 \
-    --global-batch-size 8 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --lr-warmup-fraction .01 \
-    --fp16 \
-    --pipeline-model-parallel-size 2\
-    --tensor-model-parallel-size 2\
-    "
-
-DATA_ARGS=" \
-    --load $CHECKPOINT_PATH \
-    --tokenizer-type GPT2BPETokenizer
-    "
-
-
-CMD="./tasks/eval_harness/evaluate.py $DEEPSPEED_ARGS $GPT_ARGS $DATA_ARGS --task_list piqa"
-N_GPUS=4
+N_GPUS=2
 LAUNCHER="deepspeed --num_gpus $N_GPUS"
-
-
-$LAUNCHER $CMD
+$LAUNCHER $CMD
\ No newline at end of file
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 865b20990..a9673a370 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -1,4 +1,3 @@
-
 from functools import reduce
 from logging import logMultiprocessing
 import os
@@ -6,7 +5,7 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir,os.path.pardir)))
 
-# Downloads the taks in the evaluation harness
+# Downloads the tasks in the evaluation harness
 # This is particularly useful when running in environments where the GPU nodes 
 # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
 
@@ -25,7 +24,7 @@
 from megatron import print_rank_0
 from megatron import get_tokenizer
 from megatron import mpu
-from megatron.training import setup_model_and_optimizer
+from megatron.training import setup_model_and_optimizer, get_model
 from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
 
 from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
@@ -44,15 +43,14 @@ def __init__(self, model, tokenizer):
         self.args = args
         self.model = model
         self.tokenizer = tokenizer
-        self.tokenizer.encode = self.tokenizer.tokenize
+        #self.tokenizer.encode = self.tokenizer.tokenize
         self.VOCAB_SIZE = tokenizer.vocab_size
         self.EOT_TOKEN_ID = tokenizer.eod
 
-        self.max_length = args.max_position_embeddings
-        self.max_gen_toks = 128
+        self.max_length = args.seq_length
         # For ds we split into mini batches and then micro batches to keep pipelining api happy.
         # With Megatron we just go to micro_batches directly 
-        self.batch_size = args.micro_batch_size if (not args.deepspeed) else args.global_batch_size
+        self.batch_size = args.micro_batch_size if (not args.deepspeed) else np.min(args.global_batch_size, 32)
         self.cache_hook = CacheHook(None)
         self.is_main = args.rank == 0
         self.is_local_main = args.local_rank == 0
@@ -66,6 +64,49 @@ def __init__(self, model, tokenizer):
 
         self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage()  # only the last stage of the pipeline model will receive the logits
 
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in requests:
+            if context == "":
+                # end of text as context
+                context_enc = [self.EOT_TOKEN_ID]
+            else:
+                context_enc = self.tokenizer_encode(context)
+
+            continuation_enc = self.tokenizer_encode(continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: automatic batch size detection for vectorization
+
+        loglikelihoods = []
+        with torch.no_grad():
+            for string, in tqdm(requests):
+                rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
+                    token_list=self.tokenizer_encode(string),
+                    prefix_token=self.EOT_TOKEN_ID,
+                    max_seq_len=self.max_length,
+                    context_len=1,
+                )))
+
+                rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
+                string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
+
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+
+                string_nll = sum(string_nll)
+                loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+    
     def _loglikelihood_tokens(self, requests, disable_tqdm=False):
         disable_tqdm = disable_tqdm if self.is_main else True
         res = []
@@ -90,12 +131,7 @@ def _collate(x):
 
                     # since in _collate we make sure length is descending, the longest is always the first one.
                     padding_length = padding_length if padding_length is not None else inplen
-                    
-                    # Deepspeed doesn't like it when we change the seq len
-                    # The recent curriculum learning patches should fix this, but for now it's let's just do it the slow and easy way.
-                    if self.deepspeed:
-                        padding_length = self.max_length
-                    
+                    padding_length = self.max_length
                     # pad to length
                     inp = torch.cat([
                         inp,  # [seq]
@@ -107,24 +143,26 @@ def _collate(x):
                     inplens.append(inplen)
 
                 logits = self._model_call(torch.cat(inps, dim=0))
+                #print(logits)
                 res_len += len(chunk)
                 if logits is not None:
-                    multi_logits = F.log_softmax(logits, dim=-1)  # [batch, seq, vocab]
-                    for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens,
-                                                                                 contlens):
+                    multi_logits = F.log_softmax(logits, dim=-1).cpu()  # [batch, seq, vocab]
+
+
+                    for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
                         contlen = len(cont_toks)
                         logits = logits[inplen - contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
                         greedy_tokens = logits.argmax(dim=-1)
                         # cont_toks :: [1, seq]
-                        cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0).to(multi_logits.device)
+                        cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
                         max_equal = (greedy_tokens == cont_toks).all()
+                        # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+
                         logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
                         answer = (float(logits.sum()), bool(max_equal))
-
                         # partial caching
                         if cache_key is not None:
                             self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-
                         res.append(answer)
 
         if not mpu.is_pipeline_last_stage():
@@ -134,87 +172,133 @@ def _collate(x):
         
         return reord.get_original(res)
 
-    def get_batch_pipe(self, tokens):
-        """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
+    def create_model_inputs(self, tokens):
         args = get_args()
-        tokenizer = get_tokenizer()
 
-        # Get the masks and postition ids.
         attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
             tokens,
-            tokenizer.eod,
+            self.EOT_TOKEN_ID,
             args.reset_position_ids,
             args.reset_attention_mask,
             args.eod_mask_loss,
             prefix_indices=None,
             loss_on_targets_only=False)
         
-        return (tokens, position_ids, attention_mask), (tokens, loss_mask)
+        return (tokens, position_ids, attention_mask)
 
     def _model_call(self, inps):
         args = get_args()
-
-        if args.deepspeed:
-            self.model.set_batch_fn(self.get_batch_pipe)
-            
-            # round up to multiple of micro_batch_size
-            new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
-            padded = F.pad(inps, (0, 0, 0, new_size), value = 1) 
-
-            # dummy data iterator for pipelining.
-            data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
-            self.model.micro_batches = len(data_iterator)
-
-            output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
-            
-            if mpu.is_pipeline_last_stage():
-                output = torch.cat(output, 0)[:len(inps)]
-                return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
-            else:
-                return None
-
+        # Since the shape of the micro-batch will change
+        # We need set the correct shapes here 
+        # So that latter pipeline stages knows which shapes to expect.
+        # Otherwise we will deadlock. 
+        args.micro_batch_size = len(inps)
+        args.seq_length = len(inps[0])
+
+        input_tensor = recv_forward()
+
+        # Forward pass through the model.
+        unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
+        unwrapped_model.set_input_tensor(input_tensor)
+        output = self.model(*self.create_model_inputs(inps))
+        
+        send_forward(output)
+        
+        if mpu.is_pipeline_last_stage():
+            return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
+        else:
+            return None
+    
+    def tokenizer_encode(self, text):
+        """Tokenize text *without* adding special tokens."""
+        # Splitting this into its own method in case we need to handle special cases for different tokenizers
+        from megatron.tokenizer.gpt2_tokenization import GPT2Tokenizer
+        if isinstance(self.tokenizer.tokenizer, GPT2Tokenizer):
+            return self.tokenizer.tokenizer.encode(text)
         else:
+            return self.tokenizer.tokenizer.encode(text, add_special_tokens=False)
 
-            _, position_ids, attention_mask = self.get_batch_pipe(inps)[0]
-
-            # Since the shape of the micro-batch will change
-            # We need set the correct shapes here 
-            # So that latter pipeline stages knows which shapes to expect.
-            # Otherwise we will deadlock. 
-            args.micro_batch_size = len(inps)
-            args.seq_length = len(inps[0])
-
-            input_tensor = recv_forward()
-
-            # Forward pass through the model.
-            unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
-            unwrapped_model.set_input_tensor(input_tensor)
-            output = self.model(inps, position_ids, attention_mask)
-            
-            send_forward(output)
-            if mpu.is_pipeline_last_stage():
-                return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
-            else:
-                return None
 
 from megatron.initialize import initialize_megatron
+import megatron
+
+from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
+from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
+
+def override_args(args, override_args, skip_keys, skip_if_specified_keys):    
+    for k, v in vars(override_args).items():
+        if k in skip_keys:
+            continue
+        if k in skip_if_specified_keys and getattr(args, k) is not None:
+            continue
+        setattr(args, k, v)
+
+
+# Note(Hesslow):
+# The model loading is a bit convoluted.
+# We want to parse out the model arguments from the checkpoint and use those to initialize megatron-ds.
+# 
+# However megatron-ds expects its arguments on the command line.
+# And at that point we don't know them.
+# 
+# Instead we use Jasons way: we load the arguments form the checkpoint and then override _parse_args to return whatever args we want.
+#
+# If the checkpoint is old, some new arguments may have been introduced and the code will expect these arguments to exist.
+# In order to support this we _first_ parse the arguments normally, and then override them with the arguments from the checkpoint.
+# Keeping the default-value of newer arguments. 
+#
+# We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints.
+def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
+    # parse the megatorn args. But wait with initalizing megatron.
+    args = _parse_args(extra_args_provider)
+    
+    ds_checkpoint = DeepSpeedCheckpoint(args.load,
+                                        tp_degree=args.tensor_model_parallel_size, 
+                                        pp_degree=args.pipeline_model_parallel_size)
+    
+    cp_args = ds_checkpoint.get_args()
+    
+    # Merge the current args with the checkpoint args.
+    skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config', 
+                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load']
+    
+    skip_if_specified = ['merge_file', 'vocab_file']
+    
+    
+    override_args(args, cp_args, skip_keys, skip_if_specified)
+    
+    # stop megatron from reparsing the arguments.
+    megatron.global_vars._parse_args = lambda *_args, **kwarg: args
+    megatron.global_vars._GLOBAL_ARGS = args
+    
+    initialize_megatron()
+
+    # Initializing megatron will update eg. tokenizer size. Override again.
+    override_args(args, cp_args, skip_keys, skip_if_specified)
+
+    # Initialize megatron model using the parsed state dict.
+    sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), True)
+
+    model = get_model(model_provider)[0]
+    model.load_state_dict(sd['model'], strict=True)
+
+    torch.distributed.barrier()
+    return model
 
-def get_tasks_args(parser):
+def tasks_args(parser):
     """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='eval harness')
-    group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
-    group.add_argument('--task_load_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded tasks are stored, or None if download is possible.')
-    group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
+    parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
+    parser.add_argument('--task_load_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded tasks are stored, or None if download is possible.')
+    parser.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
     return parser
 
+from megatron.global_vars import _parse_args
+
 def main():
-    
-    initialize_megatron(extra_args_provider=get_tasks_args)
+    model = load_ds_checkpoint_and_setup_megatron(extra_args_provider=tasks_args)
 
     args = get_args()
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        print("Interleaved pipeline schedule is not yet supported for text generation.")
-        exit()
+    assert not args.deepspeed, "Running this script in deepspeed-mode is not supported. We run all models using Megatron."
 
     #load eval harness task dict
     if args.task_load_path != 'None':
@@ -224,21 +308,13 @@ def main():
         if args.task_list != 'all':
             task_list = args.task_list.split(',')
             task_dict = dict((k,task_dict[k]) for k in task_list)
-            
     else:
         task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
         task_dict = tasks.get_task_dict(task_list)
 
-    # Set up model and load checkpoint.
-    model, _, _  = setup_model_and_optimizer(model_provider)
-
-    assert len(model) == 1, "Above condition should have caught this"
-    model = model[0]
-
-    if args.deepspeed:
-        model.module.activation_checkpoint_interval = 0
-        model._compute_loss = False
-        model.fwd_outputs = []
+    model.module.activation_checkpoint_interval = 0
+    model._compute_loss = False
+    model.fwd_outputs = []
 
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer) 
@@ -249,6 +325,5 @@ def main():
         with open(args.results_path, 'w') as outfile:
             json.dump(results, outfile, indent = 4)
 
-
 if __name__ == '__main__':
     main()
\ No newline at end of file
diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py
index 022759372..017036af4 100755
--- a/tools/convert_checkpoint/deepspeed_to_megatron.py
+++ b/tools/convert_checkpoint/deepspeed_to_megatron.py
@@ -4,7 +4,7 @@
 import os
 import torch
 from collections import OrderedDict
-from deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
+from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
 
 MODEL_KEY = 'model'
 ARGS_KEY = 'args'

From ee0a1a9090e409221067312e497370b13e24fd87 Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Sun, 5 Dec 2021 22:47:58 +0100
Subject: [PATCH 09/69] Add some options

---
 examples/run_evalharness.sh    | 16 ++++++++------
 tasks/eval_harness/evaluate.py | 40 +++++++++++++++++++++-------------
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
index 04d9d3aac..3f8344b11 100644
--- a/examples/run_evalharness.sh
+++ b/examples/run_evalharness.sh
@@ -1,9 +1,9 @@
-CHECKPOINT_PATH=/gpfsssd/scratch/rech/bbv/utw68ny/checkpoints/final_step
-CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/gpt2-350m-en/global_step37876
-#CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/
+#CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/gpt2-350m-en/global_step37876
+#CHECKPOINT_PATH=checkpoints/gpt2_both_ds/global_step3000
+CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/
 
 PP_SIZE=1
-TP_SIZE=2
+TP_SIZE=1
 VOCAB_FILE=gpt2-vocab.json
 MERGE_FILE=gpt2-merges.txt
 
@@ -22,11 +22,13 @@ CMD="./tasks/eval_harness/evaluate.py \
     --pipeline-model-parallel-size $PP_SIZE\
     --vocab-file $VOCAB_FILE\
     --merge-file $MERGE_FILE\
-    --micro-batch-size 1\
-    --task_list piqa\
+    --micro-batch-size 64\
+    --adaptive_seq_len\
+    --eval_fp32\
+    --task_list hellaswag,mrpc,piqa\
     $MEGATRON_REQUIRED_ARGS\
     "
 
-N_GPUS=2
+N_GPUS=1
 LAUNCHER="deepspeed --num_gpus $N_GPUS"
 $LAUNCHER $CMD
\ No newline at end of file
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index a9673a370..429e26e11 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -43,14 +43,13 @@ def __init__(self, model, tokenizer):
         self.args = args
         self.model = model
         self.tokenizer = tokenizer
-        #self.tokenizer.encode = self.tokenizer.tokenize
         self.VOCAB_SIZE = tokenizer.vocab_size
         self.EOT_TOKEN_ID = tokenizer.eod
 
         self.max_length = args.seq_length
         # For ds we split into mini batches and then micro batches to keep pipelining api happy.
         # With Megatron we just go to micro_batches directly 
-        self.batch_size = args.micro_batch_size if (not args.deepspeed) else np.min(args.global_batch_size, 32)
+        self.batch_size = args.micro_batch_size
         self.cache_hook = CacheHook(None)
         self.is_main = args.rank == 0
         self.is_local_main = args.local_rank == 0
@@ -58,7 +57,7 @@ def __init__(self, model, tokenizer):
         self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
         self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
         self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
-        self.deepspeed = args.deepspeed
+        self.adaptive_seq_len = args.adaptive_seq_len
         if self.is_data_parallel:
             raise NotImplementedError("Data parallelism is currently not supported for evaluation")
 
@@ -131,7 +130,8 @@ def _collate(x):
 
                     # since in _collate we make sure length is descending, the longest is always the first one.
                     padding_length = padding_length if padding_length is not None else inplen
-                    padding_length = self.max_length
+                    if not self.adaptive_seq_len:
+                        padding_length = self.max_length
                     # pad to length
                     inp = torch.cat([
                         inp,  # [seq]
@@ -143,12 +143,10 @@ def _collate(x):
                     inplens.append(inplen)
 
                 logits = self._model_call(torch.cat(inps, dim=0))
-                #print(logits)
                 res_len += len(chunk)
                 if logits is not None:
                     multi_logits = F.log_softmax(logits, dim=-1).cpu()  # [batch, seq, vocab]
 
-
                     for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
                         contlen = len(cont_toks)
                         logits = logits[inplen - contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
@@ -164,7 +162,7 @@ def _collate(x):
                         if cache_key is not None:
                             self.cache_hook.add_partial("loglikelihood", cache_key, answer)
                         res.append(answer)
-
+            
         if not mpu.is_pipeline_last_stage():
             # @HACK: To make the eval harness happy on threads that don't have access to the results.
             #        We just randomly generate some data. 
@@ -192,9 +190,11 @@ def _model_call(self, inps):
         # We need set the correct shapes here 
         # So that latter pipeline stages knows which shapes to expect.
         # Otherwise we will deadlock. 
+        
         args.micro_batch_size = len(inps)
         args.seq_length = len(inps[0])
-
+        args.max_position_embeddings = args.seq_length
+        
         input_tensor = recv_forward()
 
         # Forward pass through the model.
@@ -256,14 +256,17 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
                                         tp_degree=args.tensor_model_parallel_size, 
                                         pp_degree=args.pipeline_model_parallel_size)
     
+        
     cp_args = ds_checkpoint.get_args()
-    
     # Merge the current args with the checkpoint args.
-    skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config', 
+    skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config', 
                      'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load']
     
     skip_if_specified = ['merge_file', 'vocab_file']
     
+    if args.eval_fp32:
+        cp_args.fp16 = False
+        cp_args.bf16 = False
     
     override_args(args, cp_args, skip_keys, skip_if_specified)
     
@@ -281,15 +284,22 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
 
     model = get_model(model_provider)[0]
     model.load_state_dict(sd['model'], strict=True)
-
+    
     torch.distributed.barrier()
+    if args.eval_fp32:
+        model = model.float()
+    
     return model
 
 def tasks_args(parser):
     """Provide extra arguments required for tasks."""
-    parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
-    parser.add_argument('--task_load_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded tasks are stored, or None if download is possible.')
-    parser.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
+    group = parser.add_argument_group(title='Evaluation options')
+    group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
+    group.add_argument('--task_load_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded tasks are stored, or None if download is possible.')
+    group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
+    group.add_argument('--adaptive_seq_len',  default = False, action='store_true', 
+                       help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
+    group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
     return parser
 
 from megatron.global_vars import _parse_args
@@ -311,7 +321,7 @@ def main():
     else:
         task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
         task_dict = tasks.get_task_dict(task_list)
-
+        
     model.module.activation_checkpoint_interval = 0
     model._compute_loss = False
     model.fwd_outputs = []

From cacc58f81555bdb337fb681a46cf6344cc021416 Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Wed, 8 Dec 2021 19:26:45 +0100
Subject: [PATCH 10/69] Fix pipelining + fp32 evaluaiton.

---
 tasks/eval_harness/evaluate.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 429e26e11..60c4fe223 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -267,6 +267,7 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     if args.eval_fp32:
         cp_args.fp16 = False
         cp_args.bf16 = False
+        cp_args.params_dtype = torch.float32
     
     override_args(args, cp_args, skip_keys, skip_if_specified)
     
@@ -285,10 +286,10 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     model = get_model(model_provider)[0]
     model.load_state_dict(sd['model'], strict=True)
     
-    torch.distributed.barrier()
     if args.eval_fp32:
         model = model.float()
-    
+
+    torch.distributed.barrier()
     return model
 
 def tasks_args(parser):

From 778f2514b2dd418da8483f6671f4a99813ca98fa Mon Sep 17 00:00:00 2001
From: DanielHesslow <Daniel.hesslow@gmail.com>
Date: Thu, 9 Dec 2021 17:02:48 +0100
Subject: [PATCH 11/69] Remove dummy paths in examples/run_evalharness.sh

Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
---
 examples/run_evalharness.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
index 3f8344b11..74e388478 100644
--- a/examples/run_evalharness.sh
+++ b/examples/run_evalharness.sh
@@ -1,5 +1,3 @@
-#CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/gpt2-350m-en/global_step37876
-#CHECKPOINT_PATH=checkpoints/gpt2_both_ds/global_step3000
 CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/
 
 PP_SIZE=1

From 3d90b18d010f6eb17b2789c935e81778fc26e0e1 Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Thu, 9 Dec 2021 23:16:58 +0100
Subject: [PATCH 12/69] Simplify offline loading with export
 HF_DATASETS_OFFLINE=1

---
 examples/run_evalharness.sh    |  2 ++
 tasks/eval_harness/download.py | 11 ++---------
 tasks/eval_harness/evaluate.py | 14 ++------------
 3 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
index 74e388478..3cb5aed05 100644
--- a/examples/run_evalharness.sh
+++ b/examples/run_evalharness.sh
@@ -5,6 +5,8 @@ TP_SIZE=1
 VOCAB_FILE=gpt2-vocab.json
 MERGE_FILE=gpt2-merges.txt
 
+export HF_DATASETS_OFFLINE=1
+
 #dummy arguments to make megatron happy.
 MEGATRON_REQUIRED_ARGS="\
     --num-layers -1\
diff --git a/tasks/eval_harness/download.py b/tasks/eval_harness/download.py
index 0656cd656..d2abcd83a 100644
--- a/tasks/eval_harness/download.py
+++ b/tasks/eval_harness/download.py
@@ -2,8 +2,7 @@
 # This is particularly useful when running in environments where the GPU nodes 
 # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
 
-from lm_eval.base import LM
-from lm_eval import evaluator, tasks
+from lm_eval import tasks
 from lm_eval.tasks import ALL_TASKS
 import argparse
 import os
@@ -11,17 +10,11 @@
 
 parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
 parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
-parser.add_argument('--save_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded data tasks will be stored.')
 args = parser.parse_args()
 
-import pickle
-    
 def main():
     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
-    task_dict = tasks.get_task_dict(task_list)
-    with open(args.save_path, 'wb') as file:
-        pickle.dump(task_dict, file, protocol=pickle.HIGHEST_PROTOCOL)
-    print(f"Tasks have been saved to {args.save_path}!")
+    tasks.get_task_dict(task_list)
 
 if __name__ == '__main__':
     main()
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 60c4fe223..be81fc38a 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -296,7 +296,6 @@ def tasks_args(parser):
     """Provide extra arguments required for tasks."""
     group = parser.add_argument_group(title='Evaluation options')
     group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
-    group.add_argument('--task_load_path', type=str, default = "./task_cache.pickle", help='Path to where the downloaded tasks are stored, or None if download is possible.')
     group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
     group.add_argument('--adaptive_seq_len',  default = False, action='store_true', 
                        help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
@@ -311,17 +310,8 @@ def main():
     args = get_args()
     assert not args.deepspeed, "Running this script in deepspeed-mode is not supported. We run all models using Megatron."
 
-    #load eval harness task dict
-    if args.task_load_path != 'None':
-        with open(args.task_load_path, 'rb') as file:
-            task_dict = pickle.load(file)
-        
-        if args.task_list != 'all':
-            task_list = args.task_list.split(',')
-            task_dict = dict((k,task_dict[k]) for k in task_list)
-    else:
-        task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
-        task_dict = tasks.get_task_dict(task_list)
+    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+    task_dict = tasks.get_task_dict(task_list)
         
     model.module.activation_checkpoint_interval = 0
     model._compute_loss = False

From 2bb61ac7a5fb4f9c9f2426eaaca0cc7d57021358 Mon Sep 17 00:00:00 2001
From: DanielHesslow <Daniel.hesslow@gmail.com>
Date: Tue, 14 Dec 2021 17:02:19 +0100
Subject: [PATCH 13/69] Remove accidental copy-paste.

---
 tasks/eval_harness/evaluate.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index be81fc38a..7e78a41ed 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -5,10 +5,6 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir,os.path.pardir)))
 
-# Downloads the tasks in the evaluation harness
-# This is particularly useful when running in environments where the GPU nodes 
-# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
-
 from lm_eval.models.gpt2 import GPT2LM
 from lm_eval import evaluator, tasks, utils
 from lm_eval.base import CacheHook
@@ -327,4 +323,4 @@ def main():
             json.dump(results, outfile, indent = 4)
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()

From a362da38417a06886b36b7a2a49aa138a5218a8b Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Wed, 15 Dec 2021 20:59:35 +0100
Subject: [PATCH 14/69] Experimantel deepspeed evaluation-path

---
 tasks/eval_harness/evaluate.py | 79 +++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 26 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 7e78a41ed..1713ea714 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -158,7 +158,7 @@ def _collate(x):
                         if cache_key is not None:
                             self.cache_hook.add_partial("loglikelihood", cache_key, answer)
                         res.append(answer)
-            
+
         if not mpu.is_pipeline_last_stage():
             # @HACK: To make the eval harness happy on threads that don't have access to the results.
             #        We just randomly generate some data. 
@@ -178,28 +178,39 @@ def create_model_inputs(self, tokens):
             prefix_indices=None,
             loss_on_targets_only=False)
         
-        return (tokens, position_ids, attention_mask)
+        return (tokens, position_ids, attention_mask), (tokens, loss_mask)
 
     def _model_call(self, inps):
         args = get_args()
-        # Since the shape of the micro-batch will change
-        # We need set the correct shapes here 
-        # So that latter pipeline stages knows which shapes to expect.
-        # Otherwise we will deadlock. 
         
-        args.micro_batch_size = len(inps)
-        args.seq_length = len(inps[0])
-        args.max_position_embeddings = args.seq_length
-        
-        input_tensor = recv_forward()
+        if args.deepspeed:
+            self.model.set_batch_fn(self.create_model_inputs)
+            # round up to multiple of micro_batch_size
+            new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
+            padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0) 
+            # dummy data iterator for pipelining.
+            data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
+            self.model.micro_batches = len(data_iterator)
+            output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
+            output = torch.cat(output, 0)[:len(inps)]
+        else:
+            # Since the shape of the micro-batch will change
+            # We need set the correct shapes here 
+            # So that latter pipeline stages knows which shapes to expect.
+            # Otherwise we will deadlock. 
+            
+            args.micro_batch_size = len(inps)
+            args.seq_length = len(inps[0])
+            args.max_position_embeddings = args.seq_length
+            
+            input_tensor = recv_forward()
 
-        # Forward pass through the model.
-        unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
-        unwrapped_model.set_input_tensor(input_tensor)
-        output = self.model(*self.create_model_inputs(inps))
-        
-        send_forward(output)
-        
+            # Forward pass through the model.
+            unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
+            unwrapped_model.set_input_tensor(input_tensor)
+            output = self.model(*self.create_model_inputs(inps)[0])
+            send_forward(output)
+            
         if mpu.is_pipeline_last_stage():
             return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
         else:
@@ -246,6 +257,9 @@ def override_args(args, override_args, skip_keys, skip_if_specified_keys):
 # We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints.
 def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     # parse the megatorn args. But wait with initalizing megatron.
+    # avoid printing the arguments, since they will later be overridden.
+    _print_args = megatron.arguments._print_args
+    megatron.arguments._print_args = lambda *_args, **kwarg: None
     args = _parse_args(extra_args_provider)
     
     ds_checkpoint = DeepSpeedCheckpoint(args.load,
@@ -256,7 +270,7 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     cp_args = ds_checkpoint.get_args()
     # Merge the current args with the checkpoint args.
     skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config', 
-                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load']
+                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load', 'rampup_batch_size', 'iteration']
     
     skip_if_specified = ['merge_file', 'vocab_file']
     
@@ -272,16 +286,27 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     megatron.global_vars._GLOBAL_ARGS = args
     
     initialize_megatron()
+    torch.distributed.barrier()
 
     # Initializing megatron will update eg. tokenizer size. Override again.
     override_args(args, cp_args, skip_keys, skip_if_specified)
 
-    # Initialize megatron model using the parsed state dict.
-    sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), True)
-
-    model = get_model(model_provider)[0]
-    model.load_state_dict(sd['model'], strict=True)
-    
+    # print final arguments.
+    _print_args(args)
+    if args.deepspeed:
+        args.use_checkpoint_lr_scheduler = True # allow ds to override our lr schedule.
+        cp_path = args.load
+        args.load = None
+        model, _, _ = setup_model_and_optimizer(model_provider)
+        model = model[0]
+        _, _ = model.load_checkpoint(cp_path, tag = '.')
+    else:        
+        model = get_model(model_provider)[0]
+        # Initialize megatron model using the parsed state dict.
+        sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), True)
+
+        model.load_state_dict(sd['model'], strict=True)
+            
     if args.eval_fp32:
         model = model.float()
 
@@ -304,7 +329,9 @@ def main():
     model = load_ds_checkpoint_and_setup_megatron(extra_args_provider=tasks_args)
 
     args = get_args()
-    assert not args.deepspeed, "Running this script in deepspeed-mode is not supported. We run all models using Megatron."
+    if args.deepspeed and args.adaptive_seq_len:
+        print("Warning: Currently adaptive_seq_len is not supported with deepspeed. Turning off adaptive_seq_len")
+        args.adaptive_seq_len = False
 
     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
     task_dict = tasks.get_task_dict(task_list)

From 9899be0b3fe5b71f2423af58dde3b23587b9917f Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 7 Jan 2022 17:55:39 -0800
Subject: [PATCH 15/69] make it work with deepspeed; add instructions

---
 examples/run_evalharness_deepspeed.md | 71 ++++++++++++++++++++++++
 examples/run_evalharness_deepspeed.sh | 77 +++++++++++++++++++++++++++
 megatron/arguments.py                 |  2 +
 megatron/training.py                  |  8 ++-
 4 files changed, 156 insertions(+), 2 deletions(-)
 create mode 100644 examples/run_evalharness_deepspeed.md
 create mode 100644 examples/run_evalharness_deepspeed.sh

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
new file mode 100644
index 000000000..89bd29f14
--- /dev/null
+++ b/examples/run_evalharness_deepspeed.md
@@ -0,0 +1,71 @@
+# How to run lm-eval on Megatron-DeepSpeed checkpoint using the original setup
+
+This particular setup uses the normal deepspeed checkpoint and requires no conversion to Megatron-LM.
+
+This doc assumes usage on JZ, so some peculiar requirements in places. Ignore these if you're not running this on JZ.
+
+## Prerequisites
+
+On login console with external network
+
+Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness)
+```
+start-prod
+pip install lm-eval==0.0.1
+```
+Note: currently @master doesn't work with this script, later may have to edit the hardcoded version above
+
+
+then install datasets for the tasks:
+```
+python ./tasks/eval_harness/download.py --task_list hellaswag,mrpc,piqa
+```
+
+Prepare the run script:
+
+```
+cp examples/run_evalharness_deepspeed.sh run_evalharness.sh
+```
+
+now edit `run_evalharness.sh`
+
+you have to replicate the same config as in the original slurm script but you want:
+
+```
+ZERO_STAGE=0
+```
+and add:
+```
+export HF_DATASETS_OFFLINE=1
+```
+if you didn't have one already
+
+Adjust this to fit the GPU, probably ~12 for 32GB and 4-6 for 16GB
+```
+EVAL_MICRO_BATCH_SIZE=12
+```
+Do not modify `MICRO_BATCH_SIZE` which is from the original slurm training script (should remain the same).
+
+
+## Eval
+
+Currently it takes 8.5h to run on 32GB, so should probably still fit into 16GB over 20h, but will need a smaller --micro-batch-size
+
+```
+srun --account=six@gpu --constraint=v100-32g --nodes=1 --ntasks=1 --cpus-per-task=40 --gres=gpu:1 --hint=nomultithread --time=20:00:00 bash --rcfile $six_ALL_CCFRWORK/start-prod
+```
+
+```
+cd /gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
+PYTHONPATH=. sh ./run_evalharness.sh
+```
+
+## Short eval
+
+if you just want to quickly test that everything can run to the end, edit `tasks/eval_harness/evaluate.py`,  e.g. to run only 10 batches:
+```
+- results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
++ results = evaluator.evaluate(adaptor, task_dict, False, 0, 10)
+```
+
+(XXX: could be a cmd line option so that code won't need to be modified)
diff --git a/examples/run_evalharness_deepspeed.sh b/examples/run_evalharness_deepspeed.sh
new file mode 100644
index 000000000..d27faa591
--- /dev/null
+++ b/examples/run_evalharness_deepspeed.sh
@@ -0,0 +1,77 @@
+CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023
+MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+
+PP_SIZE=2
+TP_SIZE=1
+VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
+MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=8
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=512
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+#    --adaptive_seq_len \
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --seq-length $SEQ_LEN \
+    --eval_fp32 \
+    --task_list hellaswag,mrpc,piqa \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+N_GPUS=1
+LAUNCHER="deepspeed --num_gpus $N_GPUS"
+$LAUNCHER $CMD
+
+# want datasets
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 06330558e..0b4ab4b4f 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -550,6 +550,8 @@ def _add_training_args(parser):
                        'will be performed' )
     group.add_argument('--skip-train-iteration-range', type=str, nargs='+', default=None,
                        help='Iteration ranges to skip. The values are one or more dash-separated ranges. e.g., 101-200 251-300.')
+    group.add_argument('--inference', action='store_true',
+                       help='Very basic inference mode: not allocating optim/lr - requires ZERO_STAGE=0')
 
     return parser
 
diff --git a/megatron/training.py b/megatron/training.py
index bc4223dc4..c8863b4ed 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -370,9 +370,13 @@ def setup_model_and_optimizer(model_provider_func):
     unwrapped_model = unwrap_model(model,
                                    (torchDDP, LocalDDP, Float16Module))
 
-    optimizer = get_megatron_optimizer(unwrapped_model)
+    if args.inference:
+        optimizer = None
+        lr_scheduler = None
+    else:
+        optimizer = get_megatron_optimizer(unwrapped_model)
+        lr_scheduler = get_learning_rate_scheduler(optimizer)
 
-    lr_scheduler = get_learning_rate_scheduler(optimizer)
 
     if args.deepspeed:
         print_rank_0("DeepSpeed is enabled.")

From 7ef5ba751a6ad600be569d3e2b90bc64a985b02f Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 7 Jan 2022 18:30:21 -0800
Subject: [PATCH 16/69] improve

---
 examples/run_evalharness_deepspeed.md | 4 ++--
 examples/run_evalharness_deepspeed.sh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 89bd29f14..666470ea3 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -40,7 +40,7 @@ export HF_DATASETS_OFFLINE=1
 ```
 if you didn't have one already
 
-Adjust this to fit the GPU, probably ~12 for 32GB and 4-6 for 16GB
+Adjust this to fit the GPU, probably ~12 for 32GB and 4-6 for 16GB for 1.3B model
 ```
 EVAL_MICRO_BATCH_SIZE=12
 ```
@@ -49,7 +49,7 @@ Do not modify `MICRO_BATCH_SIZE` which is from the original slurm training scrip
 
 ## Eval
 
-Currently it takes 8.5h to run on 32GB, so should probably still fit into 16GB over 20h, but will need a smaller --micro-batch-size
+Currently it takes 8.5h to run on 32GB for 1.3B model, so should probably still fit into 16GB over 20h, but will need a smaller --micro-batch-size
 
 ```
 srun --account=six@gpu --constraint=v100-32g --nodes=1 --ntasks=1 --cpus-per-task=40 --gres=gpu:1 --hint=nomultithread --time=20:00:00 bash --rcfile $six_ALL_CCFRWORK/start-prod
diff --git a/examples/run_evalharness_deepspeed.sh b/examples/run_evalharness_deepspeed.sh
index d27faa591..166084dd9 100644
--- a/examples/run_evalharness_deepspeed.sh
+++ b/examples/run_evalharness_deepspeed.sh
@@ -10,7 +10,7 @@ SEQ_LEN=2048
 
 # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
 # make as big as it can fit into gpu w/o OOM, but not too close to 100%
-EVAL_MICRO_BATCH_SIZE=8
+EVAL_MICRO_BATCH_SIZE=12
 
 #dummy arguments to make megatron happy.
 MEGATRON_REQUIRED_ARGS=" \

From 9527ad36bf873043f64bb581342acf1cb01e8bfd Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 8 Jan 2022 09:13:08 -0800
Subject: [PATCH 17/69] make adaptive_seq_len work with deepspeed

---
 examples/run_evalharness_deepspeed.sh |  5 +-
 tasks/eval_harness/evaluate.py        | 81 +++++++++++++++------------
 2 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.sh b/examples/run_evalharness_deepspeed.sh
index 166084dd9..c63bea5e3 100644
--- a/examples/run_evalharness_deepspeed.sh
+++ b/examples/run_evalharness_deepspeed.sh
@@ -10,7 +10,7 @@ SEQ_LEN=2048
 
 # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
 # make as big as it can fit into gpu w/o OOM, but not too close to 100%
-EVAL_MICRO_BATCH_SIZE=12
+EVAL_MICRO_BATCH_SIZE=32
 
 #dummy arguments to make megatron happy.
 MEGATRON_REQUIRED_ARGS=" \
@@ -51,7 +51,7 @@ cat <<EOT > $config_json
 }
 EOT
 
-#    --adaptive_seq_len \
+
 CMD="./tasks/eval_harness/evaluate.py  \
     --load $CHECKPOINT_PATH \
     --tensor-model-parallel-size $TP_SIZE  \
@@ -65,6 +65,7 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --deepspeed \
     --deepspeed_config ds_config.json \
     --seq-length $SEQ_LEN \
+    --adaptive_seq_len \
     --eval_fp32 \
     --task_list hellaswag,mrpc,piqa \
     $MEGATRON_REQUIRED_ARGS \
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 1713ea714..d1422e8e6 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -9,12 +9,12 @@
 from lm_eval import evaluator, tasks, utils
 from lm_eval.base import CacheHook
 from tqdm import tqdm
-import torch.nn.functional as F 
+import torch.nn.functional as F
 
 from lm_eval.tasks import ALL_TASKS
 from pretrain_gpt import model_provider
 import numpy as np
-    
+
 import torch
 from megatron import get_args
 from megatron import print_rank_0
@@ -44,7 +44,7 @@ def __init__(self, model, tokenizer):
 
         self.max_length = args.seq_length
         # For ds we split into mini batches and then micro batches to keep pipelining api happy.
-        # With Megatron we just go to micro_batches directly 
+        # With Megatron we just go to micro_batches directly
         self.batch_size = args.micro_batch_size
         self.cache_hook = CacheHook(None)
         self.is_main = args.rank == 0
@@ -101,7 +101,7 @@ def loglikelihood_rolling(self, requests):
                 loglikelihoods.append(string_nll)
 
         return loglikelihoods
-    
+
     def _loglikelihood_tokens(self, requests, disable_tqdm=False):
         disable_tqdm = disable_tqdm if self.is_main else True
         res = []
@@ -135,6 +135,7 @@ def _collate(x):
                     ], dim=0)
 
                     inps.append(inp.unsqueeze(0))
+
                     contlens.append(cont)
                     inplens.append(inplen)
 
@@ -161,9 +162,9 @@ def _collate(x):
 
         if not mpu.is_pipeline_last_stage():
             # @HACK: To make the eval harness happy on threads that don't have access to the results.
-            #        We just randomly generate some data. 
+            #        We just randomly generate some data.
             res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests]
-        
+
         return reord.get_original(res)
 
     def create_model_inputs(self, tokens):
@@ -177,32 +178,36 @@ def create_model_inputs(self, tokens):
             args.eod_mask_loss,
             prefix_indices=None,
             loss_on_targets_only=False)
-        
+
         return (tokens, position_ids, attention_mask), (tokens, loss_mask)
 
     def _model_call(self, inps):
         args = get_args()
-        
+
         if args.deepspeed:
             self.model.set_batch_fn(self.create_model_inputs)
             # round up to multiple of micro_batch_size
             new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
-            padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0) 
+            padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
             # dummy data iterator for pipelining.
             data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
             self.model.micro_batches = len(data_iterator)
             output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
             output = torch.cat(output, 0)[:len(inps)]
+
+            # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same
+            if args.adaptive_seq_len:
+                self.model.total_loss = None
         else:
             # Since the shape of the micro-batch will change
-            # We need set the correct shapes here 
+            # We need set the correct shapes here
             # So that latter pipeline stages knows which shapes to expect.
-            # Otherwise we will deadlock. 
-            
+            # Otherwise we will deadlock.
+
             args.micro_batch_size = len(inps)
             args.seq_length = len(inps[0])
             args.max_position_embeddings = args.seq_length
-            
+
             input_tensor = recv_forward()
 
             # Forward pass through the model.
@@ -210,12 +215,12 @@ def _model_call(self, inps):
             unwrapped_model.set_input_tensor(input_tensor)
             output = self.model(*self.create_model_inputs(inps)[0])
             send_forward(output)
-            
+
         if mpu.is_pipeline_last_stage():
             return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
         else:
             return None
-    
+
     def tokenizer_encode(self, text):
         """Tokenize text *without* adding special tokens."""
         # Splitting this into its own method in case we need to handle special cases for different tokenizers
@@ -232,7 +237,7 @@ def tokenizer_encode(self, text):
 from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
 from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
 
-def override_args(args, override_args, skip_keys, skip_if_specified_keys):    
+def override_args(args, override_args, skip_keys, skip_if_specified_keys):
     for k, v in vars(override_args).items():
         if k in skip_keys:
             continue
@@ -244,15 +249,15 @@ def override_args(args, override_args, skip_keys, skip_if_specified_keys):
 # Note(Hesslow):
 # The model loading is a bit convoluted.
 # We want to parse out the model arguments from the checkpoint and use those to initialize megatron-ds.
-# 
+#
 # However megatron-ds expects its arguments on the command line.
 # And at that point we don't know them.
-# 
+#
 # Instead we use Jasons way: we load the arguments form the checkpoint and then override _parse_args to return whatever args we want.
 #
 # If the checkpoint is old, some new arguments may have been introduced and the code will expect these arguments to exist.
 # In order to support this we _first_ parse the arguments normally, and then override them with the arguments from the checkpoint.
-# Keeping the default-value of newer arguments. 
+# Keeping the default-value of newer arguments.
 #
 # We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints.
 def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
@@ -261,30 +266,30 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     _print_args = megatron.arguments._print_args
     megatron.arguments._print_args = lambda *_args, **kwarg: None
     args = _parse_args(extra_args_provider)
-    
+
     ds_checkpoint = DeepSpeedCheckpoint(args.load,
-                                        tp_degree=args.tensor_model_parallel_size, 
+                                        tp_degree=args.tensor_model_parallel_size,
                                         pp_degree=args.pipeline_model_parallel_size)
-    
-        
+
+
     cp_args = ds_checkpoint.get_args()
     # Merge the current args with the checkpoint args.
-    skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config', 
+    skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config',
                      'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load', 'rampup_batch_size', 'iteration']
-    
+
     skip_if_specified = ['merge_file', 'vocab_file']
-    
+
     if args.eval_fp32:
         cp_args.fp16 = False
         cp_args.bf16 = False
         cp_args.params_dtype = torch.float32
-    
+
     override_args(args, cp_args, skip_keys, skip_if_specified)
-    
+
     # stop megatron from reparsing the arguments.
     megatron.global_vars._parse_args = lambda *_args, **kwarg: args
     megatron.global_vars._GLOBAL_ARGS = args
-    
+
     initialize_megatron()
     torch.distributed.barrier()
 
@@ -300,13 +305,13 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
         model, _, _ = setup_model_and_optimizer(model_provider)
         model = model[0]
         _, _ = model.load_checkpoint(cp_path, tag = '.')
-    else:        
+    else:
         model = get_model(model_provider)[0]
         # Initialize megatron model using the parsed state dict.
         sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), True)
 
         model.load_state_dict(sd['model'], strict=True)
-            
+
     if args.eval_fp32:
         model = model.float()
 
@@ -318,7 +323,7 @@ def tasks_args(parser):
     group = parser.add_argument_group(title='Evaluation options')
     group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
     group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
-    group.add_argument('--adaptive_seq_len',  default = False, action='store_true', 
+    group.add_argument('--adaptive_seq_len',  default = False, action='store_true',
                        help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
     group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
     return parser
@@ -330,20 +335,22 @@ def main():
 
     args = get_args()
     if args.deepspeed and args.adaptive_seq_len:
-        print("Warning: Currently adaptive_seq_len is not supported with deepspeed. Turning off adaptive_seq_len")
-        args.adaptive_seq_len = False
+        # adaptive_seq_len hack #1:
+        # CL automatically enables reset_activation_shape() which allows us to change input shapes
+        # and it also reshapes the attenion scores in attention_mask_func
+        args.curriculum_learning = 1
 
     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
     task_dict = tasks.get_task_dict(task_list)
-        
+
     model.module.activation_checkpoint_interval = 0
     model._compute_loss = False
     model.fwd_outputs = []
 
     tokenizer = get_tokenizer()
-    adaptor = EvalHarnessAdaptor(model, tokenizer) 
+    adaptor = EvalHarnessAdaptor(model, tokenizer)
     results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
-    
+
     if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
         print(json.dumps(results, indent=2))
         with open(args.results_path, 'w') as outfile:

From d4dacbe963733c248dd4a618903b0fe3f97eff15 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 8 Jan 2022 10:29:20 -0800
Subject: [PATCH 18/69] move to slurm

---
 examples/run_evalharness_deepspeed.md         | 16 ++++++++---
 ...eed.sh => run_evalharness_deepspeed.slurm} | 27 +++++++++++++++++--
 2 files changed, 37 insertions(+), 6 deletions(-)
 rename examples/{run_evalharness_deepspeed.sh => run_evalharness_deepspeed.slurm} (68%)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 666470ea3..c7c0ba1a0 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -15,19 +15,27 @@ pip install lm-eval==0.0.1
 ```
 Note: currently @master doesn't work with this script, later may have to edit the hardcoded version above
 
+some symlinks due to lm-harness' issues with relative position of data
+```
+mkdir data
+ln -s data tasks/eval_harness/data
+```
+
+Also make sure `data` is not on one of the limited paritions like WORKSF.
 
-then install datasets for the tasks:
+Then install datasets for the tasks:
 ```
-python ./tasks/eval_harness/download.py --task_list hellaswag,mrpc,piqa
+python ./tasks/eval_harness/download.py --task_list
+arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,webqs,wic,winogrande,wnli,wsc
 ```
 
 Prepare the run script:
 
 ```
-cp examples/run_evalharness_deepspeed.sh run_evalharness.sh
+cp examples/run_evalharness_deepspeed.slurm run_evalharness.slurm
 ```
 
-now edit `run_evalharness.sh`
+now edit `run_evalharness.slurm`
 
 you have to replicate the same config as in the original slurm script but you want:
 
diff --git a/examples/run_evalharness_deepspeed.sh b/examples/run_evalharness_deepspeed.slurm
similarity index 68%
rename from examples/run_evalharness_deepspeed.sh
rename to examples/run_evalharness_deepspeed.slurm
index c63bea5e3..b0fd622f6 100644
--- a/examples/run_evalharness_deepspeed.sh
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -1,7 +1,28 @@
+#!/bin/bash
+#SBATCH --job-name=eval-harness-deepspeed
+#SBATCH --constraint=v100-32g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=40           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@gpu
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-prod
+
+echo "START TIME: $(date)"
+
 CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023
 MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
 export HF_DATASETS_OFFLINE=1
 
+cd $MEGATRON_DEEPSPEED_REPO
+
 PP_SIZE=2
 TP_SIZE=1
 VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
@@ -67,12 +88,14 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --seq-length $SEQ_LEN \
     --adaptive_seq_len \
     --eval_fp32 \
-    --task_list hellaswag,mrpc,piqa \
+    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,webqs,wic,winogrande,wnli,wsc \
     $MEGATRON_REQUIRED_ARGS \
     "
 
 N_GPUS=1
 LAUNCHER="deepspeed --num_gpus $N_GPUS"
-$LAUNCHER $CMD
+echo $LAUNCHER $CMD
+
+$LAUNCHER $CMD  | tee eval-harness.log
 
 # want datasets

From 151e91abaf0e876723fc8142645d031915ec41f3 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sun, 9 Jan 2022 01:51:35 +0100
Subject: [PATCH 19/69] fixes

---
 examples/run_evalharness_deepspeed.slurm | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
index b0fd622f6..cab9cd3af 100644
--- a/examples/run_evalharness_deepspeed.slurm
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -31,7 +31,7 @@ SEQ_LEN=2048
 
 # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
 # make as big as it can fit into gpu w/o OOM, but not too close to 100%
-EVAL_MICRO_BATCH_SIZE=32
+EVAL_MICRO_BATCH_SIZE=12
 
 #dummy arguments to make megatron happy.
 MEGATRON_REQUIRED_ARGS=" \
@@ -88,7 +88,7 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --seq-length $SEQ_LEN \
     --adaptive_seq_len \
     --eval_fp32 \
-    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,webqs,wic,winogrande,wnli,wsc \
+    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
     $MEGATRON_REQUIRED_ARGS \
     "
 
@@ -96,6 +96,8 @@ N_GPUS=1
 LAUNCHER="deepspeed --num_gpus $N_GPUS"
 echo $LAUNCHER $CMD
 
-$LAUNCHER $CMD  | tee eval-harness.log
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD  2>&1 | tee eval-harness.log
 
 # want datasets

From 92123d0a1d59e098b6ada10d19cb886c1ec718e7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 8 Jan 2022 16:56:25 -0800
Subject: [PATCH 20/69] cleanup

---
 examples/run_evalharness_deepspeed.md    | 17 ++++++++---------
 examples/run_evalharness_deepspeed.slurm |  4 +---
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index c7c0ba1a0..2d08eff5b 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -8,10 +8,10 @@ This doc assumes usage on JZ, so some peculiar requirements in places. Ignore th
 
 On login console with external network
 
-Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness)
+Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks.
 ```
 start-prod
-pip install lm-eval==0.0.1
+pip install lm-eval==0.0.1 best-download==0.0.7
 ```
 Note: currently @master doesn't work with this script, later may have to edit the hardcoded version above
 
@@ -26,7 +26,7 @@ Also make sure `data` is not on one of the limited paritions like WORKSF.
 Then install datasets for the tasks:
 ```
 python ./tasks/eval_harness/download.py --task_list
-arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,webqs,wic,winogrande,wnli,wsc
+arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc
 ```
 
 Prepare the run script:
@@ -57,16 +57,15 @@ Do not modify `MICRO_BATCH_SIZE` which is from the original slurm training scrip
 
 ## Eval
 
-Currently it takes 8.5h to run on 32GB for 1.3B model, so should probably still fit into 16GB over 20h, but will need a smaller --micro-batch-size
+Currently it takes 2-3 hours to run on 32GB for 1.3B model, so it should easily fit into 16GB over 20h, but will need a smaller `--micro-batch-size`.
 
+When ready, launch:
 ```
-srun --account=six@gpu --constraint=v100-32g --nodes=1 --ntasks=1 --cpus-per-task=40 --gres=gpu:1 --hint=nomultithread --time=20:00:00 bash --rcfile $six_ALL_CCFRWORK/start-prod
+sbatch ./run_evalharness.slurm
 ```
 
-```
-cd /gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
-PYTHONPATH=. sh ./run_evalharness.sh
-```
+Note that the original ETA at the start of the run can be 10x too longer than the actual outcome. For example it may suggest 18 hours but will complete in 2 hours.
+
 
 ## Short eval
 
diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
index cab9cd3af..63022800f 100644
--- a/examples/run_evalharness_deepspeed.slurm
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -98,6 +98,4 @@ echo $LAUNCHER $CMD
 
 export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
 
-$LAUNCHER $CMD  2>&1 | tee eval-harness.log
-
-# want datasets
+$LAUNCHER $CMD 2>&1 | tee eval-harness.log

From a6fab1f8e231084bffe35d7063099fb52c9cca55 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 10 Jan 2022 15:48:51 -0800
Subject: [PATCH 21/69] add instructions on how to import data into the
 spreadsheet

---
 examples/run_evalharness_deepspeed.md | 25 +++++++++++++
 tasks/eval_harness/report-to-csv.py   | 53 +++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100755 tasks/eval_harness/report-to-csv.py

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 2d08eff5b..88b7fb212 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -76,3 +76,28 @@ if you just want to quickly test that everything can run to the end, edit `tasks
 ```
 
 (XXX: could be a cmd line option so that code won't need to be modified)
+
+
+## Import into spreadsheet
+
+https://docs.google.com/spreadsheets/d/1CI8Q9RCblLRzUOPJ6ViqBmo284-8ojluQ-CmaEuhuv0/edit?usp=sharing
+
+Note that the spreadsheet format is quite different, so use this script:
+```
+./tasks/eval_harness/report-to-csv.py results.json
+```
+to reformat the json results into csv while changing its shape to match the spreadsheet format
+
+Since some records might be missing or extraneous here is the best way to do it:
+
+1. copy the data from first 2 columns to some place under the main spreadsheet
+
+2. put the pointer to the 3rd column next to where the 2 first columns were copied.
+
+3. import `results.csv` using file-> import -> file ->
+
+Import location: Replace data at selected cell
+
+4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
+
+5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space.
diff --git a/tasks/eval_harness/report-to-csv.py b/tasks/eval_harness/report-to-csv.py
new file mode 100755
index 000000000..dbefb38c1
--- /dev/null
+++ b/tasks/eval_harness/report-to-csv.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+# this script converts results.json:
+#
+#   "results": {
+#     "arc_challenge": {
+#       "acc": 0.24232081911262798,
+#       "acc_stderr": 0.01252159329580012,
+#       "acc_norm": 0.2764505119453925,
+#       "acc_norm_stderr": 0.013069662474252425
+#     },
+#
+# into a format expected by a spreadsheet, which is:
+#
+#   task          metric   value    err
+#   arc_challenge acc      xxx      yyy
+#   arc_challenge acc_norm xxx      yyy
+#   arc_challenge f1       xxx      yyy
+#
+# usage:
+# report-to-csv.py results.json
+
+
+import sys
+import json
+import io
+import csv
+
+results_file = sys.argv[1]
+
+csv_file = results_file.replace("json", "csv")
+
+print(f"Converting {results_file} to {csv_file}")
+
+with io.open(results_file, 'r', encoding='utf-8') as f:
+    results = json.load(f)
+
+with io.open(csv_file, 'w', encoding='utf-8') as f:
+
+    writer = csv.writer(f)
+    writer.writerow(["task", "metric", "value", "err"])
+
+    for k,v in sorted(results["results"].items()):
+        if "acc" in v:
+            writer.writerow([k, "acc", v["acc"], v["acc_stderr"]])
+        if "acc_norm" in v:
+            writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"]])
+        if "ppl" in v:
+            writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"]])
+        if "em" in v:
+            writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else ""])
+        if "f1" in v:
+            writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else ""])

From dedf111ee2885a8d260245eb12210dd99bede1f2 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 10 Jan 2022 16:05:16 -0800
Subject: [PATCH 22/69] not tracking ppl/em

---
 tasks/eval_harness/report-to-csv.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tasks/eval_harness/report-to-csv.py b/tasks/eval_harness/report-to-csv.py
index dbefb38c1..ed6a2c97a 100755
--- a/tasks/eval_harness/report-to-csv.py
+++ b/tasks/eval_harness/report-to-csv.py
@@ -45,9 +45,9 @@
             writer.writerow([k, "acc", v["acc"], v["acc_stderr"]])
         if "acc_norm" in v:
             writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"]])
-        if "ppl" in v:
-            writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"]])
-        if "em" in v:
-            writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else ""])
         if "f1" in v:
             writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else ""])
+        # if "ppl" in v:
+        #     writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"]])
+        # if "em" in v:
+        #     writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else ""])

From a12af5c0f1a511075fdb029e13c49464f786e8d3 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 10 Jan 2022 16:13:13 -0800
Subject: [PATCH 23/69] add task version

---
 tasks/eval_harness/report-to-csv.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tasks/eval_harness/report-to-csv.py b/tasks/eval_harness/report-to-csv.py
index ed6a2c97a..0c93f6466 100755
--- a/tasks/eval_harness/report-to-csv.py
+++ b/tasks/eval_harness/report-to-csv.py
@@ -38,16 +38,21 @@
 with io.open(csv_file, 'w', encoding='utf-8') as f:
 
     writer = csv.writer(f)
-    writer.writerow(["task", "metric", "value", "err"])
+    writer.writerow(["task", "metric", "value", "err", "version"])
+
+    versions = results["versions"]
 
     for k,v in sorted(results["results"].items()):
         if "acc" in v:
-            writer.writerow([k, "acc", v["acc"], v["acc_stderr"]])
+            row = [k, "acc", v["acc"], v["acc_stderr"]]
         if "acc_norm" in v:
-            writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"]])
+            row = [k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"]]
         if "f1" in v:
-            writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else ""])
+            row = [k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else ""]
         # if "ppl" in v:
-        #     writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"]])
+        #     row = [k, "ppl", v["ppl"], v["ppl_stderr"]]
         # if "em" in v:
-        #     writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else ""])
+        #     row = [k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else ""]
+
+        row += [versions[k] if k in versions else -1]
+        writer.writerow(row)

From dbecf81ca1f7ddd34df27f3601020831643f57d0 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 12 Jan 2022 15:04:24 -0800
Subject: [PATCH 24/69] make compatible with lm-eval@master

---
 tasks/eval_harness/evaluate.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index d1422e8e6..06656ef92 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -42,14 +42,16 @@ def __init__(self, model, tokenizer):
         self.VOCAB_SIZE = tokenizer.vocab_size
         self.EOT_TOKEN_ID = tokenizer.eod
 
-        self.max_length = args.seq_length
+        self._max_length = args.seq_length
+
         # For ds we split into mini batches and then micro batches to keep pipelining api happy.
         # With Megatron we just go to micro_batches directly
-        self.batch_size = args.micro_batch_size
+        self._batch_size = args.micro_batch_size
+
         self.cache_hook = CacheHook(None)
         self.is_main = args.rank == 0
         self.is_local_main = args.local_rank == 0
-        self.device = torch.cuda.current_device()
+        self._device = torch.cuda.current_device()
         self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
         self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
         self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
@@ -59,6 +61,18 @@ def __init__(self, model, tokenizer):
 
         self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage()  # only the last stage of the pipeline model will receive the logits
 
+    @property
+    def max_length(self):
+        return self._max_length
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def device(self):
+        return self._device
+
 
     def loglikelihood(self, requests):
         new_reqs = []

From c04e3d064c76cf8b6e1bef9098ae06d7aa32b79a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 12 Jan 2022 19:06:58 -0800
Subject: [PATCH 25/69] switch to 16gb slurm; simplify; improve instructions

---
 examples/run_evalharness_deepspeed.md    | 61 +++++++++++++++++-------
 examples/run_evalharness_deepspeed.slurm | 22 +++++----
 2 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 88b7fb212..a45d37d3f 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -6,21 +6,24 @@ This doc assumes usage on JZ, so some peculiar requirements in places. Ignore th
 
 ## Prerequisites
 
+1. Install software
+
 On login console with external network
 
 Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks.
 ```
 start-prod
-pip install lm-eval==0.0.1 best-download==0.0.7
+pip install best-download==0.0.7
+pip install git+https://github.com/EleutherAI/lm-evaluation-harness
 ```
-Note: currently @master doesn't work with this script, later may have to edit the hardcoded version above
+
+2. Pre-download needed datasets
 
 some symlinks due to lm-harness' issues with relative position of data
 ```
 mkdir data
 ln -s data tasks/eval_harness/data
 ```
-
 Also make sure `data` is not on one of the limited paritions like WORKSF.
 
 Then install datasets for the tasks:
@@ -29,40 +32,66 @@ python ./tasks/eval_harness/download.py --task_list
 arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc
 ```
 
-Prepare the run script:
+3. Prepare the slurm script
+
+Prepare the run script, replace `variant` with a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same `results.json` file. so, e.g., `tr9c-1B3-swiglu`
 
 ```
-cp examples/run_evalharness_deepspeed.slurm run_evalharness.slurm
+cp examples/run_evalharness_deepspeed.slurm run_evalharness-variant.slurm
 ```
 
-now edit `run_evalharness.slurm`
+now edit `run_evalharness-variant.slurm`
+
 
-you have to replicate the same config as in the original slurm script but you want:
+1. Edit:
 
 ```
-ZERO_STAGE=0
+PP_SIZE=2
+TP_SIZE=1
 ```
-and add:
+to match the original slurm script. But this is only needed to convert the checkpoint. The actual eval will happen on a single gpu.
+
+
+2. Adjust the following to fit the chosen GPU. As of last check for 1.3B model the settings are one of:
 ```
-export HF_DATASETS_OFFLINE=1
+EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
+EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
 ```
-if you didn't have one already
 
-Adjust this to fit the GPU, probably ~12 for 32GB and 4-6 for 16GB for 1.3B model
+3. If not using a Deepspeed path, disable it by removing:
+
 ```
-EVAL_MICRO_BATCH_SIZE=12
+    --deepspeed \
+    --deepspeed_config ds_config.json \
 ```
-Do not modify `MICRO_BATCH_SIZE` which is from the original slurm training script (should remain the same).
+
+Currently if `TP>1` you can't use the deepspeed path.
+
+If you didn't disable it and the program crashed on checkpoint loading unable to find some key, disable deepspeed as explained above.
 
 
 ## Eval
 
-Currently it takes 2-3 hours to run on 32GB for 1.3B model, so it should easily fit into 16GB over 20h, but will need a smaller `--micro-batch-size`.
+Currently it takes 2-3 hours to run on 32GB for 1.3B model, 6-7h for 16GB GPU, so a 20h slurm job should be enough.
 
 When ready, launch:
 ```
-sbatch ./run_evalharness.slurm
+sbatch ./run_evalharness-variant.slurm
+```
+
+To monitor progress:
+```
+tail -f tail -f $VARIANT-eval-harness.log
 ```
+where the variant is what you set `$VARIANT` to in the slurm script.
+
+The template is set up for 16GB gpu since they are easier to get by. If you change to 32GB, adjust:
+```
+#SBATCH --constraint=v100-32g
+...
+EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
+```
+
 
 Note that the original ETA at the start of the run can be 10x too longer than the actual outcome. For example it may suggest 18 hours but will complete in 2 hours.
 
diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
index 63022800f..6ce0c2eca 100644
--- a/examples/run_evalharness_deepspeed.slurm
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH --job-name=eval-harness-deepspeed
-#SBATCH --constraint=v100-32g
+#SBATCH --constraint=v100-16g
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --cpus-per-task=40           # number of cores per tasks
@@ -17,6 +17,9 @@ source $six_ALL_CCFRWORK/start-prod
 
 echo "START TIME: $(date)"
 
+# a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same "results.json" file.
+VARIANT="tr9c-1B3-swiglu"
+
 CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023
 MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
 export HF_DATASETS_OFFLINE=1
@@ -31,7 +34,10 @@ SEQ_LEN=2048
 
 # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
 # make as big as it can fit into gpu w/o OOM, but not too close to 100%
-EVAL_MICRO_BATCH_SIZE=12
+
+EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
+#EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
+
 
 #dummy arguments to make megatron happy.
 MEGATRON_REQUIRED_ARGS=" \
@@ -46,15 +52,11 @@ MEGATRON_REQUIRED_ARGS=" \
 ZERO_STAGE=0
 
 config_json="./ds_config.json"
-
-MICRO_BATCH_SIZE=1
-GLOBAL_BATCH_SIZE=512
-
 # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
 cat <<EOT > $config_json
 {
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
-  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "train_micro_batch_size_per_gpu": 1
+  "train_batch_size": 1
   "gradient_clipping": 1.0,
   "zero_optimization": {
     "stage": $ZERO_STAGE
@@ -72,9 +74,9 @@ cat <<EOT > $config_json
 }
 EOT
 
-
 CMD="./tasks/eval_harness/evaluate.py  \
     --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
     --tensor-model-parallel-size $TP_SIZE  \
     --pipeline-model-parallel-size $PP_SIZE \
     --vocab-file $VOCAB_FILE \
@@ -98,4 +100,4 @@ echo $LAUNCHER $CMD
 
 export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
 
-$LAUNCHER $CMD 2>&1 | tee eval-harness.log
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log

From e6e4800feb4386b49a1aefbc8facfecd518995fd Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Thu, 13 Jan 2022 16:34:51 +0100
Subject: [PATCH 26/69] Deepspeed model loading hack

---
 tasks/eval_harness/evaluate.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 06656ef92..b99554407 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -207,7 +207,12 @@ def _model_call(self, inps):
             data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
             self.model.micro_batches = len(data_iterator)
             output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
-            output = torch.cat(output, 0)[:len(inps)]
+
+
+            if output is not None:
+                output = torch.cat(output, 0)[:len(inps)]
+            else:
+                output = None
 
             # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same
             if args.adaptive_seq_len:
@@ -289,7 +294,7 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     cp_args = ds_checkpoint.get_args()
     # Merge the current args with the checkpoint args.
     skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config',
-                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load', 'rampup_batch_size', 'iteration']
+                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load', 'rampup_batch_size', 'iteration', 'inference']
 
     skip_if_specified = ['merge_file', 'vocab_file']
 
@@ -313,12 +318,25 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     # print final arguments.
     _print_args(args)
     if args.deepspeed:
-        args.use_checkpoint_lr_scheduler = True # allow ds to override our lr schedule.
+
+        # Hack #3:
+        # Loading pipelined models in deepspeed with different TP than it was originally trained on fails
+        # due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
+        # This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
+        # which does not contain these attention-specific keys.
+        #
+        # Deepspeed does however manage to load the model if we just turn off this sanity check.
+        import deepspeed
+        deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
+
+
         cp_path = args.load
         args.load = None
         model, _, _ = setup_model_and_optimizer(model_provider)
         model = model[0]
-        _, _ = model.load_checkpoint(cp_path, tag = '.')
+        model._config.zero_enabled = False
+        _, _ = model.load_checkpoint(cp_path, tag = '.', load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
+        model._config.zero_enabled = True
     else:
         model = get_model(model_provider)[0]
         # Initialize megatron model using the parsed state dict.

From 5e611bf1b72b97fff6133164b8ae249e906ce7f5 Mon Sep 17 00:00:00 2001
From: Daniel Hesslow <daniel@lighton.ai>
Date: Thu, 13 Jan 2022 17:13:20 +0100
Subject: [PATCH 27/69] Restore correct zero state.

---
 tasks/eval_harness/evaluate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index b99554407..d7db9f335 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -334,9 +334,10 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
         args.load = None
         model, _, _ = setup_model_and_optimizer(model_provider)
         model = model[0]
+        zero_enabled = model._config.zero_enabled
         model._config.zero_enabled = False
         _, _ = model.load_checkpoint(cp_path, tag = '.', load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
-        model._config.zero_enabled = True
+        model._config.zero_enabled = zero_enabled
     else:
         model = get_model(model_provider)[0]
         # Initialize megatron model using the parsed state dict.

From 7937eab0b93e2fd067cf6cc4ef6caa5e0fb4ad48 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 13 Jan 2022 21:14:53 -0800
Subject: [PATCH 28/69] fix conversion script

---
 tasks/eval_harness/report-to-csv.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tasks/eval_harness/report-to-csv.py b/tasks/eval_harness/report-to-csv.py
index 0c93f6466..3124cc4dd 100755
--- a/tasks/eval_harness/report-to-csv.py
+++ b/tasks/eval_harness/report-to-csv.py
@@ -43,16 +43,16 @@
     versions = results["versions"]
 
     for k,v in sorted(results["results"].items()):
+        if k not in versions:
+            versions[k] = -1
+
         if "acc" in v:
-            row = [k, "acc", v["acc"], v["acc_stderr"]]
+            writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]])
         if "acc_norm" in v:
-            row = [k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"]]
+            writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]])
         if "f1" in v:
-            row = [k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else ""]
+            writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]])
         # if "ppl" in v:
-        #     row = [k, "ppl", v["ppl"], v["ppl_stderr"]]
+        #     writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]])
         # if "em" in v:
-        #     row = [k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else ""]
-
-        row += [versions[k] if k in versions else -1]
-        writer.writerow(row)
+        #     writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]])

From afd3814220b072bf48f7326f2cec4c3d437a016c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 13 Jan 2022 21:15:08 -0800
Subject: [PATCH 29/69] simpler config

---
 examples/run_evalharness_deepspeed.slurm | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
index 6ce0c2eca..eb4859f1b 100644
--- a/examples/run_evalharness_deepspeed.slurm
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -52,23 +52,12 @@ MEGATRON_REQUIRED_ARGS=" \
 ZERO_STAGE=0
 
 config_json="./ds_config.json"
-# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
 cat <<EOT > $config_json
 {
-  "train_micro_batch_size_per_gpu": 1
-  "train_batch_size": 1
-  "gradient_clipping": 1.0,
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 12
-  },
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "zero_optimization": { "stage": $ZERO_STAGE },
+  "fp16": { "enabled": true },
   "steps_per_print": 2000,
   "wall_clock_breakdown": false
 }

From 9c60079f84521994c05922d5de7447847e124998 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 18 Jan 2022 12:26:08 -0800
Subject: [PATCH 30/69] corrections

---
 examples/run_evalharness_deepspeed.md    | 10 ++++++----
 examples/run_evalharness_deepspeed.slurm |  4 +++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index a45d37d3f..303e18596 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -43,13 +43,17 @@ cp examples/run_evalharness_deepspeed.slurm run_evalharness-variant.slurm
 now edit `run_evalharness-variant.slurm`
 
 
+Note that the eval code knows to pull the original training args from the checkpoint, so we don't need to pass any of those. And we just need to setup the evaluation args.
+
 1. Edit:
 
 ```
-PP_SIZE=2
+PP_SIZE=1
 TP_SIZE=1
 ```
-to match the original slurm script. But this is only needed to convert the checkpoint. The actual eval will happen on a single gpu.
+to match the eval topology. If the model fits into 1 gpu, then there is nothing to change.
+
+The eval script will automatically reshape the model if it was of a different topology.
 
 
 2. Adjust the following to fit the chosen GPU. As of last check for 1.3B model the settings are one of:
@@ -65,8 +69,6 @@ EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
     --deepspeed_config ds_config.json \
 ```
 
-Currently if `TP>1` you can't use the deepspeed path.
-
 If you didn't disable it and the program crashed on checkpoint loading unable to find some key, disable deepspeed as explained above.
 
 
diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
index eb4859f1b..7c18cdefa 100644
--- a/examples/run_evalharness_deepspeed.slurm
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -26,8 +26,10 @@ export HF_DATASETS_OFFLINE=1
 
 cd $MEGATRON_DEEPSPEED_REPO
 
-PP_SIZE=2
+# eval topology
+PP_SIZE=1
 TP_SIZE=1
+
 VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
 MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
 SEQ_LEN=2048

From d8611374cb31a1b78243cfd9154da1762a0fe83b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 18 Jan 2022 13:02:50 -0800
Subject: [PATCH 31/69] add logiqa

---
 examples/run_evalharness_deepspeed.slurm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
index 7c18cdefa..c7ac276c3 100644
--- a/examples/run_evalharness_deepspeed.slurm
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -81,7 +81,7 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --seq-length $SEQ_LEN \
     --adaptive_seq_len \
     --eval_fp32 \
-    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
+    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
     $MEGATRON_REQUIRED_ARGS \
     "
 

From 71587901f6c0efc3cc7d908ac30ad1691607eeb2 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Wed, 19 Jan 2022 12:18:44 -0800
Subject: [PATCH 32/69] dealing with custom tokenizers

---
 examples/run_evalharness_deepspeed.md    | 18 +++++++++++++++++-
 examples/run_evalharness_deepspeed.slurm |  4 ++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 303e18596..c80a725de 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -31,6 +31,20 @@ Then install datasets for the tasks:
 python ./tasks/eval_harness/download.py --task_list
 arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc
 ```
+and make sure that `export HF_DATASETS_OFFLINE=1`
+
+If there are things like custom tokenizers, pre-download those too, e.g.:
+
+```
+python -c "frransformers import AutoTokenizer; AutoTokenizer.from_pretrained('bigscience/oscar_13_languages_alpha_weight')"
+```
+and make sure that `export TRANSFORMERS_OFFLINE=1` is in the script.
+You know there is a custom tokenizer if the training script had something like:
+
+```
+--tokenizer-type PretrainedFromHF \
+ --tokenizer-name-or-path bigscience/oscar_13_languages_alpha_weight \
+```
 
 3. Prepare the slurm script
 
@@ -62,7 +76,9 @@ EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
 EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
 ```
 
-3. If not using a Deepspeed path, disable it by removing:
+If you get OOM lower it further.
+
+3. If not using the Deepspeed path, disable it by removing:
 
 ```
     --deepspeed \
diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
index c7ac276c3..e58ed9608 100644
--- a/examples/run_evalharness_deepspeed.slurm
+++ b/examples/run_evalharness_deepspeed.slurm
@@ -22,7 +22,11 @@ VARIANT="tr9c-1B3-swiglu"
 
 CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023
 MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
+
+# you want these 2 on JZ, and pre-download/cache any datasets/tokenizers/models
+# but comment these out if you're running on a node with Internet access
 export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
 
 cd $MEGATRON_DEEPSPEED_REPO
 

From f0da71decc2c4580db05d7a396c74638fe390811 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 21 Jan 2022 15:49:27 -0800
Subject: [PATCH 33/69] fix

---
 examples/run_evalharness_deepspeed.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index c80a725de..7166d4522 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -36,7 +36,7 @@ and make sure that `export HF_DATASETS_OFFLINE=1`
 If there are things like custom tokenizers, pre-download those too, e.g.:
 
 ```
-python -c "frransformers import AutoTokenizer; AutoTokenizer.from_pretrained('bigscience/oscar_13_languages_alpha_weight')"
+python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('bigscience/oscar_13_languages_alpha_weight')"
 ```
 and make sure that `export TRANSFORMERS_OFFLINE=1` is in the script.
 You know there is a custom tokenizer if the training script had something like:

From 1e06f41e416865b0694f0a9ecaed3698ccc21c1b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 18 Feb 2022 15:13:02 -0800
Subject: [PATCH 34/69] Update examples/run_evalharness_deepspeed.md

---
 examples/run_evalharness_deepspeed.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 7166d4522..aa9f3a4d8 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -22,7 +22,7 @@ pip install git+https://github.com/EleutherAI/lm-evaluation-harness
 some symlinks due to lm-harness' issues with relative position of data
 ```
 mkdir data
-ln -s data tasks/eval_harness/data
+ln -s `pwd`/data tasks/eval_harness/data
 ```
 Also make sure `data` is not on one of the limited paritions like WORKSF.
 

From 9ac9fadbedd1bb67f11e56e3d982fe373580a3b8 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 26 Apr 2022 08:48:31 -0700
Subject: [PATCH 35/69] check that the checkpoint path is valid

---
 tasks/eval_harness/evaluate.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index d7db9f335..aa4b798ec 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -286,6 +286,9 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     megatron.arguments._print_args = lambda *_args, **kwarg: None
     args = _parse_args(extra_args_provider)
 
+    if not os.path.exists(args.load):
+        raise ValueError(f"checkpoint path {args.load} doesn't exit")
+
     ds_checkpoint = DeepSpeedCheckpoint(args.load,
                                         tp_degree=args.tensor_model_parallel_size,
                                         pp_degree=args.pipeline_model_parallel_size)

From 8ef9018472849a5f3e27e856d21a913fa917db3a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 26 Apr 2022 11:18:07 -0700
Subject: [PATCH 36/69] skip --abort_on_unmet_fused_kernel_constraints during
 eval

---
 tasks/eval_harness/evaluate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index aa4b798ec..a3844f4c8 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -296,8 +296,8 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
 
     cp_args = ds_checkpoint.get_args()
     # Merge the current args with the checkpoint args.
-    skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config',
-                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load', 'rampup_batch_size', 'iteration', 'inference']
+    skip_keys = ['world_size', 'rank', 'local_rank', 'device_count', 'micro_batch_size', 'global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config',
+                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load', 'rampup_batch_size', 'iteration', 'inference', 'abort_on_unmet_fused_kernel_constraints']
 
     skip_if_specified = ['merge_file', 'vocab_file']
 

From a798d69ff110d0b9e6c4c902bec4308e11571be7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 26 Apr 2022 11:19:05 -0700
Subject: [PATCH 37/69] disable sanity check on layers-2%pp==0

---
 .../deepspeed_checkpoint.py                   | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/tools/convert_checkpoint/deepspeed_checkpoint.py b/tools/convert_checkpoint/deepspeed_checkpoint.py
index c38e0d550..52dff44f2 100644
--- a/tools/convert_checkpoint/deepspeed_checkpoint.py
+++ b/tools/convert_checkpoint/deepspeed_checkpoint.py
@@ -1,6 +1,6 @@
 import os
 from typing import Dict
-import torch 
+import torch
 
 ZERO_FILE_PREFIX = 'zero_pp_rank_'
 LAYER_FILE_PREFIX = 'layer_'
@@ -37,7 +37,7 @@ def __init__(self, dir, tp_degree=None, pp_degree=None):
         self.tp_degree = self.original_tp_degree if tp_degree is None else tp_degree
         self.pp_degree = self.original_pp_degree if pp_degree is None else pp_degree
         self.global_state = {}
-    
+
         self._sanity_check()
         self.pp_to_transformer_map = self._build_pp_transformer_map()
         self.transformer_file_map = self._build_transformer_file_map()
@@ -83,7 +83,7 @@ def get_args(self):
             self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
 
         return self.global_state[ARGS_KEY]
-    
+
 
     def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
         assert tp_index < self.tp_degree
@@ -93,7 +93,7 @@ def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
             sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
             sd = self._merge_state_dicts(sd_list)
             t_list.append(sd)
-        return t_list   
+        return t_list
 
     def get_final_norm_state(self, tp_index:int) -> Dict:
         assert tp_index in self.tp_to_final_norm_map.keys()
@@ -133,29 +133,35 @@ def _build_transformer_file_map(self):
                 if not map_key in file_map.keys():
                     file_map[map_key] = []
                 file_map[map_key].append(layer_file_partitions[tp_index])
-        
+
         return file_map
-        
+
     def _sanity_check(self):
         assert len(self.mp_rank_files) % self.tp_degree == 0
         assert len(self.zero_files) % (self.pp_degree * self.tp_degree) == 0
         assert len(self.layer_keys) > 2
-        assert (len(self.layer_keys) - 2) % self.pp_degree == 0
-     
+
+        # XXX: disable for now, since this fails when using:
+        # --pp-partition-method 'type:transformer|embedding'
+        # so if it can detect this flag somehow it then should validate:
+        # assert (len(self.layer_keys)) % self.pp_degree == 0
+        # the original:
+        # assert (len(self.layer_keys) - 2) % self.pp_degree == 0
+
     def _get_files_with_prefix(self, all_files, prefix):
         file_list = []
         for file_path in all_files:
             _, fname = os.path.split(file_path)
             if fname.startswith(prefix):
                 file_list.append(file_path)
-        
+
         return sorted(file_list)
 
     def validate_files(self):
         for file in self.file_list:
             if not os.path.isfile(file):
                 print(f'Error: {file} is not existent')
-        
+
     def _get_files(self, dir):
         file_list = []
         for root, dirs, files in os.walk(dir):
@@ -165,7 +171,7 @@ def _get_files(self, dir):
 
     def _get_layer_keys(self):
         key_set = set()
-        key_len = len(LAYER_FILE_PREFIX) + 2 
+        key_len = len(LAYER_FILE_PREFIX) + 2
         for file_path in self.layer_files:
             _, fname = os.path.split(file_path)
             key_set.add(fname[:key_len])

From 5884dcf356669073b29bf929fbd57b76d4f217aa Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 26 Apr 2022 15:04:27 -0700
Subject: [PATCH 38/69] sort skip_keys

---
 tasks/eval_harness/evaluate.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index a3844f4c8..d8fc7c471 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -296,8 +296,26 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
 
     cp_args = ds_checkpoint.get_args()
     # Merge the current args with the checkpoint args.
-    skip_keys = ['world_size', 'rank', 'local_rank', 'device_count', 'micro_batch_size', 'global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config',
-                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'load', 'rampup_batch_size', 'iteration', 'inference', 'abort_on_unmet_fused_kernel_constraints']
+    skip_keys = [
+        'abort_on_unmet_fused_kernel_constraints',
+        'batch_size',
+        'data_parallel_size',
+        'deepspeed',
+        'deepspeed_config',
+        'device_count',
+        'global_batch_size',
+        'inference',
+        'iteration',
+        'load',
+        'local_rank',
+        'micro_batch_size',
+        'pipeline_model_parallel_size',
+        'rampup_batch_size',
+        'rank',
+        'tensor_model_parallel_size',
+        'tensorboard_dir',
+        'world_size',
+    ]
 
     skip_if_specified = ['merge_file', 'vocab_file']
 

From 45bd9c65bed88d95cbd312e4d7eaefdb45239e0c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 10 May 2022 19:17:32 -0700
Subject: [PATCH 39/69] make the default path unique to avoid overwrite

---
 tasks/eval_harness/evaluate.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index d8fc7c471..a948cd68f 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -2,6 +2,7 @@
 from logging import logMultiprocessing
 import os
 import sys
+import datetime
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir,os.path.pardir)))
 
@@ -373,10 +374,13 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     return model
 
 def tasks_args(parser):
+
+    results_path_default = f"results-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.json"
+
     """Provide extra arguments required for tasks."""
     group = parser.add_argument_group(title='Evaluation options')
     group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
-    group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
+    group.add_argument('--results_path', type=str, default = results_path_default, help='Path to where the results will be stored.')
     group.add_argument('--adaptive_seq_len',  default = False, action='store_true',
                        help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
     group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')

From f75e23269b91f198491c9ccb63498073ff7e3cfa Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Fri, 13 May 2022 11:30:09 +0200
Subject: [PATCH 40/69] Add bootstrap_iters arg

---
 tasks/eval_harness/evaluate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index a948cd68f..e5fc60208 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -384,6 +384,7 @@ def tasks_args(parser):
     group.add_argument('--adaptive_seq_len',  default = False, action='store_true',
                        help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
     group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
+    group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     return parser
 
 from megatron.global_vars import _parse_args
@@ -407,7 +408,7 @@ def main():
 
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer)
-    results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
+    results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
 
     if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
         print(json.dumps(results, indent=2))

From 7bf75b94917dd873e6c0de7078797443d781a5c8 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Fri, 13 May 2022 21:34:48 +0200
Subject: [PATCH 41/69] Explain bootstrap_iters flag

---
 examples/run_evalharness_deepspeed.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index aa9f3a4d8..786ad1d22 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -87,6 +87,9 @@ If you get OOM lower it further.
 
 If you didn't disable it and the program crashed on checkpoint loading unable to find some key, disable deepspeed as explained above.
 
+4. Additional flags
+
+- To reduce the amount of iterations for stderr estimation, use e.g. `--bootstrap_iters 2`. This saves 1-2 minutes per dataset.
 
 ## Eval
 

From 3f18e7be331c4d940fcf4328f3b987989aef27dc Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sat, 14 May 2022 07:55:31 +0200
Subject: [PATCH 42/69] Intermediate results flag

---
 examples/run_evalharness_deepspeed.md |  1 +
 tasks/eval_harness/evaluate.py        | 20 +++++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 786ad1d22..038a35004 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -90,6 +90,7 @@ If you didn't disable it and the program crashed on checkpoint loading unable to
 4. Additional flags
 
 - To reduce the amount of iterations for stderr estimation, use e.g. `--bootstrap_iters 2`. This saves 1-2 minutes per dataset.
+- To print intermediate results when running multiple tasks use `--intermed_results`.
 
 ## Eval
 
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index e5fc60208..f09db07e0 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -384,6 +384,7 @@ def tasks_args(parser):
     group.add_argument('--adaptive_seq_len',  default = False, action='store_true',
                        help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
     group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
+    group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     return parser
 
@@ -408,12 +409,25 @@ def main():
 
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer)
-    results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+    
+    if args.intermed_results:
+        global_results = {"results": {}, "versions": {}}
+        for task_name, task in task_dict.items():
+            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, 10, bootstrap_iters=args.bootstrap_iters)
+            if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                print(json.dumps(results, indent=2))
+                results_path = args.results_path.replace(".json", f"_{task_name}.json")
+                with open(f"{results_path}", 'w') as outfile:
+                    json.dump(results, outfile, indent = 4)
+            global_results["results"] = {**global_results["results"], **results["results"]}
+            global_results["versions"] = {**global_results["versions"], **results["versions"]}
+    else:
+        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, 10, bootstrap_iters=args.bootstrap_iters)
 
     if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        print(json.dumps(results, indent=2))
+        print(json.dumps(global_results, indent=2))
         with open(args.results_path, 'w') as outfile:
-            json.dump(results, outfile, indent = 4)
+            json.dump(global_results, outfile, indent = 4)
 
 if __name__ == '__main__':
     main()

From 213317f17ea656f1857d225c31a264eea740344d Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 15 May 2022 13:43:55 +0200
Subject: [PATCH 43/69] Add backup file

---
 tasks/eval_harness/evaluate.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index f09db07e0..8f5e4a84d 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -412,22 +412,27 @@ def main():
     
     if args.intermed_results:
         global_results = {"results": {}, "versions": {}}
+        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
+        iteration_id = args.load.split("/")[-1].replace("/", "")
+        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+        # Backup file in case of interruption during writing
+        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
         for task_name, task in task_dict.items():
             results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, 10, bootstrap_iters=args.bootstrap_iters)
-            if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-                print(json.dumps(results, indent=2))
-                results_path = args.results_path.replace(".json", f"_{task_name}.json")
-                with open(f"{results_path}", 'w') as outfile:
-                    json.dump(results, outfile, indent = 4)
             global_results["results"] = {**global_results["results"], **results["results"]}
             global_results["versions"] = {**global_results["versions"], **results["versions"]}
+            if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                print(json.dumps(results, indent=2))
+                with open(results_path, 'w') as outfile:
+                    json.dump(global_results, outfile, indent=4)
+                with open(results_path_backup, 'w') as outfile:
+                    json.dump(global_results, outfile, indent=4)
     else:
         global_results = evaluator.evaluate(adaptor, task_dict, False, 0, 10, bootstrap_iters=args.bootstrap_iters)
-
-    if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        print(json.dumps(global_results, indent=2))
-        with open(args.results_path, 'w') as outfile:
-            json.dump(global_results, outfile, indent = 4)
+        if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+            print(json.dumps(global_results, indent=2))
+            with open(args.results_path, 'w') as outfile:
+                json.dump(global_results, outfile, indent=4)
 
 if __name__ == '__main__':
     main()

From 1c11b107991bf66fd1600f7dd6e2162f916e769c Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 15 May 2022 13:51:12 +0200
Subject: [PATCH 44/69] Add arg to reduce bubble for pipeline parallel

---
 examples/run_evalharness_deepspeed.md | 2 ++
 tasks/eval_harness/evaluate.py        | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 038a35004..d80860a98 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -91,6 +91,8 @@ If you didn't disable it and the program crashed on checkpoint loading unable to
 
 - To reduce the amount of iterations for stderr estimation, use e.g. `--bootstrap_iters 2`. This saves 1-2 minutes per dataset.
 - To print intermediate results when running multiple tasks use `--intermed_results`.
+- To reduce the bubble when setting PP use the flag `--micro_bs_multiplier`. Reducing `--micro-batch-size` may be needed when increasing the multiplier. 
+    - Running the 176B model with `--micro_bs_multiplier 8` & `--micro-batch-size 4` produced the fastest results for PiQA on 1 node in 2min18s.
 
 ## Eval
 
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 8f5e4a84d..98b05402d 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -47,7 +47,7 @@ def __init__(self, model, tokenizer):
 
         # For ds we split into mini batches and then micro batches to keep pipelining api happy.
         # With Megatron we just go to micro_batches directly
-        self._batch_size = args.micro_batch_size
+        self._batch_size = args.micro_batch_size * args.micro_bs_multiplier
 
         self.cache_hook = CacheHook(None)
         self.is_main = args.rank == 0
@@ -386,6 +386,7 @@ def tasks_args(parser):
     group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
+    group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
     return parser
 
 from megatron.global_vars import _parse_args

From f3307058a19577382d475388c874bb8de66a9788 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 15 May 2022 13:55:44 +0200
Subject: [PATCH 45/69] Fix adaptive_seq_len via resetting activation shape

---
 tasks/eval_harness/evaluate.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 98b05402d..e0a582daf 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -207,6 +207,12 @@ def _model_call(self, inps):
             # dummy data iterator for pipelining.
             data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
             self.model.micro_batches = len(data_iterator)
+            
+            if self.adaptive_seq_len:
+                # Allow different shapes than the default seq_len to be communicated across pipes
+                # Without this Deepspeed will hang when trying to receive activations
+                self.model.reset_activation_shape()
+            
             output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
 
 
@@ -419,7 +425,7 @@ def main():
         # Backup file in case of interruption during writing
         results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
         for task_name, task in task_dict.items():
-            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, 10, bootstrap_iters=args.bootstrap_iters)
+            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters)
             global_results["results"] = {**global_results["results"], **results["results"]}
             global_results["versions"] = {**global_results["versions"], **results["versions"]}
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
@@ -429,7 +435,7 @@ def main():
                 with open(results_path_backup, 'w') as outfile:
                     json.dump(global_results, outfile, indent=4)
     else:
-        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, 10, bootstrap_iters=args.bootstrap_iters)
+        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
         if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
             print(json.dumps(global_results, indent=2))
             with open(args.results_path, 'w') as outfile:

From 50820352c6ff677e5856493f3c84653e285d1b1a Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 15 May 2022 14:04:44 +0200
Subject: [PATCH 46/69] Extract args.load prior to
 load_ds_checkpoint_and_setup_megatron

---
 tasks/eval_harness/evaluate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index e0a582daf..6d76e48ae 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -398,6 +398,7 @@ def tasks_args(parser):
 from megatron.global_vars import _parse_args
 
 def main():
+    load_path = tasks_args.load
     model = load_ds_checkpoint_and_setup_megatron(extra_args_provider=tasks_args)
 
     args = get_args()
@@ -420,7 +421,7 @@ def main():
     if args.intermed_results:
         global_results = {"results": {}, "versions": {}}
         timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
-        iteration_id = args.load.split("/")[-1].replace("/", "")
+        iteration_id = load_path.split("/")[-1].replace("/", "")
         results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
         # Backup file in case of interruption during writing
         results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")

From db203cce9b42ca5e3441edec4849ba2134d80788 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sun, 15 May 2022 14:28:28 +0200
Subject: [PATCH 47/69] Parse args prior to loading function to get load_path

---
 tasks/eval_harness/evaluate.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 6d76e48ae..68dd649fd 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -286,12 +286,9 @@ def override_args(args, override_args, skip_keys, skip_if_specified_keys):
 # Keeping the default-value of newer arguments.
 #
 # We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints.
-def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
-    # parse the megatorn args. But wait with initalizing megatron.
-    # avoid printing the arguments, since they will later be overridden.
+def load_ds_checkpoint_and_setup_megatron(args):
     _print_args = megatron.arguments._print_args
     megatron.arguments._print_args = lambda *_args, **kwarg: None
-    args = _parse_args(extra_args_provider)
 
     if not os.path.exists(args.load):
         raise ValueError(f"checkpoint path {args.load} doesn't exit")
@@ -398,8 +395,11 @@ def tasks_args(parser):
 from megatron.global_vars import _parse_args
 
 def main():
-    load_path = tasks_args.load
-    model = load_ds_checkpoint_and_setup_megatron(extra_args_provider=tasks_args)
+    # parse the megatron args. But wait with initalizing megatron.
+    # avoid printing the arguments, since they will later be overridden.
+    args = _parse_args(tasks_args)
+    load_path = args.load
+    model = load_ds_checkpoint_and_setup_megatron(args)
 
     args = get_args()
     if args.deepspeed and args.adaptive_seq_len:

From 1d6c630a1782699f82e93e475899a50a14891667 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Mon, 16 May 2022 11:28:59 +0200
Subject: [PATCH 48/69] Add run_evalharness-tr11-176b-ml slurm script

---
 examples/run_evalharness_deepspeed.md       |   2 +-
 examples/run_evalharness_tr11-176b-ml.slurm | 121 ++++++++++++++++++++
 2 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 examples/run_evalharness_tr11-176b-ml.slurm

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index d80860a98..695d9d0aa 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -92,7 +92,7 @@ If you didn't disable it and the program crashed on checkpoint loading unable to
 - To reduce the amount of iterations for stderr estimation, use e.g. `--bootstrap_iters 2`. This saves 1-2 minutes per dataset.
 - To print intermediate results when running multiple tasks use `--intermed_results`.
 - To reduce the bubble when setting PP use the flag `--micro_bs_multiplier`. Reducing `--micro-batch-size` may be needed when increasing the multiplier. 
-    - Running the 176B model with `--micro_bs_multiplier 8` & `--micro-batch-size 4` produced the fastest results for PiQA on 1 node in 2min18s.
+    - Running the 176B model with PP=8, `--micro_bs_multiplier 8` & `--micro-batch-size 4` produced the fastest results for PiQA on 1 node in 2min18s.
 
 ## Eval
 
diff --git a/examples/run_evalharness_tr11-176b-ml.slurm b/examples/run_evalharness_tr11-176b-ml.slurm
new file mode 100644
index 000000000..6d4849461
--- /dev/null
+++ b/examples/run_evalharness_tr11-176b-ml.slurm
@@ -0,0 +1,121 @@
+#!/bin/bash
+#SBATCH --job-name=run_evalharness-tr11-176b-ml
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11-176b-ml"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step50000
+MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=8
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --bootstrap_iters 2 \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 4 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log

From 72447451777d6ad67d7cb7ed0f8e2ed0e678cee9 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Wed, 29 Jun 2022 13:06:23 +0200
Subject: [PATCH 49/69] Add bseval_harness fork compatibility

---
 examples/benchmark.slurm                      | 121 +++++
 .../run_bsevalharness_tr11-176b-ml.slurm      | 121 +++++
 examples/{ => evalharness}/run_evalharness.sh |   0
 .../run_evalharness_deepspeed.md              |   2 +
 .../run_evalharness_deepspeed.slurm           |   0
 .../run_evalharness_tr11-176b-ml.slurm        |   0
 tasks/eval_harness/evaluate_bsevalharness.py  | 462 ++++++++++++++++++
 7 files changed, 706 insertions(+)
 create mode 100644 examples/benchmark.slurm
 create mode 100644 examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
 rename examples/{ => evalharness}/run_evalharness.sh (100%)
 rename examples/{ => evalharness}/run_evalharness_deepspeed.md (96%)
 rename examples/{ => evalharness}/run_evalharness_deepspeed.slurm (100%)
 rename examples/{ => evalharness}/run_evalharness_tr11-176b-ml.slurm (100%)
 create mode 100644 tasks/eval_harness/evaluate_bsevalharness.py

diff --git a/examples/benchmark.slurm b/examples/benchmark.slurm
new file mode 100644
index 000000000..0609aedea
--- /dev/null
+++ b/examples/benchmark.slurm
@@ -0,0 +1,121 @@
+#!/bin/bash
+#SBATCH --job-name=run_evalharness-tr11-176b-ml
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11-176b-ml"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step85000
+MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=8
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/benchmark.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --bootstrap_iters 2 \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 4 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm b/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
new file mode 100644
index 000000000..722259d49
--- /dev/null
+++ b/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
@@ -0,0 +1,121 @@
+#!/bin/bash
+#SBATCH --job-name=run_bsevalharness-tr11-176b-ml
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --reservation=hug
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-muennighofflmeval
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11-176b-ml-bsevalharness"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step90000
+MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=8
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list tydiqa_primary \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --bootstrap_iters 2 \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 4 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/examples/run_evalharness.sh b/examples/evalharness/run_evalharness.sh
similarity index 100%
rename from examples/run_evalharness.sh
rename to examples/evalharness/run_evalharness.sh
diff --git a/examples/run_evalharness_deepspeed.md b/examples/evalharness/run_evalharness_deepspeed.md
similarity index 96%
rename from examples/run_evalharness_deepspeed.md
rename to examples/evalharness/run_evalharness_deepspeed.md
index 695d9d0aa..bfdf9f601 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/evalharness/run_evalharness_deepspeed.md
@@ -59,6 +59,8 @@ now edit `run_evalharness-variant.slurm`
 
 Note that the eval code knows to pull the original training args from the checkpoint, so we don't need to pass any of those. And we just need to setup the evaluation args.
 
+Note that for the bigscience lm-eval-harness fork (https://github.com/bigscience-workshop/lm-evaluation-harness), the corresponding scripts are `evaluate_bsevalharness.py` & `run_bsevalharness_tr11-176b-ml.slurm`.
+
 1. Edit:
 
 ```
diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/evalharness/run_evalharness_deepspeed.slurm
similarity index 100%
rename from examples/run_evalharness_deepspeed.slurm
rename to examples/evalharness/run_evalharness_deepspeed.slurm
diff --git a/examples/run_evalharness_tr11-176b-ml.slurm b/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
similarity index 100%
rename from examples/run_evalharness_tr11-176b-ml.slurm
rename to examples/evalharness/run_evalharness_tr11-176b-ml.slurm
diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
new file mode 100644
index 000000000..595e70e66
--- /dev/null
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -0,0 +1,462 @@
+"""
+An evaluate function compatible with https://github.com/bigscience-workshop/lm-evaluation-harness
+at commit 2d968c60fc8bd808e5e475ca300781f774d234c1
+
+Env Setup:
+git clone https://github.com/bigscience-workshop/lm-evaluation-harness
+cd lm-evaluation-harness
+pip install   "promptsource @ git+https://github.com/bigscience-workshop/promptsource@eval-hackathon"
+pip install -e ".[dev]"
+& then: https://github.com/bigscience-workshop/bigscience/blob/12f06bd39221f2e3788524ea86139ac1ac2b1b1a/jz/envs/README.md#creating-production-conda-env
+
+Usage:
+
+"""
+
+from functools import reduce
+from logging import logMultiprocessing
+import os
+import sys
+import datetime
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir,os.path.pardir)))
+
+# from lm_eval.models.gpt2 import GPT2LM
+from lm_eval import evaluator, tasks
+from lm_eval.api import utils
+from lm_eval.api.model import CacheHook
+from tqdm import tqdm
+import torch.nn.functional as F
+
+from lm_eval.tasks import ALL_TASKS
+from pretrain_gpt import model_provider
+import numpy as np
+
+import torch
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.training import setup_model_and_optimizer, get_model
+from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
+
+from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.p2p_communication import recv_forward, send_forward
+import pickle
+import json
+
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from megatron.model.distributed import DistributedDataParallel as LocalDDP
+from megatron.model.module import Float16Module
+from deepspeed.runtime.pipe import schedule
+
+class EvalHarnessAdaptor:
+    def __init__(self, model, tokenizer):
+        args = get_args()
+        self.args = args
+        self.model = model
+        self.tokenizer = tokenizer
+        self.VOCAB_SIZE = tokenizer.vocab_size
+        self.EOT_TOKEN_ID = tokenizer.eod
+
+        self._max_length = args.seq_length
+
+        # For ds we split into mini batches and then micro batches to keep pipelining api happy.
+        # With Megatron we just go to micro_batches directly
+        self._batch_size = args.micro_batch_size * args.micro_bs_multiplier
+
+        self.cache_hook = CacheHook(None)
+        self.is_main = args.rank == 0
+        self.is_local_main = args.local_rank == 0
+        self._device = torch.cuda.current_device()
+        self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+        self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
+        self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
+        self.adaptive_seq_len = args.adaptive_seq_len
+        if self.is_data_parallel:
+            raise NotImplementedError("Data parallelism is currently not supported for evaluation")
+
+        self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage()  # only the last stage of the pipeline model will receive the logits
+
+    @property
+    def max_length(self):
+        return self._max_length
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def device(self):
+        return self._device
+
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in requests:
+            if context == "":
+                # end of text as context
+                context_enc = [self.EOT_TOKEN_ID]
+            else:
+                context_enc = self.tokenizer_encode(context)
+
+            continuation_enc = self.tokenizer_encode(continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: automatic batch size detection for vectorization
+
+        loglikelihoods = []
+        with torch.no_grad():
+            for string, in tqdm(requests):
+                rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
+                    token_list=self.tokenizer_encode(string),
+                    prefix_token=self.EOT_TOKEN_ID,
+                    max_seq_len=self.max_length,
+                    context_len=1,
+                )))
+
+                rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
+                string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
+
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+
+                string_nll = sum(string_nll)
+                loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        disable_tqdm = disable_tqdm if self.is_main else True
+        res = []
+        res_len = 0  # storing the result length for later
+        self.model.eval()
+        with torch.no_grad():
+            def _collate(x):
+                toks = x[1] + x[2]
+                return (-len(toks), tuple(toks))
+
+            reord = utils.Reorderer(requests, _collate)
+            for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
+                inps, contlens, inplens, padding_length = [], [], [], None
+                for _, context_enc, continuation_enc in chunk:
+                    # when too long to fit in context, truncate from the left
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
+                        , dtype=torch.long).to(self.device)
+                    inplen, = inp.shape
+
+                    cont = continuation_enc
+
+                    # since in _collate we make sure length is descending, the longest is always the first one.
+                    padding_length = padding_length if padding_length is not None else inplen
+                    if not self.adaptive_seq_len:
+                        padding_length = self.max_length
+                    # pad to length
+                    inp = torch.cat([
+                        inp,  # [seq]
+                        torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device)  # [padding_length - seq]
+                    ], dim=0)
+
+                    inps.append(inp.unsqueeze(0))
+
+                    contlens.append(cont)
+                    inplens.append(inplen)
+
+                logits = self._model_call(torch.cat(inps, dim=0))
+                res_len += len(chunk)
+                if logits is not None:
+                    multi_logits = F.log_softmax(logits, dim=-1).cpu()  # [batch, seq, vocab]
+
+                    for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
+                        contlen = len(cont_toks)
+                        logits = logits[inplen - contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
+                        greedy_tokens = logits.argmax(dim=-1)
+                        # cont_toks :: [1, seq]
+                        cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
+                        max_equal = (greedy_tokens == cont_toks).all()
+                        # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+
+                        logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
+                        answer = (float(logits.sum()), bool(max_equal))
+                        # partial caching
+                        if cache_key is not None:
+                            self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                        res.append(answer)
+
+        if not mpu.is_pipeline_last_stage():
+            # @HACK: To make the eval harness happy on threads that don't have access to the results.
+            #        We just randomly generate some data.
+            res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests]
+
+        return reord.get_original(res)
+
+    def create_model_inputs(self, tokens):
+        args = get_args()
+
+        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+            tokens,
+            self.EOT_TOKEN_ID,
+            args.reset_position_ids,
+            args.reset_attention_mask,
+            args.eod_mask_loss,
+            prefix_indices=None,
+            loss_on_targets_only=False)
+
+        return (tokens, position_ids, attention_mask), (tokens, loss_mask)
+
+    def _model_call(self, inps):
+        args = get_args()
+
+        if args.deepspeed:
+            self.model.set_batch_fn(self.create_model_inputs)
+            # round up to multiple of micro_batch_size
+            new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
+            padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
+            # dummy data iterator for pipelining.
+            data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
+            self.model.micro_batches = len(data_iterator)
+            
+            if self.adaptive_seq_len:
+                # Allow different shapes than the default seq_len to be communicated across pipes
+                # Without this Deepspeed will hang when trying to receive activations
+                self.model.reset_activation_shape()
+            
+            output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
+
+
+            if output is not None:
+                output = torch.cat(output, 0)[:len(inps)]
+            else:
+                output = None
+
+            # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same
+            if args.adaptive_seq_len:
+                self.model.total_loss = None
+        else:
+            # Since the shape of the micro-batch will change
+            # We need set the correct shapes here
+            # So that latter pipeline stages knows which shapes to expect.
+            # Otherwise we will deadlock.
+
+            args.micro_batch_size = len(inps)
+            args.seq_length = len(inps[0])
+            args.max_position_embeddings = args.seq_length
+
+            input_tensor = recv_forward()
+
+            # Forward pass through the model.
+            unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
+            unwrapped_model.set_input_tensor(input_tensor)
+            output = self.model(*self.create_model_inputs(inps)[0])
+            send_forward(output)
+
+        if mpu.is_pipeline_last_stage():
+            return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
+        else:
+            return None
+
+    def tokenizer_encode(self, text):
+        """Tokenize text *without* adding special tokens."""
+        # Splitting this into its own method in case we need to handle special cases for different tokenizers
+        from megatron.tokenizer.gpt2_tokenization import GPT2Tokenizer
+        if isinstance(self.tokenizer.tokenizer, GPT2Tokenizer):
+            return self.tokenizer.tokenizer.encode(text)
+        else:
+            return self.tokenizer.tokenizer.encode(text, add_special_tokens=False)
+
+
+from megatron.initialize import initialize_megatron
+import megatron
+
+from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
+from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
+
+def override_args(args, override_args, skip_keys, skip_if_specified_keys):
+    for k, v in vars(override_args).items():
+        if k in skip_keys:
+            continue
+        if k in skip_if_specified_keys and getattr(args, k) is not None:
+            continue
+        setattr(args, k, v)
+
+
+# Note(Hesslow):
+# The model loading is a bit convoluted.
+# We want to parse out the model arguments from the checkpoint and use those to initialize megatron-ds.
+#
+# However megatron-ds expects its arguments on the command line.
+# And at that point we don't know them.
+#
+# Instead we use Jasons way: we load the arguments form the checkpoint and then override _parse_args to return whatever args we want.
+#
+# If the checkpoint is old, some new arguments may have been introduced and the code will expect these arguments to exist.
+# In order to support this we _first_ parse the arguments normally, and then override them with the arguments from the checkpoint.
+# Keeping the default-value of newer arguments.
+#
+# We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints.
+def load_ds_checkpoint_and_setup_megatron(args):
+    _print_args = megatron.arguments._print_args
+    megatron.arguments._print_args = lambda *_args, **kwarg: None
+
+    if not os.path.exists(args.load):
+        raise ValueError(f"checkpoint path {args.load} doesn't exit")
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.load,
+                                        tp_degree=args.tensor_model_parallel_size,
+                                        pp_degree=args.pipeline_model_parallel_size)
+
+
+    cp_args = ds_checkpoint.get_args()
+    # Merge the current args with the checkpoint args.
+    skip_keys = [
+        'abort_on_unmet_fused_kernel_constraints',
+        'batch_size',
+        'data_parallel_size',
+        'deepspeed',
+        'deepspeed_config',
+        'device_count',
+        'global_batch_size',
+        'inference',
+        'iteration',
+        'load',
+        'local_rank',
+        'micro_batch_size',
+        'pipeline_model_parallel_size',
+        'rampup_batch_size',
+        'rank',
+        'tensor_model_parallel_size',
+        'tensorboard_dir',
+        'world_size',
+    ]
+
+    skip_if_specified = ['merge_file', 'vocab_file']
+
+    if args.eval_fp32:
+        cp_args.fp16 = False
+        cp_args.bf16 = False
+        cp_args.params_dtype = torch.float32
+
+    override_args(args, cp_args, skip_keys, skip_if_specified)
+
+    # stop megatron from reparsing the arguments.
+    megatron.global_vars._parse_args = lambda *_args, **kwarg: args
+    megatron.global_vars._GLOBAL_ARGS = args
+
+    initialize_megatron()
+    torch.distributed.barrier()
+
+    # Initializing megatron will update eg. tokenizer size. Override again.
+    override_args(args, cp_args, skip_keys, skip_if_specified)
+
+    # print final arguments.
+    _print_args(args)
+    if args.deepspeed:
+
+        # Hack #3:
+        # Loading pipelined models in deepspeed with different TP than it was originally trained on fails
+        # due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
+        # This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
+        # which does not contain these attention-specific keys.
+        #
+        # Deepspeed does however manage to load the model if we just turn off this sanity check.
+        import deepspeed
+        deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
+
+
+        cp_path = args.load
+        args.load = None
+        model, _, _ = setup_model_and_optimizer(model_provider)
+        model = model[0]
+        zero_enabled = model._config.zero_enabled
+        model._config.zero_enabled = False
+        _, _ = model.load_checkpoint(cp_path, tag = '.', load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
+        model._config.zero_enabled = zero_enabled
+    else:
+        model = get_model(model_provider)[0]
+        # Initialize megatron model using the parsed state dict.
+        sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), True)
+
+        model.load_state_dict(sd['model'], strict=True)
+
+    if args.eval_fp32:
+        model = model.float()
+
+    torch.distributed.barrier()
+    return model
+
+def tasks_args(parser):
+
+    results_path_default = f"results-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.json"
+
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title='Evaluation options')
+    group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
+    group.add_argument('--results_path', type=str, default = results_path_default, help='Path to where the results will be stored.')
+    group.add_argument('--adaptive_seq_len',  default = False, action='store_true',
+                       help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
+    group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
+    group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
+    group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
+    group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    return parser
+
+from megatron.global_vars import _parse_args
+
+def main():
+    # parse the megatron args. But wait with initalizing megatron.
+    # avoid printing the arguments, since they will later be overridden.
+    args = _parse_args(tasks_args)
+    load_path = args.load
+    model = load_ds_checkpoint_and_setup_megatron(args)
+
+    args = get_args()
+    if args.deepspeed and args.adaptive_seq_len:
+        # adaptive_seq_len hack #1:
+        # CL automatically enables reset_activation_shape() which allows us to change input shapes
+        # and it also reshapes the attenion scores in attention_mask_func
+        args.curriculum_learning = 1
+
+    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+    task_dict = tasks.get_task_dict_promptsource(task_list)
+
+    model.module.activation_checkpoint_interval = 0
+    model._compute_loss = False
+    model.fwd_outputs = []
+
+    tokenizer = get_tokenizer()
+    adaptor = EvalHarnessAdaptor(model, tokenizer)
+    
+    if args.intermed_results:
+        global_results = {"results": {}, "versions": {}}
+        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
+        iteration_id = load_path.split("/")[-1].replace("/", "")
+        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+        # Backup file in case of interruption during writing
+        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
+        for task_name, task in task_dict.items():
+            results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters)
+            global_results["results"] = {**global_results["results"], **results["results"]}
+            global_results["versions"] = {**global_results["versions"], **results["versions"]}
+            if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                print(json.dumps(results, indent=2))
+                with open(results_path, 'w') as outfile:
+                    json.dump(global_results, outfile, indent=4)
+                with open(results_path_backup, 'w') as outfile:
+                    json.dump(global_results, outfile, indent=4)
+    else:
+        global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters)
+        if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+            print(json.dumps(global_results, indent=2))
+            with open(args.results_path, 'w') as outfile:
+                json.dump(global_results, outfile, indent=4)
+
+if __name__ == '__main__':
+    main()

From 6fd4646b20699a838d7fa04e3255496e72b150f9 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Wed, 29 Jun 2022 13:07:54 +0200
Subject: [PATCH 50/69] Remove superfluous script

---
 examples/benchmark.slurm | 121 ---------------------------------------
 1 file changed, 121 deletions(-)
 delete mode 100644 examples/benchmark.slurm

diff --git a/examples/benchmark.slurm b/examples/benchmark.slurm
deleted file mode 100644
index 0609aedea..000000000
--- a/examples/benchmark.slurm
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=run_evalharness-tr11-176b-ml
-#SBATCH --partition=gpu_p5
-#SBATCH --constraint=a100
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
-#SBATCH --cpus-per-task=64           # number of cores per tasks
-#SBATCH --hint=nomultithread         # we get physical cores not logical
-#SBATCH --gres=gpu:8                 # number of gpus
-#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
-#SBATCH --output=%x-%j.out           # output file name
-#SBATCH --account=six@a100
-
-
-set -x -e
-
-source $six_ALL_CCFRWORK/start-py38-pt111
-
-echo "START TIME: $(date)"
-
-# a unique identifier for the current eval ideally correspnding to the modelname
-VARIANT="tr11-176b-ml"
-
-
-CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step85000
-MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
-export HF_DATASETS_OFFLINE=1
-export TRANSFORMERS_OFFLINE=1
-
-export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
-export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
-export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
-export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
-
-cd $MEGATRON_DEEPSPEED_REPO
-
-TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
-
-PP_SIZE=8
-TP_SIZE=1
-SEQ_LEN=2048
-
-# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
-# make as big as it can fit into gpu w/o OOM, but not too close to 100%
-EVAL_MICRO_BATCH_SIZE=1
-
-#dummy arguments to make megatron happy.
-MEGATRON_REQUIRED_ARGS=" \
-    --num-layers -1 \
-    --hidden-size -1 \
-    --num-attention-heads -1 \
-    --seq-length -1  \
-    --max-position-embeddings -1 \
-"
-
-
-ZERO_STAGE=0
-
-config_json="./ds_config.json"
-
-# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
-cat <<EOT > $config_json
-{
-  "train_micro_batch_size_per_gpu": 1,
-  "train_batch_size": 1,
-  "gradient_clipping": 1.0,
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-  "bf16": {
-    "enabled": true
-  },
-  "steps_per_print": 2000,
-  "wall_clock_breakdown": false
-}
-EOT
-
-
-CMD="./tasks/eval_harness/benchmark.py  \
-    --load $CHECKPOINT_PATH \
-    --results_path $VARIANT-results.json \
-    --tensor-model-parallel-size $TP_SIZE  \
-    --pipeline-model-parallel-size $PP_SIZE \
-    --tokenizer-type PretrainedFromHF \
-    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
-    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
-    --no-load-optim \
-    --no-load-rng \
-    --bf16 \
-    --inference \
-    --seq-length $SEQ_LEN \
-    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
-    --deepspeed \
-    --deepspeed_config ds_config.json \
-    --bootstrap_iters 2 \
-    --intermed_results \
-    --adaptive_seq_len \
-    --micro_bs_multiplier 4 \
-    $MEGATRON_REQUIRED_ARGS \
-    "
-
-GPUS_PER_NODE=8
-NNODES=$SLURM_NNODES
-MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
-MASTER_PORT=6000
-export LAUNCHER="python -u -m torch.distributed.run \
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
-    --rdzv_backend c10d \
-    --max_restarts 0 \
-    --tee 3 \
-    "
-
-export CUDA_LAUNCH_BLOCKING=1
-
-echo $LAUNCHER $CMD
-
-export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
-
-$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log

From 0214bb77ac8f7e057b9f4326302d9bae48ed8d3a Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Wed, 29 Jun 2022 13:11:43 +0200
Subject: [PATCH 51/69] Remove duplicates

---
 examples/run_evalharness.sh                 |  34 -----
 examples/run_evalharness_deepspeed.md       | 156 --------------------
 examples/run_evalharness_deepspeed.slurm    |  98 ------------
 examples/run_evalharness_tr11-176b-ml.slurm | 121 ---------------
 4 files changed, 409 deletions(-)
 delete mode 100644 examples/run_evalharness.sh
 delete mode 100644 examples/run_evalharness_deepspeed.md
 delete mode 100644 examples/run_evalharness_deepspeed.slurm
 delete mode 100644 examples/run_evalharness_tr11-176b-ml.slurm

diff --git a/examples/run_evalharness.sh b/examples/run_evalharness.sh
deleted file mode 100644
index 3cb5aed05..000000000
--- a/examples/run_evalharness.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/
-
-PP_SIZE=1
-TP_SIZE=1
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-
-export HF_DATASETS_OFFLINE=1
-
-#dummy arguments to make megatron happy.
-MEGATRON_REQUIRED_ARGS="\
-    --num-layers -1\
-    --hidden-size -1\
-    --num-attention-heads -1\
-    --seq-length -1 \
-    --max-position-embeddings -1
-"
-
-CMD="./tasks/eval_harness/evaluate.py \
-    --load $CHECKPOINT_PATH\
-    --tensor-model-parallel-size $TP_SIZE \
-    --pipeline-model-parallel-size $PP_SIZE\
-    --vocab-file $VOCAB_FILE\
-    --merge-file $MERGE_FILE\
-    --micro-batch-size 64\
-    --adaptive_seq_len\
-    --eval_fp32\
-    --task_list hellaswag,mrpc,piqa\
-    $MEGATRON_REQUIRED_ARGS\
-    "
-
-N_GPUS=1
-LAUNCHER="deepspeed --num_gpus $N_GPUS"
-$LAUNCHER $CMD
\ No newline at end of file
diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
deleted file mode 100644
index 695d9d0aa..000000000
--- a/examples/run_evalharness_deepspeed.md
+++ /dev/null
@@ -1,156 +0,0 @@
-# How to run lm-eval on Megatron-DeepSpeed checkpoint using the original setup
-
-This particular setup uses the normal deepspeed checkpoint and requires no conversion to Megatron-LM.
-
-This doc assumes usage on JZ, so some peculiar requirements in places. Ignore these if you're not running this on JZ.
-
-## Prerequisites
-
-1. Install software
-
-On login console with external network
-
-Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks.
-```
-start-prod
-pip install best-download==0.0.7
-pip install git+https://github.com/EleutherAI/lm-evaluation-harness
-```
-
-2. Pre-download needed datasets
-
-some symlinks due to lm-harness' issues with relative position of data
-```
-mkdir data
-ln -s `pwd`/data tasks/eval_harness/data
-```
-Also make sure `data` is not on one of the limited paritions like WORKSF.
-
-Then install datasets for the tasks:
-```
-python ./tasks/eval_harness/download.py --task_list
-arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc
-```
-and make sure that `export HF_DATASETS_OFFLINE=1`
-
-If there are things like custom tokenizers, pre-download those too, e.g.:
-
-```
-python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('bigscience/oscar_13_languages_alpha_weight')"
-```
-and make sure that `export TRANSFORMERS_OFFLINE=1` is in the script.
-You know there is a custom tokenizer if the training script had something like:
-
-```
---tokenizer-type PretrainedFromHF \
- --tokenizer-name-or-path bigscience/oscar_13_languages_alpha_weight \
-```
-
-3. Prepare the slurm script
-
-Prepare the run script, replace `variant` with a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same `results.json` file. so, e.g., `tr9c-1B3-swiglu`
-
-```
-cp examples/run_evalharness_deepspeed.slurm run_evalharness-variant.slurm
-```
-
-now edit `run_evalharness-variant.slurm`
-
-
-Note that the eval code knows to pull the original training args from the checkpoint, so we don't need to pass any of those. And we just need to setup the evaluation args.
-
-1. Edit:
-
-```
-PP_SIZE=1
-TP_SIZE=1
-```
-to match the eval topology. If the model fits into 1 gpu, then there is nothing to change.
-
-The eval script will automatically reshape the model if it was of a different topology.
-
-
-2. Adjust the following to fit the chosen GPU. As of last check for 1.3B model the settings are one of:
-```
-EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
-EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
-```
-
-If you get OOM lower it further.
-
-3. If not using the Deepspeed path, disable it by removing:
-
-```
-    --deepspeed \
-    --deepspeed_config ds_config.json \
-```
-
-If you didn't disable it and the program crashed on checkpoint loading unable to find some key, disable deepspeed as explained above.
-
-4. Additional flags
-
-- To reduce the amount of iterations for stderr estimation, use e.g. `--bootstrap_iters 2`. This saves 1-2 minutes per dataset.
-- To print intermediate results when running multiple tasks use `--intermed_results`.
-- To reduce the bubble when setting PP use the flag `--micro_bs_multiplier`. Reducing `--micro-batch-size` may be needed when increasing the multiplier. 
-    - Running the 176B model with PP=8, `--micro_bs_multiplier 8` & `--micro-batch-size 4` produced the fastest results for PiQA on 1 node in 2min18s.
-
-## Eval
-
-Currently it takes 2-3 hours to run on 32GB for 1.3B model, 6-7h for 16GB GPU, so a 20h slurm job should be enough.
-
-When ready, launch:
-```
-sbatch ./run_evalharness-variant.slurm
-```
-
-To monitor progress:
-```
-tail -f tail -f $VARIANT-eval-harness.log
-```
-where the variant is what you set `$VARIANT` to in the slurm script.
-
-The template is set up for 16GB gpu since they are easier to get by. If you change to 32GB, adjust:
-```
-#SBATCH --constraint=v100-32g
-...
-EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
-```
-
-
-Note that the original ETA at the start of the run can be 10x too longer than the actual outcome. For example it may suggest 18 hours but will complete in 2 hours.
-
-
-## Short eval
-
-if you just want to quickly test that everything can run to the end, edit `tasks/eval_harness/evaluate.py`,  e.g. to run only 10 batches:
-```
-- results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
-+ results = evaluator.evaluate(adaptor, task_dict, False, 0, 10)
-```
-
-(XXX: could be a cmd line option so that code won't need to be modified)
-
-
-## Import into spreadsheet
-
-https://docs.google.com/spreadsheets/d/1CI8Q9RCblLRzUOPJ6ViqBmo284-8ojluQ-CmaEuhuv0/edit?usp=sharing
-
-Note that the spreadsheet format is quite different, so use this script:
-```
-./tasks/eval_harness/report-to-csv.py results.json
-```
-to reformat the json results into csv while changing its shape to match the spreadsheet format
-
-Since some records might be missing or extraneous here is the best way to do it:
-
-1. copy the data from first 2 columns to some place under the main spreadsheet
-
-2. put the pointer to the 3rd column next to where the 2 first columns were copied.
-
-3. import `results.csv` using file-> import -> file ->
-
-Import location: Replace data at selected cell
-
-4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
-
-5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space.
diff --git a/examples/run_evalharness_deepspeed.slurm b/examples/run_evalharness_deepspeed.slurm
deleted file mode 100644
index e58ed9608..000000000
--- a/examples/run_evalharness_deepspeed.slurm
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=eval-harness-deepspeed
-#SBATCH --constraint=v100-16g
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
-#SBATCH --cpus-per-task=40           # number of cores per tasks
-#SBATCH --hint=nomultithread         # we get physical cores not logical
-#SBATCH --gres=gpu:1                 # number of gpus
-#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
-#SBATCH --output=%x-%j.out           # output file name
-#SBATCH --account=six@gpu
-
-
-set -x -e
-
-source $six_ALL_CCFRWORK/start-prod
-
-echo "START TIME: $(date)"
-
-# a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same "results.json" file.
-VARIANT="tr9c-1B3-swiglu"
-
-CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023
-MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
-
-# you want these 2 on JZ, and pre-download/cache any datasets/tokenizers/models
-# but comment these out if you're running on a node with Internet access
-export HF_DATASETS_OFFLINE=1
-export TRANSFORMERS_OFFLINE=1
-
-cd $MEGATRON_DEEPSPEED_REPO
-
-# eval topology
-PP_SIZE=1
-TP_SIZE=1
-
-VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
-MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
-SEQ_LEN=2048
-
-# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
-# make as big as it can fit into gpu w/o OOM, but not too close to 100%
-
-EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
-#EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
-
-
-#dummy arguments to make megatron happy.
-MEGATRON_REQUIRED_ARGS=" \
-    --num-layers -1 \
-    --hidden-size -1 \
-    --num-attention-heads -1 \
-    --seq-length -1  \
-    --max-position-embeddings -1
-"
-
-
-ZERO_STAGE=0
-
-config_json="./ds_config.json"
-cat <<EOT > $config_json
-{
-  "train_micro_batch_size_per_gpu": 1,
-  "train_batch_size": 1,
-  "zero_optimization": { "stage": $ZERO_STAGE },
-  "fp16": { "enabled": true },
-  "steps_per_print": 2000,
-  "wall_clock_breakdown": false
-}
-EOT
-
-CMD="./tasks/eval_harness/evaluate.py  \
-    --load $CHECKPOINT_PATH \
-    --results_path $VARIANT-results.json \
-    --tensor-model-parallel-size $TP_SIZE  \
-    --pipeline-model-parallel-size $PP_SIZE \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
-    --no-load-optim \
-    --no-load-rng \
-    --inference \
-    --deepspeed \
-    --deepspeed_config ds_config.json \
-    --seq-length $SEQ_LEN \
-    --adaptive_seq_len \
-    --eval_fp32 \
-    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
-    $MEGATRON_REQUIRED_ARGS \
-    "
-
-N_GPUS=1
-LAUNCHER="deepspeed --num_gpus $N_GPUS"
-echo $LAUNCHER $CMD
-
-export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
-
-$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/examples/run_evalharness_tr11-176b-ml.slurm b/examples/run_evalharness_tr11-176b-ml.slurm
deleted file mode 100644
index 6d4849461..000000000
--- a/examples/run_evalharness_tr11-176b-ml.slurm
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=run_evalharness-tr11-176b-ml
-#SBATCH --partition=gpu_p5
-#SBATCH --constraint=a100
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
-#SBATCH --cpus-per-task=64           # number of cores per tasks
-#SBATCH --hint=nomultithread         # we get physical cores not logical
-#SBATCH --gres=gpu:8                 # number of gpus
-#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
-#SBATCH --output=%x-%j.out           # output file name
-#SBATCH --account=six@a100
-
-
-set -x -e
-
-source $six_ALL_CCFRWORK/start-py38-pt111
-
-echo "START TIME: $(date)"
-
-# a unique identifier for the current eval ideally correspnding to the modelname
-VARIANT="tr11-176b-ml"
-
-
-CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step50000
-MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
-export HF_DATASETS_OFFLINE=1
-export TRANSFORMERS_OFFLINE=1
-
-export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
-export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
-export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
-export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
-
-cd $MEGATRON_DEEPSPEED_REPO
-
-TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
-
-PP_SIZE=8
-TP_SIZE=1
-SEQ_LEN=2048
-
-# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
-# make as big as it can fit into gpu w/o OOM, but not too close to 100%
-EVAL_MICRO_BATCH_SIZE=1
-
-#dummy arguments to make megatron happy.
-MEGATRON_REQUIRED_ARGS=" \
-    --num-layers -1 \
-    --hidden-size -1 \
-    --num-attention-heads -1 \
-    --seq-length -1  \
-    --max-position-embeddings -1 \
-"
-
-
-ZERO_STAGE=0
-
-config_json="./ds_config.json"
-
-# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
-cat <<EOT > $config_json
-{
-  "train_micro_batch_size_per_gpu": 1,
-  "train_batch_size": 1,
-  "gradient_clipping": 1.0,
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-  "bf16": {
-    "enabled": true
-  },
-  "steps_per_print": 2000,
-  "wall_clock_breakdown": false
-}
-EOT
-
-
-CMD="./tasks/eval_harness/evaluate.py  \
-    --load $CHECKPOINT_PATH \
-    --results_path $VARIANT-results.json \
-    --tensor-model-parallel-size $TP_SIZE  \
-    --pipeline-model-parallel-size $PP_SIZE \
-    --tokenizer-type PretrainedFromHF \
-    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
-    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
-    --no-load-optim \
-    --no-load-rng \
-    --bf16 \
-    --inference \
-    --seq-length $SEQ_LEN \
-    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
-    --deepspeed \
-    --deepspeed_config ds_config.json \
-    --bootstrap_iters 2 \
-    --intermed_results \
-    --adaptive_seq_len \
-    --micro_bs_multiplier 4 \
-    $MEGATRON_REQUIRED_ARGS \
-    "
-
-GPUS_PER_NODE=8
-NNODES=$SLURM_NNODES
-MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
-MASTER_PORT=6000
-export LAUNCHER="python -u -m torch.distributed.run \
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
-    --rdzv_backend c10d \
-    --max_restarts 0 \
-    --tee 3 \
-    "
-
-export CUDA_LAUNCH_BLOCKING=1
-
-echo $LAUNCHER $CMD
-
-export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
-
-$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log

From 2ce9ff6753c0a1375809ef7ccf11b91ad823aec4 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Wed, 29 Jun 2022 13:13:01 +0200
Subject: [PATCH 52/69] Remove superfluous string

---
 tasks/eval_harness/evaluate_bsevalharness.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 595e70e66..1206b7080 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -8,9 +8,6 @@
 pip install   "promptsource @ git+https://github.com/bigscience-workshop/promptsource@eval-hackathon"
 pip install -e ".[dev]"
 & then: https://github.com/bigscience-workshop/bigscience/blob/12f06bd39221f2e3788524ea86139ac1ac2b1b1a/jz/envs/README.md#creating-production-conda-env
-
-Usage:
-
 """
 
 from functools import reduce

From 1fa06184ad81137e1db1c6be6dabe8644a1de988 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Thu, 30 Jun 2022 19:26:11 +0200
Subject: [PATCH 53/69] Add emission & example file

---
 .../run_bsevalharness_tr11-176b-ml.slurm      |  6 +-
 .../run_evalharness_tr11-176b-ml.slurm        |  1 -
 tasks/eval_harness/evaluate_bsevalharness.py  | 78 ++++++++++++-------
 3 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm b/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
index 722259d49..b96514c6c 100644
--- a/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
+++ b/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
@@ -24,7 +24,7 @@ VARIANT="tr11-176b-ml-bsevalharness"
 
 
 CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step90000
-MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/Megatron-DeepSpeed
+MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/megdsbslmeval/Megatron-DeepSpeed
 export HF_DATASETS_OFFLINE=1
 export TRANSFORMERS_OFFLINE=1
 
@@ -90,10 +90,10 @@ CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
     --bf16 \
     --inference \
     --seq-length $SEQ_LEN \
-    --task_list tydiqa_primary \
+    --task_list wnli \
     --deepspeed \
     --deepspeed_config ds_config.json \
-    --bootstrap_iters 2 \
+    --intermed_results \
     --adaptive_seq_len \
     --micro_bs_multiplier 4 \
     $MEGATRON_REQUIRED_ARGS \
diff --git a/examples/evalharness/run_evalharness_tr11-176b-ml.slurm b/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
index 6d4849461..83fb0ff9f 100644
--- a/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
+++ b/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
@@ -92,7 +92,6 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
     --deepspeed \
     --deepspeed_config ds_config.json \
-    --bootstrap_iters 2 \
     --intermed_results \
     --adaptive_seq_len \
     --micro_bs_multiplier 4 \
diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 1206b7080..115ecde40 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -10,15 +10,14 @@
 & then: https://github.com/bigscience-workshop/bigscience/blob/12f06bd39221f2e3788524ea86139ac1ac2b1b1a/jz/envs/README.md#creating-production-conda-env
 """
 
-from functools import reduce
-from logging import logMultiprocessing
+import logging
 import os
 import sys
 import datetime
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir,os.path.pardir)))
 
-# from lm_eval.models.gpt2 import GPT2LM
+from codecarbon import OfflineEmissionsTracker
 from lm_eval import evaluator, tasks
 from lm_eval.api import utils
 from lm_eval.api.model import CacheHook
@@ -39,7 +38,6 @@
 
 from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
 from megatron.p2p_communication import recv_forward, send_forward
-import pickle
 import json
 
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
@@ -47,6 +45,20 @@
 from megatron.model.module import Float16Module
 from deepspeed.runtime.pipe import schedule
 
+
+def setup_example_logger(output_path):
+    """
+    Sets up a logger that will save each example and prediction.
+    Copied from https://github.com/bigscience-workshop/lm-evaluation-harness/blob/2d968c60fc8bd808e5e475ca300781f774d234c1/main.py#L74
+    """
+    example_logger = logging.getLogger("examples")
+    filename = f"./examples-{output_path}.jsonl"
+    formatter = logging.Formatter("%(message)s")
+    handler = logging.FileHandler(filename)
+    handler.setFormatter(formatter)
+    example_logger.addHandler(handler)
+    example_logger.setLevel(logging.INFO)
+
 class EvalHarnessAdaptor:
     def __init__(self, model, tokenizer):
         args = get_args()
@@ -430,30 +442,44 @@ def main():
 
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer)
-    
-    if args.intermed_results:
-        global_results = {"results": {}, "versions": {}}
-        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
-        iteration_id = load_path.split("/")[-1].replace("/", "")
-        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
-        # Backup file in case of interruption during writing
-        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
-        for task_name, task in task_dict.items():
-            results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters)
-            global_results["results"] = {**global_results["results"], **results["results"]}
-            global_results["versions"] = {**global_results["versions"], **results["versions"]}
+
+    def add_config(results):
+        results["config"] = {
+            "adaptive_seq_len": args.adaptive_seq_len,
+            "num_fewshot": 0,
+            "bootstrap_iters": args.bootstrap_iters,
+        }
+        return results
+
+    with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"):
+        if args.intermed_results:
+            global_results = {"results": [], "versions": {}, "table_results": {}}
+            timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
+            iteration_id = load_path.split("/")[-1].replace("/", "")
+            results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+            # Backup file in case of interruption during writing
+            results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
+            examples_path = results_path.replace(".json", "_examples")
+            setup_example_logger(examples_path)
+            for task_name, task in task_dict.items():
+                results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters)
+                global_results["results"].extend(results["results"])
+                global_results["versions"] = {**global_results["versions"], **results["versions"]}
+                global_results["table_results"] = {**global_results["table_results"], **results["table_results"]}
+                global_results = add_config(global_results)
+                if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                    print(json.dumps(results, indent=2))
+                    with open(results_path, 'w') as outfile:
+                        json.dump(global_results, outfile, indent=4)
+                    with open(results_path_backup, 'w') as outfile:
+                        json.dump(global_results, outfile, indent=4)
+        else:
+            global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters)
+            global_results = add_config(global_results)
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-                print(json.dumps(results, indent=2))
-                with open(results_path, 'w') as outfile:
-                    json.dump(global_results, outfile, indent=4)
-                with open(results_path_backup, 'w') as outfile:
+                print(json.dumps(global_results, indent=2))
+                with open(args.results_path, 'w') as outfile:
                     json.dump(global_results, outfile, indent=4)
-    else:
-        global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters)
-        if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-            print(json.dumps(global_results, indent=2))
-            with open(args.results_path, 'w') as outfile:
-                json.dump(global_results, outfile, indent=4)
 
 if __name__ == '__main__':
     main()

From 9af3e02e15d95c7048de135af8f7ddb344d72457 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 5 Jul 2022 10:38:51 +0200
Subject: [PATCH 54/69] Add downloading

---
 .../run_bsevalharness_tr11-350M-ml.slurm      | 122 ++++++++++++++++++
 tasks/eval_harness/download.py                |   2 -
 tasks/eval_harness/download_bsevalharness.py  |  21 +++
 3 files changed, 143 insertions(+), 2 deletions(-)
 create mode 100644 examples/evalharness/run_bsevalharness_tr11-350M-ml.slurm
 create mode 100644 tasks/eval_harness/download_bsevalharness.py

diff --git a/examples/evalharness/run_bsevalharness_tr11-350M-ml.slurm b/examples/evalharness/run_bsevalharness_tr11-350M-ml.slurm
new file mode 100644
index 000000000..bbdd047d8
--- /dev/null
+++ b/examples/evalharness/run_bsevalharness_tr11-350M-ml.slurm
@@ -0,0 +1,122 @@
+#!/bin/bash
+#SBATCH --job-name=run_bsevalharness-tr11-350M-ml
+#SBATCH --partition=gpu_p13
+#SBATCH --constraint=v100
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=20           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:2                 # number of gpus
+#SBATCH --time 5:00:00               # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@v100
+#SBATCH -C v100-32g
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-muennighofflmeval
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11-350M-ml-bsevalharness"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11e-350M-ml/checkpoints/main/659500
+
+MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/megdsbslmeval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=1
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list wnli \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 4 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=2
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/tasks/eval_harness/download.py b/tasks/eval_harness/download.py
index d2abcd83a..81ed018e6 100644
--- a/tasks/eval_harness/download.py
+++ b/tasks/eval_harness/download.py
@@ -19,5 +19,3 @@ def main():
 if __name__ == '__main__':
     main()
 
-
-    
\ No newline at end of file
diff --git a/tasks/eval_harness/download_bsevalharness.py b/tasks/eval_harness/download_bsevalharness.py
new file mode 100644
index 000000000..5f313516c
--- /dev/null
+++ b/tasks/eval_harness/download_bsevalharness.py
@@ -0,0 +1,21 @@
+# Downloads the specified taks in the evaluation harness
+# This is particularly useful when running in environments where the GPU nodes 
+# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
+
+from lm_eval import tasks
+from lm_eval.tasks import ALL_TASKS
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
+parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
+args = parser.parse_args()
+
+def main():
+    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+    tasks.get_task_dict_promptsource(task_list)
+
+if __name__ == '__main__':
+    main()
+

From f75af1f96d77e913fd729830374864fdb2817fff Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 5 Jul 2022 12:43:25 +0200
Subject: [PATCH 55/69] Offload to CPU earlier & increase number of bs in
 pipleine parallelism

---
 examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm | 2 +-
 examples/evalharness/run_evalharness_tr11-176b-ml.slurm   | 2 +-
 tasks/eval_harness/evaluate.py                            | 4 ++--
 tasks/eval_harness/evaluate_bsevalharness.py              | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm b/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
index b96514c6c..7754b7db8 100644
--- a/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
+++ b/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
@@ -95,7 +95,7 @@ CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
     --deepspeed_config ds_config.json \
     --intermed_results \
     --adaptive_seq_len \
-    --micro_bs_multiplier 4 \
+    --micro_bs_multiplier 16 \
     $MEGATRON_REQUIRED_ARGS \
     "
 
diff --git a/examples/evalharness/run_evalharness_tr11-176b-ml.slurm b/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
index 83fb0ff9f..cfa01ea3c 100644
--- a/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
+++ b/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
@@ -94,7 +94,7 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --deepspeed_config ds_config.json \
     --intermed_results \
     --adaptive_seq_len \
-    --micro_bs_multiplier 4 \
+    --micro_bs_multiplier 16 \
     $MEGATRON_REQUIRED_ARGS \
     "
 
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 68dd649fd..e5277aaf0 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -157,7 +157,7 @@ def _collate(x):
                 logits = self._model_call(torch.cat(inps, dim=0))
                 res_len += len(chunk)
                 if logits is not None:
-                    multi_logits = F.log_softmax(logits, dim=-1).cpu()  # [batch, seq, vocab]
+                    multi_logits = F.log_softmax(logits, dim=-1) # [batch, seq, vocab]
 
                     for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
                         contlen = len(cont_toks)
@@ -217,7 +217,7 @@ def _model_call(self, inps):
 
 
             if output is not None:
-                output = torch.cat(output, 0)[:len(inps)]
+                output = torch.cat([o.cpu() for o in output], 0)[:len(inps)]
             else:
                 output = None
 
diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 115ecde40..0f6131ddf 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -182,7 +182,7 @@ def _collate(x):
                 logits = self._model_call(torch.cat(inps, dim=0))
                 res_len += len(chunk)
                 if logits is not None:
-                    multi_logits = F.log_softmax(logits, dim=-1).cpu()  # [batch, seq, vocab]
+                    multi_logits = F.log_softmax(logits, dim=-1) # [batch, seq, vocab]
 
                     for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
                         contlen = len(cont_toks)
@@ -242,7 +242,7 @@ def _model_call(self, inps):
 
 
             if output is not None:
-                output = torch.cat(output, 0)[:len(inps)]
+                output = torch.cat([o.cpu() for o in output], 0)[:len(inps)]
             else:
                 output = None
 

From 9cf7ffd914fc0a25aa83074e98e67787cc8d0065 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 5 Jul 2022 14:31:25 +0200
Subject: [PATCH 56/69] Add offload arg

---
 tasks/eval_harness/evaluate.py               | 11 +++++++++--
 tasks/eval_harness/evaluate_bsevalharness.py | 14 +++++++++-----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index e5277aaf0..8c362f1a7 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -157,7 +157,10 @@ def _collate(x):
                 logits = self._model_call(torch.cat(inps, dim=0))
                 res_len += len(chunk)
                 if logits is not None:
-                    multi_logits = F.log_softmax(logits, dim=-1) # [batch, seq, vocab]
+                    if self.args.offloadearly:
+                        multi_logits = logits
+                    else:
+                        multi_logits = F.log_softmax(logits, dim=-1).cpu() # [batch, seq, vocab]
 
                     for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
                         contlen = len(cont_toks)
@@ -217,7 +220,10 @@ def _model_call(self, inps):
 
 
             if output is not None:
-                output = torch.cat([o.cpu() for o in output], 0)[:len(inps)]
+                if self.args.offloadearly:
+                    output = torch.cat([F.log_softmax(o, dim=-1).cpu() for o in output[:len(inps)]], 0)
+                else:
+                    output = torch.cat(output, 0)[:len(inps)]
             else:
                 output = None
 
@@ -390,6 +396,7 @@ def tasks_args(parser):
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    group.add_argument('--offloadearly',  default = False, action='store_true', help='Offloads logits to CPU earlier to allow using a higher micro_bs_multiplier - Speeds up eval by up to 1.5x for 176B')
     return parser
 
 from megatron.global_vars import _parse_args
diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 0f6131ddf..3e1c77c97 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -182,7 +182,10 @@ def _collate(x):
                 logits = self._model_call(torch.cat(inps, dim=0))
                 res_len += len(chunk)
                 if logits is not None:
-                    multi_logits = F.log_softmax(logits, dim=-1) # [batch, seq, vocab]
+                    if self.args.offloadearly:
+                        multi_logits = logits
+                    else:
+                        multi_logits = F.log_softmax(logits, dim=-1).cpu() # [batch, seq, vocab]
 
                     for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
                         contlen = len(cont_toks)
@@ -240,11 +243,11 @@ def _model_call(self, inps):
             
             output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
 
-
             if output is not None:
-                output = torch.cat([o.cpu() for o in output], 0)[:len(inps)]
-            else:
-                output = None
+                if self.args.offloadearly:
+                    output = torch.cat([F.log_softmax(o, dim=-1).cpu() for o in output[:len(inps)]], 0)
+                else:
+                    output = torch.cat(output, 0)[:len(inps)]
 
             # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same
             if args.adaptive_seq_len:
@@ -415,6 +418,7 @@ def tasks_args(parser):
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    group.add_argument('--offloadearly',  default = False, action='store_true', help='Offloads logits to CPU earlier to allow using a higher micro_bs_multiplier - Speeds up eval by up to 1.5x for 176B')
     return parser
 
 from megatron.global_vars import _parse_args

From 40cf8690ad1fa78e6bfb1829d678ec2c113246f0 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 5 Jul 2022 14:32:22 +0200
Subject: [PATCH 57/69] add offload arg to slurm scripts

---
 examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm | 1 +
 examples/evalharness/run_evalharness_tr11-176b-ml.slurm   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm b/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
index 7754b7db8..c5710fc15 100644
--- a/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
+++ b/examples/evalharness/run_bsevalharness_tr11-176b-ml.slurm
@@ -96,6 +96,7 @@ CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
     --intermed_results \
     --adaptive_seq_len \
     --micro_bs_multiplier 16 \
+    --offloadearly \
     $MEGATRON_REQUIRED_ARGS \
     "
 
diff --git a/examples/evalharness/run_evalharness_tr11-176b-ml.slurm b/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
index cfa01ea3c..f453ecdd8 100644
--- a/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
+++ b/examples/evalharness/run_evalharness_tr11-176b-ml.slurm
@@ -95,6 +95,7 @@ CMD="./tasks/eval_harness/evaluate.py  \
     --intermed_results \
     --adaptive_seq_len \
     --micro_bs_multiplier 16 \
+    --offloadearly \
     $MEGATRON_REQUIRED_ARGS \
     "
 

From 93134669f48985e3c374fe8bfc7c35f6312e4700 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Wed, 6 Jul 2022 15:23:10 +0200
Subject: [PATCH 58/69] Fix setup_example_logger

---
 tasks/eval_harness/evaluate_bsevalharness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 3e1c77c97..b0f8cc787 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -52,7 +52,7 @@ def setup_example_logger(output_path):
     Copied from https://github.com/bigscience-workshop/lm-evaluation-harness/blob/2d968c60fc8bd808e5e475ca300781f774d234c1/main.py#L74
     """
     example_logger = logging.getLogger("examples")
-    filename = f"./examples-{output_path}.jsonl"
+    filename = f"{output_path}.jsonl"
     formatter = logging.Formatter("%(message)s")
     handler = logging.FileHandler(filename)
     handler.setFormatter(formatter)

From d0b2efaac782bcfb944c9b453e4b66118b89cae9 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 00:03:39 +0200
Subject: [PATCH 59/69] Add torch barrier

---
 tasks/eval_harness/evaluate_bsevalharness.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index b0f8cc787..2d79e5e6f 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -180,6 +180,7 @@ def _collate(x):
                     inplens.append(inplen)
 
                 logits = self._model_call(torch.cat(inps, dim=0))
+		torch.distributed.barrier()
                 res_len += len(chunk)
                 if logits is not None:
                     if self.args.offloadearly:

From 01dc62ac74c81d810a62b0557160f677c02afdb3 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 00:06:00 +0200
Subject: [PATCH 60/69] Add torch barrier

---
 tasks/eval_harness/evaluate_bsevalharness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 2d79e5e6f..3946baedd 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -180,7 +180,7 @@ def _collate(x):
                     inplens.append(inplen)
 
                 logits = self._model_call(torch.cat(inps, dim=0))
-		torch.distributed.barrier()
+                torch.distributed.barrier()
                 res_len += len(chunk)
                 if logits is not None:
                     if self.args.offloadearly:

From c193ffc3233402139d7512da0a0175ba176346bf Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 02:26:08 +0200
Subject: [PATCH 61/69] Improvement

---
 tasks/eval_harness/evaluate_bsevalharness.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 3946baedd..dfde0795d 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -12,6 +12,7 @@
 
 import logging
 import os
+import random
 import sys
 import datetime
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
@@ -153,7 +154,13 @@ def _collate(x):
                 return (-len(toks), tuple(toks))
 
             reord = utils.Reorderer(requests, _collate)
-            for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
+
+            if mpu.is_pipeline_last_stage():
+                chunks = utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size)
+            else:
+                chunks = utils.chunks(reord.get_reordered(), self.batch_size)
+
+            for chunk in chunks:
                 inps, contlens, inplens, padding_length = [], [], [], None
                 for _, context_enc, continuation_enc in chunk:
                     # when too long to fit in context, truncate from the left
@@ -420,6 +427,7 @@ def tasks_args(parser):
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
     group.add_argument('--offloadearly',  default = False, action='store_true', help='Offloads logits to CPU earlier to allow using a higher micro_bs_multiplier - Speeds up eval by up to 1.5x for 176B')
+    group.add_argument('--seed',  default=42, type=int, help='Random state seed')
     return parser
 
 from megatron.global_vars import _parse_args
@@ -456,6 +464,8 @@ def add_config(results):
         }
         return results
 
+
+    random.seed(args.seed)
     with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"):
         if args.intermed_results:
             global_results = {"results": [], "versions": {}, "table_results": {}}
@@ -467,7 +477,7 @@ def add_config(results):
             examples_path = results_path.replace(".json", "_examples")
             setup_example_logger(examples_path)
             for task_name, task in task_dict.items():
-                results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters)
+                results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.seed(args.seed))
                 global_results["results"].extend(results["results"])
                 global_results["versions"] = {**global_results["versions"], **results["versions"]}
                 global_results["table_results"] = {**global_results["table_results"], **results["table_results"]}
@@ -479,7 +489,7 @@ def add_config(results):
                     with open(results_path_backup, 'w') as outfile:
                         json.dump(global_results, outfile, indent=4)
         else:
-            global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters)
+            global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.seed(args.seed))
             global_results = add_config(global_results)
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
                 print(json.dumps(global_results, indent=2))

From 79cb56936b83f623c9959322b6d6a8c367bbae5c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 02:32:17 +0200
Subject: [PATCH 62/69] Be very careful of random states

---
 tasks/eval_harness/evaluate_bsevalharness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index dfde0795d..c23ef6591 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -214,7 +214,7 @@ def _collate(x):
         if not mpu.is_pipeline_last_stage():
             # @HACK: To make the eval harness happy on threads that don't have access to the results.
             #        We just randomly generate some data.
-            res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests]
+            res = [(0.5, True) for _ in requests]
 
         return reord.get_original(res)
 

From bd31b62c6e4056c58fe1d4951c1b68242c49607e Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 02:54:05 +0200
Subject: [PATCH 63/69] Woops

---
 tasks/eval_harness/evaluate_bsevalharness.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index c23ef6591..0082cf575 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -427,7 +427,6 @@ def tasks_args(parser):
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
     group.add_argument('--offloadearly',  default = False, action='store_true', help='Offloads logits to CPU earlier to allow using a higher micro_bs_multiplier - Speeds up eval by up to 1.5x for 176B')
-    group.add_argument('--seed',  default=42, type=int, help='Random state seed')
     return parser
 
 from megatron.global_vars import _parse_args
@@ -477,7 +476,7 @@ def add_config(results):
             examples_path = results_path.replace(".json", "_examples")
             setup_example_logger(examples_path)
             for task_name, task in task_dict.items():
-                results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.seed(args.seed))
+                results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed))
                 global_results["results"].extend(results["results"])
                 global_results["versions"] = {**global_results["versions"], **results["versions"]}
                 global_results["table_results"] = {**global_results["table_results"], **results["table_results"]}
@@ -489,7 +488,7 @@ def add_config(results):
                     with open(results_path_backup, 'w') as outfile:
                         json.dump(global_results, outfile, indent=4)
         else:
-            global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.seed(args.seed))
+            global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed))
             global_results = add_config(global_results)
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
                 print(json.dumps(global_results, indent=2))

From c6f76025b205e6b5a08bcb119bf2680f68d1329f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 03:08:08 +0200
Subject: [PATCH 64/69] This is already done correctly

---
 tasks/eval_harness/evaluate_bsevalharness.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 0082cf575..6112e9409 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -155,12 +155,7 @@ def _collate(x):
 
             reord = utils.Reorderer(requests, _collate)
 
-            if mpu.is_pipeline_last_stage():
-                chunks = utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size)
-            else:
-                chunks = utils.chunks(reord.get_reordered(), self.batch_size)
-
-            for chunk in chunks:
+            for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
                 inps, contlens, inplens, padding_length = [], [], [], None
                 for _, context_enc, continuation_enc in chunk:
                     # when too long to fit in context, truncate from the left

From 6105fe4a67ebb7b81595a60106ee16de7b7400df Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 03:19:17 +0200
Subject: [PATCH 65/69] Filter out generative tasks

---
 tasks/eval_harness/evaluate_bsevalharness.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 6112e9409..4ea5a1979 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -442,6 +442,10 @@ def main():
 
     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
     task_dict = tasks.get_task_dict_promptsource(task_list)
+    task_dict = {task_name: task for task_name, task in task_dict.items() if task.need_greedy_until is False}
+    if len(task_dict) == 0:
+        print_rank_0("Early stopping as `greedy_until` is not implemented yet.")
+        return
 
     model.module.activation_checkpoint_interval = 0
     model._compute_loss = False

From 43936d9f7a391079261ca9e1767c7df5acb6db8f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 10:29:31 +0200
Subject: [PATCH 66/69] There's no BOS for bloom

---
 tasks/eval_harness/evaluate_bsevalharness.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 4ea5a1979..0edc99aa6 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -123,9 +123,12 @@ def loglikelihood_rolling(self, requests):
         loglikelihoods = []
         with torch.no_grad():
             for string, in tqdm(requests):
+                tokens = self.tokenizer_encode(string)
+                prefix_token = tokens[0]
+                token_list = tokens[1:]
                 rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
-                    token_list=self.tokenizer_encode(string),
-                    prefix_token=self.EOT_TOKEN_ID,
+                    token_list=token_list,
+                    prefix_token=prefix_token,
                     max_seq_len=self.max_length,
                     context_len=1,
                 )))

From 280f1dcd5dde2fd17fca85451b58d01308f0b6e1 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 7 Jul 2022 11:44:59 +0200
Subject: [PATCH 67/69] Remove codecarbon

---
 tasks/eval_harness/evaluate_bsevalharness.py | 52 ++++++++++----------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/tasks/eval_harness/evaluate_bsevalharness.py b/tasks/eval_harness/evaluate_bsevalharness.py
index 0edc99aa6..bf429e7d5 100644
--- a/tasks/eval_harness/evaluate_bsevalharness.py
+++ b/tasks/eval_harness/evaluate_bsevalharness.py
@@ -18,7 +18,6 @@
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir,os.path.pardir)))
 
-from codecarbon import OfflineEmissionsTracker
 from lm_eval import evaluator, tasks
 from lm_eval.api import utils
 from lm_eval.api.model import CacheHook
@@ -467,35 +466,34 @@ def add_config(results):
 
 
     random.seed(args.seed)
-    with OfflineEmissionsTracker(country_iso_code="FRA", log_level="error"):
-        if args.intermed_results:
-            global_results = {"results": [], "versions": {}, "table_results": {}}
-            timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
-            iteration_id = load_path.split("/")[-1].replace("/", "")
-            results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
-            # Backup file in case of interruption during writing
-            results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
-            examples_path = results_path.replace(".json", "_examples")
-            setup_example_logger(examples_path)
-            for task_name, task in task_dict.items():
-                results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed))
-                global_results["results"].extend(results["results"])
-                global_results["versions"] = {**global_results["versions"], **results["versions"]}
-                global_results["table_results"] = {**global_results["table_results"], **results["table_results"]}
-                global_results = add_config(global_results)
-                if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-                    print(json.dumps(results, indent=2))
-                    with open(results_path, 'w') as outfile:
-                        json.dump(global_results, outfile, indent=4)
-                    with open(results_path_backup, 'w') as outfile:
-                        json.dump(global_results, outfile, indent=4)
-        else:
-            global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed))
+    if args.intermed_results:
+        global_results = {"results": [], "versions": {}, "table_results": {}}
+        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
+        iteration_id = load_path.split("/")[-1].replace("/", "")
+        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+        # Backup file in case of interruption during writing
+        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
+        examples_path = results_path.replace(".json", "_examples")
+        setup_example_logger(examples_path)
+        for task_name, task in task_dict.items():
+            results = evaluator.evaluate(lm=adaptor, task_dict={task_name: task}, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed))
+            global_results["results"].extend(results["results"])
+            global_results["versions"] = {**global_results["versions"], **results["versions"]}
+            global_results["table_results"] = {**global_results["table_results"], **results["table_results"]}
             global_results = add_config(global_results)
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-                print(json.dumps(global_results, indent=2))
-                with open(args.results_path, 'w') as outfile:
+                print(json.dumps(results, indent=2))
+                with open(results_path, 'w') as outfile:
+                    json.dump(global_results, outfile, indent=4)
+                with open(results_path_backup, 'w') as outfile:
                     json.dump(global_results, outfile, indent=4)
+    else:
+        global_results = evaluator.evaluate(lm=adaptor, task_dict=task_dict, bootstrap_iters=args.bootstrap_iters, rng=np.random.default_rng(args.seed))
+        global_results = add_config(global_results)
+        if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+            print(json.dumps(global_results, indent=2))
+            with open(args.results_path, 'w') as outfile:
+                json.dump(global_results, outfile, indent=4)
 
 if __name__ == '__main__':
     main()

From 9a2277cee0f7a949020fbd823f5710f75485bd4b Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Sat, 16 Jul 2022 15:38:24 +0200
Subject: [PATCH 68/69] Add small model scripts

---
 .../run_bsevalharness_tr11b-1b3-ml.slurm      | 122 ++++++++++++++++++
 .../run_bsevalharness_tr11c-2b5-ml.slurm      | 121 +++++++++++++++++
 .../run_bsevalharness_tr11e-350m-ml.slurm     | 120 +++++++++++++++++
 .../run_evalharness_tr11b-1b3-ml.slurm        | 120 +++++++++++++++++
 .../run_evalharness_tr11c-2b5-ml.slurm        | 120 +++++++++++++++++
 ...rm => run_evalharness_tr11e-350m-ml.slurm} |  34 +++--
 6 files changed, 618 insertions(+), 19 deletions(-)
 create mode 100644 examples/evalharness/run_bsevalharness_tr11b-1b3-ml.slurm
 create mode 100644 examples/evalharness/run_bsevalharness_tr11c-2b5-ml.slurm
 create mode 100644 examples/evalharness/run_bsevalharness_tr11e-350m-ml.slurm
 create mode 100644 examples/evalharness/run_evalharness_tr11b-1b3-ml.slurm
 create mode 100644 examples/evalharness/run_evalharness_tr11c-2b5-ml.slurm
 rename examples/evalharness/{run_bsevalharness_tr11-350M-ml.slurm => run_evalharness_tr11e-350m-ml.slurm} (79%)

diff --git a/examples/evalharness/run_bsevalharness_tr11b-1b3-ml.slurm b/examples/evalharness/run_bsevalharness_tr11b-1b3-ml.slurm
new file mode 100644
index 000000000..988ba0b81
--- /dev/null
+++ b/examples/evalharness/run_bsevalharness_tr11b-1b3-ml.slurm
@@ -0,0 +1,122 @@
+#!/bin/bash
+#SBATCH --job-name=run_bsevalharness-tr11b-1b3-ml
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --reservation=hug
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-muennighofflmeval
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11b-1b3-ml-bsevalharness"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11b-1B3-ml/checkpoints/main/global_step340500
+MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/megdsbslmeval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export TOKENIZERS_PARALLELISM=false
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=1
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": false
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en,gsarti/flores_101_afr,gsarti/flores_101_amh,gsarti/flores_101_ara,gsarti/flores_101_hye,gsarti/flores_101_asm,gsarti/flores_101_ast,gsarti/flores_101_azj,gsarti/flores_101_bel,gsarti/flores_101_ben,gsarti/flores_101_bos,gsarti/flores_101_bul,gsarti/flores_101_mya,gsarti/flores_101_cat,gsarti/flores_101_ceb,gsarti/flores_101_zho_simpl,gsarti/flores_101_zho_trad,gsarti/flores_101_hrv,gsarti/flores_101_ces,gsarti/flores_101_dan,gsarti/flores_101_nld,gsarti/flores_101_eng,gsarti/flores_101_est,gsarti/flores_101_tgl,gsarti/flores_101_fin,gsarti/flores_101_fra,gsarti/flores_101_ful,gsarti/flores_101_glg,gsarti/flores_101_lug,gsarti/flores_101_kat,gsarti/flores_101_deu,gsarti/flores_101_ell,gsarti/flores_101_guj,gsarti/flores_101_hau,gsarti/flores_101_heb,gsarti/flores_101_hin,gsarti/flores_101_hun,gsarti/flores_101_isl,gsarti/flores_101_ibo,gsarti/flores_101_ind,gsarti/flores_101_gle,gsarti/flores_101_ita,gsarti/flores_101_jpn,gsarti/flores_101_jav,gsarti/flores_101_kea,gsarti/flores_101_kam,gsarti/flores_101_kan,gsarti/flores_101_kaz,gsarti/flores_101_khm,gsarti/flores_101_kor,gsarti/flores_101_kir,gsarti/flores_101_lao,gsarti/flores_101_lav,gsarti/flores_101_lin,gsarti/flores_101_lit,gsarti/flores_101_luo,gsarti/flores_101_ltz,gsarti/flores_101_mkd,gsarti/flores_101_msa,gsarti/flores_101_mal,gsarti/flores_101_mlt,gsarti/flores_101_mri,gsarti/flores_101_mar,gsarti/flores_101_mon,gsarti/flores_101_npi,gsarti/flores_101_nso,gsarti/flores_101_nob,gsarti/flores_101_nya,gsarti/flores_101_oci,gsarti/flores_101_ory,gsarti/flores_101_orm,gsarti/flores_101_pus,gsarti/flores_101_fas,gsarti/flores_101_pol,gsarti/flores_101_por,gsarti/flores_101_pan,gsarti/flores_101_ron,gsarti/flores_101_rus,gsarti/flores_101_srp,gsarti/flores_101_sna,gsarti/flores_101_snd,gsarti/flores_101_slk,gsarti/flores_101_slv,gsarti/flores_101_som,gsarti/flores_101_ckb,gsarti/flores_101_spa,gsarti/flores_101_swh,gsarti/flores_101_swe,gsarti/flores_101_tgk,gsarti/flores_101_tam,gsarti/flores_101_tel,gsarti/flores_101_tha,gsarti/flores_101_tur,gsarti/flores_101_ukr,gsarti/flores_101_umb,gsarti/flores_101_urd,gsarti/flores_101_uzb,gsarti/flores_101_vie,gsarti/flores_101_cym,gsarti/flores_101_wol,gsarti/flores_101_xho,gsarti/flores_101_yor,gsarti/flores_101_zul \
+    --eval_fp32 \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 8 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=1
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/examples/evalharness/run_bsevalharness_tr11c-2b5-ml.slurm b/examples/evalharness/run_bsevalharness_tr11c-2b5-ml.slurm
new file mode 100644
index 000000000..6a012442b
--- /dev/null
+++ b/examples/evalharness/run_bsevalharness_tr11c-2b5-ml.slurm
@@ -0,0 +1,121 @@
+#!/bin/bash
+#SBATCH --job-name=run_bsevalharness-tr11c-2b5-ml
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --reservation=hug
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-muennighofflmeval
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11c-2b5-ml-bsevalharness"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11c-2B5-ml/checkpoints/main/global_step337250
+MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/megdsbslmeval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export TOKENIZERS_PARALLELISM=false
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=1
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": false
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en,gsarti/flores_101_afr,gsarti/flores_101_amh,gsarti/flores_101_ara,gsarti/flores_101_hye,gsarti/flores_101_asm,gsarti/flores_101_ast,gsarti/flores_101_azj,gsarti/flores_101_bel,gsarti/flores_101_ben,gsarti/flores_101_bos,gsarti/flores_101_bul,gsarti/flores_101_mya,gsarti/flores_101_cat,gsarti/flores_101_ceb,gsarti/flores_101_zho_simpl,gsarti/flores_101_zho_trad,gsarti/flores_101_hrv,gsarti/flores_101_ces,gsarti/flores_101_dan,gsarti/flores_101_nld,gsarti/flores_101_eng,gsarti/flores_101_est,gsarti/flores_101_tgl,gsarti/flores_101_fin,gsarti/flores_101_fra,gsarti/flores_101_ful,gsarti/flores_101_glg,gsarti/flores_101_lug,gsarti/flores_101_kat,gsarti/flores_101_deu,gsarti/flores_101_ell,gsarti/flores_101_guj,gsarti/flores_101_hau,gsarti/flores_101_heb,gsarti/flores_101_hin,gsarti/flores_101_hun,gsarti/flores_101_isl,gsarti/flores_101_ibo,gsarti/flores_101_ind,gsarti/flores_101_gle,gsarti/flores_101_ita,gsarti/flores_101_jpn,gsarti/flores_101_jav,gsarti/flores_101_kea,gsarti/flores_101_kam,gsarti/flores_101_kan,gsarti/flores_101_kaz,gsarti/flores_101_khm,gsarti/flores_101_kor,gsarti/flores_101_kir,gsarti/flores_101_lao,gsarti/flores_101_lav,gsarti/flores_101_lin,gsarti/flores_101_lit,gsarti/flores_101_luo,gsarti/flores_101_ltz,gsarti/flores_101_mkd,gsarti/flores_101_msa,gsarti/flores_101_mal,gsarti/flores_101_mlt,gsarti/flores_101_mri,gsarti/flores_101_mar,gsarti/flores_101_mon,gsarti/flores_101_npi,gsarti/flores_101_nso,gsarti/flores_101_nob,gsarti/flores_101_nya,gsarti/flores_101_oci,gsarti/flores_101_ory,gsarti/flores_101_orm,gsarti/flores_101_pus,gsarti/flores_101_fas,gsarti/flores_101_pol,gsarti/flores_101_por,gsarti/flores_101_pan,gsarti/flores_101_ron,gsarti/flores_101_rus,gsarti/flores_101_srp,gsarti/flores_101_sna,gsarti/flores_101_snd,gsarti/flores_101_slk,gsarti/flores_101_slv,gsarti/flores_101_som,gsarti/flores_101_ckb,gsarti/flores_101_spa,gsarti/flores_101_swh,gsarti/flores_101_swe,gsarti/flores_101_tgk,gsarti/flores_101_tam,gsarti/flores_101_tel,gsarti/flores_101_tha,gsarti/flores_101_tur,gsarti/flores_101_ukr,gsarti/flores_101_umb,gsarti/flores_101_urd,gsarti/flores_101_uzb,gsarti/flores_101_vie,gsarti/flores_101_cym,gsarti/flores_101_wol,gsarti/flores_101_xho,gsarti/flores_101_yor,gsarti/flores_101_zul \
+    --eval_fp32 \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 8 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=1
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/examples/evalharness/run_bsevalharness_tr11e-350m-ml.slurm b/examples/evalharness/run_bsevalharness_tr11e-350m-ml.slurm
new file mode 100644
index 000000000..a53144a41
--- /dev/null
+++ b/examples/evalharness/run_bsevalharness_tr11e-350m-ml.slurm
@@ -0,0 +1,120 @@
+#!/bin/bash
+#SBATCH --job-name=run_bsevalharness-tr11e-350m-ml
+#SBATCH --constraint=v100-32g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=10           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@v100
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-muennighofflmeval
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11e-350m-ml-bsevalharness"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11e-350M-ml/checkpoints/main/global_step659500
+MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/megdsbslmeval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasetseval
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+export TOKENIZERS_PARALLELISM=false
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=1
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": false
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list axb,axg,boolq,cb,cola,copa,crows_pairs_english,crows_pairs_french,diabla,e2e_nlg_cleaned,mnli,mnli_mismatched,multirc,piaf,qqp,rte,sst,tydiqa_primary,tydiqa_secondary,wic,wsc,wnli,wino_bias_type1_anti,wino_bias_type1_pro,wino_bias_type2_anti,wino_bias_type2_pro,xquad_ar,xquad_en,gsarti/flores_101_afr,gsarti/flores_101_amh,gsarti/flores_101_ara,gsarti/flores_101_hye,gsarti/flores_101_asm,gsarti/flores_101_ast,gsarti/flores_101_azj,gsarti/flores_101_bel,gsarti/flores_101_ben,gsarti/flores_101_bos,gsarti/flores_101_bul,gsarti/flores_101_mya,gsarti/flores_101_cat,gsarti/flores_101_ceb,gsarti/flores_101_zho_simpl,gsarti/flores_101_zho_trad,gsarti/flores_101_hrv,gsarti/flores_101_ces,gsarti/flores_101_dan,gsarti/flores_101_nld,gsarti/flores_101_eng,gsarti/flores_101_est,gsarti/flores_101_tgl,gsarti/flores_101_fin,gsarti/flores_101_fra,gsarti/flores_101_ful,gsarti/flores_101_glg,gsarti/flores_101_lug,gsarti/flores_101_kat,gsarti/flores_101_deu,gsarti/flores_101_ell,gsarti/flores_101_guj,gsarti/flores_101_hau,gsarti/flores_101_heb,gsarti/flores_101_hin,gsarti/flores_101_hun,gsarti/flores_101_isl,gsarti/flores_101_ibo,gsarti/flores_101_ind,gsarti/flores_101_gle,gsarti/flores_101_ita,gsarti/flores_101_jpn,gsarti/flores_101_jav,gsarti/flores_101_kea,gsarti/flores_101_kam,gsarti/flores_101_kan,gsarti/flores_101_kaz,gsarti/flores_101_khm,gsarti/flores_101_kor,gsarti/flores_101_kir,gsarti/flores_101_lao,gsarti/flores_101_lav,gsarti/flores_101_lin,gsarti/flores_101_lit,gsarti/flores_101_luo,gsarti/flores_101_ltz,gsarti/flores_101_mkd,gsarti/flores_101_msa,gsarti/flores_101_mal,gsarti/flores_101_mlt,gsarti/flores_101_mri,gsarti/flores_101_mar,gsarti/flores_101_mon,gsarti/flores_101_npi,gsarti/flores_101_nso,gsarti/flores_101_nob,gsarti/flores_101_nya,gsarti/flores_101_oci,gsarti/flores_101_ory,gsarti/flores_101_orm,gsarti/flores_101_pus,gsarti/flores_101_fas,gsarti/flores_101_pol,gsarti/flores_101_por,gsarti/flores_101_pan,gsarti/flores_101_ron,gsarti/flores_101_rus,gsarti/flores_101_srp,gsarti/flores_101_sna,gsarti/flores_101_snd,gsarti/flores_101_slk,gsarti/flores_101_slv,gsarti/flores_101_som,gsarti/flores_101_ckb,gsarti/flores_101_spa,gsarti/flores_101_swh,gsarti/flores_101_swe,gsarti/flores_101_tgk,gsarti/flores_101_tam,gsarti/flores_101_tel,gsarti/flores_101_tha,gsarti/flores_101_tur,gsarti/flores_101_ukr,gsarti/flores_101_umb,gsarti/flores_101_urd,gsarti/flores_101_uzb,gsarti/flores_101_vie,gsarti/flores_101_cym,gsarti/flores_101_wol,gsarti/flores_101_xho,gsarti/flores_101_yor,gsarti/flores_101_zul \
+    --eval_fp32 \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 4 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=1
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6002
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/examples/evalharness/run_evalharness_tr11b-1b3-ml.slurm b/examples/evalharness/run_evalharness_tr11b-1b3-ml.slurm
new file mode 100644
index 000000000..a086ca362
--- /dev/null
+++ b/examples/evalharness/run_evalharness_tr11b-1b3-ml.slurm
@@ -0,0 +1,120 @@
+#!/bin/bash
+#SBATCH --job-name=run_evalharness-tr11b-2b5-ml
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --reservation=hug
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11b-1b3-ml-evalharness"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11b-1B3-ml/checkpoints/main/global_step340500
+MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/megdsbslmeval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=1
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": false
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --eval_fp32 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 8 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=1
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/examples/evalharness/run_evalharness_tr11c-2b5-ml.slurm b/examples/evalharness/run_evalharness_tr11c-2b5-ml.slurm
new file mode 100644
index 000000000..fa8757caf
--- /dev/null
+++ b/examples/evalharness/run_evalharness_tr11c-2b5-ml.slurm
@@ -0,0 +1,120 @@
+#!/bin/bash
+#SBATCH --job-name=run_evalharness-tr11b-2b5-ml
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+#SBATCH --reservation=hug
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11b-2b5-ml-evalharness"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11c-2B5-ml/checkpoints/main/global_step337250
+MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/megdsbslmeval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=1
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": false
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --eval_fp32 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 8 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=1
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
diff --git a/examples/evalharness/run_bsevalharness_tr11-350M-ml.slurm b/examples/evalharness/run_evalharness_tr11e-350m-ml.slurm
similarity index 79%
rename from examples/evalharness/run_bsevalharness_tr11-350M-ml.slurm
rename to examples/evalharness/run_evalharness_tr11e-350m-ml.slurm
index bbdd047d8..a58e8cf1e 100644
--- a/examples/evalharness/run_bsevalharness_tr11-350M-ml.slurm
+++ b/examples/evalharness/run_evalharness_tr11e-350m-ml.slurm
@@ -1,30 +1,26 @@
 #!/bin/bash
-#SBATCH --job-name=run_bsevalharness-tr11-350M-ml
-#SBATCH --partition=gpu_p13
-#SBATCH --constraint=v100
-#SBATCH --nodes=2
+#SBATCH --job-name=run_evalharness-tr11e-350m-ml
+#SBATCH --constraint=v100-32g
+#SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
-#SBATCH --cpus-per-task=20           # number of cores per tasks
+#SBATCH --cpus-per-task=10           # number of cores per tasks
 #SBATCH --hint=nomultithread         # we get physical cores not logical
-#SBATCH --gres=gpu:2                 # number of gpus
-#SBATCH --time 5:00:00               # maximum execution time (HH:MM:SS)
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
 #SBATCH --output=%x-%j.out           # output file name
 #SBATCH --account=six@v100
-#SBATCH -C v100-32g
-
 
 set -x -e
 
-source $six_ALL_CCFRWORK/start-muennighofflmeval
+source $six_ALL_CCFRWORK/start-py38-pt111
 
 echo "START TIME: $(date)"
 
 # a unique identifier for the current eval ideally correspnding to the modelname
-VARIANT="tr11-350M-ml-bsevalharness"
-
+VARIANT="tr11e-350m-ml-evalharness"
 
-CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11e-350M-ml/checkpoints/main/659500
 
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11e-350M-ml/checkpoints/main/global_step659500
 MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRSCRATCH/commun/experiments/muennighoff/megdsbslmeval/Megatron-DeepSpeed
 export HF_DATASETS_OFFLINE=1
 export TRANSFORMERS_OFFLINE=1
@@ -70,7 +66,7 @@ cat <<EOT > $config_json
     "stage": $ZERO_STAGE
   },
   "bf16": {
-    "enabled": true
+    "enabled": false
   },
   "steps_per_print": 2000,
   "wall_clock_breakdown": false
@@ -78,7 +74,7 @@ cat <<EOT > $config_json
 EOT
 
 
-CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
+CMD="./tasks/eval_harness/evaluate.py  \
     --load $CHECKPOINT_PATH \
     --results_path $VARIANT-results.json \
     --tensor-model-parallel-size $TP_SIZE  \
@@ -88,19 +84,19 @@ CMD="./tasks/eval_harness/evaluate_bsevalharness.py  \
     --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
     --no-load-optim \
     --no-load-rng \
-    --bf16 \
+    --eval_fp32 \
     --inference \
     --seq-length $SEQ_LEN \
-    --task_list wnli \
+    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
     --deepspeed \
     --deepspeed_config ds_config.json \
     --intermed_results \
     --adaptive_seq_len \
-    --micro_bs_multiplier 4 \
+    --micro_bs_multiplier 8 \
     $MEGATRON_REQUIRED_ARGS \
     "
 
-GPUS_PER_NODE=2
+GPUS_PER_NODE=1
 NNODES=$SLURM_NNODES
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000

From 472045e66a6f9716e2507ee59b687bb514f38f89 Mon Sep 17 00:00:00 2001
From: Niklas Muennighoff <n.muennighoff@gmail.com>
Date: Wed, 17 Aug 2022 09:03:50 +0200
Subject: [PATCH 69/69] merge main (#331)

* Reshape deepspeed checkpoint (#239)

* Reshape deepspeed checkpoint

* add checkpoint tests

* Validate input folder

* Tests for tp/pp reshape

* remove debug folders

* fix test_checkpoint_reshaping_empty_dir

* Fix unit tests

* Remove deepspeed checkpoint utils

* Use DS 3D reshaping utils

* convert to bf16

* wip universal chkpt

* rename

* rename

* wip on fragments dealing

* cleanup

* Loading universal checkpoint with reshaping

* all gpu1<->2 reshapes work

* param attrs

* make the tests adaptable to the number of available gpus

* WIP

* WIP

* WIP

* WIP

* Debug functions

* args should be required, don't create another latest file

* Parallelize shard extraction

* close+join pool; add tqdm; comment out noise

* rename

* parameterize

* Parallel slice merging

* Cleanup

* allow inspection on a machine w/o gpus

* test against the right DS branch

* DS size was merged

Co-authored-by: Stas Bekman <stas@stason.org>

* BLOOM Inference via DeepSpeed-Inference, Accelerate and DeepSpeed-ZeRO (#308)

* hardcode the dtype depending on the model

* change the mp based on the world_size

* remove hardcoded world_size

* add bigscience/bigscience-small-testing

* fixes

* add zero-inference script

* fixes

* fix

* working script

* renames

* fixes

* fix for offline use

* add benchmark

* add benchmark

* update

* cleanup

* update

* msecs

* cleanup

* improve

* fix benchmark, add warmup

* update

* fix; thanks Michael Wyatt

* clarify

* add bloom batch-inference script

* removed the names :-)

* fold the bs functionality from the other script

* fix

* restore do_sample

* dump generate args

* fix

* fix

* support any batchsize

* div by bs

* mul by bs

* add cpu_offload; sync scripts

* wip

* improvements

* fixes

* fixes

* add accelerate script

* fix

* wip

* wip

* stats

* add OnDevice and remove zero-inference (#316)

* wip

* rework generate + benchmark

* figure out the memory map dynamically

* bug fix

* fix ds-zero-inference wrt device

* bug fix

* update

* update

* fix

Co-authored-by: Reza Yazdani <reyazda@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Stas Bekman <stas@stason.org>
Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
Co-authored-by: Reza Yazdani <reyazda@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
---
 megatron/arguments.py                         |   2 +
 megatron/checkpointing.py                     |  17 +-
 megatron/training.py                          |  68 +++-
 megatron/utils.py                             |  76 ++++
 run_bf16.sh                                   |  65 ++--
 run_universal_bf16.sh                         | 180 ++++++++++
 scripts/inference/README.md                   | 194 ++++++++++
 .../inference/bloom-accelerate-inference.py   | 186 ++++++++++
 scripts/inference/bloom-ds-inference.py       | 299 ++++++++++++++++
 scripts/inference/bloom-ds-zero-inference.py  | 211 +++++++++++
 scripts/inference/bloom-inference.py          | 153 --------
 tests/ds_config_bf16.json                     |  14 +
 tests/test_checkpoints.py                     | 298 ++++++++++++++++
 .../deepspeed_checkpoint.py                   | 195 ----------
 .../deepspeed_to_deepspeed.py                 | 183 ++++++++++
 .../deepspeed_to_megatron.py                  |  81 +++--
 .../deepspeed_to_transformers.py              |  53 ++-
 tools/convert_checkpoint/ds_to_universal.py   | 336 ++++++++++++++++++
 .../convert_checkpoint/inspect_checkpoint.py  |  15 +-
 .../inspect_deepspeed_checkpoint.py           |  59 ++-
 20 files changed, 2239 insertions(+), 446 deletions(-)
 create mode 100755 run_universal_bf16.sh
 create mode 100644 scripts/inference/bloom-accelerate-inference.py
 create mode 100644 scripts/inference/bloom-ds-inference.py
 create mode 100644 scripts/inference/bloom-ds-zero-inference.py
 delete mode 100644 scripts/inference/bloom-inference.py
 create mode 100644 tests/ds_config_bf16.json
 create mode 100644 tests/test_checkpoints.py
 delete mode 100644 tools/convert_checkpoint/deepspeed_checkpoint.py
 create mode 100644 tools/convert_checkpoint/deepspeed_to_deepspeed.py
 create mode 100755 tools/convert_checkpoint/ds_to_universal.py

diff --git a/megatron/arguments.py b/megatron/arguments.py
index cf48d0213..c18235a78 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -641,6 +641,8 @@ def _add_learning_rate_args(parser):
                        '(learning rate, warmup iterations, minimum learning '
                        'rate, maximum number of iterations, and decay style '
                        'from checkpoint and ignore input arguments.')
+    group.add_argument('--universal-checkpoint', action='store_true',
+                        help='Loading a universal format checkpoint.')
 
     return parser
 
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index d9a30f468..dacbec7dc 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -27,7 +27,8 @@
                       mpu,
                       print_rank_0,
                       update_num_microbatches,
-                      utils)
+                      utils,
+                      get_tokenizer)
 from megatron.enums import PositionEmbeddingType
 
 _CHECKPOINT_VERSION = None
@@ -131,6 +132,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
         state_dict['checkpoint_version'] = 3.0
         state_dict['iteration'] = iteration
         state_dict['tokens'] = args.consumed_train_tokens
+        state_dict['checkpoint_info'] = _checkpoint_info()
 
         # DeepSpeed saves the model/optimizer/scheduler
         if not args.deepspeed:
@@ -361,7 +363,8 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     assert args.consumed_valid_samples == 0
     if 'args' in state_dict:
         checkpoint_args = state_dict['args']
-        check_checkpoint_args(checkpoint_args)
+        if not args.universal_checkpoint:
+            check_checkpoint_args(checkpoint_args)
         args.consumed_train_samples = getattr(checkpoint_args,
                                               'consumed_train_samples', 0)
         update_num_microbatches(consumed_samples=args.consumed_train_samples)
@@ -468,3 +471,13 @@ def load_biencoder_checkpoint(model, only_query_model=False,
         print(' successfully loaded {}'.format(checkpoint_name))
 
     return model
+
+
+def _checkpoint_info():
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    return {
+        "padded_vocab_size": args.padded_vocab_size,
+        "original_vocab_size": tokenizer.vocab_size,
+    }
\ No newline at end of file
diff --git a/megatron/training.py b/megatron/training.py
index 2d45ca808..bd00bc77e 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -367,6 +367,32 @@ def get_learning_rate_scheduler(optimizer):
     return lr_scheduler
 
 
+def sync_hp_to_lp(optimizer):
+
+    optimizer.update_lp_params()
+
+    # for n,p in model.named_parameters():
+    #     print(n)
+
+    #     if p._hp_mapping is not None:
+    #         #print(f'rank {rank} fixing hp for input_layernorm')
+    #         #p._hp_mapping.update_hp()
+
+    #         hp = p._hp_mapping.hp_fragment
+
+
+
+    #         torch.distributed.all_reduce(hp, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+
+    #         # 3. optim states
+    #         for key in ['exp_avg', 'exp_avg_sq']:
+    #             optim_state_fragment = p._hp_mapping.get_optim_state_fragment(key)
+    #             #print(f'rank {rank} before reduce optim state fragment {key} = {optim_state_fragment}')
+    #             torch.distributed.all_reduce(optim_state_fragment, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+    #             #print(f'rank {rank} after reduce optim state fragment {key} = {optim_state_fragment}')
+
+
+
 def setup_model_and_optimizer(model_provider_func):
     """Setup model and optimizer."""
     args = get_args()
@@ -386,12 +412,21 @@ def setup_model_and_optimizer(model_provider_func):
 
     if args.deepspeed:
         print_rank_0("DeepSpeed is enabled.")
-        pp = mpu.get_pipeline_model_parallel_world_size()
+        #pp = mpu.get_pipeline_model_parallel_world_size()
+
+        import json
+        import io
+        with io.open(args.deepspeed_config, "r", encoding="utf-8") as f:
+            config = json.load(f)
+        if args.universal_checkpoint:
+            config["checkpoint"] = {"load_universal": True}
+
         model, optimizer, _, lr_scheduler = deepspeed.initialize(
             model=model[0],
             optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            config=config,
             args=args,
-            lr_scheduler=lr_scheduler
         )
 
         assert model.fp16_enabled() == args.fp16, "megatron fp16 config does not match deepspeed"
@@ -416,8 +451,37 @@ def setup_model_and_optimizer(model_provider_func):
         torch.distributed.barrier()
         timers('load-checkpoint').stop()
         timers.log(['load-checkpoint'])
+
+
+        # hp -> lp
+        if args.deepspeed and args.universal_checkpoint:
+            sync_hp_to_lp(optimizer)
+
+
     else:
         args.iteration = 0
+    
+    from .utils import dump_weights 
+    dump_weights(f'{args.universal_checkpoint=}', args.iteration, model, optimizer)
+
+    # tp_rank = mpu.get_tensor_model_parallel_rank()
+    # pp_rank = mpu.get_pipeline_model_parallel_rank()
+    # dp_rank = mpu.get_data_parallel_rank()
+    # for n,p in model[0].named_parameters():
+    #     if 'word_embeddings.weight' not in n:
+    #         continue 
+    #     if tp_rank == 0 and pp_rank == 0:
+    #         print(f"{tp_rank=}{pp_rank=}{dp_rank=} bf16 {n=} {p[:10]=}")
+    #         if p._hp_mapping is not None:
+    #             hp = p._hp_mapping.hp_fragment
+    #             print(f'{tp_rank=}{pp_rank=}{dp_rank=} fp32 {n=} {hp[:10]=}')
+
+    #     if tp_rank == 0 and pp_rank == mpu.get_pipeline_model_parallel_world_size() - 1:
+    #         print(f"{tp_rank=}{pp_rank=}{dp_rank=} bf16 {n=} {p[:10]=}")
+    #         if p._hp_mapping is not None:
+    #             hp = p._hp_mapping.hp_fragment
+    #             print(f'{tp_rank=}{pp_rank=}{dp_rank=} fp32 {n=} {hp[:10]=}')
+
 
     # We only support local DDP with multiple micro-batches.
     if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1:
diff --git a/megatron/utils.py b/megatron/utils.py
index 6f3a0fa41..893f58dd2 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -461,3 +461,79 @@ def found_kill_switch():
         return True
     else:
         return False
+
+def get_fingerprint_header():
+    return f"{'min':^13} {'max':^13} {'mean':^13} {'l2 norm':^12} metadata"
+
+def get_fingerprint(p):
+    return f"{p.min():13.6e} {p.max():13.6e} {p.mean():13.6e} {p.norm():12.6e}"
+
+
+def dump_weights(preamble, iteration, model, optimizer, tensor=None):   
+    tp_rank = mpu.get_tensor_model_parallel_rank()
+    pp_rank = mpu.get_pipeline_model_parallel_rank()
+    dp_rank = mpu.get_data_parallel_rank()
+    dp_size = mpu.get_data_parallel_world_size()
+    fn = f"debug-bf16-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt"
+
+    # only care for first and last pp stages and dp0 tp0
+    #if not (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()):
+    #    return
+
+    #if not (tp_rank == 0 and dp_rank == 0):
+    #    return
+
+    if tensor is not None:
+        orig_tensor = tensor
+        if hasattr(tensor, "_hp_param"):
+            numel = tensor._hp_param.numel() # // dp_size
+            tensor = tensor.flatten().narrow(0, 0, numel)
+
+    #print(fn)
+    with open(fn, "w") as fh:
+        fh.write(f"{get_fingerprint_header()}\n")
+
+        if tensor is not None:
+            fh.write(f"{get_fingerprint(tensor)} tensor {tensor.shape}\n")
+        else:
+            for n, p in model[0].named_parameters():
+                fh.write(f"{get_fingerprint(p)} {n} {p.shape}\n")
+
+
+    return 
+
+
+    # until we figure out how to dump the actual fp32 values don't do this
+    fn = f"debug-fp32-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt"
+    with open(fn, "w") as fh:
+        fh.write(f"{get_fingerprint_header()}\n")
+        if tensor is not None:
+            tensor = orig_tensor
+            if hasattr(tensor, "_hp_param"):
+                fh.write(f"{get_fingerprint(tensor._hp_param)} tensor {tensor._hp_param.shape}\n")
+                #fh.write(f"{get_fingerprint(tensor._hp_grad)} tensor grad\n")
+            else:
+                fh.write(f"{get_fingerprint(tensor)} tensor {tensor.shape}\n")
+                #fh.write(f"{get_fingerprint(tensor.grad)} tensor grad\n")
+
+        else:
+            if hasattr(model[0].module.tied_modules, "embed"):
+                p = model[0].module.tied_modules.embed.word_embeddings.weight._hp_param
+                fh.write(f"{get_fingerprint(p)} module.tied_modules.embed.word_embeddings.weight._hp_param {p.shape}\n")
+
+        # for i, param_group in enumerate(optimizer.param_groups):
+        #     fh.write(f"{get_fingerprint(optimizer.fp32_groups_flat_partition[i])} group={i}\n")
+            #fh.write(f"{i}={optimizer.fp32_groups_flat_partition[i]}\n")
+    #     if mpu.is_pipeline_first_stage():
+    #         x = optimizer.fp32_groups_flat_partition[0]
+    #         fh.write(f"fp32={x[:402432]}\n")
+    #     if mpu.is_pipeline_last_stage()):
+    #         x = optimizer.fp32_groups_flat_partition[1]
+    #         fh.write(f"fp32={x[-402432:]}\n")
+
+    # import os
+    # import socket
+    # hostname = socket.gethostname()
+    # pid = os.getpid()
+    # global_rank = torch.distributed.get_rank()
+    #fn = f"debug-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-global{global_rank}-{preamble}-{pid}.txt"
\ No newline at end of file
diff --git a/run_bf16.sh b/run_bf16.sh
index fd3a48398..fc884d4af 100755
--- a/run_bf16.sh
+++ b/run_bf16.sh
@@ -12,7 +12,12 @@ DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 #DATASET_3="<PATH TO THE THIRD DATASET>"
 #DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
 
-BASE_DATA_PATH=/data/Megatron-LM/data
+#BASE_DATA_PATH=tests/data/gpt2
+#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document
+#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json
+#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt
+
+BASE_DATA_PATH=/vc_data/Megatron-LM/data
 DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
 VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
 MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
@@ -20,40 +25,45 @@ MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
 
 script_path=$(realpath $0)
 script_dir=$(dirname $script_path)
-#CONFIG_JSON="$script_dir/ds_config.json"
-CONFIG_JSON="/tmp/ds_config.json"
+CONFIG_JSON="$script_dir/ds_config.json"
+#CONFIG_JSON="/tmp/ds_config.json"
 
 USE_DEEPSPEED=1
 ZERO_STAGE=0
 
-
-# Debug
 #TP=4
 #PP=4
-#LAYERS=8
-#HIDDEN=512
-#SEQ=1024
-#GLOBAL_BATCH=128
-#WORKER_STR="-i worker-0"
 
-
-TP=1
-PP=1
-DP=2
+# Debug
+DEBUG_MODE=0 
+if [[ $DEBUG_MODE == 1 ]]; then
+        LAYERS=4
+        HIDDEN=512
+        SEQ=512
+        EXIT_INTERVAL=3
+else
+        HIDDEN=1024
+        LAYERS=24
+        SEQ=1024
+        EXIT_INTERVAL=10
+fi  
+
+TP=2
+PP=2
+DP=4
 WORLD_SIZE=$((TP*PP*DP))
-HIDDEN=1024
-LAYERS=24
-SEQ=1024
-GLOBAL_BATCH=1
-WORKER_STR=""
+GLOBAL_BATCH=4
 
 MICRO_BATCH=1
+TRAIN_ITERS=100000
+CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} 
+LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
 
 LR=6.0e-4
 MIN_LR=6.0e-5
 DTYPE="bf16"
-EXP_DIR=${HOME}/experiments/results/bf16
-LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3"
+EXP_DIR=${HOME}/experiments/results/ckpt_reshape
+LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont"
 mkdir -p $LOG_DIR
 
 while [[ $# -gt 0 ]]
@@ -89,7 +99,7 @@ options=" \
         --max-position-embeddings $SEQ \
 	--micro-batch-size $MICRO_BATCH \
 	--global-batch-size $GLOBAL_BATCH \
-	--train-iters 1000 \
+	--train-iters $TRAIN_ITERS \
         --lr $LR \
 	--min-lr $MIN_LR \
         --lr-decay-style cosine \
@@ -99,7 +109,7 @@ options=" \
 	--data-path ${DATASET} \
 	--vocab-file ${VOCAB_PATH} \
 	--merge-file ${MERGE_PATH} \
-	--save-interval 10000 \
+	--save-interval 1000 \
         --split 98,2,0 \
         --clip-grad 1.0 \
 	--weight-decay 0.1 \
@@ -108,7 +118,12 @@ options=" \
 	--init-method-std 0.006 \
         --${DTYPE} \
 	--checkpoint-activations \
-	--exit-interval 10000 \
+	--exit-interval ${EXIT_INTERVAL} \
+        --save ${CHECKPOINT_PATH} \
+        --load ${LOAD_CHECKPOINT_PATH} \
+        --position-embedding-type alibi \
+        --override-lr-scheduler \
+        --embed-layernorm \
 	--tensorboard-dir $LOG_DIR
         "
 
@@ -151,7 +166,7 @@ cat <<EOT > $CONFIG_JSON
 }
 EOT
 
-WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
+#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
 #WORKER_STR="-i worker-0:0,1,2,3"
 #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
 #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
diff --git a/run_universal_bf16.sh b/run_universal_bf16.sh
new file mode 100755
index 000000000..7a60c34c1
--- /dev/null
+++ b/run_universal_bf16.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+#mkdir -p $DIR/logs
+#mkdir -p /tmp/logs
+
+
+#DATASET_1="<PATH TO THE FIRST DATASET>"
+#DATASET_2="<PATH TO THE SECOND DATASET>"
+#DATASET_3="<PATH TO THE THIRD DATASET>"
+#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+#BASE_DATA_PATH=tests/data/gpt2
+#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document
+#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json
+#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt
+
+BASE_DATA_PATH=/vc_data/Megatron-LM/data
+DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+CONFIG_JSON="$script_dir/ds_config.json"
+#CONFIG_JSON="/tmp/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+#TP=4
+#PP=4
+
+# Debug
+DEBUG_MODE=0 
+if [[ $DEBUG_MODE == 1 ]]; then
+        LAYERS=4
+        HIDDEN=512
+        SEQ=512
+        EXIT_INTERVAL=3
+else
+        HIDDEN=1024
+        LAYERS=24
+        SEQ=1024
+        EXIT_INTERVAL=10
+fi  
+
+TP=2
+PP=2
+DP=4
+WORLD_SIZE=$((TP*PP*DP))
+GLOBAL_BATCH=4
+
+MICRO_BATCH=1
+TRAIN_ITERS=100000
+CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} 
+LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp2_pp2_dp4
+
+LR=6.0e-4
+MIN_LR=6.0e-5
+DTYPE="bf16"
+EXP_DIR=${HOME}/experiments/results/ckpt_reshape
+LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni"
+mkdir -p $LOG_DIR
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters $TRAIN_ITERS \
+        --lr $LR \
+	--min-lr $MIN_LR \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 10 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 1000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --${DTYPE} \
+	--checkpoint-activations \
+	--exit-interval ${EXIT_INTERVAL} \
+        --save ${CHECKPOINT_PATH} \
+        --load ${LOAD_CHECKPOINT_PATH} \
+        --universal-checkpoint \
+        --position-embedding-type alibi \
+        --override-lr-scheduler \
+        --embed-layernorm \
+	--tensorboard-dir $LOG_DIR
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "bf16": {
+    "enabled": true
+  },
+
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
+#WORKER_STR="-i worker-0:0,1,2,3"
+#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
+#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
+run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
diff --git a/scripts/inference/README.md b/scripts/inference/README.md
index 1a958c28b..44e98f9fb 100644
--- a/scripts/inference/README.md
+++ b/scripts/inference/README.md
@@ -1 +1,195 @@
 # Inference scripts for BLOOM
+
+## BLOOM Inference solutions
+
+Here are some stats on JeanZay's 8x80GB A100 node w/ 512GB of CPU memory:
+
+All benchmarks are doing greedy generation of 100 token outputs:
+```
+Generate args {'min_length': 100, 'max_length': 100, 'do_sample': False}
+```
+The inputs are just a few tokens.
+
+Throughput in msecs:
+
+| project \ bs |      1 |     8 |    16 |    32 |    64 |  128 |
+| :----------- |  :---- | :---- | :---- | :---- | :---- | :--- |
+| accelerate   | 230.38 | 31.78 | 17.84 | 10.89 |  oom  | omm  |
+| ds-inference |  40.57 |  5.23 |       |       |  2.77 | 0.66 |
+| ds-zero      |    283 | 34.88 | oom   |  oom  |  oom  | oom  |
+
+
+Start to ready to generate in secs:
+
+| project \ bs |    1 |    8 |   16 |   32 |   64 |  128 |
+| :----------- | :--- | :--- | :--- | :--- | :--- | :--- |
+| accelerate   |  121 |  120 |  113 |  118 |      |      |
+| ds-inference |  662 |  673 |      |      |  685 |  654 |
+| ds-zero      |  462 |  463 |      |      |      |      |
+|              |      |      |      |      |      |      |
+
+
+DS-Inference load time (start to ready to generate) will become much faster soon. Once we stop relying on ds-zero to instantiate the model on gpu. The plan is to pre-shard the weights TP-wise for 8x and 16x gpus and load them directly on each gpu. Will probably be under 1min.
+
+
+## Deepspeed-Inference
+
+Tensor-Parallelism and efficient fused CUDA kernels:
+https://www.deepspeed.ai/tutorials/inference-tutorial/
+
+### Setup
+
+```
+git clone https://github.com/microsoft/DeepSpeed
+cd DeepSpeed
+pip install .
+```
+
+### Run
+
+```
+deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom
+```
+
+Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay) - just a batch of 1 (would be more efficient to run a larger batch)
+
+Adding `--benchmark` to activate the benchmarks
+
+
+BS=1
+```
+$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-inference_bs=1.txt
+[...]
+
+```
+
+While processing memory per process:
+
+-  GPU: ~50GB
+-  CPU: ~10GB
+
+
+BS=8
+```
+$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-inference_bs=8.txt
+[...]
+*** Performance stats:
+Throughput per token including tokenize: 5.23 msecs
+Start to ready to generate: 683.397 secs
+Tokenize and generate 800 (bs=8) tokens: 4.241 secs
+Start to finish: 687.638 secs
+```
+
+BS=64
+
+```
+$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 64 --benchmark 2>&1 | tee bloom-ds-inference_bs=64.txt
+
+
+
+
+```
+
+BS=128
+
+```
+$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 128 --benchmark 2>&1 | tee bloom-ds-inference_bs=128.txt
+
+
+
+
+```
+
+## Deepspeed ZeRO-Inference
+
+https://www.deepspeed.ai/tutorials/zero/
+
+### Setup
+
+```
+pip install deepspeed
+```
+
+
+### Run
+
+Note that the script currently runs the same inputs on all GPUs, but you can run a different stream on each GPU, and get `n_gpu` times faster throughput. You can't do that with Deepspeed-Inference.
+
+
+BS=1
+
+```
+$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt
+[...]
+*** Performance stats:
+Throughput per token including tokenize: 282.93 msecs
+Start to ready to generate: 501.871 secs
+Tokenize and generate 800 (bs=1) tokens: 226.188 secs
+Start to finish: 728.060 secs
+```
+
+
+BS=8
+
+```
+$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt
+[...]
+
+*** Performance stats:
+Throughput per token including tokenize: 34.57 msecs
+Start to ready to generate: 482.132 secs
+Tokenize and generate 6400 (bs=8) tokens: 221.236 secs
+Start to finish: 703.368 secs
+```
+
+BS=16 and higher OOMs
+
+```
+$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 16 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=16.txt
+[...]
+OOM
+
+```
+
+
+
+## HF Accelerate
+
+https://github.com/huggingface/accelerate
+
+### Setup
+
+```
+pip install transformers
+```
+
+
+
+### Run
+
+
+
+
+BS=1
+```
+$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt
+[...]
+
+
+```
+
+BS=8
+```
+$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt
+[...]
+
+
+```
+
+BS=16
+```
+$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 16 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=16.txt
+[...]
+
+
+```
diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py
new file mode 100644
index 000000000..415b2f765
--- /dev/null
+++ b/scripts/inference/bloom-accelerate-inference.py
@@ -0,0 +1,186 @@
+import argparse
+import time
+import os
+import gc
+import torch
+import math
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers")
+    parser.add_argument("--name", type=str, help="Name path", required=True)
+    parser.add_argument("--batch_size", default=1, type=int, help="batch size")
+    parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark")
+    parser.add_argument("--greedy", action="store_true")
+    parser.add_argument("--top-k", type=int, default=0)
+    parser.add_argument("--top-p", type=float, default=0.)
+
+    return parser.parse_args()
+
+def get_max_memory_per_gpu_dict(dtype, model_name):
+    """ try to generate the memory map based on what we know about the model and the available hardware """
+
+    # figure out the memory map - the minimum per gpu required to load the model
+    n_gpus = torch.cuda.device_count()
+
+    if model_name == "bigscience/bloom" and n_gpus == 8 and torch.cuda.get_device_properties(0).total_memory > 79*2**30:
+        # hand crafted optimized memory map for 8x80 setup over BLOOM
+        # this works with bs=40
+        return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'}
+
+    try:
+        # model_params calculation, as we don't have a model yet to do:
+        #model_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+
+        config = AutoConfig.from_pretrained(model_name)
+        h = config.n_embed
+        l = config.n_layer
+        v = config.vocab_size
+        # from https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing
+        model_params = l*(12*h**2 + 13*h) + v*h + 4*h
+    except:
+        print(f"The model {model_name} has a broken config file. Please notify the owner")
+        raise
+
+    bytes = torch.finfo(dtype).bits / 8
+    param_memory_total_in_bytes = model_params * bytes
+    # add 5% since weight sizes aren't the same and some GPU may need more memory
+    param_memory_per_gpu_in_bytes = int(param_memory_total_in_bytes / n_gpus * 1.05)
+    print(f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights")
+
+    # check the real available memory
+    # load cuda kernels first and only measure the real free memory after loading (shorter by ~2GB)
+    torch.ones(1).cuda()
+    max_memory_per_gpu_in_bytes = torch.cuda.mem_get_info(0)[0]
+    if max_memory_per_gpu_in_bytes < param_memory_per_gpu_in_bytes:
+        raise ValueError(f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)")
+
+    return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())}
+
+t_start = time.time()
+
+num_tokens = 100
+
+args = get_args()
+
+local_rank = int(os.getenv('LOCAL_RANK', '0'))
+world_size = int(os.getenv('WORLD_SIZE', '1'))
+
+rank = local_rank
+
+model_name = args.name
+if rank == 0:
+    print(f"Loading model {model_name}")
+
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+# XXX: can't automatically derive dtype via config's `from_pretrained`
+dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16
+
+#print(get_max_memory_per_gpu_dict())
+
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    max_memory=get_max_memory_per_gpu_dict(dtype, model_name),
+    torch_dtype=dtype,
+)
+
+
+if args.benchmark:
+    t_ready = time.time()
+
+
+
+### Generate
+
+if rank == 0:
+    print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}")
+
+input_sentences = [
+    "DeepSpeed is a machine learning framework",
+    "He is working on",
+    "He has a",
+    "He got all",
+    "Everyone is happy and I can",
+    "The new movie that got Oscar this year",
+    "In the far far distance from our galaxy,",
+    "Peace is the only way"
+]
+
+if args.batch_size > len(input_sentences):
+    # dynamically extend to support larger bs by repetition
+    input_sentences *= math.ceil(args.batch_size / len(input_sentences))
+
+generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)
+#generate_kwargs = dict(max_new_tokens=num_tokens, use_cache=False, do_sample=False)
+#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) 
+
+if rank == 0:
+    print(f"Generate args {generate_kwargs}")
+inputs = input_sentences[:args.batch_size]
+def generate():
+    """ returns a list of zipped inputs, outputs and number of new tokens """
+
+    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
+    for t in input_tokens:
+        if torch.is_tensor(input_tokens[t]):
+            input_tokens[t] = input_tokens[t].to("cuda:0")
+
+    outputs = model.generate(**input_tokens, **generate_kwargs)
+
+    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
+    output_tokens_lengths = [x.shape[0] for x in outputs]
+
+    total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)]
+    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    return zip(inputs, outputs, total_new_tokens)
+
+# warmup is a must if measuring speed as it's when all the optimizations are performed
+# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
+_ = generate()
+
+t_generate_start = time.time()
+generated = generate()
+t_generate_span = time.time() - t_generate_start
+if rank == 0:
+    for i,o,_ in generated:
+        print(f"{'-'*60}\nin={i}\nout={o}\n")
+
+
+if args.benchmark:
+    torch.cuda.empty_cache()
+    gc.collect()
+
+### Benchmark
+
+if args.benchmark:
+    if rank == 0:
+        print(f"*** Running benchmark")
+
+    # warm up
+    for i in range(1):
+        _ = generate()
+    torch.cuda.synchronize()
+
+    # benchmark
+    t0 = time.time()
+    cycles = 5
+    total_new_tokens_generated = 0
+    for i in range(cycles):
+        generated = generate()
+        total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated)
+    torch.cuda.synchronize()
+    if rank == 0:
+        througput = (time.time() - t0)/(total_new_tokens_generated)
+        print(f"""
+*** Performance stats:
+Throughput per token including tokenize: {througput*1000:.2f} msecs
+Start to ready to generate: {t_ready - t_start:.3f} secs
+Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs
+Start to finish: {t_ready - t_start + t_generate_span:.3f} secs
+""")
diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py
new file mode 100644
index 000000000..c21dfeb96
--- /dev/null
+++ b/scripts/inference/bloom-ds-inference.py
@@ -0,0 +1,299 @@
+# usage:
+# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom
+#
+# to run benchmarks:
+# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark
+#
+
+
+# This is going to improve, but at the moment, the process is a bit cumbersome - we first use
+# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints,
+# 2. free the allocated storage
+# 3. start Deepspeed-Inference and only now load the checkpoint
+# 4. run generate
+# Done.
+#
+
+
+from argparse import ArgumentParser
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from transformers.deepspeed import HfDeepSpeedConfig
+from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock
+import deepspeed
+import gc
+import glob
+import io
+import json
+import math
+import os
+import sys
+import time
+import torch
+import torch.distributed as dist
+
+t_start = time.time()
+
+num_tokens = 100
+
+parser = ArgumentParser()
+
+parser.add_argument("--name", required=True, type=str, help="model_name")
+parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers")
+parser.add_argument("--batch_size", default=1, type=int, help="batch size")
+parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark")
+args = parser.parse_args()
+
+local_rank = int(os.getenv('LOCAL_RANK', '0'))
+world_size = int(os.getenv('WORLD_SIZE', '1'))
+
+deepspeed.init_distributed('nccl')
+rank = dist.get_rank()
+
+
+### Model loading and instantiating on GPUs
+
+def get_checkpoint_files(pretrained_model_name_or_path):
+    # XXX: I just hacked this one together to automatically handle the fetching of the model file or
+    # shards into cache and returning the cached entries - note that I removed most arguments
+
+    from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url, is_offline_mode
+    from transformers.utils.hub import EntryNotFoundError
+    from transformers.modeling_utils import get_checkpoint_shard_files
+
+    cache_dir = None
+    is_sharded = False
+
+    # XXX: preparation for revision branches if needed
+    revision = None
+    #revision = "sharded"
+
+    # this supports nodes with no network (so you need to pre-cache the model and the tokenizer with
+    # python -c "from transformers import AutoModel; AutoModel.from_pretrained('bigscience/bloom')"
+    if is_offline_mode():
+        print("Offline mode: forcing local_files_only=True")
+        local_files_only = True
+    else:
+        local_files_only = False
+
+    filename = WEIGHTS_NAME
+    archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision)
+
+    try:
+        resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, local_files_only=local_files_only,)
+        return [resolved_archive_file]
+
+    except (EntryNotFoundError, FileNotFoundError):
+        if filename == WEIGHTS_NAME:
+            # Maybe the checkpoint is sharded, we try to grab the index name in this case.
+            archive_file = hf_bucket_url(
+                pretrained_model_name_or_path,
+                filename=WEIGHTS_INDEX_NAME,
+                revision=revision,
+            )
+            resolved_archive_file = cached_path(
+                archive_file,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+            )
+            is_sharded = True
+
+    if is_sharded:
+        # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
+        resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+            pretrained_model_name_or_path,
+            resolved_archive_file,
+            cache_dir=cache_dir,
+            revision=revision
+        )
+
+        return resolved_archive_file
+
+model_name = args.name
+
+#print(get_checkpoint_files(model_name))
+
+if rank == 0:
+    print(f"*** Loading the model {model_name}")
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+config = AutoConfig.from_pretrained(model_name)
+
+# XXX: can't automatically derive dtype via config's `from_pretrained`
+#dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16
+
+
+# use one of these args to `init_inference`
+# 1. injection_policy is the slower version, but it's plain pytorch so it'll always work
+# 2. replace_with_kernel_inject is the faster one (fast fused kernels)
+kernel_inject = True
+#kernel_inject = False
+
+if kernel_inject:
+    # XXX: for now ds-inference only works with fp16
+    dtype = torch.float16
+else:
+    dtype = torch.bfloat16
+
+if args.benchmark:
+    torch.cuda.empty_cache()
+    gc.collect()
+    deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True)
+
+# Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load
+with deepspeed.OnDevice(dtype=dtype, device='meta'):
+    model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
+
+if args.benchmark:
+    deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True)
+
+model = model.eval()
+
+
+if args.benchmark:
+    torch.cuda.empty_cache()
+    gc.collect()
+    deepspeed.runtime.utils.see_memory_usage('post-init-ds-zero-init', force=True)
+
+### Deepspeed-Inference Loading
+
+checkpoints_json = "checkpoints.json"
+def write_checkponts_json():
+
+    with io.open(checkpoints_json, 'w', encoding='utf-8') as f:
+
+        #checkpoint_dir = "/gpfsscratch/rech/six/commun/uan68tv-model-conversion/bloom"
+        #checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin")
+        checkpoint_files = get_checkpoint_files(model_name)
+
+        #print("Checkpoint files:", checkpoint_files)
+
+        data = {
+            "type": "BLOOM-176B",
+            "checkpoints": checkpoint_files,
+            "version": 1.0
+        }
+        json.dump(data, f)
+
+if rank == 0:
+    write_checkponts_json()
+dist.barrier()
+
+if args.benchmark:
+    torch.cuda.empty_cache()
+    gc.collect()
+    deepspeed.runtime.utils.see_memory_usage('pre-ds-inference-init', force=True)
+
+if kernel_inject:
+    kwargs = dict(replace_with_kernel_inject=True)
+else:
+    kwargs = dict(injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')})
+
+#checkpoints_json=None
+model = deepspeed.init_inference(model,
+                                 mp_size=world_size,
+                                 dtype=torch.half,
+                                 checkpoint=checkpoints_json,
+                                 **kwargs,
+                                 )
+
+if args.benchmark:
+    torch.cuda.empty_cache()
+    gc.collect()
+    deepspeed.runtime.utils.see_memory_usage('post-ds-inference-init', force=True)
+
+
+model = model.module
+
+if args.benchmark:
+    t_ready = time.time()
+
+
+### Generate
+
+if rank == 0:
+    print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}")
+
+input_sentences = [
+    "DeepSpeed is a machine learning framework",
+    "He is working on",
+    "He has a",
+    "He got all",
+    "Everyone is happy and I can",
+    "The new movie that got Oscar this year",
+    "In the far far distance from our galaxy,",
+    "Peace is the only way"
+]
+
+if args.batch_size > len(input_sentences):
+    # dynamically extend to support larger bs by repetition
+    input_sentences *= math.ceil(args.batch_size / len(input_sentences))
+
+generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)
+
+if rank == 0:
+    print(f"Generate args {generate_kwargs}")
+inputs = input_sentences[:args.batch_size]
+def generate():
+    """ returns a list of zipped inputs, outputs and number of new tokens """
+
+    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
+    for t in input_tokens:
+        if torch.is_tensor(input_tokens[t]):
+            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
+
+    outputs = model.generate(**input_tokens, **generate_kwargs)
+
+    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
+    output_tokens_lengths = [x.shape[0] for x in outputs]
+
+    total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)]
+    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    return zip(inputs, outputs, total_new_tokens)
+
+
+# warmup is a must if measuring speed as it's when all the optimizations are performed
+# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
+_ = generate()
+
+t_generate_start = time.time()
+generated = generate()
+t_generate_span = time.time() - t_generate_start
+if rank == 0:
+    for i,o,_ in generated:
+        print(f"{'-'*60}\nin={i}\nout={o}\n")
+
+if args.benchmark:
+    torch.cuda.empty_cache()
+    gc.collect()
+    deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True)
+
+### Benchmark
+
+# benchmark it!
+if args.benchmark:
+    if rank == 0:
+        print(f"*** Running benchmark")
+
+    # warm up
+    for i in range(1):
+        _ = generate()
+    torch.cuda.synchronize()
+
+    # benchmark
+    t0 = time.time()
+    cycles = 5
+    total_new_tokens_generated = 0
+    for i in range(cycles):
+        generated = generate()
+        total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated)
+    torch.cuda.synchronize()
+    if rank == 0:
+        througput = (time.time() - t0)/(total_new_tokens_generated)
+        print(f"""
+*** Performance stats:
+Throughput per token including tokenize: {througput*1000:.2f} msecs
+Start to ready to generate: {t_ready - t_start:.3f} secs
+Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs
+Start to finish: {t_ready - t_start + t_generate_span:.3f} secs
+""")
diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py
new file mode 100644
index 000000000..043b4967f
--- /dev/null
+++ b/scripts/inference/bloom-ds-zero-inference.py
@@ -0,0 +1,211 @@
+# usage:
+# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom
+#
+# to run benchmarks:
+# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark
+#
+
+
+# This is going to improve, but at the moment, the process is a bit cumbersome - we first use
+# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints,
+# 2. free the allocated storage
+# 3. start Deepspeed-Inference and only now load the checkpoint
+# 4. run generate
+# Done.
+#
+
+
+from argparse import ArgumentParser
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from transformers.deepspeed import HfDeepSpeedConfig
+from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock
+import deepspeed
+import gc
+import glob
+import io
+import json
+import math
+import os
+import sys
+import time
+import torch
+import torch.distributed as dist
+
+t_start = time.time()
+
+num_tokens = 100
+
+parser = ArgumentParser()
+
+parser.add_argument("--name", required=True, type=str, help="model_name")
+parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers")
+parser.add_argument("--batch_size", default=1, type=int, help="batch size")
+parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark")
+parser.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload")
+args = parser.parse_args()
+
+local_rank = int(os.getenv('LOCAL_RANK', '0'))
+world_size = int(os.getenv('WORLD_SIZE', '1'))
+
+
+### Model loading and instantiating on GPU (via ZeRO)
+
+model_name = args.name
+
+if local_rank == 0:
+    print(f"*** Loading the model {model_name}")
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+config = AutoConfig.from_pretrained(model_name)
+
+# XXX: can't automatically derive dtype via config's `from_pretrained`
+dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16
+
+model_hidden_size = config.hidden_size
+train_batch_size = 1 * world_size
+
+ds_config = {
+    "fp16": {
+        "enabled": dtype == torch.float16,
+    },
+    "bf16": {
+        "enabled": dtype == torch.bfloat16,
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": True,
+        "contiguous_gradients": True,
+        "reduce_bucket_size": model_hidden_size * model_hidden_size,
+        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
+        "stage3_param_persistence_threshold": 0
+    },
+    "steps_per_print": 2000,
+    "train_batch_size": train_batch_size,
+    "train_micro_batch_size_per_gpu": 1,
+    "wall_clock_breakdown": False
+}
+
+if args.cpu_offload:
+    ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True)
+
+dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus
+
+if args.benchmark:
+    torch.cuda.empty_cache()
+    gc.collect()
+    deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True)
+
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+
+if args.benchmark:
+    deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True)
+
+model = model.eval()
+
+rank = dist.get_rank()
+
+if rank == 0:
+    print(ds_config)
+
+ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
+ds_engine.module.eval()
+model = ds_engine.module
+
+if args.benchmark:
+    t_ready = time.time()
+
+
+### Generate
+
+if rank == 0:
+    print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}")
+
+input_sentences = [
+    "DeepSpeed is a machine learning framework",
+    "He is working on",
+    "He has a",
+    "He got all",
+    "Everyone is happy and I can",
+    "The new movie that got Oscar this year",
+    "In the far far distance from our galaxy,",
+    "Peace is the only way"
+]
+
+if args.batch_size > len(input_sentences):
+    # dynamically extend to support larger bs by repetition
+    input_sentences *= math.ceil(args.batch_size / len(input_sentences))
+
+generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False)
+
+if rank == 0:
+    print(f"Generate args {generate_kwargs}")
+inputs = input_sentences[:args.batch_size]
+def generate():
+    """ returns a list of zipped inputs, outputs and number of new tokens """
+
+    input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True)
+    for t in input_tokens:
+        if torch.is_tensor(input_tokens[t]):
+            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
+
+    outputs = model.generate(**input_tokens, **generate_kwargs)
+
+    input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
+    output_tokens_lengths = [x.shape[0] for x in outputs]
+
+    total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)]
+    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    return zip(inputs, outputs, total_new_tokens)
+
+# XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size
+
+# warmup is a must if measuring speed as it's when all the optimizations are performed
+# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
+_ = generate()
+
+t_generate_start = time.time()
+pairs = generate()
+t_generate_span = time.time() - t_generate_start
+if rank == 0:
+    for i,o,_ in pairs:
+        print(f"{'-'*60}\nin={i}\nout={o}\n")
+
+
+if args.benchmark:
+    torch.cuda.empty_cache()
+    gc.collect()
+    deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True)
+
+### Benchmark
+
+if args.benchmark:
+    if rank == 0:
+        print(f"*** Running benchmark")
+
+    # warm up
+    for i in range(1):
+        _ = generate()
+    torch.cuda.synchronize()
+
+    # benchmark
+    t0 = time.time()
+    cycles = 5
+    total_new_tokens_generated = 0
+    for i in range(cycles):
+        generated = generate()
+        total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated)
+
+    torch.cuda.synchronize()
+    if rank == 0:
+        # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs)
+        total_new_tokens_generated *=  world_size
+        througput = (time.time() - t0)/(total_new_tokens_generated)
+        print(f"""
+*** Performance stats:
+Throughput per token including tokenize: {througput*1000:.2f} msecs
+Start to ready to generate: {t_ready - t_start:.3f} secs
+Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs
+Start to finish: {t_ready - t_start + t_generate_span:.3f} secs
+""")
+
diff --git a/scripts/inference/bloom-inference.py b/scripts/inference/bloom-inference.py
deleted file mode 100644
index 17da46795..000000000
--- a/scripts/inference/bloom-inference.py
+++ /dev/null
@@ -1,153 +0,0 @@
-
-# usage:
-# deepspeed --num_gpus 1 bloom-inference.py --name bigscience/bloom-350m
-#
-
-#import glob
-from argparse import ArgumentParser
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-from transformers.deepspeed import HfDeepSpeedConfig
-from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock
-import deepspeed
-import io
-import json
-import os
-import torch
-import torch.distributed as dist
-
-parser = ArgumentParser()
-
-parser.add_argument("--name", required=True, type=str)
-parser.add_argument("--local_rank", required=False, type=int)
-parser.add_argument("--deepspeed", action="store_true")
-args = parser.parse_args()
-
-local_rank = int(os.getenv('LOCAL_RANK', '0'))
-world_size = int(os.getenv('WORLD_SIZE', '1'))
-
-def get_checkpoint_files(pretrained_model_name_or_path):
-    # XXX: I just hacked this one together to automatically handle the fetching of the model file or
-    # shards into cache and returning the cached entries - note that I removed most arguments
-
-    from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url
-
-    cache_dir = None
-    is_sharded = False
-    filename = WEIGHTS_NAME
-    archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename)
-
-    try:
-        resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        return [resolved_archive_file]
-
-    except EntryNotFoundError:
-        if filename == WEIGHTS_NAME:
-            # Maybe the checkpoint is sharded, we try to grab the index name in this case.
-            archive_file = hf_bucket_url(
-                pretrained_model_name_or_path,
-                filename=WEIGHTS_INDEX_NAME,
-            )
-            resolved_archive_file = cached_path(
-                archive_file,
-                cache_dir=cache_dir,
-            )
-            is_sharded = True
-
-    if is_sharded:
-        # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
-        resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
-            pretrained_model_name_or_path,
-            resolved_archive_file,
-            cache_dir=cache_dir,
-        )
-
-        return resolved_archive_file
-
-
-model_name = args.name
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-config = AutoConfig.from_pretrained(model_name)
-model_hidden_size = config.hidden_size
-train_batch_size = 1 * world_size
-model = AutoModelForCausalLM.from_config(config)
-
-ds_config = {
-    "fp16": {
-        "enabled": model.dtype == torch.float16,
-    },
-    "bf16": {
-        "enabled": model.dtype == torch.bfloat16,
-    },
-    "zero_optimization": {
-        "stage": 3,
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": True
-        },
-        "overlap_comm": True,
-        "contiguous_gradients": True,
-        "reduce_bucket_size": model_hidden_size * model_hidden_size,
-        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
-        "stage3_param_persistence_threshold": 0
-    },
-    "steps_per_print": 2000,
-    "train_batch_size": train_batch_size,
-    "train_micro_batch_size_per_gpu": 1,
-    "wall_clock_breakdown": False
-}
-
-dschf = HfDeepSpeedConfig(ds_config)
-
-model = model.eval()
-ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
-ds_engine.module.eval()
-model = ds_engine.module
-
-
-
-checkpoints_json = "checkpoints.json"
-with io.open(checkpoints_json, 'w', encoding='utf-8') as f:
-
-    #checkpoint_files = glob.glob(f"args.checkpoint_dir/*bin")
-    checkpoint_files = get_checkpoint_files(model_name)
-
-    print("Checkpoint files:", checkpoint_files)
-
-    data = {
-	"type": "BLOOM-176B",
-	"checkpoints": checkpoint_files,
-	"version": 1.0
-    }
-    json.dump(data, f)
-
-
-model = deepspeed.init_inference(model,
-                                 mp_size=1,
-                                 dtype=torch.half,
-                                 checkpoint=checkpoints_json,
-                                 #injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')}
-                                 replace_with_kernel_inject=True
-                                 )
-model = model.module
-
-text_in = 'DeepSpeed is'
-
-tokens = tokenizer(text_in, return_tensors="pt")
-
-for t in tokens:
-    if torch.is_tensor(tokens[t]):
-        tokens[t] = tokens[t].to(torch.cuda.current_device())
-
-with torch.no_grad():
-    gen_tokens = model.generate(
-        **tokens,
-        min_length=50,
-        max_length=50,
-        do_sample=False,
-    )
-
-
-text_out = tokenizer.batch_decode(gen_tokens)[0]
-
-print(f"in={text_in}\nout={text_out}")
diff --git a/tests/ds_config_bf16.json b/tests/ds_config_bf16.json
new file mode 100644
index 000000000..6afd1f6b2
--- /dev/null
+++ b/tests/ds_config_bf16.json
@@ -0,0 +1,14 @@
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 16,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": 0
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "zero_allow_untested_optimizer": true,
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py
new file mode 100644
index 000000000..fdc41e014
--- /dev/null
+++ b/tests/test_checkpoints.py
@@ -0,0 +1,298 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import pytest
+from pathlib import Path
+
+from parameterized import parameterized
+from megatron.testing_utils import (
+    CaptureStdout,
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    require_deepspeed,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    set_seed
+)
+
+set_seed(42)
+
+
+def parameterized_custom_name_func(func, param_num, param):
+    # customize the test name generator function as we want both params to appear in the sub-test
+    # name, as by default it shows only the first param
+    param_based_name = parameterized.to_safe_name("_to_".join(str(x) for x in param.args))
+    return f"{func.__name__}_{param_based_name}"
+
+params = [
+    # TP_PP_DP
+    ["1_1_1", "1_1_1"],
+    ["2_1_1", "1_1_1"],
+    ["1_2_1", "1_1_1"],
+    ["1_1_2", "1_1_1"],
+
+    ["2_1_1", "2_1_1"],
+    ["1_1_1", "2_1_1"],
+    ["1_1_1", "1_2_1"],
+    ["1_1_1", "1_1_2"],
+
+    ["1_1_2", "1_1_2"],
+    ["1_1_2", "2_1_1"],
+    ["1_1_2", "1_2_1"],
+
+    ["1_2_1", "1_2_1"],
+    ["1_2_1", "2_1_1"],
+    ["1_2_1", "1_1_2"],
+
+    ["2_1_1", "2_1_1"],
+    ["2_1_1", "1_2_1"],
+    ["2_1_1", "1_1_2"],
+
+    ["2_2_2", "1_1_1"],
+    ["2_2_2", "2_2_2"],
+    ["1_1_1", "2_2_2"],
+
+    ["1_1_8", "2_2_2"],
+
+]
+
+def get_launcher(num_gpus):
+    # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
+    # - it won't be able to handle that
+    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+
+@require_deepspeed
+@require_torch_gpu
+class MegDSTestCheckpoints(TestCasePlus):
+    """ """
+
+    def setUp(self):
+        super().setUp()
+
+        # at times magatron fails to build kernels and doesn't remove the lock file, which makes
+        # subsequent runs hang - so make sure there is no lock when starting the testing
+        meg_lock_file_path = self.repo_root_dir_str + "/megatron/fused_kernels/build/lock"
+        if os.path.exists(meg_lock_file_path):
+            os.unlink(meg_lock_file_path)
+
+    def get_config(self, output_dir, tp_size, pp_size, dp_size):
+        data_dir = f"{self.data_dir}/gpt2"
+
+        num_gpus = pp_size * tp_size * dp_size
+        print(f"Using {num_gpus} GPUs")
+
+        n_samples = 300 # about 56 iterations
+
+        exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
+        seq_len = 128
+
+        # XXX: for now while testing shapes make it really short and fast
+        exit_interval = 1
+        seq_len = 8
+
+
+        # common/shared configs
+
+        ds_args = f"""
+                --deepspeed
+                --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json
+                --zero-stage 0
+                --deepspeed-activation-checkpointing
+        """.split()
+
+        args = f"""
+                --tensor-model-parallel-size {tp_size}
+                --pipeline-model-parallel-size {pp_size}
+                --distributed-backend nccl
+
+                --log-interval 1
+                --save-interval 1
+                --eval-interval 10
+                --eval-iters 1
+                --checkpoint-activations
+                --partition-activations
+                --exit-interval {exit_interval}
+
+                --merge-file {data_dir}/gpt2-tiny-merges.txt
+                --vocab-file {data_dir}/gpt2-tiny-vocab.json
+                --save {output_dir}/checkpoints
+                --load {output_dir}/checkpoints
+                --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+                --tensorboard-dir {output_dir}/tensorboard
+                --tensorboard-queue-size 5
+                --log-timers-to-tensorboard
+                --log-batch-size-to-tensorboard
+                --log-validation-ppl-to-tensorboard
+
+                --num-layers 2
+                --hidden-size 8
+                --num-attention-heads 2
+                --seq-length {seq_len}
+                --max-position-embeddings 8
+                --micro-batch-size 1
+                --global-batch-size 16
+                --train-samples {n_samples}
+
+                --embed-layernorm
+                --position-embedding-type alibi
+
+                --optimizer adam
+                --adam-beta1 0.9
+                --adam-beta2 0.95
+                --adam-eps 1e-8
+                --lr 1e-4
+                --lr-warmup-samples 5
+                --lr-decay-samples 6
+                --clip-grad 1.0
+                --weight-decay 1e-1
+                --bf16
+
+                --log-level debug
+                --log-level-replica info
+        """.split()
+
+
+        # XXX: fails to handle:
+        #--embed-layernorm
+        #
+# stderr: RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding:
+# stderr:         size mismatch for norm.weight: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
+# stderr:         size mismatch for norm.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
+
+        return args, ds_args, num_gpus
+
+
+    def train_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
+        src_dir = self.src_dir
+        script = [f"{src_dir}/pretrain_gpt.py"]
+
+        args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        #print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+    def convert_checkpoint_to_universal(self, output_dir, step):
+        cmd = f"""
+            python tools/convert_checkpoint/ds_to_universal.py
+            --input_folder  {output_dir}/checkpoints/global_step{step}
+            --output_folder {output_dir}/checkpoints/global_step{step}_universal
+        """.split()
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        self.assertIn("Convert DeepSpeed Checkpoint to Universal Checkpoint", cs.out)
+
+    def resume_from_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
+        src_dir = self.src_dir
+        script = [f"{src_dir}/pretrain_gpt.py"]
+
+        args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+    def resume_from_universal_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
+        src_dir = self.src_dir
+        script = [f"{src_dir}/pretrain_gpt.py"]
+
+        args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args + ["--universal-checkpoint"]
+        # keep for quick debug
+        #print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+
+    @require_torch_multi_gpu
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_checkpoint_reshaping_main(self, src, tgt):
+        # this test needs at least 2 gpus - if there are more gpus it will do more extensive testing
+
+        tp_size_src, pp_size_src, dp_size_src = list(map(int, src.split('_')))
+        tp_size_tgt, pp_size_tgt, dp_size_tgt = list(map(int, tgt.split('_')))
+
+        n_gpus = get_gpu_count()
+        n_gpus_src = tp_size_src * pp_size_src * dp_size_src
+        n_gpus_tgt = tp_size_tgt * pp_size_tgt * dp_size_tgt
+
+        if n_gpus_src > n_gpus:
+            pytest.skip(f"the test requires {n_gpus_src} gpus for source topology but have only {n_gpus}")
+        if n_gpus_tgt > n_gpus:
+            pytest.skip(f"the test requires {n_gpus_tgt} gpus for target topology but have only {n_gpus}")
+
+        output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
+
+        # 1. train with initial topology defined in the first arg of params
+        self.train_checkpoint(output_dir, tp_size=tp_size_src , pp_size=pp_size_src , dp_size=dp_size_src )
+
+        # 2. convert checkpoint to universal checkpoint (topology )
+        self.convert_checkpoint_to_universal(output_dir=output_dir, step=1)
+
+        # 3. check we can resume training from a reshaped checkpoint to the target topology - the last arg of params
+        self.resume_from_universal_checkpoint(output_dir, tp_size=tp_size_tgt, pp_size=pp_size_tgt, dp_size=dp_size_tgt)
+
+
+    @require_torch_multi_gpu
+    def test_checkpoint_reshaping_empty_dir(self):
+
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+        with self.assertRaises(RuntimeError) as context:
+            self.convert_checkpoint_to_universal(output_dir=output_dir, step=1)
diff --git a/tools/convert_checkpoint/deepspeed_checkpoint.py b/tools/convert_checkpoint/deepspeed_checkpoint.py
deleted file mode 100644
index 52dff44f2..000000000
--- a/tools/convert_checkpoint/deepspeed_checkpoint.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import os
-from typing import Dict
-import torch
-
-ZERO_FILE_PREFIX = 'zero_pp_rank_'
-LAYER_FILE_PREFIX = 'layer_'
-MP_RANK_FILE_PREFIX = 'mp_rank_'
-EMBEDDING_LAYER_INDEX = 0
-FINAL_LAYER_NORM_INDEX = -1
-ARGS_KEY = 'args'
-ITERATION_KEY = 'iteration'
-SEQUENTIAL_LAYERS = [
-    'input_layernorm.weight', 'input_layernorm.bias',
-    'self_attention.dense.bias',
-    'post_attention_layernorm.weight', 'post_attention_layernorm.bias',
-    'mlp.dense_4h_to_h.bias',
-    'position_embeddings.weight'
-]
-
-LAYER_CONCAT_DIM = {
-    'self_attention.dense.weight': 1,
-    'mlp.dense_4h_to_h.weight': 1
-}
-
-class DeepSpeedCheckpoint(object):
-    def __init__(self, dir, tp_degree=None, pp_degree=None):
-        self.dir = dir
-        self.file_list = self._get_files(dir)
-        self.zero_files = self._get_files_with_prefix(self.file_list, ZERO_FILE_PREFIX)
-        self.layer_files = self._get_files_with_prefix(self.file_list, LAYER_FILE_PREFIX)
-        self.mp_rank_files = self._get_files_with_prefix(self.file_list, MP_RANK_FILE_PREFIX)
-        self.layer_keys = self._get_layer_keys()
-        self.layer_count = len(self.layer_keys)
-        self.original_tp_degree = len(self._get_files_with_prefix(self.layer_files, f'{LAYER_FILE_PREFIX}01'))
-        self.original_pp_degree = len(self.mp_rank_files) // self.original_tp_degree
-        self.dp_degree = len(self.zero_files) // (self.original_pp_degree * self.original_tp_degree)
-        self.tp_degree = self.original_tp_degree if tp_degree is None else tp_degree
-        self.pp_degree = self.original_pp_degree if pp_degree is None else pp_degree
-        self.global_state = {}
-
-        self._sanity_check()
-        self.pp_to_transformer_map = self._build_pp_transformer_map()
-        self.transformer_file_map = self._build_transformer_file_map()
-        self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX)
-        self.tp_to_final_norm_map = self._build_tp_other_layer_map(FINAL_LAYER_NORM_INDEX)
-        self._build_global_state()
-
-
-
-    def show_tp_embedding_map(self):
-        self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers')
-
-    def show_tp_final_norm_map(self):
-        self._dump_mapping(self.tp_to_final_norm_map, 'tp_to_final_norm_layers')
-
-    def show_pp_tranformer_map(self):
-        self._dump_mapping(self.pp_to_transformer_map, 'pp_to_tranformer_layers')
-
-    def show_transformer_file_map(self):
-        self._dump_mapping(self.transformer_file_map, 'rank_to_tranformer_files')
-
-    def _build_global_state(self):
-        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
-        self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
-        self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
-
-    def get_iteration(self):
-        if not ITERATION_KEY in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
-            self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
-
-        return self.global_state[ITERATION_KEY]
-
-    def get_embedding_state(self, tp_index: int) -> Dict:
-        assert tp_index in self.tp_to_embedding_map.keys()
-        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]]
-        sd = self._merge_state_dicts(sd_list)
-        return sd
-
-    def get_args(self):
-        if not ARGS_KEY in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
-            self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
-
-        return self.global_state[ARGS_KEY]
-
-
-    def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
-        assert tp_index < self.tp_degree
-        assert pp_index < self.pp_degree
-        t_list = []
-        for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
-            sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
-            sd = self._merge_state_dicts(sd_list)
-            t_list.append(sd)
-        return t_list
-
-    def get_final_norm_state(self, tp_index:int) -> Dict:
-        assert tp_index in self.tp_to_final_norm_map.keys()
-        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'))
-        return sd
-
-    def _build_tp_other_layer_map(self, layer_index:int):
-        assert layer_index < len(self.layer_files)
-        layer_files = self._get_files_with_prefix(self.layer_files, self.layer_keys[layer_index])
-        layer_file_partitions = self._partition_data(layer_files, self.tp_degree)
-        data_map = {i:flist for i, flist in enumerate(layer_file_partitions)}
-        return data_map
-
-    def _build_pp_transformer_map(self):
-        data_map = {}
-        transformer_layers = self.layer_keys[1:-1]
-        layers_per_pp = len(transformer_layers) // self.pp_degree
-        data_map = {i:transformer_layers[i*layers_per_pp:(i+1)*layers_per_pp] for i in range(0, self.pp_degree)}
-        return data_map
-
-    def _dump_mapping(self, data_map, map_tag = None):
-        if map_tag is not None:
-            print(f'Dump mapping: {map_tag}')
-        for k, v in data_map.items():
-            print(f'{k} = {v}')
-
-    def _build_transformer_file_map(self):
-        transformer_layer_keys = self.layer_keys[1:-1]
-        file_map = {}
-        layers_per_pp = len(transformer_layer_keys) // self.pp_degree
-        for key_index, layer_key in enumerate(transformer_layer_keys):
-            pp_index = key_index // layers_per_pp
-            layer_files = self._get_files_with_prefix(self.layer_files, layer_key)
-            layer_file_partitions = self._partition_data(layer_files, self.tp_degree)
-            for tp_index in range(self.tp_degree):
-                map_key = (tp_index, pp_index)
-                if not map_key in file_map.keys():
-                    file_map[map_key] = []
-                file_map[map_key].append(layer_file_partitions[tp_index])
-
-        return file_map
-
-    def _sanity_check(self):
-        assert len(self.mp_rank_files) % self.tp_degree == 0
-        assert len(self.zero_files) % (self.pp_degree * self.tp_degree) == 0
-        assert len(self.layer_keys) > 2
-
-        # XXX: disable for now, since this fails when using:
-        # --pp-partition-method 'type:transformer|embedding'
-        # so if it can detect this flag somehow it then should validate:
-        # assert (len(self.layer_keys)) % self.pp_degree == 0
-        # the original:
-        # assert (len(self.layer_keys) - 2) % self.pp_degree == 0
-
-    def _get_files_with_prefix(self, all_files, prefix):
-        file_list = []
-        for file_path in all_files:
-            _, fname = os.path.split(file_path)
-            if fname.startswith(prefix):
-                file_list.append(file_path)
-
-        return sorted(file_list)
-
-    def validate_files(self):
-        for file in self.file_list:
-            if not os.path.isfile(file):
-                print(f'Error: {file} is not existent')
-
-    def _get_files(self, dir):
-        file_list = []
-        for root, dirs, files in os.walk(dir):
-            for file in files:
-                file_list.append(os.path.join(root, file))
-        return file_list
-
-    def _get_layer_keys(self):
-        key_set = set()
-        key_len = len(LAYER_FILE_PREFIX) + 2
-        for file_path in self.layer_files:
-            _, fname = os.path.split(file_path)
-            key_set.add(fname[:key_len])
-        return sorted(list(key_set))
-
-    def _partition_data(self, data_list, num_partitions):
-        num_elems = len(data_list)
-        assert num_elems % num_partitions == 0
-        partition_size = num_elems // num_partitions
-        partitions_list = [data_list[i:i+partition_size] for i in range(0, num_elems, partition_size)]
-        return partitions_list
-
-    def _merge_state_dicts(self, sd_list):
-        merged_sd = {}
-        for key in sd_list[0].keys():
-            if not key in SEQUENTIAL_LAYERS:
-                cat_dim = LAYER_CONCAT_DIM.get(key, 0)
-                merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim)
-            else:
-                merged_sd[key] = sd_list[0][key]
-        return merged_sd
diff --git a/tools/convert_checkpoint/deepspeed_to_deepspeed.py b/tools/convert_checkpoint/deepspeed_to_deepspeed.py
new file mode 100644
index 000000000..8d484e88d
--- /dev/null
+++ b/tools/convert_checkpoint/deepspeed_to_deepspeed.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+import sys
+import argparse
+import os
+import torch
+
+from pathlib import Path
+
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+
+from megatron.tokenizer.tokenizer import _vocab_size_with_padding
+from deepspeed.checkpoint.deepspeed_checkpoint import (
+    ARGS_KEY,
+    CHECKPOINT_INFO_KEY,
+)
+
+from deepspeed.checkpoint import (
+    DeepSpeedCheckpoint,
+    get_model_ckpt_name_for_rank,
+    get_zero_ckpt_name_for_rank,
+    get_layer_ckpt_name_for_rank
+)
+
+CHECKPOINT_FILE_SUFFIX = '_model_states.pt'
+MP_WORLD_SIZE ='mp_world_size'
+WORD_EMBEDDINGS_KEY = 'word_embeddings.weight'
+ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
+PADDED_VOCAB_SIZE = 'padded_vocab_size'
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder',
+                        default=None,
+                        type=str,
+                        help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder',
+                        default=None,
+                        type=str,
+                        help='Output Megatron checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=None,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=None,
+                        type=int,
+                        help='Target PP degree')
+    parser.add_argument('--target_dp',
+                        default=None,
+                        type=int,
+                        help='Target DP degree')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+
+def _save_checkpoint(file_path, chkpt_sd):
+    dir, _ = os.path.split(file_path)
+    os.makedirs(dir, exist_ok=True)
+    torch.save(chkpt_sd, file_path)
+
+
+def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index):
+    sd_list = ds_checkpoint.get_transformer_state(tp_index, pp_index)
+    layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index)
+    assert len(sd_list) == len(layer_id_list)
+    for sd, layer_id in zip(sd_list, layer_id_list):
+        ckpt_path = get_layer_ckpt_name_for_rank(
+            base_folder=base_folder,
+            layer_id=layer_id,
+            tp_rank=tp_index)
+        _save_checkpoint(ckpt_path, sd)
+
+
+def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor):
+    target_args = ds_checkpoint.get_args()
+    checkpoint_info = ds_checkpoint.get_checkpoint_info()
+    target_args.tensor_model_parallel_size = ds_checkpoint.tp_degree
+    target_args.padded_vocab_size = _vocab_size_with_padding(checkpoint_info[ORIGINAL_VOCAB_SIZE], target_args)
+    assert target_args.padded_vocab_size <= padded_vocab_tensor.numel()
+    checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size
+    unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size)
+    return unpadded_vocab_tensor.clone()
+
+
+def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
+    sd = ds_checkpoint.get_embedding_state(tp_index)
+    if ds_checkpoint.is_change_tp_degree():
+        sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY])
+    layer_id = ds_checkpoint.get_embedding_layer_id()
+    ckpt_path = get_layer_ckpt_name_for_rank(
+        base_folder=base_folder,
+        tp_rank=tp_index,
+        layer_id=layer_id)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
+    sd = ds_checkpoint.get_final_norm_state(tp_index)
+    layer_id = ds_checkpoint.get_final_norm_layer_id()
+    ckpt_path = get_layer_ckpt_name_for_rank(
+        base_folder=base_folder,
+        tp_rank=tp_index,
+        layer_id=layer_id)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index,
+                                   pp_index):
+    sd = ds_checkpoint.get_2d_parallel_state(tp_index=tp_index,
+                                             pp_index=pp_index)
+    sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree
+    file_id = pp_index * ds_checkpoint.tp_degree + tp_index
+    ckpt_path = get_model_ckpt_name_for_rank(base_folder, f'{file_id:02d}')
+
+    # Adjust specific fields
+    sd[ARGS_KEY] = ds_checkpoint.get_args()
+    sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
+    sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
+    sd[CHECKPOINT_INFO_KEY][PADDED_VOCAB_SIZE] = sd[ARGS_KEY].padded_vocab_size
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_zero_checkpoint(ds_checkpoint, base_folder, dp_index, pp_index, tp_index):
+    _2d_rank = (pp_index * ds_checkpoint.tp_degree) + tp_index
+    sd = ds_checkpoint.get_zero_checkpoint_state(
+        pp_index=pp_index,
+        tp_index=tp_index,
+        dp_index=dp_index)
+
+    ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder,
+                                            dp_rank=dp_index,
+                                            mp_rank=_2d_rank)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_latest_file(base_folder, file_name, latest_tag):
+    file_path = os.path.join(base_folder, file_name)
+    os.makedirs(base_folder, exist_ok=True)
+    with open(file_path, 'w') as f:
+        f.write(str(latest_tag))
+
+
+def main():
+    print(f'Convert DeepSpeed Checkpoint to DeepSpeed Checkpoint')
+
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to DeepSpeed checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(
+        args.input_folder,
+        args.target_tp,
+        args.target_pp,
+        args.target_dp)
+    iteration = ds_checkpoint.get_iteration()
+    latest_tag = f'global_step{iteration}'
+    _create_latest_file(args.output_folder,
+                        'latest_checkpointed_iteration.txt', iteration)
+    _create_latest_file(args.output_folder, 'latest', latest_tag)
+    base_folder = os.path.join(args.output_folder, latest_tag)
+
+    for i in range(ds_checkpoint.tp_degree):
+        _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, i)
+        _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, i)
+
+        for j in range(ds_checkpoint.pp_degree):
+            _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, j)
+            _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j)
+
+    for i in range(ds_checkpoint.dp_degree):
+        for j in range(ds_checkpoint.pp_degree):
+            for k in range(ds_checkpoint.tp_degree):
+                _create_zero_checkpoint(ds_checkpoint, base_folder, i, j, k)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py
index 017036af4..74e5ca7c9 100755
--- a/tools/convert_checkpoint/deepspeed_to_megatron.py
+++ b/tools/convert_checkpoint/deepspeed_to_megatron.py
@@ -13,18 +13,34 @@
 ENCODER_KEY = 'encoder'
 WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
 WORD_EMBEDDINGS_KEY = 'word_embeddings'
-FINAL_LAYER_NORM_KEY ='final_layernorm'
+FINAL_LAYER_NORM_KEY = 'final_layernorm'
 CHECKPOINT_VERSION_KEY = 'checkpoint_version'
 CHECKPOINT_VERSION_VALUE = 3.0
 ITERATION_KEY = 'iteration'
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input_folder', default=None, type=str, help='Input DeepSpeed Checkpoint folder')
-    parser.add_argument('--output_folder', default=None, type=str, help='Output Megatron checkpoint folder')
-    parser.add_argument('--target_tp', default=1, type=int, help='Target TP degree')
-    parser.add_argument('--target_pp', default=1, type=int, help='Target PP degree')
-    parser.add_argument('--for_release', action='store_true', help='Convert for release purpose, reset some (progress) counters.')
+    parser.add_argument('--input_folder',
+                        default=None,
+                        type=str,
+                        help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder',
+                        default=None,
+                        type=str,
+                        help='Output Megatron checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=1,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=1,
+                        type=int,
+                        help='Target PP degree')
+    parser.add_argument(
+        '--for_release',
+        action='store_true',
+        help='Convert for release purpose, reset some (progress) counters.')
     args = parser.parse_args()
     print(f'args = {args}')
     return args
@@ -39,6 +55,7 @@ def _convert_ds_transformer_state(sd_list):
 
     return new_sd
 
+
 def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
     path_list = []
     iter_folder = f'iter_{iteration:07d}'
@@ -47,18 +64,18 @@ def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
         for j in range(0, pp_degree):
             rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
             ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
-            path_list[i].append(os.path.join(base_folder, iter_folder, ckpt_path))
+            path_list[i].append(
+                os.path.join(base_folder, iter_folder, ckpt_path))
 
     return path_list
 
 
 def _create_megatron_dict():
-    language_model_dict = {
-        EMBEDDING_KEY: {},
-        ENCODER_KEY: {}
-    }
+    language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}}
     megatron_dict = {
-        MODEL_KEY: {LANGUGAGE_MODEL_KEY: language_model_dict},
+        MODEL_KEY: {
+            LANGUGAGE_MODEL_KEY: language_model_dict
+        },
         CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
     }
     return megatron_dict
@@ -78,7 +95,11 @@ def _renest_sd(sd):
     return new_sd
 
 
-def _create_rank_checkpoint(ds_checkpoint, checkpoint_path, tp_index, pp_index, for_release=False):
+def _create_rank_checkpoint(ds_checkpoint,
+                            checkpoint_path,
+                            tp_index,
+                            pp_index,
+                            for_release=False):
     meg_encoder_sd = OrderedDict()
     meg_embedding_sd = OrderedDict()
     meg_embedding_for_head_sd = OrderedDict()
@@ -92,7 +113,7 @@ def _create_rank_checkpoint(ds_checkpoint, checkpoint_path, tp_index, pp_index,
         if pp_index == 0:
             meg_embedding_sd.update(nested_embedding_sd)
 
-        if pp_index == ds_checkpoint.pp_degree -1:
+        if pp_index == ds_checkpoint.pp_degree - 1:
             for key, value in embedding_sd.items():
                 if key.startswith(WORD_EMBEDDINGS_KEY):
                     fields = key.split('.')
@@ -101,7 +122,10 @@ def _create_rank_checkpoint(ds_checkpoint, checkpoint_path, tp_index, pp_index,
                     meg_embedding_for_head_sd[new_key] = value
 
             final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index)
-            new_final_norm_sd = {f'{FINAL_LAYER_NORM_KEY}.{key}': value for key, value in final_norm_sd.items()}
+            new_final_norm_sd = {
+                f'{FINAL_LAYER_NORM_KEY}.{key}': value
+                for key, value in final_norm_sd.items()
+            }
             meg_encoder_sd.update(new_final_norm_sd)
 
     checkpoint_sd = _create_megatron_dict()
@@ -109,15 +133,19 @@ def _create_rank_checkpoint(ds_checkpoint, checkpoint_path, tp_index, pp_index,
     iteration = ds_checkpoint.get_iteration()
     checkpoint_sd[ITERATION_KEY] = iteration
     if pp_index == 0:
-        checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][EMBEDDING_KEY] = meg_embedding_sd
+        checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][
+            EMBEDDING_KEY] = meg_embedding_sd
     checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd
-    if pp_index == ds_checkpoint.pp_degree -1:
-        checkpoint_sd[MODEL_KEY][WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
+    if pp_index == ds_checkpoint.pp_degree - 1:
+        checkpoint_sd[MODEL_KEY][
+            WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
 
     checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args()
     # Adjust specific fields
-    checkpoint_sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
-    checkpoint_sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
+    checkpoint_sd[
+        ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
+    checkpoint_sd[
+        ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
     if for_release:
         checkpoint_sd[ARGS_KEY].consumed_train_samples = 0
         checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0
@@ -131,20 +159,27 @@ def _create_latest_file(base_folder, iteration):
     with open(file_path, 'w') as f:
         f.write(str(iteration))
 
+
 def main():
     print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint')
 
     args = parse_arguments()
-    print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}')
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}'
+    )
 
-    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp)
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
+                                        args.target_pp)
     iteration = ds_checkpoint.get_iteration()
     _create_latest_file(args.output_folder, iteration)
-    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree, ds_checkpoint.pp_degree)
+    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration,
+                                                ds_checkpoint.tp_degree,
+                                                ds_checkpoint.pp_degree)
     for i in range(0, ds_checkpoint.tp_degree):
         for j in range(0, ds_checkpoint.pp_degree):
             sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release)
             _save_checkpoint(checkpoint_paths[i][j], sd)
 
+
 if __name__ == "__main__":
     main()
diff --git a/tools/convert_checkpoint/deepspeed_to_transformers.py b/tools/convert_checkpoint/deepspeed_to_transformers.py
index 667695026..015f63a94 100755
--- a/tools/convert_checkpoint/deepspeed_to_transformers.py
+++ b/tools/convert_checkpoint/deepspeed_to_transformers.py
@@ -3,31 +3,37 @@
 import os
 import torch
 import json
-
-from deepspeed_checkpoint import DeepSpeedCheckpoint
+import sys
+from pathlib import Path
+ 
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+    
+from deepspeed.checkpoint import DeepSpeedCheckpoint
 from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
 
 # the import was tested to work with this version
 # https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
 # copying that version here instead
-from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import (
-    convert_megatron_checkpoint,
-)
-from transformers import GPT2Config, AutoTokenizer
+from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
+from transformers import GPT2Config
 
 
 def main():
+
     # this first part comes mainly from deepspeed_to_megatron.main
     args = parse_arguments()
     print(
-        f"Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}"
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}'
     )
 
-    ds_checkpoint = DeepSpeedCheckpoint(
-        args.input_folder, args.target_tp, args.target_pp
-    )
-    ds_args = ds_checkpoint.get_args()
-    input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, args.for_release)
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
+                                        args.target_pp)
+    iteration = ds_checkpoint.get_iteration()
+    input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0,
+                                               args.for_release)
 
     # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
     # Spell out all parameters in case the defaults change.
@@ -59,13 +65,14 @@ def main():
 
     # Convert.
     print("Converting to HF Checkpoint")
-    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
+    output_state_dict = convert_megatron_checkpoint(args, input_state_dict,
+                                                    config)
 
     basename = args.output_folder
     os.makedirs(basename, exist_ok=True)
 
     # Print the structure of converted state dict.
-    # if args.print_checkpoint_structure:
+    #if args.print_checkpoint_structure:
     #    recursive_print(None, output_state_dict)
 
     # Store the config to file.
@@ -73,20 +80,6 @@ def main():
     output_config = config.to_dict()
     output_config["architectures"] = ["GPT2LMHeadModel"]
     output_config["model_type"] = "gpt2"
-
-    # Add tokenizer class info to config.json
-    # see https://github.com/huggingface/transformers/issues/13906)
-    tokenizer_type = ds_args.tokenizer_type
-    if tokenizer_type == "GPT2BPETokenizer":
-        tokenizer_model_name = "gpt2"
-    elif tokenizer_type == "PretrainedFromHF":
-        tokenizer_model_name = ds_args.tokenizer_name_or_path
-    else:
-        raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
-    tokenizer_class = type(tokenizer).__name__
-    output_config["tokenizer_class"] = tokenizer_class
-
     print(f'Saving config to "{output_config_file}"')
     with open(output_config_file, "w") as f:
         json.dump(output_config, f)
@@ -96,9 +89,7 @@ def main():
     print(f'Saving checkpoint to "{output_checkpoint_file}"')
     torch.save(output_state_dict, output_checkpoint_file)
 
-    # Save tokenizer based on args
-    print(f"Adding {tokenizer_class} tokenizer files")
-    tokenizer.save_pretrained(basename)
+    print("Now add tokenizer files and upload to the hub")
 
 
 if __name__ == "__main__":
diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py
new file mode 100755
index 000000000..9a5dd1154
--- /dev/null
+++ b/tools/convert_checkpoint/ds_to_universal.py
@@ -0,0 +1,336 @@
+#!/usr/bin/env python
+
+from collections import OrderedDict
+from copy import deepcopy
+from email.policy import default
+from functools import partial
+from pathlib import Path
+from pprint import pprint
+import argparse
+import glob
+import itertools
+import logging
+import multiprocessing
+import os
+import re
+import shutil
+import sys
+import torch
+import tqdm
+
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+
+
+from deepspeed.checkpoint import DeepSpeedCheckpoint
+
+MODEL_KEY = 'model'
+ARGS_KEY = 'args'
+LANGUGAGE_MODEL_KEY = 'language_model'
+EMBEDDING_KEY = 'embedding'
+ENCODER_KEY = 'encoder'
+WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
+WORD_EMBEDDINGS_KEY = 'word_embeddings'
+FINAL_LAYER_NORM_KEY = 'final_layernorm'
+CHECKPOINT_VERSION_KEY = 'checkpoint_version'
+CHECKPOINT_VERSION_VALUE = 3.0
+ITERATION_KEY = 'iteration'
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder',
+                        type=str,
+                        help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder',
+                        type=str,
+                        help='Output Megatron checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=1,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=1,
+                        type=int,
+                        help='Target PP degree')
+    parser.add_argument('--num_extract_workers',
+                        default=4,
+                        type=int,
+                        help='How many parallel processes to extract zero shards')
+    parser.add_argument('--num_merge_workers',
+                        default=2,
+                        type=int,
+                        help='How many parallel processes to merge tp slices (more memory intensive, use much fewer than --num_extract_workers))')
+    parser.add_argument(
+        '--for_release',
+        action='store_true',
+        help='Convert for release purpose, reset some (progress) counters.')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def _convert_ds_transformer_state(sd_list):
+    new_sd = OrderedDict()
+    for i, sd in enumerate(sd_list):
+        for key, value in sd.items():
+            new_key = f'layers.{i}.{key}'
+            new_sd[new_key] = value
+
+    return new_sd
+
+
+def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
+    path_list = []
+    iter_folder = f'iter_{iteration:07d}'
+    for i in range(0, tp_degree):
+        path_list.append([])
+        for j in range(0, pp_degree):
+            rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
+            ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
+            path_list[i].append(
+                os.path.join(base_folder, iter_folder, ckpt_path))
+
+    return path_list
+
+
+def _create_megatron_dict():
+    language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}}
+    megatron_dict = {
+        MODEL_KEY: {
+            LANGUGAGE_MODEL_KEY: language_model_dict
+        },
+        CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
+    }
+    return megatron_dict
+
+
+def _save_checkpoint(file_path, chkpt_sd):
+    dir, _ = os.path.split(file_path)
+    os.makedirs(dir, exist_ok=True)
+    torch.save(chkpt_sd, file_path)
+
+
+
+def extract_zero_shards(dir, slice_shapes, ds_checkpoint, indices_3D):
+    pp_index, tp_index, dp_index = indices_3D
+    sd = ds_checkpoint.get_zero_checkpoint_state(
+        pp_index=pp_index,
+        tp_index=tp_index,
+        dp_index=dp_index)
+
+    #pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}")
+
+    optim_sd = sd["optimizer_state_dict"]
+    param_slice_mappings = optim_sd["param_slice_mappings"]
+
+    # dict
+    state_groups = optim_sd["base_optimizer_state"]["state"]
+    # list
+    fp32_groups = optim_sd["single_partition_of_fp32_groups"]
+    param_groups_cnt = len(state_groups)
+
+    for param_group_id in range(param_groups_cnt):
+
+        flat_state = dict(
+            exp_avg=state_groups[param_group_id]["exp_avg"],
+            exp_avg_sq=state_groups[param_group_id]["exp_avg_sq"],
+            fp32=fp32_groups[param_group_id],
+        )
+
+        for name,fragment_mapping in param_slice_mappings[param_group_id].items():
+            if "word_embeddings.weight" in name and pp_index > 0:
+                # Skip tied weights that are replicated in first and last pp stages
+                continue
+
+            #print(f"{param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}")
+            for state_key in flat_state.keys():
+                dump_param_fragment(dir, tp_index, dp_index, state_key, flat_state[state_key], name, fragment_mapping.start, fragment_mapping.numel)
+
+
+
+
+cnt = 0
+def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel):
+
+    global cnt # temp hack
+
+    param_base_path = os.path.join(dir, param_name, str(tp_index))
+    os.makedirs(param_base_path, exist_ok=True)
+
+    cnt += 1
+    counter = f"{dp_index:0>2d}"
+
+    path = os.path.join(param_base_path, f"{state_name}.{counter}")
+
+    #print(f"{param_name}: {offset}: {numel} => {path}")
+
+    t = state_flat_tensor.narrow(0, offset, numel)
+    _save_checkpoint(path, t)
+
+
+def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape):
+    slices = []
+    for tp_index in range(tp_degree):
+        prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}")
+        paths = sorted(list(glob.glob(f"{prefix_path}.0*")))
+        #print(paths)
+        shards = [torch.load(p) for p in paths]
+        slice = torch.cat(shards, dim=0).reshape(slice_shape)
+        slices.append(slice)
+
+    return slices
+
+
+ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
+def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor):
+    checkpoint_info = ds_checkpoint.get_checkpoint_info()
+    padding_tensor = padded_vocab_tensor.narrow(0, checkpoint_info[ORIGINAL_VOCAB_SIZE], padded_vocab_tensor.shape[0]-checkpoint_info[ORIGINAL_VOCAB_SIZE])
+    #print(f'{padded_vocab_tensor[checkpoint_info[ORIGINAL_VOCAB_SIZE]-3:,:]=}')
+    return padded_vocab_tensor.narrow(0, 0, checkpoint_info[ORIGINAL_VOCAB_SIZE])
+
+
+WEIGHTS_TO_AVERAGE_PATTERNS = [
+    r"tied_modules.embed.word_embeddings.norm.weight",
+    r"tied_modules.embed.word_embeddings.norm.bias",
+    r"\d+.input_layernorm.weight",
+    r"\d+.input_layernorm.bias",
+    r"\d+.post_attention_layernorm.weight",
+    r"\d+.post_attention_layernorm.bias",
+    r"\d+.self_attention.dense.bias",
+    r"\d+.mlp.dense_4h_to_h.bias",
+    r"\d+.weight",
+    r"\d+.bias",
+]
+
+WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [
+    "dense_4h_to_h.weight",
+    "self_attention.dense.weight",
+]
+
+
+def _get_vocab_divisibility_padding_tensor(ds_checkpoint, padded_vocab_tensor):
+    checkpoint_info = ds_checkpoint.get_checkpoint_info()
+    if padded_vocab_tensor.shape[0] > checkpoint_info[ORIGINAL_VOCAB_SIZE]:
+        return padded_vocab_tensor[-1]
+    else:
+        return torch.zeros(padded_vocab_tensor.shape[1])
+
+def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
+    name, shape = name_and_shape
+    slice_base_path = os.path.join(slice_dir, name)
+    param_base_path = os.path.join(dir, name)
+
+    for state in ("fp32", "exp_avg", "exp_avg_sq"):
+        slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape)
+        final_path = os.path.join(param_base_path, f"{state}.pt")
+
+        #print(f"Expected shape: {shape}")
+        #print(f"Fragment sizes:", list(frag.shape for frag in slices))
+        ckpt_dict = {}
+        if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS):
+            param = sum(slices) / len(slices)
+        else:
+            cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
+            #print(f"CAT DIM: {cat_dim}")
+            param = torch.cat(slices, dim=cat_dim)
+            ckpt_dict['cat_dim'] = cat_dim
+
+        if "word_embeddings.weight" in name:
+            #print(f"Before {param.shape=}")
+            # strip padding
+            #param = _strip_vocab_padding(ds_checkpoint, param)
+            ckpt_dict['vocab_divisibility_padding_tensor'] = _get_vocab_divisibility_padding_tensor(ds_checkpoint, param)
+            #print(f"After {param.shape=}")
+
+        #print(f"Final shape: {param.shape}")
+        ckpt_dict['param'] = param
+        _save_checkpoint(final_path, ckpt_dict)
+
+
+
+
+
+
+def _get_chunks(l, n):
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+
+def _do_parallel_work(do_work, work_chunks, num_workers):
+    pool = multiprocessing.Pool(num_workers)
+    for batch in tqdm.tqdm(work_chunks):
+        pool.map(do_work, batch)
+    pool.close()
+    pool.join()
+
+def _extract_zero_shard_files(args, ds_checkpoint, slice_shapes, temp_dir):
+    _3d_range_list = list(itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree)))
+    #pprint(_3d_range_list)
+    work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers))
+    #pprint(work_chunks)
+
+    do_work = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint)
+    _do_parallel_work(do_work, work_chunks, args.num_extract_workers)
+
+
+
+def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir):
+    work_chunks = list(_get_chunks(list(slice_shapes.items()), args.num_merge_workers))
+    #pprint(work_chunks)
+    zero_output_folder = os.path.join(args.output_folder, "zero") 
+    do_work = partial(merge_tp_slices, ds_checkpoint, zero_output_folder, temp_dir, ds_checkpoint.tp_degree)
+    _do_parallel_work(do_work, work_chunks, args.num_merge_workers)
+
+
+
+def main():
+    print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint')
+
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)#, 1, 2) # args.target_tp, args.target_pp)
+
+    iteration = ds_checkpoint.get_iteration()
+    #_create_latest_file(args.output_folder, iteration)
+    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration,
+                                                ds_checkpoint.tp_degree,
+                                                ds_checkpoint.pp_degree)
+
+    slice_shapes = []
+    for mp_rank_file in ds_checkpoint.mp_rank_files:
+        mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'))
+        slice_shapes += mp_sd["param_shapes"]
+
+    # fix back to normal flat dict, merge duplicates for tp>1
+    slice_shapes = dict((k,v) for d in slice_shapes for k,v in d.items() )
+    temp_dir = os.path.join(args.output_folder, 'tmp')
+
+    print('*** 1. Extracting ZeRO fragments')
+    _extract_zero_shard_files(args, ds_checkpoint, slice_shapes, temp_dir)
+
+    print('*** 2. Merging slices')
+    _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir)
+
+    shutil.rmtree(temp_dir, ignore_errors=True)
+
+    # Copy mp* files into output folder
+    for f in glob.glob(os.path.join(args.input_folder, 'mp*')):
+        shutil.copy2(f, args.output_folder)
+
+    # Update latest to output folder
+    checkpoint_root_folder, step_folder = os.path.split(args.output_folder)
+    latest_file = os.path.join(checkpoint_root_folder, 'latest_universal')
+    with open(latest_file, "w") as f:
+        f.write(step_folder)
+
+    print('*** Done!')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/convert_checkpoint/inspect_checkpoint.py b/tools/convert_checkpoint/inspect_checkpoint.py
index 5ee955bb4..bee217f5c 100644
--- a/tools/convert_checkpoint/inspect_checkpoint.py
+++ b/tools/convert_checkpoint/inspect_checkpoint.py
@@ -1,13 +1,19 @@
-import torch
 import sys
+import torch
 import os
 from collections import OrderedDict
+from pathlib import Path
+
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
 
 
 def dump_data(datum, name_list=[]):
     if type(datum) in (dict, OrderedDict):
         for k, v in datum.items():
-            dump_data(v, name_list+[str(k)])
+            dump_data(v, name_list + [str(k)])
     elif type(datum) in (list, tuple):
         for v in datum:
             dump_data(v, name_list)
@@ -15,10 +21,11 @@ def dump_data(datum, name_list=[]):
         prefix = '.'.join(name_list)
         print(f'[tensor] {prefix} = {datum.shape}')
     else:
-        #pass 
+        #pass
         prefix = '.'.join(name_list)
         print(f'[other] {prefix} = {datum}')
 
+
 def main():
     if len(sys.argv) < 2:
         print(f'Usage: {sys.argv[0]} <checkpoint file>')
@@ -30,7 +37,7 @@ def main():
         exit(1)
 
     print(f'loading checkpoint file: {ckpt_file}')
-    sd = torch.load(ckpt_file)
+    sd = torch.load(ckpt_file, map_location=torch.device('cpu'))
     dump_data(sd)
 
     quit()
diff --git a/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py b/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py
index 3125f7d9a..09fa60991 100644
--- a/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py
+++ b/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py
@@ -1,19 +1,39 @@
+import sys
+from pathlib import Path
+ 
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+    
 import argparse
-from deepspeed_checkpoint import DeepSpeedCheckpoint
+
+from deepspeed.checkpoint import DeepSpeedCheckpoint 
+
 
 def list_files(file_list, tag):
     print(f'Listing files: {tag}')
     for i, file in enumerate(file_list):
         print(f'{i+1}: {file}')
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--folder', default=None, type=str, help='DeepSpeed Checkpoint folder')
-    parser.add_argument('--target_tp', default=None, type=int, help='Target TP degree')
-    parser.add_argument('--target_pp', default=None, type=int, help='Target PP degree')
+    parser.add_argument('--folder',
+                        default=None,
+                        type=str,
+                        help='DeepSpeed Checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=None,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=None,
+                        type=int,
+                        help='Target PP degree')
     args = parser.parse_args()
     print(f'args = {args}')
-    return args 
+    return args
 
 
 def show_input_files(ds_checkpoint):
@@ -22,38 +42,52 @@ def show_input_files(ds_checkpoint):
     list_files(ds_checkpoint.layer_files, 'layer')
     list_files(ds_checkpoint.mp_rank_files, 'mp rank')
 
+
 def show_simple_state(ds_checkpoint):
     print(f'layer keys = {ds_checkpoint.layer_keys}')
     print(f'layer count = {ds_checkpoint.layer_count}')
 
-    print(f'tp_degree_count = {ds_checkpoint.tp_degree}')
-    print(f'pp_degree_count = {ds_checkpoint.pp_degree}')
+    print(
+        f'tp_degree_count = {ds_checkpoint.original_tp_degree} ------> {ds_checkpoint.tp_degree}'
+    )
+    print(
+        f'pp_degree_count = {ds_checkpoint.original_pp_degree} ------> {ds_checkpoint.pp_degree}'
+    )
     print(f'dp_degree_count = {ds_checkpoint.dp_degree}')
+    ds_checkpoint.old_2d_map.print_data('old 2d map ==>')
+    ds_checkpoint.new_2d_map.print_data('new 2d map ==>')
+
 
 def show_mappings(ds_checkpoint):
     ds_checkpoint.show_pp_tranformer_map()
     ds_checkpoint.show_transformer_file_map()
     ds_checkpoint.show_tp_embedding_map()
     ds_checkpoint.show_tp_final_norm_map()
+    ds_checkpoint.show_2d_mapping()
+
 
 def show_state_summary(tag, sd):
-    summary = {k:v.shape for k,v in sd.items()}
+    summary = {k: v.shape for k, v in sd.items()}
     print(f'{tag} = {summary}')
 
+
 def show_embedding_states(ds_checkpoint):
     for i in range(0, ds_checkpoint.tp_degree):
         sd = ds_checkpoint.get_embedding_state(i)
         show_state_summary(f'embedding[{i}]', sd)
 
+
 def show_final_norm_states(ds_checkpoint):
     for i in range(0, ds_checkpoint.tp_degree):
         sd = ds_checkpoint.get_final_norm_state(i)
         show_state_summary(f'final_norm[{i}]', sd)
 
+
 def show_transformer_states(ds_checkpoint):
     for i in range(0, ds_checkpoint.tp_degree):
         for j in range(0, ds_checkpoint.pp_degree):
-            state_list = ds_checkpoint.get_transformer_state(tp_index=i, pp_index=j)
+            state_list = ds_checkpoint.get_transformer_state(tp_index=i,
+                                                             pp_index=j)
             print(f'tp_pp_rank[{i},{j}] = ')
             for k, sd in enumerate(state_list):
                 show_state_summary(f'      block[{k}]', sd)
@@ -64,9 +98,11 @@ def main():
     print(f'Inspecting DeepSpeed Checkpoint')
     args = parse_arguments()
 
-    ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp, args.target_pp)
+    ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp,
+                                        args.target_pp)
     ds_checkpoint.validate_files()
-    
+
+    show_simple_state(ds_checkpoint)
     show_input_files(ds_checkpoint)
     show_simple_state(ds_checkpoint)
     show_mappings(ds_checkpoint)
@@ -76,5 +112,6 @@ def main():
     checkpoint_args = ds_checkpoint.get_args()
     print(f'checkpoint args = {checkpoint_args}')
 
+
 if __name__ == "__main__":
     main()