From 290be18ce4bc5ac22a08f8bb6ee7e14966e407a8 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Sun, 30 Apr 2023 10:44:44 +0800
Subject: [PATCH 01/43] Added ROUGE-L evaluation.

Added ROUGE-L evaluation.
---
 src/lmflow/pipeline/evaluator.py | 439 ++++++++++++++++---------------
 1 file changed, 234 insertions(+), 205 deletions(-)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index 217355b38..1c6d4c0cb 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -1,15 +1,18 @@
-"""The Evaluator class simplifies the process of running evaluation on a language model provided by a HFDecoderModel instance imported from the lmflow package. The class constructor takes three dictionaries as arguments: model_args containing arguments related to the language model, data_args containing arguments related to the data used for evaluation, and evaluator_args containing other arguments for the evaluation process.
+"""The Evaluator class simplifies the process of running evaluation on a language model provided by a HFDecoderModel instance imported from the LMFlow-main package. The class constructor takes three dictionaries as arguments: model_args containing arguments related to the language model, data_args containing arguments related to the data used for evaluation, and evaluator_args containing other arguments for the evaluation process.
 
 The class has two methods: create_dataloader() that loads the data from the test file, creates a data loader, and returns it with the size of the data, and evaluate(model) that generates output text given input text. It uses the create_dataloader() method to load the data, iterates over the data in mini-batches, and encodes the input text with the encode() method of the HFDecoderModel class. Then, it generates output text using the evaluate() method of the HFDecoderModel class, decodes the generated output text using the decode() method of the HFDecoderModel class, and writes the output to a file in the output directory. The method also logs some information to the console and Weights and Biases if the use_wandb argument is True.
 """
 import os
+# import deepspeed
 import torch
 import wandb
-import deepspeed
+#import deepspeed
 import sys
 import numpy as np
 import datetime
 import json
+from rouge_score import rouge_scorer
+from multiprocessing import Pool
 # TODO: remove later
 from transformers import AutoConfig
 import torch.distributed as dist
@@ -42,7 +45,11 @@ def __init__(self, model_args, data_args, evaluator_args):
         self.data_args = data_args
         self.evaluator_args = evaluator_args
         self.model_args = model_args
-
+        print("--------Begin Evaluator Arguments----------")
+        print(f"model_args : {self.model_args}")
+        print(f"data_args : {self.data_args}")
+        print(f"evaluator_args : {self.evaluator_args}")
+        print("--------End Evaluator Arguments----------")
         # logger
         if(self.evaluator_args.use_wandb == True):
             wandb.init(project="lmflow_evaluation")
@@ -62,7 +69,7 @@ def __init__(self, model_args, data_args, evaluator_args):
 
         print(f"model_hidden_size = {self.model_hidden_size}")
         # batch size has to be divisible by world_size, but can be bigger than world_size
-        train_batch_size = self.evaluator_args.inference_batch_size_per_device * self.world_size
+        train_batch_size = 1 * self.world_size
         self.evaluator_args.minibatch_size = train_batch_size
         self.block_size = evaluator_args.evaluate_block_size
         # dataloader, data_size = create_dataloader(args)    # load dataset
@@ -86,12 +93,26 @@ def create_dataloader(self, dataset: Dataset):
             self.evaluator_args.minibatch_size,
             self.evaluator_args.random_shuffle
         )
-        print(f"Successfully create dataloader with size {len(dataloader)},batch_size {self.evaluator_args.minibatch_size}.")
-        
+        print(f"Successfully create dataloader with size {len(dataloader)}.")
         return dataloader, dataset_size
 
 
     # TODO: Split for better unittest
+    def _calculate_rouge_l(self, predicted_answer, groundtruth, scorer: rouge_scorer.RougeScorer, answer_type=None):
+        case_insensitive_types = [
+            "strategyqa",
+            "coin_flip",
+            "pubmedqa",
+            "binary_choice",
+            "medmcqa",
+            "usmle",
+        ]
+        if answer_type in case_insensitive_types:
+            rouge_score = scorer.score(groundtruth.lower(), predicted_answer.lower())["rougeL"].fmeasure
+        else:
+            rouge_score = scorer.score(groundtruth, predicted_answer)["rougeL"].fmeasure
+        return rouge_score
+
 
     def _match(self, predicted_answer, groundtruth, answer_type=None):
         case_insensitive_types = [
@@ -109,13 +130,7 @@ def _match(self, predicted_answer, groundtruth, answer_type=None):
         return False
 
 
-    def evaluate(
-        self,
-        model,
-        dataset: Dataset,
-        metric = "accuracy",
-        verbose=True,
-    ):
+    def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
         """
         Perform Evaluation for a model
 
@@ -129,125 +144,117 @@ def evaluate(
 
         """
         if metric in ["acc", "accuracy"]:
-            acc = self._evaluate_acc(model, dataset, verbose=verbose)
-            print(f"Evaluating final accuracy: {acc}")
-            return acc
-        elif metric in ["ppl", "perplexity"]:
-            ppl = self._evaluate_ppl(model, dataset, verbose=verbose)
-            print(f"Evaluating final perplexity: {ppl}")
-            return ppl
-        elif metric in ["nll", "neg_log_likelihood"]:
-            nll = self._evaluate_nll(model, dataset, verbose=verbose)
-            print(f"Evaluating final negative log likelihood: {nll}")
-            return nll
-        else:
-            raise NotImplementedError(f"metric {metric} is not supported")
-
-
-    def _evaluate_acc(self, model, dataset, verbose=True):
-        dataloader, data_size = self.create_dataloader(dataset)
-
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            if not os.path.exists(self.evaluator_args.output_dir):
-                os.makedirs(self.evaluator_args.output_dir)
-            output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+            dataloader, data_size = self.create_dataloader(dataset)     # data_size = number of mini-batches
 
-        acc_list = []
-        total = 0
-        for batch_index, batch in enumerate(dataloader):
-            if batch_index * self.world_size >= self.data_args.max_eval_samples: 
-                break
-            if batch_index * self.world_size >= self.data_args.max_eval_samples: 
-                break
-            if self.local_rank*self.evaluator_args.inference_batch_size_per_device >= len(batch):
-                current_batch = batch[:self.evaluator_args.inference_batch_size_per_device]
-            else:
-                current_batch = batch[self.local_rank*self.evaluator_args.inference_batch_size_per_device:(self.local_rank+1)*self.evaluator_args.inference_batch_size_per_device]
-            prompt_structure = self.evaluator_args.prompt_structure
-            input = [prompt_structure.format(input=i['input']) for i in current_batch]
-            output = [i['output'] for i in current_batch]   
-            input_idx = [i['input_idx'] for i in current_batch]
-            batch_input = model.encode(input, return_tensors="pt",padding=True).to(device=self.local_rank)
-            inputs = batch_input['input_ids']
-            mask = batch_input['attention_mask']
-            outputs = model.inference(inputs, max_new_tokens=100,attention_mask=mask,temperature=0.0)
-            text_out = model.decode(outputs, skip_special_tokens=True)
-            # # only return the generation, trucating the input
-            decoded_input = model.decode(inputs, skip_special_tokens=True,)
-            prompt_length = [len(i) for i in decoded_input]
-            text_out = [text_out[i][prompt_length[i]:] for i in range(len(text_out))]
-            answer_type = self.evaluator_args.answer_type
-            pred_answer = []
-            for i in text_out:
-                pred_answer.append(answer_extraction(
-                    i,
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                if not os.path.exists(self.evaluator_args.output_dir):
+                    os.makedirs(self.evaluator_args.output_dir)
+                output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+
+            acc_list = []
+            total = 0
+            # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
+            # ds_engine.module.eval()
+            for batch_index, batch in enumerate(dataloader):
+                if batch_index * self.world_size >= self.data_args.max_eval_samples: 
+                    break
+                if self.local_rank >= len(batch):
+                    current_batch = batch[0]
+                else:
+                    # the batch in current process
+                    current_batch = batch[self.local_rank] 
+                
+                prompt_structure = self.evaluator_args.prompt_structure
+                input = prompt_structure.format(input=current_batch['input'])
+                output = current_batch['output']
+                input_idx = current_batch['input_idx']
+
+                inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+                
+
+                # with torch.no_grad():
+                    # outputs = ds_engine.module.generate(inputs, synced_gpus=True, pad_token_id=model.get_tokenizer().eos_token_id, min_length=5, max_length=100,temperature=0.0, do_sample=False)
+                outputs = model.inference(inputs, max_new_tokens=100, temperature=0.0)
+                text_out = model.decode(outputs[0], skip_special_tokens=True)
+
+                # # only return the generation, truncating the input
+                prompt_length = len(model.decode(inputs[0], skip_special_tokens=True,))
+                text_out = text_out[prompt_length:]
+                answer_type = self.evaluator_args.answer_type
+                pred_answer = answer_extraction(
+                    text_out,
                     answer_type=answer_type,
-                ))
-            if verbose:
+                )
                 print(f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
                 print(f"predicted answer: {pred_answer} \n")
                 print(f"groundtruth answer: {output} \n")
 
-            if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and donot contribute to the accuracy
-                correct_ = 0
-                total_ = 1
-                total -= 1
-            else:
-                correct_ = 0
-                total_ = 0
-                for i in range(len(pred_answer)):
-                    total_ += 1
-                    if self._match(pred_answer[i], output[i], answer_type):
-                        correct_ += 1
-
-            # collect accuracy from all gpus
-            all_process = torch.tensor([correct_, total_], dtype=torch.float32, device=self.local_rank)
-            dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
-            correct_, total_ = all_process.tolist()
-            avg = correct_ / total_
-            acc_list.append(avg)
-            total += total_
-
-            # collect predictions from all gpus
-            output_dict = {"question": input,
-                        "prediction": text_out,
-                        "pred_answer": pred_answer,
-                        "answer": output}
-            all_process_list = [{}] * self.world_size
-
-            dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0)
-            if not dist.is_initialized() or dist.get_rank() == 0:
+                if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and donot contribute to the accuracy
+                    correct_ = 0
+                    total_ = 0
+                else:
+                    correct_ = 0
+                    total_ = 1
+                    if self._match(pred_answer, output, answer_type):
+                        correct_ = 1
+
+                # collect accuracy from all gpus
+                all_process = torch.tensor([correct_, total_], dtype=torch.float32, device=self.local_rank)
+                dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+                correct_, total_ = all_process.tolist()
+                avg = correct_ / total_
+                acc_list.append(avg)
+                total += total_
+
+                # collect predictions from all gpus
+                output_dict = {"question": input,
+                            "prediction": text_out,
+                            "pred_answer": pred_answer,
+                            "answer": output}
+                all_process_list = [{}] * self.world_size
+
+                dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0) # 只收集process 0？？
+                if not dist.is_initialized() or dist.get_rank() == 0:
+                    current_accuracy = np.mean(acc_list)
+                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
+                    
+                    if(self.evaluator_args.use_wandb == True):
+                        wandb.log({"Accuracy": current_accuracy})
+
+                    for index, output in enumerate(all_process_list):
+                        output_json = json.dumps(output)
+                        output_writer.write(output_json + '\n')
+
+            if not dist.is_initialized() or dist.get_rank() == 0: # 此刻已经处理完dataset
                 current_accuracy = np.mean(acc_list)
-                print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
-
-                if(self.evaluator_args.use_wandb == True):
-                    wandb.log({"Accuracy": current_accuracy})
-
-                for index, output in enumerate(all_process_list):
-                    output_json = json.dumps(output)
-                    output_writer.write(output_json + '\n')
-
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            current_accuracy = np.mean(acc_list)
-            print("Final accuracy = ", current_accuracy)
-            output_writer.close()
-        return current_accuracy
+                print("Final accuracy = ", current_accuracy)
+                output_writer.close()
+        elif metric in ["ppl", "perplexity"]:
+            ppl =  self._evaluate_ppl(model, dataset)
+            print(f"Evaluating final ppl: {ppl}")
+        elif metric in ["nll", "neg_log_likelihood"]:
+            neg_log_likelihood = self._evaluate_neg_log_likelihood(model, dataset)
+            print(f"Evaluating final negative log likelihood: {neg_log_likelihood}")
+        elif metric in ["rl", "rouge-l", "ROUGE-L"]:
+            rl = self._evaluate_rouge-l(model, dataset)
+            print(f"Evaluating final ROUGE-L: {rl}")
+        else:
+            raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
 
 
-    def _evaluate_ppl(self, model, dataset: Dataset, verbose=True):
+    def _evaluate_ppl(self, model, dataset: Dataset):
         data_dict = dataset.to_dict()
         if data_dict['type'] == 'text2text':
             raise NotImplementedError("ppl evaluation is currently not supported for text2text dataset, please use text_only dataset.")
         texts = [ instance["text"] for instance in data_dict["instances"] ]
-        encodings = model.get_tokenizer()("\n\n".join(texts), return_tensors="pt")
+        encodings = model.get_tokenizer()("\n\n".join(texts), return_tensors="pt")          # seems no need for rouge-L
         # Define some constant
         try:
             max_length = min(model.get_backend_model().config.n_positions, model.get_max_length())
         except:
             max_length = min(1024, model.get_max_length())
 
-        if verbose:
-            print(f"The maximum sequence length : {max_length}")
+        print(f"The maximum sequence length : {max_length}")
         seq_len = encodings.input_ids.size(1)
 
         nlls = []
@@ -268,20 +275,98 @@ def _evaluate_ppl(self, model, dataset: Dataset, verbose=True):
 
             nlls.append(neg_log_likelihood)
             prev_end_loc = end_loc
-            if verbose:
-                print(f"Evaluating PPL: {int(begin_loc/self.block_size) + 1} / {int(seq_len/self.block_size)} Complete, current ppl : {torch.exp(torch.stack(nlls).mean())}")
+            print(f"Evaluating PPL: {int(begin_loc/self.block_size) + 1} / {int(seq_len/self.block_size)} Complete, current ppl : {torch.exp(torch.stack(nlls).mean())}")
             if end_loc == seq_len:
                 break
         ppl = torch.exp(torch.stack(nlls).mean())
         return ppl
 
+    # Added for ROUGE-L evaluation
+    def _evaluate_rouge_l(self, model, dataset: Dataset):       # alpaca dataset: [{instruction: "...", input: "...", output: "...", text: "..."}, ....]
+        dataloader, data_size = self.create_dataloader(dataset)
+
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            if not os.path.exists(self.evaluator_args.output_dir):
+                os.makedirs(self.evaluator_args.output_dir)
+            output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+
+        rl_list = []
+        total = 0
+        for batch_index, batch in enumerate(dataloader):
+            if batch_index * self.world_size >= self.data_args.max_eval_samples:
+                break
+            if self.local_rank >= len(batch):
+                current_batch = batch[0]
+            else:
+                # the batch in current process
+                current_batch = batch[self.local_rank]
+
+            prompt_structure = self.evaluator_args.prompt_structure
+            input = prompt_structure.format(input=current_batch['input'])
+            output = current_batch['output']
+            input_idx = current_batch['input_idx']
+
+            inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+            outputs = model.inference(inputs, max_new_tokens=100, temperature=0.0)
+            text_out = model.decode(outputs[0], skip_special_tokens=True)
+
+            # only return the generation, truncating the input
+            prompt_length = len(model.decode(inputs[0], skip_special_tokens=True, ))
+            text_out = text_out[prompt_length:]
+            answer_type = self.evaluator_args.answer_type
+            pred_answer = answer_extraction(
+                text_out,
+                answer_type=answer_type,
+            )
+            print(f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
+            print(f"predicted answer: {pred_answer} \n")
+            print(f"groundtruth answer: {output} \n")
+
+            scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)  # stemmer: stem the words to their root form
+
+            if self.local_rank >= len(
+                    batch):  # for last batch, the padding examples are ignored and do not contribute to the ROUGE-L
+                rl_ = 0
+                total_ = 0
+            else:
+                rl_ = max(0, self._calculate_rouge_l(pred_answer, output, scorer, answer_type))
+                total_ = 1
+
+            # collect rouge-l from all gpus
+            all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
+            dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+            rl_sum, total_ = all_process.tolist()
+            avg = rl_sum / total_
+            rl_list.append(avg)
+            total += total_
+
+            # collect predictions from all gpus
+            output_dict = {"question": input,
+                           "prediction": text_out,
+                           "pred_answer": pred_answer,
+                           "answer": output}
+            all_process_list = [{}] * self.world_size
+
+            dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0)  # 只收集process 0？？
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                current_rouge_l = np.mean(rl_list)
+                print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                      "{}/ {} has been finished, current ROUGE-L = {}".format(int(total), data_size, current_rouge_l))
+
+                if (self.evaluator_args.use_wandb == True):
+                    wandb.log({"rougeL_fmeasure": current_rouge_l})
+
+                for index, output in enumerate(all_process_list):
+                    output_json = json.dumps(output)
+                    output_writer.write(output_json + '\n')
+
+        if not dist.is_initialized() or dist.get_rank() == 0:  # 此刻已经处理完dataset
+            current_accuracy = np.mean(rl_list)
+            print("Final ROUGE-L = ", current_rouge_l)
+            output_writer.close()
+
 
-    def _evaluate_nll(
-        self,
-        model,
-        dataset: Dataset,
-        verbose=True,
-    ):
+    def _evaluate_neg_log_likelihood(self, model, dataset: Dataset):
         """
         Evaluates negative log likelihood of the model over a dataset.
 
@@ -296,29 +381,14 @@ def _evaluate_nll(
             A float which represents the negative log likelihood.
         """
         data_dict = dataset.to_dict()
-
-        # Handles prompt structure
-        if dataset.get_type() == "text2text":
-            prompt = self.evaluator_args.prompt_structure
-            data_dict["instances"] = [
-                {
-                    "input": prompt.format(input=instance["input"]),
-                    "output": instance["output"]
-                }
-                for instance in data_dict["instances"]
-            ]
-
-        dataset = dataset.from_dict(data_dict)
-        tokenized_dataset = model.tokenize(dataset, add_special_tokens=False)
-        tokenized_dataset = tokenized_dataset.get_backend_dataset()
-        encoding_list = [
-            {
-                "input_ids": torch.tensor([input_ids]),
-                "labels": torch.tensor([labels]),
-            }
-            for input_ids, labels in zip(tokenized_dataset["input_ids"],
-                                         tokenized_dataset["labels"])
-        ]
+        if data_dict['type'] == 'text2text':
+            raise NotImplementedError(
+                "negative log likelihood evaluation is currently not supported"
+                " for text2text dataset, please use text_only dataset."
+            )
+        texts = [ instance["text"] for instance in data_dict["instances"] ]
+        encoding_list = [ model.get_tokenizer()(text, return_tensors="pt")
+                          for text in texts ]
 
         # Gets context window length
         try:
@@ -328,10 +398,9 @@ def _evaluate_nll(
             max_length = min(1024, model.get_max_length())
 
         nlls = []
-        full_nlls = []
-        num_samples = len(encoding_list)
+        num_samples = len(texts)
         for sample_idx, encodings in enumerate(encoding_list):
-            seq_len = encodings["input_ids"].size(1)
+            seq_len = encodings.input_ids.size(1)
 
             prev_end_loc = 0
             for begin_loc in range(0, seq_len, self.block_size):
@@ -339,68 +408,28 @@ def _evaluate_nll(
 
                 # may be different from block_size on last loop
                 trg_len = end_loc - prev_end_loc
-                input_ids = encodings["input_ids"][:, begin_loc:end_loc]
+                input_ids = encodings.input_ids[:, begin_loc:end_loc]
                 input_ids = input_ids.to(device=self.local_rank)
 
-                labels = encodings["labels"][:, begin_loc:end_loc]
-                target_ids = labels.clone()
-                full_target_ids = input_ids.clone()
-
-                def get_nll(label_ids, nll_list):
-                    label_ids[:, :-trg_len] = -100
-                    label_ids = label_ids.to(device=self.local_rank)
-
-                    # Valid labels are from 0 to `vocab_size`
-                    num_valid_labels = torch.count_nonzero(label_ids >= 0)
-                    if label_ids[0, 0] != -100:
-                        num_valid_labels -= 1
-
-                    if not torch.all(label_ids == -100):
-                        with torch.no_grad():
-                            outputs = model.get_backend_model()(
-                                input_ids, labels=label_ids
-                            )
-                            # loss is calculated using CrossEntropyLoss which
-                            # sums over valid labels N.B. the model only
-                            # calculates loss over trg_len - 1 labels, because
-                            # it internally shifts the labels to the left by 1.
-                            neg_log_likelihood = outputs.loss * num_valid_labels
-                    else:
-                        neg_log_likelihood = torch.zeros([]).to(
-                            device=self.local_rank
-                        )
-
-                    nll_list.append(neg_log_likelihood)
-
-                get_nll(target_ids, nlls)
-                get_nll(full_target_ids, full_nlls)
-
-                current_output_nll = torch.stack(nlls).sum() / (sample_idx + 1)
-                current_full_nll = torch.stack(full_nlls).sum() / (sample_idx + 1)
+                target_ids = input_ids.clone()
+                target_ids[:, :-trg_len] = -100
 
-                prev_end_loc = end_loc
-                if verbose:
-                    if dataset.get_type() == "text_only":
-                        print(
-                            f"Evaluating negative log likelihood:"
-                            f" {sample_idx + 1} / {num_samples} Complete,"
-                            f" current nll: {current_full_nll}"
-                        )
-                    elif dataset.get_type() == "text2text":
-                        print(
-                            f"Evaluating negative log likelihood:"
-                            f" {sample_idx + 1} / {num_samples} Complete,"
-                            f" current full nll / input nll / output nll:"
-                            f" {current_full_nll} /"
-                            f" {current_full_nll - current_output_nll} /"
-                            f" {current_output_nll}"
-                        )
-                    else:
-                        raise NotImplementedError(
-                            "f{dataset.get_type()} typed datasets are not"
-                            " supported"
-                        )
+                with torch.no_grad():
+                    outputs = model.get_backend_model()(input_ids,
+                                                        labels=target_ids)
+                    # loss is calculated using CrossEntropyLoss which averages
+                    # over valid labels N.B. the model only calculates loss
+                    # over trg_len - 1 labels, because it internally shifts the
+                    # labels to the left by 1.
+                    neg_log_likelihood = outputs.loss
 
+                nlls.append(neg_log_likelihood)
+                prev_end_loc = end_loc
+                print(
+                    f"Evaluating negative log likelihood:"
+                    f" {sample_idx + 1} / {num_samples} Complete, current nll:"
+                    f" {torch.stack(nlls).sum() / (sample_idx + 1)}"
+                )
                 if end_loc == seq_len:
                     break
 

From 56ae992ce54d3d4ff878ed3988c3d327068e8b32 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 4 May 2023 22:57:21 +0800
Subject: [PATCH 02/43] Evaluation with ROUGE-L

A script similar to run_evaluation_with_lora.sh
---
 scripts/run_evaluation_with_rougel.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 scripts/run_evaluation_with_rougel.sh

diff --git a/scripts/run_evaluation_with_rougel.sh b/scripts/run_evaluation_with_rougel.sh
new file mode 100644
index 000000000..264238e5a
--- /dev/null
+++ b/scripts/run_evaluation_with_rougel.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# --model_name_or_path specifies the original huggingface model
+# --lora_model_path specifies the model difference introduced by finetuning,
+#   i.e. the one saved by ./scripts/run_finetune_with_lora.sh
+CUDA_VISIBLE_DEVICES=0 \
+    deepspeed examples/evaluate.py \
+    --answer_type text \
+    --model_name_or_path facebook/galactica-1.3b \
+    --lora_model_path output_models/finetune_with_lora \
+    --dataset_path data/alpaca/test \
+    --prompt_structure "Input: {input}" \
+    --deepspeed examples/ds_config.json \
+    --metric rouge-l
\ No newline at end of file

From 2aa8e022c66938ee5e0ab2d35a96b01fdcb9221b Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Wed, 10 May 2023 23:06:48 +0800
Subject: [PATCH 03/43] typo fixed

.
---
 src/lmflow/pipeline/evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index 1c6d4c0cb..8501befe9 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -236,7 +236,7 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
             neg_log_likelihood = self._evaluate_neg_log_likelihood(model, dataset)
             print(f"Evaluating final negative log likelihood: {neg_log_likelihood}")
         elif metric in ["rl", "rouge-l", "ROUGE-L"]:
-            rl = self._evaluate_rouge-l(model, dataset)
+            rl = self._evaluate_rouge_l(model, dataset)
             print(f"Evaluating final ROUGE-L: {rl}")
         else:
             raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")

From 3e321b206c8ad8252a03aedd8992f8c961f8f9b5 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 11 May 2023 21:55:59 +0800
Subject: [PATCH 04/43] Appended choices lists with ROUGE-L funtions

changed line 473, added choices of "rl", "rouge-l", and "ROUGE-L" to apply the ROUGE-L metric.
---
 src/lmflow/args.py | 53 +++-------------------------------------------
 1 file changed, 3 insertions(+), 50 deletions(-)

diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index 00ec3ffa2..6ffc0469a 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -13,7 +13,7 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Optional, List
+from typing import Optional
 
 from transformers.utils.versions import require_version
 
@@ -99,10 +99,6 @@ class ModelArguments:
         default=None,
         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
     )
-    arch_type: Optional[str] = field(
-        default="decoder_only",
-        metadata={"help": "The architecture type of the model. Currently supported decoder_only or encoder_decoder"}
-    )
     config_overrides: Optional[str] = field(
         default=None,
         metadata={
@@ -171,10 +167,6 @@ class ModelArguments:
         default=32,
         metadata={"help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."},
     )
-    lora_target_modules: List[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name",
-                              }
-    )
     lora_dropout: float = field(
         default=0.1,
         metadata={"help": "The dropout rate in lora.linear."},
@@ -250,9 +242,6 @@ class DatasetArguments:
     dataset_path: Optional[str] = field(
         default=None, metadata={"help": "The path of the dataset to use."}
     )
-    eval_dataset_path: Optional[str] = field(
-        default=None, metadata={"help": "The path of the eval dataset to use."}
-    )
     dataset_name: Optional[str] = field(
         default="customized", metadata={"help": "Should be \"customized\""}
     )
@@ -313,16 +302,6 @@ class DatasetArguments:
         default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
-    group_texts_batch_size: int = field(
-        default=1000,
-        metadata={
-            "help": (
-                "Number of samples that will be grouped together to go though"
-                " `group_texts` operation. See `--disable_group_texts` for"
-                " detailed explanation of this operation."
-            )
-        }
-    )
     disable_group_texts: bool = field(
         default=False,
         metadata={
@@ -491,20 +470,11 @@ class EvaluatorArguments:
         default="accuracy",
         metadata={
             "help": "the metric the model will be evaluated on",
-            "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood"],
-        },
-    )
-    inference_batch_size_per_device: Optional[int] = field(
-        default=1,
-        metadata={
-            "help": (
-                "every device will infer {inference_batch_size_per_device}"
-                " samples in parallel. The inferred results will be concatenaed"
-                " with inputs and attach a reward."
-            ),
+            "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood", "rl", "rouge-l", "ROUGE-L"],
         },
     )
 
+
 @dataclass
 class InferencerArguments:
     """
@@ -627,23 +597,6 @@ class RaftAlignerArguments(TrainingArguments):
         },
     )
 
-@dataclass
-class BenchmarkingArguments:
-    dataset_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "benchmark dataset name provided by lmflow"
-        },
-    )
-    lm_evaluation_metric: Optional[str] = field(
-        default="accuracy",
-        metadata={
-            "help": "the metric the model will be evaluated on",
-            "choices": ["acc", "acc_norm", "bleu", "chrf", "em", "f1", "ppl", \
-                "ter", "r@1", "r@2", "mrr", "mc1", "mc2", "word_perplexity", \
-                    "byte_perplexity", "bits_per_byte"],
-        },
-    )
 
 PIPELINE_ARGUMENT_MAPPING = {
     "finetuner": FinetunerArguments,

From 61b38a5bbfe12975aca847167724c6d94474a04e Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 11 May 2023 22:21:54 +0800
Subject: [PATCH 05/43] changed the model_path and deleted lora_model_path

model: from facebook/galactica-1.3b to gpt2_large
---
 scripts/run_evaluation_with_rougel.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/run_evaluation_with_rougel.sh b/scripts/run_evaluation_with_rougel.sh
index 264238e5a..75bbdfdde 100644
--- a/scripts/run_evaluation_with_rougel.sh
+++ b/scripts/run_evaluation_with_rougel.sh
@@ -6,8 +6,7 @@
 CUDA_VISIBLE_DEVICES=0 \
     deepspeed examples/evaluate.py \
     --answer_type text \
-    --model_name_or_path facebook/galactica-1.3b \
-    --lora_model_path output_models/finetune_with_lora \
+    --model_name_or_path gpt2-large \
     --dataset_path data/alpaca/test \
     --prompt_structure "Input: {input}" \
     --deepspeed examples/ds_config.json \

From 6bc78caa6147794ece0aa98c0e5442838088716f Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 11 May 2023 22:40:09 +0800
Subject: [PATCH 06/43] imported deepspeed

.
---
 src/lmflow/pipeline/evaluator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index 8501befe9..3c703e391 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -3,10 +3,9 @@
 The class has two methods: create_dataloader() that loads the data from the test file, creates a data loader, and returns it with the size of the data, and evaluate(model) that generates output text given input text. It uses the create_dataloader() method to load the data, iterates over the data in mini-batches, and encodes the input text with the encode() method of the HFDecoderModel class. Then, it generates output text using the evaluate() method of the HFDecoderModel class, decodes the generated output text using the decode() method of the HFDecoderModel class, and writes the output to a file in the output directory. The method also logs some information to the console and Weights and Biases if the use_wandb argument is True.
 """
 import os
-# import deepspeed
+import deepspeed
 import torch
 import wandb
-#import deepspeed
 import sys
 import numpy as np
 import datetime

From 3a5c0fce4500efd19fc72550310ef1a40357be62 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Fri, 12 May 2023 00:40:06 +0800
Subject: [PATCH 07/43] typo fixed

...
---
 src/lmflow/pipeline/evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index 3c703e391..f725dbe64 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -360,7 +360,7 @@ def _evaluate_rouge_l(self, model, dataset: Dataset):       # alpaca dataset: [{
                     output_writer.write(output_json + '\n')
 
         if not dist.is_initialized() or dist.get_rank() == 0:  # 此刻已经处理完dataset
-            current_accuracy = np.mean(rl_list)
+            current_rouge_l = np.mean(rl_list)
             print("Final ROUGE-L = ", current_rouge_l)
             output_writer.close()
 

From c88e18f35e0389c0bc6e3c8545e1441cbb8c6601 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Fri, 12 May 2023 00:42:36 +0800
Subject: [PATCH 08/43] Add files via upload

---
 src/lmflow/pipeline/evaluator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index f725dbe64..e124557a2 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -363,6 +363,7 @@ def _evaluate_rouge_l(self, model, dataset: Dataset):       # alpaca dataset: [{
             current_rouge_l = np.mean(rl_list)
             print("Final ROUGE-L = ", current_rouge_l)
             output_writer.close()
+        return current_rouge_l
 
 
     def _evaluate_neg_log_likelihood(self, model, dataset: Dataset):

From ee17f1abf2444f5d50c5bd9a8be4cb9b2e4ff2fc Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Mon, 5 Jun 2023 00:06:01 +0800
Subject: [PATCH 09/43] Simplify the evaluator.py

Combine similar accuracy and ROUGE-L metrics into one metric
---
 src/lmflow/pipeline/evaluator.py | 88 +++++++++++++++++++++-----------
 1 file changed, 59 insertions(+), 29 deletions(-)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index e124557a2..9d79355b6 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -56,6 +56,7 @@ def __init__(self, model_args, data_args, evaluator_args):
         set_random_seed(self.evaluator_args.random_seed)
         self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        print("self.world_size是：", self.world_size)
         torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
         deepspeed.init_distributed()
 
@@ -89,7 +90,7 @@ def create_dataloader(self, dataset: Dataset):
 
         dataloader = batchlize(
             dataset_buf,
-            self.evaluator_args.minibatch_size,
+            self.evaluator_args.minibatch_size,      # = self.world_size
             self.evaluator_args.random_shuffle
         )
         print(f"Successfully create dataloader with size {len(dataloader)}.")
@@ -142,7 +143,7 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
             
 
         """
-        if metric in ["acc", "accuracy"]:
+        if metric in ["acc", "accuracy", "rl", "rouge-l", "ROUGE-L"]:
             dataloader, data_size = self.create_dataloader(dataset)     # data_size = number of mini-batches
 
             if not dist.is_initialized() or dist.get_rank() == 0:
@@ -150,12 +151,12 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
                     os.makedirs(self.evaluator_args.output_dir)
                 output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
 
-            acc_list = []
+            all_list = []  # list to replace both acc_list and rl_list
             total = 0
             # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
             # ds_engine.module.eval()
             for batch_index, batch in enumerate(dataloader):
-                if batch_index * self.world_size >= self.data_args.max_eval_samples: 
+                if batch_index * self.world_size >= self.data_args.max_eval_samples:
                     break
                 if self.local_rank >= len(batch):
                     current_batch = batch[0]
@@ -188,21 +189,39 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
                 print(f"predicted answer: {pred_answer} \n")
                 print(f"groundtruth answer: {output} \n")
 
-                if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and donot contribute to the accuracy
-                    correct_ = 0
-                    total_ = 0
+                if metric in ["acc", "accuracy"]:
+                    if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and do not contribute to the accuracy
+                        correct_ = 0
+                        total_ = 0
+                    else:
+                        correct_ = 0
+                        total_ = 1
+                        if self._match(pred_answer, output, answer_type):
+                            correct_ = 1
+                    score = correct_
+
+                else:
+                    scorer = rouge_scorer.RougeScorer(["rougeL"],
+                                                      use_stemmer=False)  # stemmer: stem the words to their root form
+
+                    if self.local_rank >= len(
+                            batch):  # for last batch, the padding examples are ignored and do not contribute to the ROUGE-L
+                        rl_ = 0
+                        total_ = 0
+                    else:
+                        rl_ = max(0, self._calculate_rouge_l(pred_answer, output, scorer, answer_type))
+                        total_ = 1
+                    score = rl_
+
+                # collect rouge-l from all gpus
+                all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
+                if metric in ["acc", "accuracy"]:
+                    dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
                 else:
-                    correct_ = 0
-                    total_ = 1
-                    if self._match(pred_answer, output, answer_type):
-                        correct_ = 1
-
-                # collect accuracy from all gpus
-                all_process = torch.tensor([correct_, total_], dtype=torch.float32, device=self.local_rank)
-                dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
-                correct_, total_ = all_process.tolist()
-                avg = correct_ / total_
-                acc_list.append(avg)
+                    dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  # sum 还是 max好？
+                sum_or_max, total_ = all_process.tolist()
+                avg = sum_or_max / total_
+                all_list.append(avg)
                 total += total_
 
                 # collect predictions from all gpus
@@ -214,29 +233,40 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
 
                 dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0) # 只收集process 0？？
                 if not dist.is_initialized() or dist.get_rank() == 0:
-                    current_accuracy = np.mean(acc_list)
-                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
-                    
-                    if(self.evaluator_args.use_wandb == True):
-                        wandb.log({"Accuracy": current_accuracy})
+                    if metric in ["acc", "accuracy"]:
+                        current_accuracy = np.mean(all_list)
+                        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
+
+                        if(self.evaluator_args.use_wandb == True):
+                            wandb.log({"Accuracy": current_accuracy})
+
+                    else:
+                        current_rouge_l = np.mean(all_list)
+                        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current ROUGE-L = {}".format(int(total), data_size, current_rouge_l))
+
+                        if (self.evaluator_args.use_wandb == True):
+                            wandb.log({"rougeL_fmeasure": current_rouge_l})
 
                     for index, output in enumerate(all_process_list):
                         output_json = json.dumps(output)
                         output_writer.write(output_json + '\n')
 
+
             if not dist.is_initialized() or dist.get_rank() == 0: # 此刻已经处理完dataset
-                current_accuracy = np.mean(acc_list)
-                print("Final accuracy = ", current_accuracy)
+                if metric in ["acc", "accuracy"]:
+                    current_accuracy = np.mean(all_list)
+                    print("Final accuracy = ", current_accuracy)
+                else:
+                    current_rouge_l = np.mean(all_list)
+                    print("Final ROUGE-L = ", current_rouge_l)
                 output_writer.close()
+                
         elif metric in ["ppl", "perplexity"]:
             ppl =  self._evaluate_ppl(model, dataset)
             print(f"Evaluating final ppl: {ppl}")
         elif metric in ["nll", "neg_log_likelihood"]:
             neg_log_likelihood = self._evaluate_neg_log_likelihood(model, dataset)
             print(f"Evaluating final negative log likelihood: {neg_log_likelihood}")
-        elif metric in ["rl", "rouge-l", "ROUGE-L"]:
-            rl = self._evaluate_rouge_l(model, dataset)
-            print(f"Evaluating final ROUGE-L: {rl}")
         else:
             raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
 
@@ -333,7 +363,7 @@ def _evaluate_rouge_l(self, model, dataset: Dataset):       # alpaca dataset: [{
 
             # collect rouge-l from all gpus
             all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
-            dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+            dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)      # sum 还是 max好？
             rl_sum, total_ = all_process.tolist()
             avg = rl_sum / total_
             rl_list.append(avg)

From 5971fa741fc5b1c9a71f607328d4f6d77d8d3ec0 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Mon, 5 Jun 2023 13:49:08 +0800
Subject: [PATCH 10/43] for testing

---
 scripts/run_tmp.sh | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 scripts/run_tmp.sh

diff --git a/scripts/run_tmp.sh b/scripts/run_tmp.sh
new file mode 100644
index 000000000..f666b7ad4
--- /dev/null
+++ b/scripts/run_tmp.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 \
+    deepspeed examples/evaluate.py \
+    --answer_type text \
+    --model_name_or_path gpt2-large \
+    --dataset_path data/alpaca/test \
+    --deepspeed examples/ds_config.json \
+    --metric accuracy
\ No newline at end of file

From 1ab1afe78b3d624214add6788bba699de3b9b597 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Mon, 5 Jun 2023 15:42:53 +0800
Subject: [PATCH 11/43] the original evaluator file before June 4th

---
 src/lmflow/pipeline/evaluator_tmp.py | 437 +++++++++++++++++++++++++++
 1 file changed, 437 insertions(+)
 create mode 100644 src/lmflow/pipeline/evaluator_tmp.py

diff --git a/src/lmflow/pipeline/evaluator_tmp.py b/src/lmflow/pipeline/evaluator_tmp.py
new file mode 100644
index 000000000..b8b2e0a78
--- /dev/null
+++ b/src/lmflow/pipeline/evaluator_tmp.py
@@ -0,0 +1,437 @@
+"""The Evaluator class simplifies the process of running evaluation on a language model provided by a HFDecoderModel instance imported from the LMFlow-main package. The class constructor takes three dictionaries as arguments: model_args containing arguments related to the language model, data_args containing arguments related to the data used for evaluation, and evaluator_args containing other arguments for the evaluation process.
+
+The class has two methods: create_dataloader() that loads the data from the test file, creates a data loader, and returns it with the size of the data, and evaluate(model) that generates output text given input text. It uses the create_dataloader() method to load the data, iterates over the data in mini-batches, and encodes the input text with the encode() method of the HFDecoderModel class. Then, it generates output text using the evaluate() method of the HFDecoderModel class, decodes the generated output text using the decode() method of the HFDecoderModel class, and writes the output to a file in the output directory. The method also logs some information to the console and Weights and Biases if the use_wandb argument is True.
+"""
+import os
+import deepspeed
+import torch
+import wandb
+import sys
+import numpy as np
+import datetime
+import json
+from rouge_score import rouge_scorer
+from multiprocessing import Pool
+# TODO: remove later
+from transformers import AutoConfig
+import torch.distributed as dist
+
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.base_pipeline import BasePipeline
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+
+class Evaluator(BasePipeline):
+    """
+    Initializes the `Evaluator` class with given arguments.
+
+    Parameters
+    ------------
+    model_args : ModelArguments object.
+        Contains the arguments required to load the model.
+    
+    data_args : DatasetArguments object.
+        Contains the arguments required to load the dataset.
+
+    evaluator_args : EvaluatorArguments object.
+        Contains the arguments required to perform evaluation.
+
+
+    """
+    def __init__(self, model_args, data_args, evaluator_args):
+    # our method
+        self.data_args = data_args
+        self.evaluator_args = evaluator_args
+        self.model_args = model_args
+        print("--------Begin Evaluator Arguments----------")
+        print(f"model_args : {self.model_args}")
+        print(f"data_args : {self.data_args}")
+        print(f"evaluator_args : {self.evaluator_args}")
+        print("--------End Evaluator Arguments----------")
+        # logger
+        if(self.evaluator_args.use_wandb == True):
+            wandb.init(project="lmflow_evaluation")
+        # random seed
+        set_random_seed(self.evaluator_args.random_seed)
+        self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
+        deepspeed.init_distributed()
+
+        self.config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        try: 
+            self.model_hidden_size = self.config.hidden_size
+        except:
+            print("Error in setting hidden size, use the default size 1024")
+            self.model_hidden_size = 1024 # gpt2 seems do not have hidden_size in config
+
+        print(f"model_hidden_size = {self.model_hidden_size}")
+        # batch size has to be divisible by world_size, but can be bigger than world_size
+        train_batch_size = 1 * self.world_size
+        self.evaluator_args.minibatch_size = train_batch_size
+        self.block_size = evaluator_args.evaluate_block_size
+        # dataloader, data_size = create_dataloader(args)    # load dataset
+
+
+    def create_dataloader(self, dataset: Dataset):
+        data_dict = dataset.to_dict()
+        inputs = [ instance["input"] for instance in data_dict["instances"] ]
+        outputs = [ instance["output"] for instance in data_dict["instances"] ]
+        dataset_size = len(outputs)
+        dataset_buf = []
+        for idx in range(dataset_size):
+            dataset_buf.append({
+                "input": inputs[idx],
+                "output": outputs[idx],
+                "input_idx": idx
+            })
+
+        dataloader = batchlize(
+            dataset_buf,
+            self.evaluator_args.minibatch_size,
+            self.evaluator_args.random_shuffle
+        )
+        print(f"Successfully create dataloader with size {len(dataloader)}.")
+        return dataloader, dataset_size
+
+
+    # TODO: Split for better unittest
+    def _calculate_rouge_l(self, predicted_answer, groundtruth, scorer: rouge_scorer.RougeScorer, answer_type=None):
+        case_insensitive_types = [
+            "strategyqa",
+            "coin_flip",
+            "pubmedqa",
+            "binary_choice",
+            "medmcqa",
+            "usmle",
+        ]
+        if answer_type in case_insensitive_types:
+            rouge_score = scorer.score(groundtruth.lower(), predicted_answer.lower())["rougeL"].fmeasure
+        else:
+            rouge_score = scorer.score(groundtruth, predicted_answer)["rougeL"].fmeasure
+        return rouge_score
+
+
+    def _match(self, predicted_answer, groundtruth, answer_type=None):
+        case_insensitive_types = [
+            "strategyqa",
+            "coin_flip",
+            "pubmedqa",
+            "binary_choice",
+            "medmcqa",
+            "usmle",
+        ]
+        if answer_type in case_insensitive_types:
+            return predicted_answer.lower() == groundtruth.lower()
+        else:
+            return predicted_answer == groundtruth
+        return False
+
+
+    def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
+        """
+        Perform Evaluation for a model
+
+        Parameters
+        ------------
+        model : TunableModel object.
+            TunableModel to perform inference
+
+        dataset : Dataset object.
+            
+
+        """
+        if metric in ["acc", "accuracy"]:
+            dataloader, data_size = self.create_dataloader(dataset)     # data_size = number of mini-batches
+
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                if not os.path.exists(self.evaluator_args.output_dir):
+                    os.makedirs(self.evaluator_args.output_dir)
+                output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+
+            acc_list = []
+            total = 0
+            # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
+            # ds_engine.module.eval()
+            for batch_index, batch in enumerate(dataloader):
+                if batch_index * self.world_size >= self.data_args.max_eval_samples: 
+                    break
+                if self.local_rank >= len(batch):
+                    current_batch = batch[0]
+                else:
+                    # the batch in current process
+                    current_batch = batch[self.local_rank] 
+                
+                prompt_structure = self.evaluator_args.prompt_structure
+                input = prompt_structure.format(input=current_batch['input'])
+                output = current_batch['output']
+                input_idx = current_batch['input_idx']
+
+                inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+                
+
+                # with torch.no_grad():
+                    # outputs = ds_engine.module.generate(inputs, synced_gpus=True, pad_token_id=model.get_tokenizer().eos_token_id, min_length=5, max_length=100,temperature=0.0, do_sample=False)
+                outputs = model.inference(inputs, max_new_tokens=100, temperature=0.0)
+                text_out = model.decode(outputs[0], skip_special_tokens=True)
+
+                # # only return the generation, truncating the input
+                prompt_length = len(model.decode(inputs[0], skip_special_tokens=True,))
+                text_out = text_out[prompt_length:]
+                answer_type = self.evaluator_args.answer_type
+                pred_answer = answer_extraction(
+                    text_out,
+                    answer_type=answer_type,
+                )
+                print(f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
+                print(f"predicted answer: {pred_answer} \n")
+                print(f"groundtruth answer: {output} \n")
+
+                if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and do not contribute to the accuracy
+                    correct_ = 0
+                    total_ = 0
+                else:
+                    correct_ = 0
+                    total_ = 1
+                    if self._match(pred_answer, output, answer_type):
+                        correct_ = 1
+
+                # collect accuracy from all gpus
+                all_process = torch.tensor([correct_, total_], dtype=torch.float32, device=self.local_rank)
+                dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+                correct_, total_ = all_process.tolist()
+                avg = correct_ / total_
+                acc_list.append(avg)
+                total += total_
+
+                # collect predictions from all gpus
+                output_dict = {"question": input,
+                            "prediction": text_out,
+                            "pred_answer": pred_answer,
+                            "answer": output}
+                all_process_list = [{}] * self.world_size
+
+                dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0) # 只收集process 0？？
+                if not dist.is_initialized() or dist.get_rank() == 0:
+                    current_accuracy = np.mean(acc_list)
+                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
+                    
+                    if(self.evaluator_args.use_wandb == True):
+                        wandb.log({"Accuracy": current_accuracy})
+
+                    for index, output in enumerate(all_process_list):
+                        output_json = json.dumps(output)
+                        output_writer.write(output_json + '\n')
+
+            if not dist.is_initialized() or dist.get_rank() == 0: # 此刻已经处理完dataset
+                current_accuracy = np.mean(acc_list)
+                print("Final accuracy = ", current_accuracy)
+                output_writer.close()
+        elif metric in ["ppl", "perplexity"]:
+            ppl =  self._evaluate_ppl(model, dataset)
+            print(f"Evaluating final ppl: {ppl}")
+        elif metric in ["nll", "neg_log_likelihood"]:
+            neg_log_likelihood = self._evaluate_neg_log_likelihood(model, dataset)
+            print(f"Evaluating final negative log likelihood: {neg_log_likelihood}")
+        elif metric in ["rl", "rouge-l", "ROUGE-L"]:
+            rl = self._evaluate_rouge_l(model, dataset)
+            print(f"Evaluating final ROUGE-L: {rl}")
+        else:
+            raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
+
+
+    def _evaluate_ppl(self, model, dataset: Dataset):
+        data_dict = dataset.to_dict()
+        if data_dict['type'] == 'text2text':
+            raise NotImplementedError("ppl evaluation is currently not supported for text2text dataset, please use text_only dataset.")
+        texts = [ instance["text"] for instance in data_dict["instances"] ]
+        encodings = model.get_tokenizer()("\n\n".join(texts), return_tensors="pt")          # seems no need for rouge-L
+        # Define some constant
+        try:
+            max_length = min(model.get_backend_model().config.n_positions, model.get_max_length())
+        except:
+            max_length = min(1024, model.get_max_length())
+
+        print(f"The maximum sequence length : {max_length}")
+        seq_len = encodings.input_ids.size(1)
+
+        nlls = []
+        prev_end_loc = 0
+        for begin_loc in range(0, seq_len, self.block_size):
+            end_loc = min(begin_loc + max_length, seq_len)
+            trg_len = end_loc - prev_end_loc  # may be different from block_size on last loop
+            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device=self.local_rank)
+            target_ids = input_ids.clone()
+            target_ids[:, :-trg_len] = -100
+
+            with torch.no_grad():
+                outputs = model.get_backend_model()(input_ids, labels=target_ids)
+                # loss is calculated using CrossEntropyLoss which averages over valid labels
+                # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+                # to the left by 1.
+                neg_log_likelihood = outputs.loss
+
+            nlls.append(neg_log_likelihood)
+            prev_end_loc = end_loc
+            print(f"Evaluating PPL: {int(begin_loc/self.block_size) + 1} / {int(seq_len/self.block_size)} Complete, current ppl : {torch.exp(torch.stack(nlls).mean())}")
+            if end_loc == seq_len:
+                break
+        ppl = torch.exp(torch.stack(nlls).mean())
+        return ppl
+
+    # Added for ROUGE-L evaluation
+    def _evaluate_rouge_l(self, model, dataset: Dataset):       # alpaca dataset: [{instruction: "...", input: "...", output: "...", text: "..."}, ....]
+        dataloader, data_size = self.create_dataloader(dataset)
+
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            if not os.path.exists(self.evaluator_args.output_dir):
+                os.makedirs(self.evaluator_args.output_dir)
+            output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+
+        rl_list = []
+        total = 0
+        for batch_index, batch in enumerate(dataloader):
+            if batch_index * self.world_size >= self.data_args.max_eval_samples:
+                break
+            if self.local_rank >= len(batch):
+                current_batch = batch[0]
+            else:
+                # the batch in current process
+                current_batch = batch[self.local_rank]
+
+            prompt_structure = self.evaluator_args.prompt_structure
+            input = prompt_structure.format(input=current_batch['input'])
+            output = current_batch['output']
+            input_idx = current_batch['input_idx']
+
+            inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+            outputs = model.inference(inputs, max_new_tokens=100, temperature=0.0)
+            text_out = model.decode(outputs[0], skip_special_tokens=True)
+
+            # only return the generation, truncating the input
+            prompt_length = len(model.decode(inputs[0], skip_special_tokens=True, ))
+            text_out = text_out[prompt_length:]
+            answer_type = self.evaluator_args.answer_type
+            pred_answer = answer_extraction(
+                text_out,
+                answer_type=answer_type,
+            )
+            print(f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
+            print(f"predicted answer: {pred_answer} \n")
+            print(f"groundtruth answer: {output} \n")
+
+            scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)  # stemmer: stem the words to their root form
+
+            if self.local_rank >= len(
+                    batch):  # for last batch, the padding examples are ignored and do not contribute to the ROUGE-L
+                rl_ = 0
+                total_ = 0
+            else:
+                rl_ = max(0, self._calculate_rouge_l(pred_answer, output, scorer, answer_type))
+                total_ = 1
+
+            # collect rouge-l from all gpus
+            all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
+            dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+            rl_sum, total_ = all_process.tolist()
+            avg = rl_sum / total_
+            rl_list.append(avg)
+            total += total_
+
+            # collect predictions from all gpus
+            output_dict = {"question": input,
+                           "prediction": text_out,
+                           "pred_answer": pred_answer,
+                           "answer": output}
+            all_process_list = [{}] * self.world_size
+
+            dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0)  # 只收集process 0？？
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                current_rouge_l = np.mean(rl_list)
+                print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                      "{}/ {} has been finished, current ROUGE-L = {}".format(int(total), data_size, current_rouge_l))
+
+                if (self.evaluator_args.use_wandb == True):
+                    wandb.log({"rougeL_fmeasure": current_rouge_l})
+
+                for index, output in enumerate(all_process_list):
+                    output_json = json.dumps(output)
+                    output_writer.write(output_json + '\n')
+
+        if not dist.is_initialized() or dist.get_rank() == 0:  # 此刻已经处理完dataset
+            current_rouge_l = np.mean(rl_list)
+            print("Final ROUGE-L = ", current_rouge_l)
+            output_writer.close()
+        return current_rouge_l
+
+
+    def _evaluate_neg_log_likelihood(self, model, dataset: Dataset):
+        """
+        Evaluates negative log likelihood of the model over a dataset.
+
+        NLL = -1/N sum_{i=1}^N sum_{j=1}^|w_i| ln(p(w_{i,j}|context_window)),
+
+        where N is the number of data samples, w_{i,j} is the j-th token in
+        i-th sample. Here "context_window" = p(w_{i,start}, w_{i,start+1}, ...,
+        p_{i,j-1} with start = max(0, j - window_length + 1). "window_length"
+        is normally the maximum length accepted by the model.
+
+        Returns:
+            A float which represents the negative log likelihood.
+        """
+        data_dict = dataset.to_dict()
+        if data_dict['type'] == 'text2text':
+            raise NotImplementedError(
+                "negative log likelihood evaluation is currently not supported"
+                " for text2text dataset, please use text_only dataset."
+            )
+        texts = [ instance["text"] for instance in data_dict["instances"] ]
+        encoding_list = [ model.get_tokenizer()(text, return_tensors="pt")
+                          for text in texts ]
+
+        # Gets context window length
+        try:
+            max_length = min(model.get_backend_model().config.n_positions,
+                             model.get_max_length())
+        except:
+            max_length = min(1024, model.get_max_length())
+
+        nlls = []
+        num_samples = len(texts)
+        for sample_idx, encodings in enumerate(encoding_list):
+            seq_len = encodings.input_ids.size(1)
+
+            prev_end_loc = 0
+            for begin_loc in range(0, seq_len, self.block_size):
+                end_loc = min(begin_loc + max_length, seq_len)
+
+                # may be different from block_size on last loop
+                trg_len = end_loc - prev_end_loc
+                input_ids = encodings.input_ids[:, begin_loc:end_loc]
+                input_ids = input_ids.to(device=self.local_rank)
+
+                target_ids = input_ids.clone()
+                target_ids[:, :-trg_len] = -100
+
+                with torch.no_grad():
+                    outputs = model.get_backend_model()(input_ids,
+                                                        labels=target_ids)
+                    # loss is calculated using CrossEntropyLoss which averages
+                    # over valid labels N.B. the model only calculates loss
+                    # over trg_len - 1 labels, because it internally shifts the
+                    # labels to the left by 1.
+                    neg_log_likelihood = outputs.loss
+
+                nlls.append(neg_log_likelihood)
+                prev_end_loc = end_loc
+                print(
+                    f"Evaluating negative log likelihood:"
+                    f" {sample_idx + 1} / {num_samples} Complete, current nll:"
+                    f" {torch.stack(nlls).sum() / (sample_idx + 1)}"
+                )
+                if end_loc == seq_len:
+                    break
+
+        mean_nll = torch.stack(nlls).sum() / num_samples
+        return mean_nll

From 64c3acfa8923af0ff4b5bce63e6af6a25fb6942d Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Mon, 5 Jun 2023 15:44:24 +0800
Subject: [PATCH 12/43] the original version before June 4th

---
 src/lmflow/pipeline/evaluator.py | 88 +++++++++++---------------------
 1 file changed, 29 insertions(+), 59 deletions(-)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index 9d79355b6..b8b2e0a78 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -56,7 +56,6 @@ def __init__(self, model_args, data_args, evaluator_args):
         set_random_seed(self.evaluator_args.random_seed)
         self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.world_size = int(os.getenv("WORLD_SIZE", "1"))
-        print("self.world_size是：", self.world_size)
         torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
         deepspeed.init_distributed()
 
@@ -90,7 +89,7 @@ def create_dataloader(self, dataset: Dataset):
 
         dataloader = batchlize(
             dataset_buf,
-            self.evaluator_args.minibatch_size,      # = self.world_size
+            self.evaluator_args.minibatch_size,
             self.evaluator_args.random_shuffle
         )
         print(f"Successfully create dataloader with size {len(dataloader)}.")
@@ -143,7 +142,7 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
             
 
         """
-        if metric in ["acc", "accuracy", "rl", "rouge-l", "ROUGE-L"]:
+        if metric in ["acc", "accuracy"]:
             dataloader, data_size = self.create_dataloader(dataset)     # data_size = number of mini-batches
 
             if not dist.is_initialized() or dist.get_rank() == 0:
@@ -151,12 +150,12 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
                     os.makedirs(self.evaluator_args.output_dir)
                 output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
 
-            all_list = []  # list to replace both acc_list and rl_list
+            acc_list = []
             total = 0
             # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
             # ds_engine.module.eval()
             for batch_index, batch in enumerate(dataloader):
-                if batch_index * self.world_size >= self.data_args.max_eval_samples:
+                if batch_index * self.world_size >= self.data_args.max_eval_samples: 
                     break
                 if self.local_rank >= len(batch):
                     current_batch = batch[0]
@@ -189,39 +188,21 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
                 print(f"predicted answer: {pred_answer} \n")
                 print(f"groundtruth answer: {output} \n")
 
-                if metric in ["acc", "accuracy"]:
-                    if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and do not contribute to the accuracy
-                        correct_ = 0
-                        total_ = 0
-                    else:
-                        correct_ = 0
-                        total_ = 1
-                        if self._match(pred_answer, output, answer_type):
-                            correct_ = 1
-                    score = correct_
-
-                else:
-                    scorer = rouge_scorer.RougeScorer(["rougeL"],
-                                                      use_stemmer=False)  # stemmer: stem the words to their root form
-
-                    if self.local_rank >= len(
-                            batch):  # for last batch, the padding examples are ignored and do not contribute to the ROUGE-L
-                        rl_ = 0
-                        total_ = 0
-                    else:
-                        rl_ = max(0, self._calculate_rouge_l(pred_answer, output, scorer, answer_type))
-                        total_ = 1
-                    score = rl_
-
-                # collect rouge-l from all gpus
-                all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
-                if metric in ["acc", "accuracy"]:
-                    dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+                if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and do not contribute to the accuracy
+                    correct_ = 0
+                    total_ = 0
                 else:
-                    dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  # sum 还是 max好？
-                sum_or_max, total_ = all_process.tolist()
-                avg = sum_or_max / total_
-                all_list.append(avg)
+                    correct_ = 0
+                    total_ = 1
+                    if self._match(pred_answer, output, answer_type):
+                        correct_ = 1
+
+                # collect accuracy from all gpus
+                all_process = torch.tensor([correct_, total_], dtype=torch.float32, device=self.local_rank)
+                dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+                correct_, total_ = all_process.tolist()
+                avg = correct_ / total_
+                acc_list.append(avg)
                 total += total_
 
                 # collect predictions from all gpus
@@ -233,40 +214,29 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
 
                 dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0) # 只收集process 0？？
                 if not dist.is_initialized() or dist.get_rank() == 0:
-                    if metric in ["acc", "accuracy"]:
-                        current_accuracy = np.mean(all_list)
-                        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
-
-                        if(self.evaluator_args.use_wandb == True):
-                            wandb.log({"Accuracy": current_accuracy})
-
-                    else:
-                        current_rouge_l = np.mean(all_list)
-                        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current ROUGE-L = {}".format(int(total), data_size, current_rouge_l))
-
-                        if (self.evaluator_args.use_wandb == True):
-                            wandb.log({"rougeL_fmeasure": current_rouge_l})
+                    current_accuracy = np.mean(acc_list)
+                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
+                    
+                    if(self.evaluator_args.use_wandb == True):
+                        wandb.log({"Accuracy": current_accuracy})
 
                     for index, output in enumerate(all_process_list):
                         output_json = json.dumps(output)
                         output_writer.write(output_json + '\n')
 
-
             if not dist.is_initialized() or dist.get_rank() == 0: # 此刻已经处理完dataset
-                if metric in ["acc", "accuracy"]:
-                    current_accuracy = np.mean(all_list)
-                    print("Final accuracy = ", current_accuracy)
-                else:
-                    current_rouge_l = np.mean(all_list)
-                    print("Final ROUGE-L = ", current_rouge_l)
+                current_accuracy = np.mean(acc_list)
+                print("Final accuracy = ", current_accuracy)
                 output_writer.close()
-                
         elif metric in ["ppl", "perplexity"]:
             ppl =  self._evaluate_ppl(model, dataset)
             print(f"Evaluating final ppl: {ppl}")
         elif metric in ["nll", "neg_log_likelihood"]:
             neg_log_likelihood = self._evaluate_neg_log_likelihood(model, dataset)
             print(f"Evaluating final negative log likelihood: {neg_log_likelihood}")
+        elif metric in ["rl", "rouge-l", "ROUGE-L"]:
+            rl = self._evaluate_rouge_l(model, dataset)
+            print(f"Evaluating final ROUGE-L: {rl}")
         else:
             raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
 
@@ -363,7 +333,7 @@ def _evaluate_rouge_l(self, model, dataset: Dataset):       # alpaca dataset: [{
 
             # collect rouge-l from all gpus
             all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
-            dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)      # sum 还是 max好？
+            dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
             rl_sum, total_ = all_process.tolist()
             avg = rl_sum / total_
             rl_list.append(avg)

From dcac238b3d721cf51471648eacfaa23c8647324d Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Tue, 6 Jun 2023 16:51:57 +0800
Subject: [PATCH 13/43] Newest version of the team

---
 src/lmflow/args.py | 148 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 135 insertions(+), 13 deletions(-)

diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index 6ffc0469a..2dcc7d8f7 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -13,7 +13,7 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Optional, List
 
 from transformers.utils.versions import require_version
 
@@ -75,6 +75,8 @@ class ModelArguments:
     use_ram_optimized_load : bool
         a boolean indicating whether to use disk mapping when memory is not
         enough.
+    use_int8 : bool
+        a boolean indicating whether to load int8 quantization for inference.
     """
 
     model_name_or_path: Optional[str] = field(
@@ -99,6 +101,10 @@ class ModelArguments:
         default=None,
         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
     )
+    arch_type: Optional[str] = field(
+        default="decoder_only",
+        metadata={"help": "The architecture type of the model. Currently supported decoder_only or encoder_decoder"}
+    )
     config_overrides: Optional[str] = field(
         default=None,
         metadata={
@@ -167,6 +173,10 @@ class ModelArguments:
         default=32,
         metadata={"help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."},
     )
+    lora_target_modules: List[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name",
+                              }
+    )
     lora_dropout: float = field(
         default=0.1,
         metadata={"help": "The dropout rate in lora.linear."},
@@ -179,6 +189,19 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether use disk mapping when memory is not enough."}
     )
+    use_flash_attention: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "whether use flash attention layer to reduce GPU memory with"
+                " higher time cost."
+            )
+        }
+    )
+    use_int8: bool = field(
+        default=False,
+        metadata={"help": "whether to load int8 quantization for inference"}
+    )
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -302,6 +325,16 @@ class DatasetArguments:
         default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
+    group_texts_batch_size: int = field(
+        default=1000,
+        metadata={
+            "help": (
+                "Number of samples that will be grouped together to go though"
+                " `group_texts` operation. See `--disable_group_texts` for"
+                " detailed explanation of this operation."
+            )
+        }
+    )
     disable_group_texts: bool = field(
         default=False,
         metadata={
@@ -344,7 +377,9 @@ class FinetunerArguments(TrainingArguments):
     """
     Adapt transformers.TrainingArguments
     """
-    pass
+    eval_dataset_path: Optional[str] = field(
+        default=None, metadata={"help": "The path of the eval dataset to use."}
+    )
 
 
 @dataclass
@@ -370,6 +405,12 @@ class EvaluatorArguments:
     deepspeed : 
         Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already
         loaded json file as a dict
+        
+    temperature : float
+        An argument of model.generate in huggingface to control the diversity of generation.
+        
+    repetition_penalty : float
+        An argument of model.generate in huggingface to penalize repetitions.
     """
     local_rank: int = field(
         default=-1,
@@ -470,11 +511,38 @@ class EvaluatorArguments:
         default="accuracy",
         metadata={
             "help": "the metric the model will be evaluated on",
-            "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood", "rl", "rouge-l", "ROUGE-L"],
+            "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood"],
         },
     )
-
-
+    inference_batch_size_per_device: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": (
+                "every device will infer {inference_batch_size_per_device}"
+                " samples in parallel. The inferred results will be concatenaed"
+                " with inputs and attach a reward."
+            ),
+        },
+    )
+    use_accelerator_for_evaluator: bool = field(
+        default=False, metadata={"help": "Whether to use Huggingface Accelerator instead of Deepspeed"},
+    )
+        
+    temperature: float = field(
+        default=0,
+        metadata={"help": "Temperature during inference."},
+    )
+    
+    repetition_penalty: float = field(
+        default=1,
+        metadata={"help": "Repetition_penalty during inference."},
+    )
+        
+    max_new_tokens: int = field(
+        default=100,
+        metadata={"help": "Maximum length during inference."},
+    )
+    
 @dataclass
 class InferencerArguments:
     """
@@ -491,7 +559,12 @@ class InferencerArguments:
         loaded json file as a dict
     mixed_precision : str, choice from ["bf16","fp16"].
         mixed precision mode, whether to use bf16 or fp16
-
+    
+    temperature : float
+        An argument of model.generate in huggingface to control the diversity of generation.
+        
+    repetition_penalty : float
+        An argument of model.generate in huggingface to penalize repetitions.
     """
     device: str = field(
         default="gpu",
@@ -503,8 +576,24 @@ class InferencerArguments:
     local_rank: int = field(
         default=-1,
         metadata={"help": "For distributed training: local_rank"
-        }
+        },
+    )
+        
+    temperature: float = field(
+        default=0.0,
+        metadata={"help": "Temperature during inference."},
     )
+    
+    repetition_penalty: float = field(
+        default=1,
+        metadata={"help": "Repetition_penalty during inference."},
+    )
+        
+    max_new_tokens: int = field(
+        default=100,
+        metadata={"help": "Maximum length during inference."},
+    )
+        
     random_seed: Optional[int] = field(
         default=1,
         metadata={
@@ -531,6 +620,13 @@ class InferencerArguments:
             "choices": ["bf16","fp16"],
         },
     )
+    do_sample: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "whether turn on true random sampling during inference."
+        },
+    )
+        
 
 
 @dataclass
@@ -545,7 +641,7 @@ class RaftAlignerArguments(TrainingArguments):
         }
     )
     output_min_length: Optional[int] = field(
-        default=16,
+        default=64,
         metadata={
             "help": (
                 "minimum length of the output token sequence generated from"
@@ -554,7 +650,7 @@ class RaftAlignerArguments(TrainingArguments):
         },
     )
     output_max_length: Optional[int] = field(
-        default=48,
+        default=128,
         metadata={
             "help": (
                 "maximum length of the output token sequence generated from"
@@ -569,15 +665,14 @@ class RaftAlignerArguments(TrainingArguments):
         },
     )
     raft_batch_size: Optional[int] = field(
-        default=320,
+        default=1024,
         metadata={
             "help": (
-                "only select {raft_batch_size} samples each time to"
-                " generate rewards and be ranked for STF training."
+                "only select {raft_batch_size} samples each time for STF training."
             )
         },
     )
-    top_reward_percentage: Optional[int] = field(
+    top_reward_percentage: Optional[float] = field(
         default=0.2,
         metadata={
             "help": (
@@ -596,7 +691,34 @@ class RaftAlignerArguments(TrainingArguments):
             ),
         },
     )
+    collection_strategy: Optional[str] = field(
+        default="top",
+        metadata={
+            "help": (
+                "{collection_strategy} is either top or local"
+                " top means that we rank the samples globally regardless of the prompts"
+                " local means that we only rank the samples with the same prompt"
+            ),
+        },
+    )
 
+@dataclass
+class BenchmarkingArguments:
+    dataset_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "benchmark dataset name provided by lmflow"
+        },
+    )
+    lm_evaluation_metric: Optional[str] = field(
+        default="accuracy",
+        metadata={
+            "help": "the metric the model will be evaluated on",
+            "choices": ["acc", "acc_norm", "bleu", "chrf", "em", "f1", "ppl", \
+                "ter", "r@1", "r@2", "mrr", "mc1", "mc2", "word_perplexity", \
+                    "byte_perplexity", "bits_per_byte"],
+        },
+    )
 
 PIPELINE_ARGUMENT_MAPPING = {
     "finetuner": FinetunerArguments,

From bdcc9f87cb8b581369be292420b24d5c1797d52b Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Wed, 7 Jun 2023 16:57:19 +0800
Subject: [PATCH 14/43] Update args.py

---
 src/lmflow/args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index 2dcc7d8f7..8dbc06c37 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -511,7 +511,7 @@ class EvaluatorArguments:
         default="accuracy",
         metadata={
             "help": "the metric the model will be evaluated on",
-            "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood"],
+            "choices": ["ppl", "perplexity", "acc", "accuracy", "rl", "rouge-l", "nll", "neg_log_likelihood"],
         },
     )
     inference_batch_size_per_device: Optional[int] = field(

From 5b6fb415636ad46574137443e7791156ce6caee8 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Wed, 7 Jun 2023 17:05:09 +0800
Subject: [PATCH 15/43] Update run_evaluation_with_rougel.sh

---
 scripts/run_evaluation_with_rougel.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/run_evaluation_with_rougel.sh b/scripts/run_evaluation_with_rougel.sh
index 75bbdfdde..90f5f25d7 100644
--- a/scripts/run_evaluation_with_rougel.sh
+++ b/scripts/run_evaluation_with_rougel.sh
@@ -8,6 +8,6 @@ CUDA_VISIBLE_DEVICES=0 \
     --answer_type text \
     --model_name_or_path gpt2-large \
     --dataset_path data/alpaca/test \
-    --prompt_structure "Input: {input}" \
     --deepspeed examples/ds_config.json \
-    --metric rouge-l
\ No newline at end of file
+    --inference_batch_size_per_device 1 \
+    --metric rouge-l

From 545c8086dee66b549832fcc666a8cc78c04549bb Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 11:37:10 +0800
Subject: [PATCH 16/43] a test case for ROUGE-L metric in evaluation

---
 scripts/rougel_test_case.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 scripts/rougel_test_case.sh

diff --git a/scripts/rougel_test_case.sh b/scripts/rougel_test_case.sh
new file mode 100644
index 000000000..0da258416
--- /dev/null
+++ b/scripts/rougel_test_case.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# 比较self-instruct和LMFlow对相同instructions的ROUGE-L得分是否相同
+
+CUDA_VISIBLE_DEVICES=0 \
+    deepspeed examples/test_rougel.py \
+    --answer_type text \
+    --model_name_or_path gpt2-large \
+    --dataset_path data/alpaca/test \
+    --deepspeed examples/ds_config.json \
+    --inference_batch_size_per_device 1 \
+    --metric rouge-l
\ No newline at end of file

From 72500dbfd9049dacfb81b2c919aeb14c8fe6f5a3 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 11:38:52 +0800
Subject: [PATCH 17/43] test case for ROUGE-L

---
 src/lmflow/pipeline/test_rougel.py | 282 +++++++++++++++++++++++++++++
 1 file changed, 282 insertions(+)
 create mode 100644 src/lmflow/pipeline/test_rougel.py

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
new file mode 100644
index 000000000..c950960de
--- /dev/null
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -0,0 +1,282 @@
+import os
+import deepspeed
+import torch
+import wandb
+import sys
+import numpy as np
+import datetime
+import json
+from rouge_score import rouge_scorer
+from multiprocessing import Pool
+from functools import partial
+# TODO: remove later
+from transformers import AutoConfig
+import torch.distributed as dist
+
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.base_pipeline import BasePipeline
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction, load_data
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+
+
+class Evaluator(BasePipeline):
+    """
+    Initializes the `Evaluator` class with given arguments.
+
+    Parameters
+    ------------
+    model_args : ModelArguments object.
+        Contains the arguments required to load the model.
+
+    data_args : DatasetArguments object.
+        Contains the arguments required to load the dataset.
+
+    evaluator_args : EvaluatorArguments object.
+        Contains the arguments required to perform evaluation.
+
+
+    """
+
+    def __init__(self, model_args, data_args, evaluator_args):
+        # our method
+        self.data_args = data_args
+        self.evaluator_args = evaluator_args
+        self.model_args = model_args
+        print("--------Begin Evaluator Arguments----------")
+        print(f"model_args : {self.model_args}")
+        print(f"data_args : {self.data_args}")
+        print(f"evaluator_args : {self.evaluator_args}")
+        print("--------End Evaluator Arguments----------")
+        # logger
+        if (self.evaluator_args.use_wandb == True):
+            wandb.init(project="lmflow_evaluation")
+        # random seed
+        set_random_seed(self.evaluator_args.random_seed)
+        self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        print("\nself.world_size是：", self.world_size, "\n")
+        torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
+        deepspeed.init_distributed()
+
+        self.config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        try:
+            self.model_hidden_size = self.config.hidden_size
+        except:
+            print("Error in setting hidden size, use the default size 1024")
+            self.model_hidden_size = 1024  # gpt2 seems do not have hidden_size in config
+
+        print(f"model_hidden_size = {self.model_hidden_size}")
+        # batch size has to be divisible by world_size, but can be bigger than world_size
+        train_batch_size = 1 * self.world_size
+        self.evaluator_args.minibatch_size = train_batch_size
+        self.block_size = evaluator_args.evaluate_block_size
+        # dataloader, data_size = create_dataloader(args)    # load dataset
+
+    # First use the method in self-instruct to get the ROUGE-L scores for the dataset, then use the method in LMFlow and compare the two scores,
+    # The metric is tested to be valid if all scores are the same.
+    def get_rougel_score_list(self, predicted_data: str):
+        scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
+
+        dataset = load_data(predicted_data)
+        data_dict = dataset.to_dict()
+        inputs = [instance["input"] for instance in data_dict["instances"]]
+        outputs = [instance["output"] for instance in data_dict["instances"]]
+        dataset_size = len(outputs)
+
+        dataset_buf = []
+        for idx in range(dataset_size):
+            dataset_buf.append({
+                "input": inputs[idx],
+                "output": outputs[idx],
+                "input_idx": idx
+            })
+
+        dataloader = batchlize(   # 相当于每minibatch_size大小切一段，dataloader = [[{}, {}, ... ], [{}, {}, ... ], ... ]
+            dataset_buf,
+            self.evaluator_args.minibatch_size,  # = self.world_size
+            self.evaluator_args.random_shuffle
+        )
+        print(f"Successfully create dataloader with size {len(dataloader)}.")
+
+        score_list = []  # store the maximum ROUGE-L score in each batch
+
+        for batch in dataloader:
+            input_ = [data["input"] for data in batch]
+            output_ = [data["output"] for data in batch]
+            with Pool(4) as p:  # 4 processes
+                rouge_scores = p.map(partial(scorer.score, input_), [output_])
+            rouge_scores = [score["rougeL"].fmeasure for score in rouge_scores]  # score["rougeL"].fmeasure 是对应的pair的得分
+            max_rl_score = max(rouge_scores)
+            score_list.append(max_rl_score)
+
+        return score_list
+
+
+    def create_dataloader(self, dataset: Dataset):
+        data_dict = dataset.to_dict()
+        inputs = [instance["input"] for instance in data_dict["instances"]]
+        outputs = [instance["output"] for instance in data_dict["instances"]]
+        dataset_size = len(outputs)
+        dataset_buf = []
+        for idx in range(dataset_size):
+            dataset_buf.append({
+                "input": inputs[idx],
+                "output": outputs[idx],
+                "input_idx": idx
+            })
+
+        dataloader = batchlize(
+            dataset_buf,
+            self.evaluator_args.minibatch_size,  # = self.world_size
+            self.evaluator_args.random_shuffle
+        )
+        print(f"Successfully create dataloader with size {len(dataloader)}.")
+        return dataloader, dataset_size
+
+    # TODO: Split for better unittest
+    def _calculate_rouge_l(self, predicted_answer, groundtruth, scorer: rouge_scorer.RougeScorer, answer_type=None):
+        case_insensitive_types = [
+            "strategyqa",
+            "coin_flip",
+            "pubmedqa",
+            "binary_choice",
+            "medmcqa",
+            "usmle",
+        ]
+        if answer_type in case_insensitive_types:
+            rouge_score = scorer.score(groundtruth.lower(), predicted_answer.lower())["rougeL"].fmeasure
+        else:
+            rouge_score = scorer.score(groundtruth, predicted_answer)["rougeL"].fmeasure
+        return rouge_score
+
+    def evaluate(self, model, dataset: Dataset, metric="rougel"):
+        """
+        Perform Evaluation for a model
+
+        Parameters
+        ------------
+        model : TunableModel object.
+            TunableModel to perform inference
+
+        dataset : Dataset object.
+
+
+        """
+        if metric in ["rl", "rouge-l", "ROUGE-L"]:
+            dataloader, data_size = self.create_dataloader(dataset)  # data_size = number of mini-batches
+
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                if not os.path.exists(self.evaluator_args.output_dir):
+                    os.makedirs(self.evaluator_args.output_dir)
+                output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+
+            pred_score_list = []  # list to record the ROUGE-L scores of all batches from LMFlow method
+
+            # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
+            # ds_engine.module.eval()
+            for batch_index, batch in enumerate(dataloader):
+                if batch_index * self.world_size >= self.data_args.max_eval_samples:
+                    break
+                if self.local_rank >= len(batch):
+                    current_batch = batch[0]
+                else:
+                    # the batch in current process
+                    current_batch = batch[self.local_rank]
+
+                prompt_structure = self.evaluator_args.prompt_structure
+                input = prompt_structure.format(input=current_batch['input'])
+                output = current_batch['output']
+                input_idx = current_batch['input_idx']
+
+                inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+
+                # with torch.no_grad():
+                # outputs = ds_engine.module.generate(inputs, synced_gpus=True, pad_token_id=model.get_tokenizer().eos_token_id, min_length=5, max_length=100,temperature=0.0, do_sample=False)
+                outputs = model.inference(inputs, max_new_tokens=100, temperature=0.0)
+                text_out = model.decode(outputs[0], skip_special_tokens=True)
+
+                # # only return the generation, truncating the input
+                prompt_length = len(model.decode(inputs[0], skip_special_tokens=True, ))
+                text_out = text_out[prompt_length:]
+                answer_type = self.evaluator_args.answer_type
+                pred_answer = answer_extraction(
+                    text_out,
+                    answer_type=answer_type,
+                )
+                print(
+                    f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
+                print(f"predicted answer: {pred_answer} \n")
+                print(f"groundtruth answer: {output} \n")
+
+
+                scorer = rouge_scorer.RougeScorer(["rougeL"],
+                                                  use_stemmer=False)  # stemmer: stem the words to their root form
+
+                if self.local_rank >= len(
+                        batch):  # for last batch, the padding examples are ignored and do not contribute to the ROUGE-L
+                    rl_ = 0
+                    total_ = 0
+                else:
+                    rl_ = max(0, self._calculate_rouge_l(pred_answer, output, scorer, answer_type))
+                    total_ = 1
+                score = rl_
+
+                # collect rouge-l from all gpus
+                all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
+                dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  # sum 还是 max好？
+                max_, total_ = all_process.tolist()
+                print("max_: ", max_)
+                print("total_: ", total_)
+                # avg = max_ / total_
+                avg = max_
+                pred_score_list.append(avg)
+                # total += total_
+
+                # collect predictions from all gpus
+                output_dict = {"question": input,
+                               "prediction": text_out,
+                               "pred_answer": pred_answer,
+                               "answer": output}
+                all_process_list = [{}] * self.world_size
+
+                dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None,
+                                   dst=0)  # 只收集process 0？？
+                print("all_process_list: ", all_process_list)
+                if not dist.is_initialized() or dist.get_rank() == 0:
+                    current_rouge_l = np.mean(pred_score_list)
+                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                          "{}/ {} has been finished, current ROUGE-L = {}".format(int(pred_score_list), data_size,
+                                                                                  current_rouge_l))
+
+                    if (self.evaluator_args.use_wandb == True):
+                        wandb.log({"rougeL_fmeasure": current_rouge_l})
+
+                    for index, output in enumerate(all_process_list):
+                        output_json = json.dumps(output)
+                        output_writer.write(output_json + '\n')
+
+            if not dist.is_initialized() or dist.get_rank() == 0:  # 此刻已经处理完dataset
+                current_rouge_l = np.mean(pred_score_list)
+                print("Final ROUGE-L = ", current_rouge_l)
+                output_writer.close()
+
+        else:
+            raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
+
+        # load the dataset with predicted answers and apply the self-instruct method to get the answer score list.
+        ans_score_list = self.get_rougel_score_list(f"{self.evaluator_args.output_dir}/evaluation.json")
+
+        # Start compare the two ROUGE-L scores lists we get
+        matched = True
+        for pred, ans in zip(pred_score_list, ans_score_list):
+            print("LMFlow ROUGE-L: ", pred, " -- self-instruct ROUGE-L: ", ans)
+            if pred != ans:
+                matched = False
+                print("scores not matched!")
+                return
+        print("scores matched. Tested to be valid.")
+
+
+

From 5851f47762f45d3ae509efef4133c3b248dda21f Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 11:53:24 +0800
Subject: [PATCH 18/43] update

---
 src/lmflow/pipeline/evaluator.py | 92 +++++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 31 deletions(-)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index b8b2e0a78..7b1bec54c 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -56,6 +56,7 @@ def __init__(self, model_args, data_args, evaluator_args):
         set_random_seed(self.evaluator_args.random_seed)
         self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        print("self.world_size是：", self.world_size)
         torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
         deepspeed.init_distributed()
 
@@ -79,7 +80,7 @@ def create_dataloader(self, dataset: Dataset):
         inputs = [ instance["input"] for instance in data_dict["instances"] ]
         outputs = [ instance["output"] for instance in data_dict["instances"] ]
         dataset_size = len(outputs)
-        dataset_buf = []
+        dataset_buf = []    # [{input: in1, output:out1, input_idx: 1}, { ... } , ... ]
         for idx in range(dataset_size):
             dataset_buf.append({
                 "input": inputs[idx],
@@ -87,9 +88,9 @@ def create_dataloader(self, dataset: Dataset):
                 "input_idx": idx
             })
 
-        dataloader = batchlize(
+        dataloader = batchlize(  # 相当于每minibatch_size大小切一段，dataloader = [[{}, {}, ... ], [{}, {}, ... ], ... ]
             dataset_buf,
-            self.evaluator_args.minibatch_size,
+            self.evaluator_args.minibatch_size,      # = self.world_size
             self.evaluator_args.random_shuffle
         )
         print(f"Successfully create dataloader with size {len(dataloader)}.")
@@ -142,7 +143,7 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
             
 
         """
-        if metric in ["acc", "accuracy"]:
+        if metric in ["acc", "accuracy", "rl", "rouge-l", "ROUGE-L"]:
             dataloader, data_size = self.create_dataloader(dataset)     # data_size = number of mini-batches
 
             if not dist.is_initialized() or dist.get_rank() == 0:
@@ -150,12 +151,12 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
                     os.makedirs(self.evaluator_args.output_dir)
                 output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
 
-            acc_list = []
+            all_list = []  # list to replace both acc_list and rl_list
             total = 0
             # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
             # ds_engine.module.eval()
             for batch_index, batch in enumerate(dataloader):
-                if batch_index * self.world_size >= self.data_args.max_eval_samples: 
+                if batch_index * self.world_size >= self.data_args.max_eval_samples:
                     break
                 if self.local_rank >= len(batch):
                     current_batch = batch[0]
@@ -188,21 +189,39 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
                 print(f"predicted answer: {pred_answer} \n")
                 print(f"groundtruth answer: {output} \n")
 
-                if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and do not contribute to the accuracy
-                    correct_ = 0
-                    total_ = 0
+                if metric in ["acc", "accuracy"]:
+                    if self.local_rank >= len(batch): # for last batch, the padding examples are ignored and do not contribute to the accuracy
+                        correct_ = 0
+                        total_ = 0
+                    else:
+                        correct_ = 0
+                        total_ = 1
+                        if self._match(pred_answer, output, answer_type):
+                            correct_ = 1
+                    score = correct_
+
+                else:
+                    scorer = rouge_scorer.RougeScorer(["rougeL"],
+                                                      use_stemmer=False)  # stemmer: stem the words to their root form
+
+                    if self.local_rank >= len(
+                            batch):  # for last batch, the padding examples are ignored and do not contribute to the ROUGE-L
+                        rl_ = 0
+                        total_ = 0
+                    else:
+                        rl_ = max(0, self._calculate_rouge_l(pred_answer, output, scorer, answer_type))
+                        total_ = 1
+                    score = rl_
+
+                # collect rouge-l from all gpus
+                all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
+                if metric in ["acc", "accuracy"]:
+                    dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
                 else:
-                    correct_ = 0
-                    total_ = 1
-                    if self._match(pred_answer, output, answer_type):
-                        correct_ = 1
-
-                # collect accuracy from all gpus
-                all_process = torch.tensor([correct_, total_], dtype=torch.float32, device=self.local_rank)
-                dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
-                correct_, total_ = all_process.tolist()
-                avg = correct_ / total_
-                acc_list.append(avg)
+                    dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  # sum 还是 max好？
+                sum_or_max, total_ = all_process.tolist()
+                avg = sum_or_max / total_
+                all_list.append(avg)
                 total += total_
 
                 # collect predictions from all gpus
@@ -214,29 +233,40 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
 
                 dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0) # 只收集process 0？？
                 if not dist.is_initialized() or dist.get_rank() == 0:
-                    current_accuracy = np.mean(acc_list)
-                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
-                    
-                    if(self.evaluator_args.use_wandb == True):
-                        wandb.log({"Accuracy": current_accuracy})
+                    if metric in ["acc", "accuracy"]:
+                        current_accuracy = np.mean(all_list)
+                        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current accuracy = {}".format(int(total), data_size, current_accuracy))
+
+                        if(self.evaluator_args.use_wandb == True):
+                            wandb.log({"Accuracy": current_accuracy})
+
+                    else:
+                        current_rouge_l = np.mean(all_list)
+                        print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "{}/ {} has been finished, current ROUGE-L = {}".format(int(total), data_size, current_rouge_l))
+
+                        if (self.evaluator_args.use_wandb == True):
+                            wandb.log({"rougeL_fmeasure": current_rouge_l})
 
                     for index, output in enumerate(all_process_list):
                         output_json = json.dumps(output)
                         output_writer.write(output_json + '\n')
 
+
             if not dist.is_initialized() or dist.get_rank() == 0: # 此刻已经处理完dataset
-                current_accuracy = np.mean(acc_list)
-                print("Final accuracy = ", current_accuracy)
+                if metric in ["acc", "accuracy"]:
+                    current_accuracy = np.mean(all_list)
+                    print("Final accuracy = ", current_accuracy)
+                else:
+                    current_rouge_l = np.mean(all_list)
+                    print("Final ROUGE-L = ", current_rouge_l)
                 output_writer.close()
+
         elif metric in ["ppl", "perplexity"]:
             ppl =  self._evaluate_ppl(model, dataset)
             print(f"Evaluating final ppl: {ppl}")
         elif metric in ["nll", "neg_log_likelihood"]:
             neg_log_likelihood = self._evaluate_neg_log_likelihood(model, dataset)
             print(f"Evaluating final negative log likelihood: {neg_log_likelihood}")
-        elif metric in ["rl", "rouge-l", "ROUGE-L"]:
-            rl = self._evaluate_rouge_l(model, dataset)
-            print(f"Evaluating final ROUGE-L: {rl}")
         else:
             raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
 
@@ -333,7 +363,7 @@ def _evaluate_rouge_l(self, model, dataset: Dataset):       # alpaca dataset: [{
 
             # collect rouge-l from all gpus
             all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
-            dist.all_reduce(all_process, dist.ReduceOp.SUM, async_op=False)
+            dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)      # sum 还是 max好？
             rl_sum, total_ = all_process.tolist()
             avg = rl_sum / total_
             rl_list.append(avg)

From 9c5fd69cb41aa694fbe32340af83194b60c2e53f Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 11:57:45 +0800
Subject: [PATCH 19/43] test ROUGE-L metric

---
 examples/test_rougel.py | 282 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 282 insertions(+)
 create mode 100644 examples/test_rougel.py

diff --git a/examples/test_rougel.py b/examples/test_rougel.py
new file mode 100644
index 000000000..c950960de
--- /dev/null
+++ b/examples/test_rougel.py
@@ -0,0 +1,282 @@
+import os
+import deepspeed
+import torch
+import wandb
+import sys
+import numpy as np
+import datetime
+import json
+from rouge_score import rouge_scorer
+from multiprocessing import Pool
+from functools import partial
+# TODO: remove later
+from transformers import AutoConfig
+import torch.distributed as dist
+
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.base_pipeline import BasePipeline
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction, load_data
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+
+
+class Evaluator(BasePipeline):
+    """
+    Initializes the `Evaluator` class with given arguments.
+
+    Parameters
+    ------------
+    model_args : ModelArguments object.
+        Contains the arguments required to load the model.
+
+    data_args : DatasetArguments object.
+        Contains the arguments required to load the dataset.
+
+    evaluator_args : EvaluatorArguments object.
+        Contains the arguments required to perform evaluation.
+
+
+    """
+
+    def __init__(self, model_args, data_args, evaluator_args):
+        # our method
+        self.data_args = data_args
+        self.evaluator_args = evaluator_args
+        self.model_args = model_args
+        print("--------Begin Evaluator Arguments----------")
+        print(f"model_args : {self.model_args}")
+        print(f"data_args : {self.data_args}")
+        print(f"evaluator_args : {self.evaluator_args}")
+        print("--------End Evaluator Arguments----------")
+        # logger
+        if (self.evaluator_args.use_wandb == True):
+            wandb.init(project="lmflow_evaluation")
+        # random seed
+        set_random_seed(self.evaluator_args.random_seed)
+        self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.world_size = int(os.getenv("WORLD_SIZE", "1"))
+        print("\nself.world_size是：", self.world_size, "\n")
+        torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
+        deepspeed.init_distributed()
+
+        self.config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+        try:
+            self.model_hidden_size = self.config.hidden_size
+        except:
+            print("Error in setting hidden size, use the default size 1024")
+            self.model_hidden_size = 1024  # gpt2 seems do not have hidden_size in config
+
+        print(f"model_hidden_size = {self.model_hidden_size}")
+        # batch size has to be divisible by world_size, but can be bigger than world_size
+        train_batch_size = 1 * self.world_size
+        self.evaluator_args.minibatch_size = train_batch_size
+        self.block_size = evaluator_args.evaluate_block_size
+        # dataloader, data_size = create_dataloader(args)    # load dataset
+
+    # First use the method in self-instruct to get the ROUGE-L scores for the dataset, then use the method in LMFlow and compare the two scores,
+    # The metric is tested to be valid if all scores are the same.
+    def get_rougel_score_list(self, predicted_data: str):
+        scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
+
+        dataset = load_data(predicted_data)
+        data_dict = dataset.to_dict()
+        inputs = [instance["input"] for instance in data_dict["instances"]]
+        outputs = [instance["output"] for instance in data_dict["instances"]]
+        dataset_size = len(outputs)
+
+        dataset_buf = []
+        for idx in range(dataset_size):
+            dataset_buf.append({
+                "input": inputs[idx],
+                "output": outputs[idx],
+                "input_idx": idx
+            })
+
+        dataloader = batchlize(   # 相当于每minibatch_size大小切一段，dataloader = [[{}, {}, ... ], [{}, {}, ... ], ... ]
+            dataset_buf,
+            self.evaluator_args.minibatch_size,  # = self.world_size
+            self.evaluator_args.random_shuffle
+        )
+        print(f"Successfully create dataloader with size {len(dataloader)}.")
+
+        score_list = []  # store the maximum ROUGE-L score in each batch
+
+        for batch in dataloader:
+            input_ = [data["input"] for data in batch]
+            output_ = [data["output"] for data in batch]
+            with Pool(4) as p:  # 4 processes
+                rouge_scores = p.map(partial(scorer.score, input_), [output_])
+            rouge_scores = [score["rougeL"].fmeasure for score in rouge_scores]  # score["rougeL"].fmeasure 是对应的pair的得分
+            max_rl_score = max(rouge_scores)
+            score_list.append(max_rl_score)
+
+        return score_list
+
+
+    def create_dataloader(self, dataset: Dataset):
+        data_dict = dataset.to_dict()
+        inputs = [instance["input"] for instance in data_dict["instances"]]
+        outputs = [instance["output"] for instance in data_dict["instances"]]
+        dataset_size = len(outputs)
+        dataset_buf = []
+        for idx in range(dataset_size):
+            dataset_buf.append({
+                "input": inputs[idx],
+                "output": outputs[idx],
+                "input_idx": idx
+            })
+
+        dataloader = batchlize(
+            dataset_buf,
+            self.evaluator_args.minibatch_size,  # = self.world_size
+            self.evaluator_args.random_shuffle
+        )
+        print(f"Successfully create dataloader with size {len(dataloader)}.")
+        return dataloader, dataset_size
+
+    # TODO: Split for better unittest
+    def _calculate_rouge_l(self, predicted_answer, groundtruth, scorer: rouge_scorer.RougeScorer, answer_type=None):
+        case_insensitive_types = [
+            "strategyqa",
+            "coin_flip",
+            "pubmedqa",
+            "binary_choice",
+            "medmcqa",
+            "usmle",
+        ]
+        if answer_type in case_insensitive_types:
+            rouge_score = scorer.score(groundtruth.lower(), predicted_answer.lower())["rougeL"].fmeasure
+        else:
+            rouge_score = scorer.score(groundtruth, predicted_answer)["rougeL"].fmeasure
+        return rouge_score
+
+    def evaluate(self, model, dataset: Dataset, metric="rougel"):
+        """
+        Perform Evaluation for a model
+
+        Parameters
+        ------------
+        model : TunableModel object.
+            TunableModel to perform inference
+
+        dataset : Dataset object.
+
+
+        """
+        if metric in ["rl", "rouge-l", "ROUGE-L"]:
+            dataloader, data_size = self.create_dataloader(dataset)  # data_size = number of mini-batches
+
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                if not os.path.exists(self.evaluator_args.output_dir):
+                    os.makedirs(self.evaluator_args.output_dir)
+                output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+
+            pred_score_list = []  # list to record the ROUGE-L scores of all batches from LMFlow method
+
+            # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
+            # ds_engine.module.eval()
+            for batch_index, batch in enumerate(dataloader):
+                if batch_index * self.world_size >= self.data_args.max_eval_samples:
+                    break
+                if self.local_rank >= len(batch):
+                    current_batch = batch[0]
+                else:
+                    # the batch in current process
+                    current_batch = batch[self.local_rank]
+
+                prompt_structure = self.evaluator_args.prompt_structure
+                input = prompt_structure.format(input=current_batch['input'])
+                output = current_batch['output']
+                input_idx = current_batch['input_idx']
+
+                inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
+
+                # with torch.no_grad():
+                # outputs = ds_engine.module.generate(inputs, synced_gpus=True, pad_token_id=model.get_tokenizer().eos_token_id, min_length=5, max_length=100,temperature=0.0, do_sample=False)
+                outputs = model.inference(inputs, max_new_tokens=100, temperature=0.0)
+                text_out = model.decode(outputs[0], skip_special_tokens=True)
+
+                # # only return the generation, truncating the input
+                prompt_length = len(model.decode(inputs[0], skip_special_tokens=True, ))
+                text_out = text_out[prompt_length:]
+                answer_type = self.evaluator_args.answer_type
+                pred_answer = answer_extraction(
+                    text_out,
+                    answer_type=answer_type,
+                )
+                print(
+                    f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
+                print(f"predicted answer: {pred_answer} \n")
+                print(f"groundtruth answer: {output} \n")
+
+
+                scorer = rouge_scorer.RougeScorer(["rougeL"],
+                                                  use_stemmer=False)  # stemmer: stem the words to their root form
+
+                if self.local_rank >= len(
+                        batch):  # for last batch, the padding examples are ignored and do not contribute to the ROUGE-L
+                    rl_ = 0
+                    total_ = 0
+                else:
+                    rl_ = max(0, self._calculate_rouge_l(pred_answer, output, scorer, answer_type))
+                    total_ = 1
+                score = rl_
+
+                # collect rouge-l from all gpus
+                all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
+                dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  # sum 还是 max好？
+                max_, total_ = all_process.tolist()
+                print("max_: ", max_)
+                print("total_: ", total_)
+                # avg = max_ / total_
+                avg = max_
+                pred_score_list.append(avg)
+                # total += total_
+
+                # collect predictions from all gpus
+                output_dict = {"question": input,
+                               "prediction": text_out,
+                               "pred_answer": pred_answer,
+                               "answer": output}
+                all_process_list = [{}] * self.world_size
+
+                dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None,
+                                   dst=0)  # 只收集process 0？？
+                print("all_process_list: ", all_process_list)
+                if not dist.is_initialized() or dist.get_rank() == 0:
+                    current_rouge_l = np.mean(pred_score_list)
+                    print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                          "{}/ {} has been finished, current ROUGE-L = {}".format(int(pred_score_list), data_size,
+                                                                                  current_rouge_l))
+
+                    if (self.evaluator_args.use_wandb == True):
+                        wandb.log({"rougeL_fmeasure": current_rouge_l})
+
+                    for index, output in enumerate(all_process_list):
+                        output_json = json.dumps(output)
+                        output_writer.write(output_json + '\n')
+
+            if not dist.is_initialized() or dist.get_rank() == 0:  # 此刻已经处理完dataset
+                current_rouge_l = np.mean(pred_score_list)
+                print("Final ROUGE-L = ", current_rouge_l)
+                output_writer.close()
+
+        else:
+            raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")
+
+        # load the dataset with predicted answers and apply the self-instruct method to get the answer score list.
+        ans_score_list = self.get_rougel_score_list(f"{self.evaluator_args.output_dir}/evaluation.json")
+
+        # Start compare the two ROUGE-L scores lists we get
+        matched = True
+        for pred, ans in zip(pred_score_list, ans_score_list):
+            print("LMFlow ROUGE-L: ", pred, " -- self-instruct ROUGE-L: ", ans)
+            if pred != ans:
+                matched = False
+                print("scores not matched!")
+                return
+        print("scores matched. Tested to be valid.")
+
+
+

From 4861bcccc6de5c06483fd1fbd7062dcbb4863876 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 12:17:49 +0800
Subject: [PATCH 20/43] Added the function of evaluate.py

---
 examples/test_rougel.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/examples/test_rougel.py b/examples/test_rougel.py
index c950960de..f054ff18d 100644
--- a/examples/test_rougel.py
+++ b/examples/test_rougel.py
@@ -11,17 +11,40 @@
 from functools import partial
 # TODO: remove later
 from transformers import AutoConfig
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 import torch.distributed as dist
-
+from transformers import HfArgumentParser
 from lmflow.datasets.dataset import Dataset
 from lmflow.pipeline.base_pipeline import BasePipeline
-from lmflow.models.hf_decoder_model import HFDecoderModel
 from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction, load_data
+from lmflow.models.auto_model import AutoModel
+from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
 
+# copied from evaluate.py, with small changes.
+pipeline_name = "test_rougel"
+PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
+
+parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
+model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
+
+with open (pipeline_args.deepspeed, "r") as f:
+    ds_config = json.load(f)
+
+model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config)
+dataset = Dataset(data_args)
+
+evaluator = AutoPipeline.get_pipeline(
+    pipeline_name=pipeline_name,
+    model_args=model_args,
+    data_args=data_args,
+    pipeline_args=pipeline_args,
+)
+evaluator.evaluate(model=model, dataset=dataset, metric=pipeline_args.metric)
 
-class Evaluator(BasePipeline):
+#copied from evaluator.py
+class Test_rougel(BasePipeline):
     """
     Initializes the `Evaluator` class with given arguments.
 

From f0304f54eee1c7d53a1d6181f19e34cdcdafb840 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 12:18:26 +0800
Subject: [PATCH 21/43] updated

---
 src/lmflow/args.py | 256 +++++++++++++++++++++++----------------------
 1 file changed, 131 insertions(+), 125 deletions(-)

diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index 8dbc06c37..24d7cf9b7 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -13,7 +13,7 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Optional, List
+from typing import Optional
 
 from transformers.utils.versions import require_version
 
@@ -75,8 +75,6 @@ class ModelArguments:
     use_ram_optimized_load : bool
         a boolean indicating whether to use disk mapping when memory is not
         enough.
-    use_int8 : bool
-        a boolean indicating whether to load int8 quantization for inference.
     """
 
     model_name_or_path: Optional[str] = field(
@@ -101,10 +99,6 @@ class ModelArguments:
         default=None,
         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
     )
-    arch_type: Optional[str] = field(
-        default="decoder_only",
-        metadata={"help": "The architecture type of the model. Currently supported decoder_only or encoder_decoder"}
-    )
     config_overrides: Optional[str] = field(
         default=None,
         metadata={
@@ -173,10 +167,6 @@ class ModelArguments:
         default=32,
         metadata={"help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."},
     )
-    lora_target_modules: List[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name",
-                              }
-    )
     lora_dropout: float = field(
         default=0.1,
         metadata={"help": "The dropout rate in lora.linear."},
@@ -189,19 +179,6 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether use disk mapping when memory is not enough."}
     )
-    use_flash_attention: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "whether use flash attention layer to reduce GPU memory with"
-                " higher time cost."
-            )
-        }
-    )
-    use_int8: bool = field(
-        default=False,
-        metadata={"help": "whether to load int8 quantization for inference"}
-    )
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
@@ -325,16 +302,6 @@ class DatasetArguments:
         default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
-    group_texts_batch_size: int = field(
-        default=1000,
-        metadata={
-            "help": (
-                "Number of samples that will be grouped together to go though"
-                " `group_texts` operation. See `--disable_group_texts` for"
-                " detailed explanation of this operation."
-            )
-        }
-    )
     disable_group_texts: bool = field(
         default=False,
         metadata={
@@ -377,16 +344,14 @@ class FinetunerArguments(TrainingArguments):
     """
     Adapt transformers.TrainingArguments
     """
-    eval_dataset_path: Optional[str] = field(
-        default=None, metadata={"help": "The path of the eval dataset to use."}
-    )
+    pass
 
 
 @dataclass
 class EvaluatorArguments:
     """
     Define a class EvaluatorArguments using the dataclass decorator. The class contains several optional
-    parameters that can be used to configure a evaluator.
+    parameters that can be used to configure an evaluator.
 
     local_rank : str
         For distributed training: local_rank
@@ -405,12 +370,6 @@ class EvaluatorArguments:
     deepspeed : 
         Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already
         loaded json file as a dict
-        
-    temperature : float
-        An argument of model.generate in huggingface to control the diversity of generation.
-        
-    repetition_penalty : float
-        An argument of model.generate in huggingface to penalize repetitions.
     """
     local_rank: int = field(
         default=-1,
@@ -511,38 +470,11 @@ class EvaluatorArguments:
         default="accuracy",
         metadata={
             "help": "the metric the model will be evaluated on",
-            "choices": ["ppl", "perplexity", "acc", "accuracy", "rl", "rouge-l", "nll", "neg_log_likelihood"],
-        },
-    )
-    inference_batch_size_per_device: Optional[int] = field(
-        default=1,
-        metadata={
-            "help": (
-                "every device will infer {inference_batch_size_per_device}"
-                " samples in parallel. The inferred results will be concatenaed"
-                " with inputs and attach a reward."
-            ),
+            "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood", "rl", "rouge-l", "ROUGE-L"],
         },
     )
-    use_accelerator_for_evaluator: bool = field(
-        default=False, metadata={"help": "Whether to use Huggingface Accelerator instead of Deepspeed"},
-    )
-        
-    temperature: float = field(
-        default=0,
-        metadata={"help": "Temperature during inference."},
-    )
-    
-    repetition_penalty: float = field(
-        default=1,
-        metadata={"help": "Repetition_penalty during inference."},
-    )
-        
-    max_new_tokens: int = field(
-        default=100,
-        metadata={"help": "Maximum length during inference."},
-    )
-    
+
+
 @dataclass
 class InferencerArguments:
     """
@@ -559,12 +491,7 @@ class InferencerArguments:
         loaded json file as a dict
     mixed_precision : str, choice from ["bf16","fp16"].
         mixed precision mode, whether to use bf16 or fp16
-    
-    temperature : float
-        An argument of model.generate in huggingface to control the diversity of generation.
-        
-    repetition_penalty : float
-        An argument of model.generate in huggingface to penalize repetitions.
+
     """
     device: str = field(
         default="gpu",
@@ -576,24 +503,8 @@ class InferencerArguments:
     local_rank: int = field(
         default=-1,
         metadata={"help": "For distributed training: local_rank"
-        },
-    )
-        
-    temperature: float = field(
-        default=0.0,
-        metadata={"help": "Temperature during inference."},
-    )
-    
-    repetition_penalty: float = field(
-        default=1,
-        metadata={"help": "Repetition_penalty during inference."},
-    )
-        
-    max_new_tokens: int = field(
-        default=100,
-        metadata={"help": "Maximum length during inference."},
+        }
     )
-        
     random_seed: Optional[int] = field(
         default=1,
         metadata={
@@ -620,13 +531,6 @@ class InferencerArguments:
             "choices": ["bf16","fp16"],
         },
     )
-    do_sample: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "whether turn on true random sampling during inference."
-        },
-    )
-        
 
 
 @dataclass
@@ -641,7 +545,7 @@ class RaftAlignerArguments(TrainingArguments):
         }
     )
     output_min_length: Optional[int] = field(
-        default=64,
+        default=16,
         metadata={
             "help": (
                 "minimum length of the output token sequence generated from"
@@ -650,7 +554,7 @@ class RaftAlignerArguments(TrainingArguments):
         },
     )
     output_max_length: Optional[int] = field(
-        default=128,
+        default=48,
         metadata={
             "help": (
                 "maximum length of the output token sequence generated from"
@@ -665,14 +569,15 @@ class RaftAlignerArguments(TrainingArguments):
         },
     )
     raft_batch_size: Optional[int] = field(
-        default=1024,
+        default=320,
         metadata={
             "help": (
-                "only select {raft_batch_size} samples each time for STF training."
+                "only select {raft_batch_size} samples each time to"
+                " generate rewards and be ranked for STF training."
             )
         },
     )
-    top_reward_percentage: Optional[float] = field(
+    top_reward_percentage: Optional[int] = field(
         default=0.2,
         metadata={
             "help": (
@@ -686,37 +591,137 @@ class RaftAlignerArguments(TrainingArguments):
         metadata={
             "help": (
                 "every device will infer {inference_batch_size_per_device}"
-                " samples in parallel. The inferred results will be concatenaed"
+                " samples in parallel. The inferred results will be concatenated"
                 " with inputs and attach a reward."
             ),
         },
     )
-    collection_strategy: Optional[str] = field(
-        default="top",
+
+@dataclass
+class TesterArguments:
+    """
+    Define a class TesterArguments using the dataclass decorator. The class contains several optional
+    parameters that can be used to configure an tester for ROUGE-L.
+
+    local_rank : str
+        For distributed training: local_rank
+
+    random_shuffle : bool
+
+    use_wandb : bool
+
+    random_seed : int, default = 1
+
+    output_dir : str, default = './output_dir',
+
+    mixed_precision : str, choice from ["bf16","fp16"].
+        mixed precision mode, whether to use bf16 or fp16
+
+    deepspeed :
+        Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already
+        loaded json file as a dict
+    """
+    local_rank: int = field(
+        default=-1,
+        metadata={"help": "For distributed training: local_rank"
+                  }
+    )
+
+    random_shuffle: Optional[bool] = field(
+        default=False,
+        metadata={"help": ""
+                  }
+    )
+
+    use_wandb: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "When this flag is True, wandb will be enabled"
+            )
+        },
+    )
+    random_seed: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": (
+                "used to set random seed"
+            )
+        },
+    )
+    output_dir: Optional[str] = field(
+        default="./output_dir",
+        metadata={"help": "Output path for the inferenced results"},
+    )
+    mixed_precision: Optional[str] = field(
+        default="bf16",
         metadata={
             "help": (
-                "{collection_strategy} is either top or local"
-                " top means that we rank the samples globally regardless of the prompts"
-                " local means that we only rank the samples with the same prompt"
+                "mixed precision mode, whether to use bf16 or fp16"
             ),
+            "choices": ["bf16", "fp16"],
         },
     )
-
-@dataclass
-class BenchmarkingArguments:
-    dataset_name: Optional[str] = field(
+    deepspeed: Optional[str] = field(
         default=None,
         metadata={
-            "help": "benchmark dataset name provided by lmflow"
+            "help": (
+                "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already"
+                " loaded json file as a dict"
+            )
         },
     )
-    lm_evaluation_metric: Optional[str] = field(
-        default="accuracy",
+    answer_type: Optional[str] = field(
+        default="text",
+        metadata={
+            "help": (
+                'Question type for answer extraction from the decoder output.'
+                ' Supported types: \n'
+                '   1) "multiple_choice", e.g. A, B, C, D, ...\n'
+                '   2) "binary_choice", e.g. yes, no, maybe\n'
+                '   3) "math", e.g. 1.0, -3.52\n'
+                '   4) "text", e.g. "I think that it is okay"\n'
+                '   5) Special treatment for several datasets\n'
+                '     - "gsm8k"\n'
+                '     - "svamp"\n'
+                '     - "asdiv"\n'
+                '     - "addsub"\n'
+                '     - "singleeq"\n'
+                '     - "multiarith"\n'
+                '     - "aqua"\n'
+                '     - "csqa"\n'
+                '     - "strategyqa"\n'
+                '     - "pubmedqa"\n'
+                '     - "medmcqa"\n'
+                '     - "usmle"\n'
+            )
+        },
+    )
+    prompt_structure: Optional[str] = field(
+        default="{input}",
+        metadata={
+            "help": (
+                'Prompt structure to facilitate prompt engineering during'
+                ' inference. The model will receive'
+                ' `prompt_structure.format(input=input)` as its input.'
+            )
+        },
+    )
+    evaluate_block_size: Optional[int] = field(
+        default=512,
+        metadata={
+            "help": (
+                "the model will have at least block_size tokens for context when calculating the conditional likelihood of any one token"
+                " (provided there are block_size preceding tokens available to condition on)"
+            )
+        },
+    )
+    metric: Optional[str] = field(
+        default="rougel",
         metadata={
             "help": "the metric the model will be evaluated on",
-            "choices": ["acc", "acc_norm", "bleu", "chrf", "em", "f1", "ppl", \
-                "ter", "r@1", "r@2", "mrr", "mc1", "mc2", "word_perplexity", \
-                    "byte_perplexity", "bits_per_byte"],
+            "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood", "rl", "rouge-l",
+                        "ROUGE-L"],
         },
     )
 
@@ -725,6 +730,7 @@ class BenchmarkingArguments:
     "evaluator": EvaluatorArguments,
     "inferencer": InferencerArguments,
     "raft_aligner": RaftAlignerArguments,
+    "test_rougel": TesterArguments,
 }
 
 

From 7fc95e73f70eb6b2facfec42cd1ed1d268daccee Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 12:19:25 +0800
Subject: [PATCH 22/43] Add files via upload

---
 src/lmflow/pipeline/auto_pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lmflow/pipeline/auto_pipeline.py b/src/lmflow/pipeline/auto_pipeline.py
index 0a1c3fcdb..da27bd556 100644
--- a/src/lmflow/pipeline/auto_pipeline.py
+++ b/src/lmflow/pipeline/auto_pipeline.py
@@ -14,6 +14,7 @@
     "finetuner": Finetuner,
     "inferencer": Inferencer,
     "raft_aligner": RaftAligner,
+    "test_rougel": Test_rougel,
 }
 
 

From dd5005e7e60ed23c4d77692240b19d6d20b57920 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 12:29:45 +0800
Subject: [PATCH 23/43] import evaluate, not AutoPipeline

---
 examples/test_rougel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/test_rougel.py b/examples/test_rougel.py
index f054ff18d..25ef53c50 100644
--- a/examples/test_rougel.py
+++ b/examples/test_rougel.py
@@ -11,7 +11,8 @@
 from functools import partial
 # TODO: remove later
 from transformers import AutoConfig
-from lmflow.pipeline.auto_pipeline import AutoPipeline
+# from lmflow.pipeline.auto_pipeline import AutoPipeline
+import evaluate
 import torch.distributed as dist
 from transformers import HfArgumentParser
 from lmflow.datasets.dataset import Dataset

From 621059a25c207941d634128aa8fbd18a08da4bdc Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 14:16:49 +0800
Subject: [PATCH 24/43] import Test_rougel

---
 src/lmflow/pipeline/auto_pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lmflow/pipeline/auto_pipeline.py b/src/lmflow/pipeline/auto_pipeline.py
index da27bd556..4f1179985 100644
--- a/src/lmflow/pipeline/auto_pipeline.py
+++ b/src/lmflow/pipeline/auto_pipeline.py
@@ -7,6 +7,7 @@
 from lmflow.pipeline.finetuner import Finetuner
 from lmflow.pipeline.inferencer import Inferencer
 from lmflow.pipeline.raft_aligner import RaftAligner
+from lmflow.examples.test_rougel import Test_rougel
 
 
 PIPELINE_MAPPING = {

From f74cb7585709081965cc8703fd7d3f94226dbd8c Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 14:19:36 +0800
Subject: [PATCH 25/43] Add files via upload

---
 src/lmflow/pipeline/test_rougel.py | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index c950960de..25ef53c50 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -11,17 +11,41 @@
 from functools import partial
 # TODO: remove later
 from transformers import AutoConfig
+# from lmflow.pipeline.auto_pipeline import AutoPipeline
+import evaluate
 import torch.distributed as dist
-
+from transformers import HfArgumentParser
 from lmflow.datasets.dataset import Dataset
 from lmflow.pipeline.base_pipeline import BasePipeline
-from lmflow.models.hf_decoder_model import HFDecoderModel
 from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction, load_data
+from lmflow.models.auto_model import AutoModel
+from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
 
+# copied from evaluate.py, with small changes.
+pipeline_name = "test_rougel"
+PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
+
+parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
+model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
+
+with open (pipeline_args.deepspeed, "r") as f:
+    ds_config = json.load(f)
+
+model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config)
+dataset = Dataset(data_args)
+
+evaluator = AutoPipeline.get_pipeline(
+    pipeline_name=pipeline_name,
+    model_args=model_args,
+    data_args=data_args,
+    pipeline_args=pipeline_args,
+)
+evaluator.evaluate(model=model, dataset=dataset, metric=pipeline_args.metric)
 
-class Evaluator(BasePipeline):
+#copied from evaluator.py
+class Test_rougel(BasePipeline):
     """
     Initializes the `Evaluator` class with given arguments.
 

From b1a615d8628fa469cedeac8b2659e8e120ce1b93 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 14:23:01 +0800
Subject: [PATCH 26/43] import Test_rougel

---
 src/lmflow/pipeline/auto_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lmflow/pipeline/auto_pipeline.py b/src/lmflow/pipeline/auto_pipeline.py
index 4f1179985..5067600b5 100644
--- a/src/lmflow/pipeline/auto_pipeline.py
+++ b/src/lmflow/pipeline/auto_pipeline.py
@@ -7,7 +7,7 @@
 from lmflow.pipeline.finetuner import Finetuner
 from lmflow.pipeline.inferencer import Inferencer
 from lmflow.pipeline.raft_aligner import RaftAligner
-from lmflow.examples.test_rougel import Test_rougel
+from lmflow.pipeline.test_rougel import Test_rougel
 
 
 PIPELINE_MAPPING = {

From 19fd37e16ac9103df7afa600b47952189eff87d8 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 14:26:07 +0800
Subject: [PATCH 27/43] Add inference_batch_size_per_device

---
 src/lmflow/args.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index 24d7cf9b7..db351aa2d 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -724,6 +724,16 @@ class TesterArguments:
                         "ROUGE-L"],
         },
     )
+    inference_batch_size_per_device: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": (
+                "every device will infer {inference_batch_size_per_device}"
+                " samples in parallel. The inferred results will be concatenated"
+                " with inputs and attach a reward."
+            ),
+        },
+    )
 
 PIPELINE_ARGUMENT_MAPPING = {
     "finetuner": FinetunerArguments,

From ff58720578eed12cc9f8decc37325c5b17db35c5 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 14:38:39 +0800
Subject: [PATCH 28/43] test for the validity of ROUGE-L metric

---
 examples/test.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 examples/test.py

diff --git a/examples/test.py b/examples/test.py
new file mode 100644
index 000000000..ab9d284c6
--- /dev/null
+++ b/examples/test.py
@@ -0,0 +1,45 @@
+import os
+import deepspeed
+import torch
+import wandb
+import sys
+import numpy as np
+import datetime
+import json
+from rouge_score import rouge_scorer
+from multiprocessing import Pool
+from functools import partial
+# TODO: remove later
+from transformers import AutoConfig
+from lmflow.pipeline.auto_pipeline import AutoPipeline
+
+import torch.distributed as dist
+from transformers import HfArgumentParser
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.base_pipeline import BasePipeline
+from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction, load_data
+from lmflow.models.auto_model import AutoModel
+from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+
+# copied from evaluate.py, with small changes.
+pipeline_name = "test_rougel"
+PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
+
+parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
+model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
+
+with open (pipeline_args.deepspeed, "r") as f:
+    ds_config = json.load(f)
+
+model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config)
+dataset = Dataset(data_args)
+
+evaluator = AutoPipeline.get_pipeline(
+    pipeline_name=pipeline_name,
+    model_args=model_args,
+    data_args=data_args,
+    pipeline_args=pipeline_args,
+)
+evaluator.evaluate(model=model, dataset=dataset, metric=pipeline_args.metric)

From f46e99b9df74705cbdaca02264b1239f0c23d465 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 14:39:43 +0800
Subject: [PATCH 29/43] split test.py and test_rougel.py

---
 scripts/rougel_test_case.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rougel_test_case.sh b/scripts/rougel_test_case.sh
index 0da258416..6074489e8 100644
--- a/scripts/rougel_test_case.sh
+++ b/scripts/rougel_test_case.sh
@@ -3,7 +3,7 @@
 # 比较self-instruct和LMFlow对相同instructions的ROUGE-L得分是否相同
 
 CUDA_VISIBLE_DEVICES=0 \
-    deepspeed examples/test_rougel.py \
+    deepspeed examples/test.py \
     --answer_type text \
     --model_name_or_path gpt2-large \
     --dataset_path data/alpaca/test \

From c3f09e68f044c9c464ac7f51d9147ac800dad8df Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 14:40:24 +0800
Subject: [PATCH 30/43] deleted the part of test.py

---
 src/lmflow/pipeline/test_rougel.py | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index 25ef53c50..6ede17dd0 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -12,7 +12,7 @@
 # TODO: remove later
 from transformers import AutoConfig
 # from lmflow.pipeline.auto_pipeline import AutoPipeline
-import evaluate
+
 import torch.distributed as dist
 from transformers import HfArgumentParser
 from lmflow.datasets.dataset import Dataset
@@ -21,29 +21,6 @@
 from lmflow.models.auto_model import AutoModel
 from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
 
-os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
-
-# copied from evaluate.py, with small changes.
-pipeline_name = "test_rougel"
-PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
-
-parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
-model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
-
-with open (pipeline_args.deepspeed, "r") as f:
-    ds_config = json.load(f)
-
-model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config)
-dataset = Dataset(data_args)
-
-evaluator = AutoPipeline.get_pipeline(
-    pipeline_name=pipeline_name,
-    model_args=model_args,
-    data_args=data_args,
-    pipeline_args=pipeline_args,
-)
-evaluator.evaluate(model=model, dataset=dataset, metric=pipeline_args.metric)
-
 #copied from evaluator.py
 class Test_rougel(BasePipeline):
     """
@@ -303,4 +280,3 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
         print("scores matched. Tested to be valid.")
 
 
-

From 782ebcfc78217c44662630617a959cd37c347d91 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 14:49:39 +0800
Subject: [PATCH 31/43] Added inference_batch_size_per_device

---
 src/lmflow/args.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index db351aa2d..76c0429c5 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -473,6 +473,16 @@ class EvaluatorArguments:
             "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood", "rl", "rouge-l", "ROUGE-L"],
         },
     )
+    inference_batch_size_per_device: Optional[int] = field(
+        default=1,
+        metadata={
+            "help": (
+                "every device will infer {inference_batch_size_per_device}"
+                " samples in parallel. The inferred results will be concatenated"
+                " with inputs and attach a reward."
+            ),
+        },
+    )
 
 
 @dataclass

From 91e61cf091226fa3661f5a3131cbcfa66b77893c Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 15:08:33 +0800
Subject: [PATCH 32/43] copied the function of Auto_pipeline inside

---
 examples/test.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/examples/test.py b/examples/test.py
index ab9d284c6..fd93e9685 100644
--- a/examples/test.py
+++ b/examples/test.py
@@ -11,7 +11,8 @@
 from functools import partial
 # TODO: remove later
 from transformers import AutoConfig
-from lmflow.pipeline.auto_pipeline import AutoPipeline
+
+# from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 import torch.distributed as dist
 from transformers import HfArgumentParser
@@ -20,9 +21,29 @@
 from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction, load_data
 from lmflow.models.auto_model import AutoModel
 from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
+from lmflow.pipeline.test_rougel import Test_rougel
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
 
+
+def get_pipeline(self,
+                 pipeline_name,
+                 model_args,
+                 data_args,
+                 pipeline_args,
+                 *args,
+                 **kwargs
+                 ):
+
+    pipeline = Test_rougel(
+        model_args,
+        data_args,
+        pipeline_args,
+        *args,
+        **kwargs
+    )
+    return pipeline
+
 # copied from evaluate.py, with small changes.
 pipeline_name = "test_rougel"
 PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
@@ -36,7 +57,7 @@
 model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config)
 dataset = Dataset(data_args)
 
-evaluator = AutoPipeline.get_pipeline(
+evaluator = get_pipeline(
     pipeline_name=pipeline_name,
     model_args=model_args,
     data_args=data_args,

From 4c51c352f22dd82f691620dce42051801a11f5ac Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 15:13:39 +0800
Subject: [PATCH 33/43] Update test.py

---
 examples/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/test.py b/examples/test.py
index fd93e9685..7f7673999 100644
--- a/examples/test.py
+++ b/examples/test.py
@@ -26,7 +26,7 @@
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
 
 
-def get_pipeline(self,
+def get_pipeline(
                  pipeline_name,
                  model_args,
                  data_args,

From 73c1d2a8cfc549acd53205347f66cd54a990810a Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 15:20:55 +0800
Subject: [PATCH 34/43] tiny updates

---
 src/lmflow/pipeline/evaluator.py   | 5 ++++-
 src/lmflow/pipeline/test_rougel.py | 6 +++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index 7b1bec54c..fb574755a 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -220,7 +220,10 @@ def evaluate(self, model, dataset: Dataset, metric = "accuracy"):
                 else:
                     dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  # sum 还是 max好？
                 sum_or_max, total_ = all_process.tolist()
-                avg = sum_or_max / total_
+                if metric in ["acc", "accuracy"]:
+                    avg = sum_or_max / total_
+                else:
+                    avg = sum_or_max
                 all_list.append(avg)
                 total += total_
 
diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index 6ede17dd0..e046dee9d 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -174,7 +174,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
                 output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
 
             pred_score_list = []  # list to record the ROUGE-L scores of all batches from LMFlow method
-
+            total = 0
             # ds_engine = deepspeed.initialize(model=model.get_model(), config_params=self.ds_config)[0]
             # ds_engine.module.eval()
             for batch_index, batch in enumerate(dataloader):
@@ -233,7 +233,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
                 # avg = max_ / total_
                 avg = max_
                 pred_score_list.append(avg)
-                # total += total_
+                total += total_
 
                 # collect predictions from all gpus
                 output_dict = {"question": input,
@@ -248,7 +248,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
                 if not dist.is_initialized() or dist.get_rank() == 0:
                     current_rouge_l = np.mean(pred_score_list)
                     print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                          "{}/ {} has been finished, current ROUGE-L = {}".format(int(pred_score_list), data_size,
+                          "{}/ {} has been finished, current ROUGE-L = {}".format(int(total), data_size,
                                                                                   current_rouge_l))
 
                     if (self.evaluator_args.use_wandb == True):

From ddc294d9d90a142a9d7b00a22d6da4f84037ae64 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 20:12:55 +0800
Subject: [PATCH 35/43] fix issues of JSON dataset format

---
 src/lmflow/pipeline/test_rougel.py | 42 +++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index e046dee9d..f6fe0ad68 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -77,29 +77,38 @@ def __init__(self, model_args, data_args, evaluator_args):
 
     # First use the method in self-instruct to get the ROUGE-L scores for the dataset, then use the method in LMFlow and compare the two scores,
     # The metric is tested to be valid if all scores are the same.
-    def get_rougel_score_list(self, predicted_data: str):
+    def get_rougel_score_list(self, predicted_data_path: str):
         scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
 
-        dataset = load_data(predicted_data)
-        data_dict = dataset.to_dict()
-        inputs = [instance["input"] for instance in data_dict["instances"]]
-        outputs = [instance["output"] for instance in data_dict["instances"]]
-        dataset_size = len(outputs)
-
-        dataset_buf = []
+        # Open the input file
+        with open(predicted_data_path, encoding='utf-8') as f:
+            # Read in the contents of the file
+            contents = f.read()
+        # Split the contents of the file into individual JSON objects
+        objects = contents.strip().split('\n')
+        # Add square brackets to the beginning and end of the list of objects
+        json_array = '[' + ','.join(objects) + ']'
+        # Convert the JSON array to a Python object
+        json_data = json.loads(json_array)
+
+        pred_answers = [instance["pred_answer"] for instance in json_data]
+        answers = [instance["answer"] for instance in json_data]
+        dataset_size = len(answers)
+
+        pred_dataset_buf = []
         for idx in range(dataset_size):
-            dataset_buf.append({
-                "input": inputs[idx],
-                "output": outputs[idx],
-                "input_idx": idx
+            pred_dataset_buf.append({
+                "input": pred_answers[idx],
+                "output": answers[idx],
+                "idx": idx
             })
 
         dataloader = batchlize(   # 相当于每minibatch_size大小切一段，dataloader = [[{}, {}, ... ], [{}, {}, ... ], ... ]
-            dataset_buf,
+            pred_dataset_buf,
             self.evaluator_args.minibatch_size,  # = self.world_size
             self.evaluator_args.random_shuffle
         )
-        print(f"Successfully create dataloader with size {len(dataloader)}.")
+        print(f"Successfully create dataloader of predictions and answers with size {len(dataloader)}.")
 
         score_list = []  # store the maximum ROUGE-L score in each batch
 
@@ -127,6 +136,8 @@ def create_dataloader(self, dataset: Dataset):
                 "output": outputs[idx],
                 "input_idx": idx
             })
+            if idx == 12:       # to be removed later
+                break
 
         dataloader = batchlize(
             dataset_buf,
@@ -171,7 +182,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
             if not dist.is_initialized() or dist.get_rank() == 0:
                 if not os.path.exists(self.evaluator_args.output_dir):
                     os.makedirs(self.evaluator_args.output_dir)
-                output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
+                output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")  # ./output_dir/evaluation.json
 
             pred_score_list = []  # list to record the ROUGE-L scores of all batches from LMFlow method
             total = 0
@@ -262,6 +273,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
                 current_rouge_l = np.mean(pred_score_list)
                 print("Final ROUGE-L = ", current_rouge_l)
                 output_writer.close()
+                print("__________________________\n__________________________")
 
         else:
             raise NotImplementedError(f"{metric} is not implemented or not match with our defined metrics")

From f088ab6c3e2320cbb623f45ff68adaa3821c7e79 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 20:40:51 +0800
Subject: [PATCH 36/43] fix the Pool issue

---
 src/lmflow/pipeline/test_rougel.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index f6fe0ad68..9f0821c85 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -116,10 +116,13 @@ def get_rougel_score_list(self, predicted_data_path: str):
             input_ = [data["input"] for data in batch]
             output_ = [data["output"] for data in batch]
             with Pool(4) as p:  # 4 processes
-                rouge_scores = p.map(partial(scorer.score, input_), [output_])
-            rouge_scores = [score["rougeL"].fmeasure for score in rouge_scores]  # score["rougeL"].fmeasure 是对应的pair的得分
-            max_rl_score = max(rouge_scores)
-            score_list.append(max_rl_score)
+                scores = []
+                for idx in range(4):
+                    rouge_scores = p.map(partial(scorer.score, input_[idx]), [output_[idx]])        # 1 对 1， 所以Pool没啥作用
+                    rouge_scores = [score["rougeL"].fmeasure for score in rouge_scores]  # score["rougeL"].fmeasure 是对应的pair的得分
+                    max_rl_score = max(rouge_scores)
+                    scores.append(max_rl_score)
+            score_list.append(max(scores))
 
         return score_list
 

From b6f703fbdd69d5eee883135a96c2ecb6c00358d7 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 20:45:10 +0800
Subject: [PATCH 37/43] fix Pool issue

---
 src/lmflow/pipeline/test_rougel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index 9f0821c85..9be98c697 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -117,7 +117,7 @@ def get_rougel_score_list(self, predicted_data_path: str):
             output_ = [data["output"] for data in batch]
             with Pool(4) as p:  # 4 processes
                 scores = []
-                for idx in range(4):
+                for idx in range(len(input_)):
                     rouge_scores = p.map(partial(scorer.score, input_[idx]), [output_[idx]])        # 1 对 1， 所以Pool没啥作用
                     rouge_scores = [score["rougeL"].fmeasure for score in rouge_scores]  # score["rougeL"].fmeasure 是对应的pair的得分
                     max_rl_score = max(rouge_scores)

From 70cb3c67f5ead9f5f2476dc8d43643056bc2f5e0 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Thu, 8 Jun 2023 20:51:21 +0800
Subject: [PATCH 38/43] fix weird precision error in comparison of scores

---
 src/lmflow/pipeline/test_rougel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index 9be98c697..1e9ca5f60 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -288,7 +288,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
         matched = True
         for pred, ans in zip(pred_score_list, ans_score_list):
             print("LMFlow ROUGE-L: ", pred, " -- self-instruct ROUGE-L: ", ans)
-            if pred != ans:
+            if round(pred, 5) != round(ans, 5):
                 matched = False
                 print("scores not matched!")
                 return

From 0a291675f14e04a4f23d24a2f66ee766cb657b9d Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Sat, 17 Jun 2023 20:36:50 -0700
Subject: [PATCH 39/43] similar to evaluate.py

---
 src/lmflow/pipeline/test_rougel.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index 1e9ca5f60..2b8a5f832 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -24,7 +24,7 @@
 #copied from evaluator.py
 class Test_rougel(BasePipeline):
     """
-    Initializes the `Evaluator` class with given arguments.
+    Initializes the `Test_rougel` class with given arguments.
 
     Parameters
     ------------
@@ -80,15 +80,11 @@ def __init__(self, model_args, data_args, evaluator_args):
     def get_rougel_score_list(self, predicted_data_path: str):
         scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
 
-        # Open the input file
+        
         with open(predicted_data_path, encoding='utf-8') as f:
-            # Read in the contents of the file
             contents = f.read()
-        # Split the contents of the file into individual JSON objects
         objects = contents.strip().split('\n')
-        # Add square brackets to the beginning and end of the list of objects
         json_array = '[' + ','.join(objects) + ']'
-        # Convert the JSON array to a Python object
         json_data = json.loads(json_array)
 
         pred_answers = [instance["pred_answer"] for instance in json_data]
@@ -103,7 +99,7 @@ def get_rougel_score_list(self, predicted_data_path: str):
                 "idx": idx
             })
 
-        dataloader = batchlize(   # 相当于每minibatch_size大小切一段，dataloader = [[{}, {}, ... ], [{}, {}, ... ], ... ]
+        dataloader = batchlize(   
             pred_dataset_buf,
             self.evaluator_args.minibatch_size,  # = self.world_size
             self.evaluator_args.random_shuffle
@@ -118,8 +114,8 @@ def get_rougel_score_list(self, predicted_data_path: str):
             with Pool(4) as p:  # 4 processes
                 scores = []
                 for idx in range(len(input_)):
-                    rouge_scores = p.map(partial(scorer.score, input_[idx]), [output_[idx]])        # 1 对 1， 所以Pool没啥作用
-                    rouge_scores = [score["rougeL"].fmeasure for score in rouge_scores]  # score["rougeL"].fmeasure 是对应的pair的得分
+                    rouge_scores = p.map(partial(scorer.score, input_[idx]), [output_[idx]])        
+                    rouge_scores = [score["rougeL"].fmeasure for score in rouge_scores]  
                     max_rl_score = max(rouge_scores)
                     scores.append(max_rl_score)
             score_list.append(max(scores))
@@ -240,7 +236,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
 
                 # collect rouge-l from all gpus
                 all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
-                dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  # sum 还是 max好？
+                dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  
                 max_, total_ = all_process.tolist()
                 print("max_: ", max_)
                 print("total_: ", total_)
@@ -257,7 +253,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
                 all_process_list = [{}] * self.world_size
 
                 dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None,
-                                   dst=0)  # 只收集process 0？？
+                                   dst=0)  
                 print("all_process_list: ", all_process_list)
                 if not dist.is_initialized() or dist.get_rank() == 0:
                     current_rouge_l = np.mean(pred_score_list)
@@ -272,7 +268,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
                         output_json = json.dumps(output)
                         output_writer.write(output_json + '\n')
 
-            if not dist.is_initialized() or dist.get_rank() == 0:  # 此刻已经处理完dataset
+            if not dist.is_initialized() or dist.get_rank() == 0:  
                 current_rouge_l = np.mean(pred_score_list)
                 print("Final ROUGE-L = ", current_rouge_l)
                 output_writer.close()

From 727c82d867ceb35ceb6a4fcc0fbcff0f30794b63 Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Sat, 17 Jun 2023 20:37:52 -0700
Subject: [PATCH 40/43] Similar to evaluator.py

---
 src/lmflow/pipeline/test_rougel.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lmflow/pipeline/test_rougel.py b/src/lmflow/pipeline/test_rougel.py
index 2b8a5f832..e01f57085 100644
--- a/src/lmflow/pipeline/test_rougel.py
+++ b/src/lmflow/pipeline/test_rougel.py
@@ -9,7 +9,6 @@
 from rouge_score import rouge_scorer
 from multiprocessing import Pool
 from functools import partial
-# TODO: remove later
 from transformers import AutoConfig
 # from lmflow.pipeline.auto_pipeline import AutoPipeline
 

From 48a392bde6c4730a92df1227be2d6d22fcdcc7da Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Sat, 17 Jun 2023 20:39:21 -0700
Subject: [PATCH 41/43] Update test.py

Similar to evaluate.py
---
 examples/test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/test.py b/examples/test.py
index 7f7673999..0007b8dd2 100644
--- a/examples/test.py
+++ b/examples/test.py
@@ -9,7 +9,6 @@
 from rouge_score import rouge_scorer
 from multiprocessing import Pool
 from functools import partial
-# TODO: remove later
 from transformers import AutoConfig
 
 # from lmflow.pipeline.auto_pipeline import AutoPipeline

From eefad15adc1f0e9c8ae1aff1b7a1501c496ededd Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Sat, 17 Jun 2023 20:40:57 -0700
Subject: [PATCH 42/43] Update test_rougel.py

---
 examples/test_rougel.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/test_rougel.py b/examples/test_rougel.py
index 25ef53c50..2f56d5746 100644
--- a/examples/test_rougel.py
+++ b/examples/test_rougel.py
@@ -249,7 +249,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
 
                 # collect rouge-l from all gpus
                 all_process = torch.tensor([rl_, total_], dtype=torch.float32, device=self.local_rank)
-                dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)  # sum 还是 max好？
+                dist.all_reduce(all_process, dist.ReduceOp.MAX, async_op=False)
                 max_, total_ = all_process.tolist()
                 print("max_: ", max_)
                 print("total_: ", total_)
@@ -266,7 +266,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
                 all_process_list = [{}] * self.world_size
 
                 dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None,
-                                   dst=0)  # 只收集process 0？？
+                                   dst=0)
                 print("all_process_list: ", all_process_list)
                 if not dist.is_initialized() or dist.get_rank() == 0:
                     current_rouge_l = np.mean(pred_score_list)
@@ -281,7 +281,7 @@ def evaluate(self, model, dataset: Dataset, metric="rougel"):
                         output_json = json.dumps(output)
                         output_writer.write(output_json + '\n')
 
-            if not dist.is_initialized() or dist.get_rank() == 0:  # 此刻已经处理完dataset
+            if not dist.is_initialized() or dist.get_rank() == 0:
                 current_rouge_l = np.mean(pred_score_list)
                 print("Final ROUGE-L = ", current_rouge_l)
                 output_writer.close()

From b5abd56695a75d158f7065cf9c87e90a68a486cb Mon Sep 17 00:00:00 2001
From: sYc <songyc9@126.com>
Date: Sat, 17 Jun 2023 20:43:29 -0700
Subject: [PATCH 43/43] Update rougel_test_case.sh

---
 scripts/rougel_test_case.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/rougel_test_case.sh b/scripts/rougel_test_case.sh
index 6074489e8..7d38696ac 100644
--- a/scripts/rougel_test_case.sh
+++ b/scripts/rougel_test_case.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
 # 比较self-instruct和LMFlow对相同instructions的ROUGE-L得分是否相同
+# A test case for the ROUGE-L metric used in evaluation.
+# Compares the ROUGE-L scores from self-intruct and LMFlow. Valid if all scores in each pair are the same.
 
 CUDA_VISIBLE_DEVICES=0 \
     deepspeed examples/test.py \
@@ -9,4 +11,4 @@ CUDA_VISIBLE_DEVICES=0 \
     --dataset_path data/alpaca/test \
     --deepspeed examples/ds_config.json \
     --inference_batch_size_per_device 1 \
-    --metric rouge-l
\ No newline at end of file
+    --metric rouge-l