diff --git a/src/lmflow/args.py b/src/lmflow/args.py index 3f160ee07..a495ce071 100644 --- a/src/lmflow/args.py +++ b/src/lmflow/args.py @@ -561,7 +561,7 @@ class DatasetArguments: validation_split_percentage: Optional[int] = field( default=5, metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" + "help": "The percentage of the train set used as validation set in case there's no eval dataset." }, ) preprocessing_num_workers: Optional[int] = field( diff --git a/src/lmflow/datasets/dataset.py b/src/lmflow/datasets/dataset.py index 1ce9f3a4d..45b2bad2c 100644 --- a/src/lmflow/datasets/dataset.py +++ b/src/lmflow/datasets/dataset.py @@ -265,15 +265,16 @@ def from_dict(self, dict_obj: dict, *args, **kwargs): @classmethod - def create_from_dict(cls, dict_obj, *args, **kwargs): + def create_from_dict(cls, dict_obj, dataset_args: Optional[DatasetArguments] = None, backend: str = "huggingface"): r""" Returns -------- Returns a Dataset object given a dict. """ - empty_data_args = DatasetArguments(dataset_path=None) - dataset = Dataset(empty_data_args) + if dataset_args is None: + dataset_args = DatasetArguments(dataset_path=None) + dataset = Dataset(dataset_args, backend=backend) return dataset.from_dict(dict_obj) @@ -467,14 +468,12 @@ def sample(self, n: int, seed: int=42): if self.backend == "huggingface": sampled_dataset = self.backend_dataset.shuffle(seed=seed).select(range(n)) output_dataset = self.create_from_dict( - { - "type": self.get_type(), - "instances": [ - { - col_name: sampled_dataset[col_name][i] for col_name in sampled_dataset.column_names - } for i in range(n) - ] - } + dict_obj={ + "type": self.get_type(), + "instances": [data_point for data_point in tqdm(sampled_dataset, desc="Train Dataset")] + }, + dataset_args=self.data_args, + backend=self.backend, ) return output_dataset else: @@ -506,24 +505,20 @@ def train_test_split(self, test_size: float=0.2, shuffle: bool=True, seed: int=4 test_size=test_size, shuffle=shuffle, seed=seed ) train_dataset = self.create_from_dict( - { - "type": self.get_type(), - "instances": [ - { - col_name: splited["train"][col_name][i] for col_name in splited["train"].column_names - } for i in range(len(splited["train"])) - ] - } + dict_obj={ + "type": self.get_type(), + "instances": [data_point for data_point in tqdm(splited["train"], desc="Train Dataset")] + }, + dataset_args=self.data_args, + backend=self.backend, ) test_dataset = self.create_from_dict( - { - "type": self.get_type(), - "instances": [ - { - col_name: splited["test"][col_name][i] for col_name in splited["test"].column_names - } for i in range(len(splited["test"])) - ] - } + dict_obj={ + "type": self.get_type(), + "instances": [data_point for data_point in tqdm(splited["test"], desc="Test Dataset")] + }, + dataset_args=self.data_args, + backend=self.backend, ) return train_dataset, test_dataset else: diff --git a/src/lmflow/pipeline/finetuner.py b/src/lmflow/pipeline/finetuner.py index e4958aca7..a1e2af91e 100644 --- a/src/lmflow/pipeline/finetuner.py +++ b/src/lmflow/pipeline/finetuner.py @@ -35,6 +35,7 @@ import lmflow.optim.optimizers as optim from lmflow.args import OptimizerNames, DatasetArguments, ModelArguments, FinetunerArguments from lmflow.datasets.dataset import Dataset +from lmflow.models.hf_decoder_model import HFDecoderModel from lmflow.pipeline.base_tuner import BaseTuner from lmflow.pipeline.utils.peft_trainer import PeftTrainer, PeftSavingCallback @@ -415,10 +416,28 @@ def create_optimizer(self): self.optimizer = smp.DistributedOptimizer(self.optimizer) return CustomizedOptimTrainer + + def __tokenize_dataset( + self, + model: "HFDecoderModel", + dataset: "Dataset", + ) -> "Dataset": + # Tokenization and text grouping must be done in the main process + with self.finetuner_args.main_process_first(desc="dataset map tokenization"): + tokenized_dataset = model.tokenize(dataset) + if self.data_args.disable_group_texts: + lm_dataset = tokenized_dataset + else: + lm_dataset = self.group_text( + tokenized_dataset, + model_max_length=model.get_max_length(), + ) + + return lm_dataset def tune(self, model, - dataset, + dataset: "Dataset", transform_dataset_in_place=True, data_collator=None): """ @@ -439,57 +458,38 @@ def tune(self, if not transform_dataset_in_place: dataset = copy.deepcopy(dataset) - # Tokenization and text grouping must be done in the main process + train_dataset = None + eval_dataset = None + if dataset.backend == "custom_multi_modal": dataset.backend_dataset.register_tokenizer( model.tokenizer, model.image_processor) - lm_dataset = dataset + train_dataset = dataset.get_backend_dataset() else: - with finetuner_args.main_process_first(desc="dataset map tokenization"): - tokenized_dataset = model.tokenize(dataset) - if data_args.disable_group_texts: - lm_dataset = tokenized_dataset - else: - lm_dataset = self.group_text( - tokenized_dataset, - model_max_length=model.get_max_length(), + if finetuner_args.do_eval: + if finetuner_args.eval_dataset_path is None: + assert data_args.validation_split_percentage != 0, ( + "You've set `do_eval=True`. If you don't provide an evaluation dataset using" + " `eval_dataset_path`, please set `validation_split_percentage` to a non-zero" + " value." ) - - train_dataset = lm_dataset.get_backend_dataset() - logger.info(f"Number of train samples: {len(train_dataset)}") - - if finetuner_args.do_eval: - eval_dataset_args = deepcopy(data_args) - eval_dataset_args.dataset_path = finetuner_args.eval_dataset_path - eval_dataset = Dataset(eval_dataset_args) - with finetuner_args.main_process_first(desc="dataset map tokenization"): - tokenized_dataset = model.tokenize(eval_dataset) - if data_args.disable_group_texts: - lm_dataset = tokenized_dataset - else: - lm_dataset = self.group_text( - tokenized_dataset, - model_max_length=model.get_max_length(), + train_dataset_raw, eval_dataset_raw = dataset.train_test_split( + test_size=data_args.validation_split_percentage / 100, + shuffle=True, + seed=finetuner_args.seed, ) - eval_dataset = lm_dataset.get_backend_dataset() - logger.info(f"Number of eval samples: {len(eval_dataset)}") - - def preprocess_logits_for_metrics(logits, labels): - if isinstance(logits, tuple): - # Depending on the model and config, logits may contain extra tensors, - # like past_key_values, but logits always come first - logits = logits[0] - return logits.argmax(dim=-1) - - metric = evaluate.load("accuracy") - - def compute_metrics(eval_preds): - preds, labels = eval_preds - # preds have the same shape as the labels, after the argmax(-1) has been calculated - # by preprocess_logits_for_metrics but we need to shift the labels - labels = labels[:, 1:].reshape(-1) - preds = preds[:, :-1].reshape(-1) - return metric.compute(predictions=preds, references=labels) + train_dataset = self.__tokenize_dataset(model, train_dataset_raw).get_backend_dataset() + eval_dataset = self.__tokenize_dataset(model, eval_dataset_raw).get_backend_dataset() + else: + eval_dataset_args = deepcopy(data_args) + eval_dataset_args.dataset_path = finetuner_args.eval_dataset_path + eval_dataset_raw = Dataset(eval_dataset_args) + eval_dataset = self.__tokenize_dataset(model, eval_dataset_raw).get_backend_dataset() + logger.info(f"Number of eval samples: {len(eval_dataset)}") + + else: + train_dataset = self.__tokenize_dataset(model, dataset).get_backend_dataset() + logger.info(f"Number of train samples: {len(train_dataset)}") if finetuner_args.do_train: if data_args.max_train_samples is not None: @@ -583,8 +583,6 @@ def switch_active_layers(self): tokenizer=model.get_tokenizer(), # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=data_collator, - compute_metrics=compute_metrics if training_args.do_eval else None, - preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None, callbacks=trainer_callbacks ) # Training