diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml index e291ab67a..891d56714 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml @@ -8,22 +8,22 @@ Global: Model: vocab_size: 50304 - hidden_size: 2048 - num_layers: 24 - num_attention_heads: 16 + hidden_size: 512 + num_layers: 1 + num_attention_heads: 8 ffn_hidden_size: - hidden_dropout_prob: 0.1 - attention_probs_dropout_prob: 0.1 + hidden_dropout_prob: 0 + attention_probs_dropout_prob: 0 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 - use_recompute: True + use_recompute: False recompute_granularity: Distributed: - dp_degree: 8 - mp_degree: 1 + dp_degree: 1 + mp_degree: 2 pp_degree: 1 sharding: sharding_degree: 1 diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml index 33857b512..a2fc3e428 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml @@ -8,12 +8,12 @@ Global: Model: vocab_size: 50304 - hidden_size: 1024 - num_layers: 24 + hidden_size: 2048 + num_layers: 1 num_attention_heads: 16 - ffn_hidden_size: 4096 - hidden_dropout_prob: 0.1 - attention_probs_dropout_prob: 0.1 + ffn_hidden_size: + hidden_dropout_prob: 0 + attention_probs_dropout_prob: 0 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index 742ad8030..e0400cd1a 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -16,7 +16,7 @@ Engine: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + use_pure_fp16: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 0a461d411..33f4fb4e8 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -259,7 +259,8 @@ def _train_one_epoch(self, # Note(GuoxiaWang): Do not use len(train_data_loader()), # it will cause a memory leak. total_train_batch = len(train_data_loader) - total_eval_batch = len(valid_data_loader) if valid_data_loader is not None else 0 + total_eval_batch = len( + valid_data_loader) if valid_data_loader is not None else 0 for step, batch in enumerate(train_data_loader): if epoch_index == self._load_recovery['epoch']: @@ -287,44 +288,6 @@ def _train_one_epoch(self, train_start = time.time() train_losses = [] - if self._run_mode == 'step' and not skip_first: - if step % self._eval_freq == 0: - self._module.model.eval() - - eval_losses = [] - eval_start = time.time() - - for eval_step, batch in enumerate(valid_data_loader): - loss = self._evaluate_impl(batch) - eval_losses.append(loss) - - if eval_step >= self._eval_iters - 1: - break - - paddle.device.cuda.synchronize() - eval_cost = time.time() - eval_start - eval_loss = sum(eval_losses) / len(eval_losses) - - log_dict = { - 'loss': eval_loss.numpy()[0], - 'epoch': epoch_index, - 'batch': eval_step, - 'total_batch': total_eval_batch, - 'eval_cost': eval_cost / self._logging_freq, - } - self._module.validation_step_end(log_dict) - - self._module.model.train() - - if self._save_steps > 0 and step % self._save_steps == 0: - self.save(epoch=epoch_index, step=step) - else: - skip_first = False - - if self._run_mode == 'step' and step >= self._max_steps: - logger.info("The training process is complete.") - return - if self.profiler: self.profiler.step() @@ -432,6 +395,7 @@ def _optim_update_params(self): p.bw_storage.scale_(1.0 / self._dp_group.nranks) dist.all_reduce(p.bw_storage, group=self._dp_group) + # print(">>>", self._optimizer._parameter_list) if self._use_pure_fp16: self._scaler.step(self._optimizer) self._scaler.update() diff --git a/projects/gpt/pretrain_gpt_1.3B_dp8.sh b/projects/gpt/pretrain_gpt_1.3B_dp8.sh index 87a8b042e..be44a366b 100644 --- a/projects/gpt/pretrain_gpt_1.3B_dp8.sh +++ b/projects/gpt/pretrain_gpt_1.3B_dp8.sh @@ -18,6 +18,6 @@ log_dir=log_hybrid rm -rf $log_dir # 1.3B+dp8 run_pretrain -python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ +python -m paddle.distributed.launch --log_dir $log_dir --devices "6,7" \ ./tools/train.py \ -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml