Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@ Global:

Model:
vocab_size: 50304
hidden_size: 2048
num_layers: 24
num_attention_heads: 16
hidden_size: 512
num_layers: 1
num_attention_heads: 8
ffn_hidden_size:
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
hidden_dropout_prob: 0
attention_probs_dropout_prob: 0
max_position_embeddings: 1024
type_vocab_size: 16
initializer_range: 0.02
use_recompute: True
use_recompute: False
recompute_granularity:


Distributed:
dp_degree: 8
mp_degree: 1
dp_degree: 1
mp_degree: 2
pp_degree: 1
sharding:
sharding_degree: 1
Expand Down
10 changes: 5 additions & 5 deletions ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ Global:

Model:
vocab_size: 50304
hidden_size: 1024
num_layers: 24
hidden_size: 2048
num_layers: 1
num_attention_heads: 16
ffn_hidden_size: 4096
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
ffn_hidden_size:
hidden_dropout_prob: 0
attention_probs_dropout_prob: 0
max_position_embeddings: 1024
type_vocab_size: 16
initializer_range: 0.02
Expand Down
2 changes: 1 addition & 1 deletion ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Engine:
eval_iters: 10
test_iters:
mix_precision:
use_pure_fp16: True
use_pure_fp16: False
scale_loss: 32768.0
custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
custom_white_list: ["lookup_table", "lookup_table_v2"]
Expand Down
42 changes: 3 additions & 39 deletions ppfleetx/core/engine/eager_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ def _train_one_epoch(self,
# Note(GuoxiaWang): Do not use len(train_data_loader()),
# it will cause a memory leak.
total_train_batch = len(train_data_loader)
total_eval_batch = len(valid_data_loader) if valid_data_loader is not None else 0
total_eval_batch = len(
valid_data_loader) if valid_data_loader is not None else 0
for step, batch in enumerate(train_data_loader):

if epoch_index == self._load_recovery['epoch']:
Expand Down Expand Up @@ -287,44 +288,6 @@ def _train_one_epoch(self,
train_start = time.time()
train_losses = []

if self._run_mode == 'step' and not skip_first:
if step % self._eval_freq == 0:
self._module.model.eval()

eval_losses = []
eval_start = time.time()

for eval_step, batch in enumerate(valid_data_loader):
loss = self._evaluate_impl(batch)
eval_losses.append(loss)

if eval_step >= self._eval_iters - 1:
break

paddle.device.cuda.synchronize()
eval_cost = time.time() - eval_start
eval_loss = sum(eval_losses) / len(eval_losses)

log_dict = {
'loss': eval_loss.numpy()[0],
'epoch': epoch_index,
'batch': eval_step,
'total_batch': total_eval_batch,
'eval_cost': eval_cost / self._logging_freq,
}
self._module.validation_step_end(log_dict)

self._module.model.train()

if self._save_steps > 0 and step % self._save_steps == 0:
self.save(epoch=epoch_index, step=step)
else:
skip_first = False

if self._run_mode == 'step' and step >= self._max_steps:
logger.info("The training process is complete.")
return

if self.profiler:
self.profiler.step()

Expand Down Expand Up @@ -432,6 +395,7 @@ def _optim_update_params(self):
p.bw_storage.scale_(1.0 / self._dp_group.nranks)
dist.all_reduce(p.bw_storage, group=self._dp_group)

# print(">>>", self._optimizer._parameter_list)
if self._use_pure_fp16:
self._scaler.step(self._optimizer)
self._scaler.update()
Expand Down
2 changes: 1 addition & 1 deletion projects/gpt/pretrain_gpt_1.3B_dp8.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ log_dir=log_hybrid
rm -rf $log_dir

# 1.3B+dp8 run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
python -m paddle.distributed.launch --log_dir $log_dir --devices "6,7" \
./tools/train.py \
-c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml