Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llm/auto_parallel/gpt-3/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,16 @@
CosineAnnealingWithWarmupDecay,
GPTConfig,
GPTForCausalLMAuto,
GPTForCausalLMNet,
GPTPretrainingCriterionAuto,
GPTPretrainingCriterionNet,
LinearAnnealingWithWarmupDecay,
)
from paddlenlp.utils.log import logger

MODEL_CLASSES = {
"gpt": (GPTConfig, GPTForCausalLMAuto, GPTPretrainingCriterionAuto),
"gpt_network": (GPTConfig, GPTForCausalLMNet, GPTPretrainingCriterionNet),
}

from paddlenlp.data.causal_dataset import (
Expand Down Expand Up @@ -104,6 +107,10 @@ class PreTrainingArguments(AutoTrainingArguments):
default=False,
metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
)
use_intermediate_api: bool = field(
default=False,
metadata={"help": "Weather to use auto_parallel intermediate api"},
)

def __post_init__(self):
super().__post_init__()
Expand Down
101 changes: 101 additions & 0 deletions llm/auto_parallel/llama/llama_with_api.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -x

unset PADDLE_ELASTIC_JOB_ID
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
unset FLAGS_START_PORT
unset PADDLE_ELASTIC_TIMEOUT

export NNODES=1
export PADDLE_TRAINERS_NUM=1

export GLOG_v=0

export FLAGS_cudnn_deterministic=1
export FLAGS_embedding_deterministic=1
export FLAGS_max_inplace_grad_add=65536
export FLAGS_enable_auto_parallel_align_mode=1
export FLAGS_enable_pir_api=1

task_name="llama_auto"
rm -rf output
rm -rf log

export SOT_LOG_LEVEL=4
export PYTHONPATH=../../../:$PYTHONPATH

#ulimit -c unlimited

python -u -m paddle.distributed.launch \
--gpus "0,1,2,3,4,5,6,7" \
--log_dir "log" \
./run_pretrain_auto.py \
--model_name_or_path "facebook/llama-7b" \
--tokenizer_name_or_path "facebook/llama-7b" \
--input_dir "./data" \
--output_dir "./output" \
--split 949,50,1 \
--to_static false \
--pipeline_parallel_degree 2 \
--sharding_parallel_degree 2 \
--tensor_parallel_degree 2 \
--virtual_pp_degree 1 \
--pipeline_schedule_mode "VPP" \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--max_grad_norm 1.0 \
--learning_rate 3e-05 \
--min_learning_rate 3e-06 \
--max_steps 10 \
--logging_steps 1 \
--eval_steps 10000 \
--save_steps 1000 \
--continue_training 0 \
--do_train true \
--do_eval false \
--do_predict false \
--disable_tqdm true \
--save_total_limit 2 \
--device gpu \
--model_type "llama_network" \
--use_intermediate_api true \
--dataloader_num_workers 4 \
--distributed_dataloader 0 \
--enable_auto_parallel 1 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 32 \
--per_device_eval_batch_size 1 \
--recompute false \
--recompute_use_reentrant true \
--skip_profile_timer true \
--recompute_granularity full \
--pp_recompute_interval 0 \
--bf16 true \
--fp16_opt_level "O2" \
--amp_master_grad true \
--fuse_attention_ffn false \
--fuse_attention_qkv true \
--use_flash_attention true \
--use_fused_rope true \
--use_fused_rms_norm false \
--max_seq_length 4096 \
--sequence_parallel true \
--sharding "stage1" \
--sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
--tensor_parallel_config "enable_mp_async_allreduce" \
--num_hidden_layers 4 \
--auto_parallel_resume_form_hybrid_parallel true \
13 changes: 9 additions & 4 deletions llm/auto_parallel/llama/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,15 @@
LinearAnnealingWithWarmupDecay,
LlamaConfig,
LlamaForCausalLM3DAuto,
LlamaForCausalLM3DNet,
LlamaPretrainingCriterion3DAuto,
LlamaPretrainingCriterion3DNet,
)
from paddlenlp.utils.log import logger

MODEL_CLASSES = {
"llama": (LlamaConfig, LlamaForCausalLM3DAuto, LlamaPretrainingCriterion3DAuto),
"llama_network": (LlamaConfig, LlamaForCausalLM3DNet, LlamaPretrainingCriterion3DNet),
}


Expand Down Expand Up @@ -107,6 +110,10 @@ class PreTrainingArguments(AutoTrainingArguments):
default=False,
metadata={"help": "Weather to run benchmark by autotuner. True for from_scratch and pad_max_length."},
)
use_intermediate_api: bool = field(
default=False,
metadata={"help": "Weather to use auto_parallel intermediate api"},
)

def __post_init__(self):
super().__post_init__()
Expand Down Expand Up @@ -551,6 +558,7 @@ def main():
config.use_recompute = training_args.recompute
config.tensor_parallel_degree = training_args.tensor_parallel_degree
config.tensor_parallel_rank = training_args.tensor_parallel_rank
config.sharding_parallel_degree = training_args.sharding_parallel_degree

if training_args.strategy.pipeline.enable and config.virtual_pp_degree > 1:
pipeline = training_args.strategy.pipeline
Expand All @@ -571,10 +579,6 @@ def main():
model = model_class.from_config(config, dtype="float32")
criterion = criterion_class(config)

for param in model.parameters():
assert not param._is_initialized()
param.initialize()

if training_args.recompute:

def fn(layer):
Expand Down Expand Up @@ -628,6 +632,7 @@ def fn(layer):
eval_dataset=eval_dataset if training_args.do_eval else None,
optimizers=(None, lr_scheduler),
tokenizer=tokenizer,
model_args=model_args,
)

checkpoint = None
Expand Down
99 changes: 99 additions & 0 deletions llm/auto_parallel/qwen/run_intermediate_api.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# just for debug

set -x
unset PADDLE_ELASTIC_JOB_ID
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS
unset FLAGS_START_PORT
unset PADDLE_ELASTIC_TIMEOUT

export NNODES=1
export PADDLE_TRAINERS_NUM=1
export FLAGS_call_stack_level=3
export FLAGS_use_cuda_managed_memory=true

task_name="llama_auto"
rm -rf output/$task_name/
rm -rf "output/$task_name""_log"

export SOT_LOG_LEVEL=4
export PYTHONPATH=../../../:$PYTHONPATH


rm -rf ./log/auto_3d_auto

export FLAGS_embedding_deterministic=1
export FLAGS_cudnn_deterministic=1
export FLAGS_max_inplace_grad_add=65536
export NVIDIA_TF32_OVERRIDE=0
export FLAGS_enable_pir_in_executor=1
export FLAGS_enable_pir_api=1


python -u -m paddle.distributed.launch \
--gpus "4,5" \
--log_dir "log/auto_3d_auto" \
run_pretrain_3D_auto.py \
--model_name_or_path "qwen/qwen-14b" \
--tokenizer_name_or_path "qwen/qwen-14b" \
--model_type "qwen_network" \
--use_intermediate_api true \
--input_dir "../data" \
--output_dir "./checkpoints/qwen_pretrain_ckpts" \
--per_device_train_batch_size 1\
--gradient_accumulation_steps 32\
--per_device_eval_batch_size 16\
--sharding "stage1" \
--sharding_parallel_degree 1\
--tensor_parallel_degree 2\
--pipeline_parallel_degree 1\
--virtual_pp_degree 1\
--use_flash_attention false\
--use_fused_rms_norm false\
--use_fused_rope false\
--max_seq_length 4096\
--learning_rate 3e-05\
--min_learning_rate 3e-06\
--scale_loss 1024\
--warmup_steps 30\
--logging_steps 1\
--max_steps 10000\
--save_steps 1000\
--eval_steps 10000\
--weight_decay 0.01\
--bf16 true\
--fp16_opt_level "O2"\
--amp_master_grad true \
--warmup_ratio 0.01\
--max_grad_norm 0.0\
--dataloader_num_workers 4\
--continue_training 0\
--do_train true\
--do_eval false\
--do_predict false \
--disable_tqdm true\
--recompute false\
--recompute_granularity "core_attn"\
--recompute_use_reentrant true\
--distributed_dataloader 0\
--save_total_limit 2\
--enable_auto_parallel 1\
--to_static 1 \
--num_hidden_layers 1 \
--attention_probs_dropout_prob 0 \
--hidden_dropout_prob 0 \
--auto_parallel_resume_form_hybrid_parallel true \
15 changes: 14 additions & 1 deletion llm/auto_parallel/qwen/run_pretrain_3D_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,15 @@
LinearAnnealingWithWarmupDecay,
QWenConfig,
QWenForCausalLM3DAuto,
QWenForCausalLM3DNet,
QWenPretrainingCriterionAuto,
QWenPretrainingCriterionNet,
)
from paddlenlp.utils.log import logger

MODEL_CLASSES = {
"qwen": (QWenConfig, QWenForCausalLM3DAuto, QWenPretrainingCriterionAuto),
"qwen_network": (QWenConfig, QWenForCausalLM3DNet, QWenPretrainingCriterionNet),
}

from paddlenlp.data.causal_dataset import (
Expand Down Expand Up @@ -113,6 +116,10 @@ class PreTrainingArguments(AutoTrainingArguments):
default=False,
metadata={"help": "whether use lazy init for model parameters"},
)
use_intermediate_api: bool = field(
default=False,
metadata={"help": "Weather to use auto_parallel intermediate api"},
)

def __post_init__(self):
super().__post_init__()
Expand Down Expand Up @@ -258,6 +265,8 @@ class ModelArguments:
default=False,
metadata={"help": "recompute_use_reentrant"},
)
hidden_dropout_prob: float = field(default=0.1, metadata={"help": "The hidden dropout prob."})
attention_probs_dropout_prob: float = field(default=0.1, metadata={"help": "The attention hidden dropout prob."})


def create_pretrained_dataset(
Expand Down Expand Up @@ -559,7 +568,11 @@ def main():
# Create the learning_rate sheduler and optimizer
if training_args.decay_steps is None:
training_args.decay_steps = training_args.max_steps
warmup_steps = training_args.warmup_ratio * training_args.max_steps

if training_args.warmup_steps > 0:
warmup_steps = training_args.warmup_steps
else:
warmup_steps = training_args.warmup_ratio * training_args.max_steps

lr_scheduler = None
if training_args.lr_scheduler_type.value == "cosine":
Expand Down
Loading