Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f789ca3
feat(dsv3): trainer part migration
hushenwei2000 Aug 25, 2025
608bde9
add pre_train entrance
chen2016013 Aug 26, 2025
dc4f910
change deepseekv2 model
chen2016013 Aug 26, 2025
a040746
change deepseekv2 model 2
chen2016013 Aug 26, 2025
d57a21b
feat(dsv3): download support bos
hushenwei2000 Aug 27, 2025
626ef5a
Merge branch 'develop' into merge_dsv3
hushenwei2000 Aug 27, 2025
11318b9
feat(dsv3): download support bos
hushenwei2000 Aug 27, 2025
1ed58e9
change deepseekv2 model 3
chen2016013 Aug 27, 2025
92d409a
Update __init__.py
chen2016013 Aug 27, 2025
be2b827
Update config.json
chen2016013 Aug 27, 2025
35b3693
Update train_gpu.sh
chen2016013 Aug 27, 2025
6a72aed
feat(dsv3): fix code
hushenwei2000 Aug 27, 2025
edb5cd1
Merge commit 'refs/pull/2486/head' of https://github.com/PaddlePaddle…
chen2016013 Aug 27, 2025
bee704f
Merge commit 'refs/pull/2487/head' of https://github.com/PaddlePaddle…
chen2016013 Aug 27, 2025
c6e19b4
Merge commit 'refs/pull/2483/head' of https://github.com/PaddlePaddle…
chen2016013 Aug 27, 2025
0459f1d
merge all changes
chen2016013 Aug 27, 2025
101d592
fix bug
chen2016013 Aug 28, 2025
7b65ab6
Update paddleformers/transformers/deepseek_v2/modeling_fast.py
chen2016013 Aug 28, 2025
84cf440
Update paddleformers/examples/deepseek_v3/run_pretrain.py
chen2016013 Aug 28, 2025
4f60f2d
fix bug
chen2016013 Aug 28, 2025
5e6075a
Merge branch 'test0827_merge' of https://github.com/chen2016013/Paddl…
chen2016013 Aug 28, 2025
f87fe03
Update train_gpu.sh
chen2016013 Aug 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions paddleformers/examples/deepseek_v3/config/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"architectures": [
"DeepseekV3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"auto_map": {
"AutoConfig": "configuration_deepseek.DeepseekV3Config",
"AutoModel": "modeling_deepseek.DeepseekV3Model",
"AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
},
"aux_loss_alpha": 0.001,
"bos_token_id": 0,
"eos_token_id": 1,
"ep_size": 1,
"first_k_dense_replace": 3,
"hidden_act": "silu",
"hidden_size": 7168,
"initializer_range": 0.02,
"intermediate_size": 18432,
"kv_lora_rank": 512,
"max_position_embeddings": 163840,
"model_type": "deepseek_v3",
"moe_intermediate_size": 2048,
"moe_layer_freq": 1,
"n_group": 8,
"n_routed_experts": 8,
"n_shared_experts": 1,
"norm_topk_prob": true,
"num_attention_heads": 128,
"num_experts_per_tok": 8,
"num_hidden_layers": 15,
"num_key_value_heads": 128,
"num_nextn_predict_layers": 1,
"pretraining_tp": 1,
"q_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"beta_fast": 32,
"beta_slow": 1,
"factor": 40,
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
},
"rope_theta": 10000,
"routed_scaling_factor": 2.5,
"scoring_func": "sigmoid",
"seq_aux": true,
"tie_word_embeddings": false,
"topk_group": 4,
"topk_method": "noaux_tc",
"dtype": "bfloat16",
"transformers_version": "4.33.1",
"use_cache": true,
"v_head_dim": 128,
"vocab_size": 129280,
"using_flex_token": true,
"using_fake_gate": true,
"use_fused_rms_norm": true,
"fuse_attention_ffn": true,
"use_fused_rope": true,
"token_drop_steps": 0,
"recompute_fwd_gate_up": true,
"adaptive_remained_O1_recompute_ratio": 0.3,
"using_post_norm_recompute": true,
"is_split_group_gemm": false,
"use_dualpipev": true,
"send_mtp_embed": true,
"offline_quant_expert_weight": false,
"clear_origin_weight_when_offline_quant": false
}

53 changes: 53 additions & 0 deletions paddleformers/examples/deepseek_v3/config/pretrain_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"model_name_or_path": "./config/",
"tokenizer_name_or_path": "deepseek-ai/DeepSeek-V3",
"input_dir": "./data",
"output_dir": "./checkpoints/pretrain_ckpts",
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 24,
"per_device_eval_batch_size": 1,
"tensor_parallel_degree": 1,
"pipeline_parallel_degree": 4,
"pipeline_parallel_config": "use_dualpipev",
"sharding_parallel_degree": 2,
"sharding_parallel_config": "split_param",
"sharding_comm_buffer_size_MB": 2048,
"expert_parallel_degree": 2,
"sharding": "stage1",
"virtual_pp_degree": 1,
"sequence_parallel": 0,
"use_flash_attention": true,
"max_seq_length": 4097,
"learning_rate": 3e-05,
"min_learning_rate": 3e-06,
"warmup_steps": 30,
"logging_steps": 1,
"max_steps": 200,
"save_steps": 5000,
"eval_steps": 1000,
"weight_decay": 0.01,
"bf16": true,
"fp16_opt_level": "O2",
"warmup_ratio": 0.01,
"max_grad_norm": 1.0,
"amp_master_grad": 1,
"dataloader_num_workers": 8,
"continue_training": 0,
"do_train": true,
"do_eval": true,
"do_predict": false,
"disable_tqdm": true,
"recompute": false,
"distributed_dataloader": 1,
"unified_checkpoint": true,
"save_total_limit": 2,
"skip_profile_timer": false,
"use_fused_rms_norm": true,
"fuse_attention_ffn": true,
"use_fused_rope": true,
"save_sharded_model": false,
"load_sharded_model": false,
"use_expert_parallel": true,
"unified_checkpoint_config": "skip_save_model_weight",
"offload_optim": true
}
21 changes: 21 additions & 0 deletions paddleformers/examples/deepseek_v3/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# llama 模型数据下载
# mkdir -p data
# wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
# wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx

rm -rf output
nohup sh script/train_gpu.sh config/pretrain_argument.json > run.log 2>&1 &
Loading
Loading