Skip to content

Commit f48aa0c

Browse files
author
rrutmann
committed
Fix: Adapt configts to latest changes
1 parent 693c57d commit f48aa0c

File tree

9 files changed

+99
-17
lines changed

9 files changed

+99
-17
lines changed

config_files/training/config_example_coca.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,22 @@ settings:
2525
gradient_accumulation_steps: 1
2626
local_train_micro_batch_size: 1
2727
sequence_length: 256
28+
dp_degree: ${settings.cuda_env.world_size}
2829
training_target:
2930
num_target_tokens:
3031
component_key: number_conversion
3132
variant_key: num_tokens_from_num_steps
3233
config:
3334
num_steps: ${settings.training_target.num_target_steps}
34-
num_ranks: ${settings.cuda_env.world_size}
35+
dp_degree: ${settings.cuda_env.world_size}
3536
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3637
sequence_length: ${settings.step_profile.sequence_length}
3738
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3839
num_target_steps: # for the batch progress subscriber
3940
component_key: number_conversion
4041
variant_key: num_steps_from_num_samples
4142
config:
42-
num_ranks: ${settings.cuda_env.world_size}
43+
dp_degree: ${settings.cuda_env.world_size}
4344
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4445
global_num_samples: ${settings.coca_example_settings.train_num_samples}
4546
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}

config_files/training/config_lorem_ipsum_long_fsdp1.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,22 @@ settings:
2626
gradient_accumulation_steps: 2
2727
local_train_micro_batch_size: 1
2828
sequence_length: 256
29+
dp_degree: ${settings.cuda_env.world_size}
2930
training_target:
3031
num_target_tokens:
3132
component_key: number_conversion
3233
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
3334
config:
3435
dataset_path: ${settings.paths.train_dataset_path}
3536
sequence_length: ${settings.step_profile.sequence_length}
36-
num_ranks: ${settings.cuda_env.world_size}
37+
dp_degree: ${settings.cuda_env.world_size}
3738
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3839
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3940
num_target_steps: # for the batch progress subscriber
4041
component_key: number_conversion
4142
variant_key: num_steps_from_num_tokens
4243
config:
43-
num_ranks: ${settings.cuda_env.world_size}
44+
dp_degree: ${settings.cuda_env.world_size}
4445
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4546
global_num_tokens: ${settings.training_target.num_target_tokens}
4647
sequence_length: ${settings.step_profile.sequence_length}

config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,22 @@ settings:
2626
gradient_accumulation_steps: 2
2727
local_train_micro_batch_size: 1
2828
sequence_length: 256
29+
dp_degree: ${settings.cuda_env.world_size}
2930
training_target:
3031
num_target_tokens:
3132
component_key: number_conversion
3233
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
3334
config:
3435
dataset_path: ${settings.paths.train_dataset_path}
3536
sequence_length: ${settings.step_profile.sequence_length}
36-
num_ranks: ${settings.cuda_env.world_size}
37+
dp_degree: ${settings.cuda_env.world_size}
3738
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3839
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3940
num_target_steps: # for the batch progress subscriber
4041
component_key: number_conversion
4142
variant_key: num_steps_from_num_tokens
4243
config:
43-
num_ranks: ${settings.cuda_env.world_size}
44+
dp_degree: ${settings.cuda_env.world_size}
4445
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4546
global_num_tokens: ${settings.training_target.num_target_tokens}
4647
sequence_length: ${settings.step_profile.sequence_length}

config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,28 @@ settings:
2626
gradient_accumulation_steps: 1
2727
local_train_micro_batch_size: 4
2828
sequence_length: 256
29+
dp_degree:
30+
instance_key: dp_degree
31+
pass_type: BY_REFERENCE
2932
training_target:
3033
num_target_tokens:
3134
component_key: number_conversion
3235
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
3336
config:
3437
dataset_path: ${settings.paths.train_dataset_path}
3538
sequence_length: ${settings.step_profile.sequence_length}
36-
num_ranks: ${settings.cuda_env.world_size}
39+
dp_degree:
40+
instance_key: dp_degree
41+
pass_type: BY_REFERENCE
3742
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3843
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3944
num_target_steps: # for the batch progress subscriber
4045
component_key: number_conversion
4146
variant_key: num_steps_from_num_tokens
4247
config:
43-
num_ranks: ${settings.cuda_env.world_size}
48+
dp_degree:
49+
instance_key: dp_degree
50+
pass_type: BY_REFERENCE
4451
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4552
global_num_tokens: ${settings.training_target.num_target_tokens}
4653
sequence_length: ${settings.step_profile.sequence_length}
@@ -176,6 +183,15 @@ device_mesh:
176183
data_parallel_shard_degree: -1
177184
world_size: ${settings.cuda_env.world_size}
178185

186+
dp_degree:
187+
component_key: number_conversion
188+
variant_key: parallel_degree
189+
config: # get the parallel degree from the device mesh
190+
device_mesh:
191+
instance_key: device_mesh
192+
pass_type: BY_REFERENCE
193+
parallelism_methods: [dp_shard, dp_replicate]
194+
179195
app_state:
180196
component_key: app_state
181197
variant_key: raw

config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,18 @@ settings:
2626
gradient_accumulation_steps: 1
2727
local_train_micro_batch_size: 4
2828
sequence_length: 16
29+
dp_degree:
30+
instance_key: dp_degree
31+
pass_type: BY_REFERENCE
2932
training_target:
3033
num_target_tokens:
3134
component_key: number_conversion
3235
variant_key: num_tokens_from_num_steps
3336
config:
3437
sequence_length: ${settings.step_profile.sequence_length}
35-
num_ranks: ${settings.cuda_env.world_size}
38+
dp_degree:
39+
instance_key: dp_degree
40+
pass_type: BY_REFERENCE
3641
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3742
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3843
num_steps: ${settings.training_target.num_target_steps}
@@ -174,6 +179,15 @@ device_mesh:
174179
data_parallel_shard_degree: -1
175180
world_size: ${settings.cuda_env.world_size}
176181

182+
dp_degree:
183+
component_key: number_conversion
184+
variant_key: parallel_degree
185+
config: # get the parallel degree from the device mesh
186+
device_mesh:
187+
instance_key: device_mesh
188+
pass_type: BY_REFERENCE
189+
parallelism_methods: [dp_shard, dp_replicate]
190+
177191
app_state:
178192
component_key: app_state
179193
variant_key: raw

config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,21 +26,28 @@ settings:
2626
gradient_accumulation_steps: 2
2727
local_train_micro_batch_size: 1
2828
sequence_length: 256
29+
dp_degree:
30+
instance_key: dp_degree
31+
pass_type: BY_REFERENCE
2932
training_target:
3033
num_target_tokens:
3134
component_key: number_conversion
3235
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
3336
config:
3437
dataset_path: ${settings.paths.train_dataset_path}
3538
sequence_length: ${settings.step_profile.sequence_length}
36-
num_ranks: ${settings.cuda_env.world_size}
39+
dp_degree:
40+
instance_key: dp_degree
41+
pass_type: BY_REFERENCE
3742
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3843
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3944
num_target_steps: # for the batch progress subscriber
4045
component_key: number_conversion
4146
variant_key: num_steps_from_num_tokens
4247
config:
43-
num_ranks: ${settings.cuda_env.world_size}
48+
dp_degree:
49+
instance_key: dp_degree
50+
pass_type: BY_REFERENCE
4451
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4552
global_num_tokens: ${settings.training_target.num_target_tokens}
4653
sequence_length: ${settings.step_profile.sequence_length}
@@ -200,6 +207,15 @@ device_mesh:
200207
data_parallel_shard_degree: 4
201208
world_size: ${settings.cuda_env.world_size}
202209

210+
dp_degree:
211+
component_key: number_conversion
212+
variant_key: parallel_degree
213+
config: # get the parallel degree from the device mesh
214+
device_mesh:
215+
instance_key: device_mesh
216+
pass_type: BY_REFERENCE
217+
parallelism_methods: [dp_shard, dp_replicate]
218+
203219
app_state:
204220
component_key: app_state
205221
variant_key: dcp

tests/end2end_tests/gpt2_train_num_steps_7_pp_tp.yaml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,28 @@ settings:
2525
gradient_accumulation_steps: 1
2626
local_train_micro_batch_size: 2
2727
sequence_length: 256
28+
dp_degree:
29+
instance_key: dp_degree
30+
pass_type: BY_REFERENCE
2831
training_target:
2932
num_target_tokens:
3033
component_key: number_conversion
3134
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
3235
config:
3336
dataset_path: ${settings.paths.train_dataset_path}
3437
sequence_length: ${settings.step_profile.sequence_length}
35-
num_ranks: 2 # FIXME: adapt to dp_parallel_degree
38+
dp_degree:
39+
instance_key: dp_degree
40+
pass_type: BY_REFERENCE
3641
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3742
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3843
num_target_steps: # for the batch progress subscriber
3944
component_key: number_conversion
4045
variant_key: num_steps_from_num_tokens
4146
config:
42-
num_ranks: 2 # FIXME: adapt to dp_parallel_degree
47+
dp_degree:
48+
instance_key: dp_degree
49+
pass_type: BY_REFERENCE
4350
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4451
global_num_tokens: ${settings.training_target.num_target_tokens}
4552
sequence_length: ${settings.step_profile.sequence_length}
@@ -137,6 +144,15 @@ device_mesh:
137144
data_parallel_shard_degree: -1
138145
world_size: ${settings.cuda_env.world_size}
139146

147+
dp_degree:
148+
component_key: number_conversion
149+
variant_key: parallel_degree
150+
config: # get the parallel degree from the device mesh
151+
device_mesh:
152+
instance_key: device_mesh
153+
pass_type: BY_REFERENCE
154+
parallelism_methods: [dp_shard, dp_replicate]
155+
140156
app_state:
141157
component_key: app_state
142158
variant_key: raw

tutorials/library_usage/config_lorem_ipsum.yaml

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,18 @@ settings:
2727
gradient_accumulation_steps: 1
2828
local_train_micro_batch_size: 2
2929
sequence_length: 256
30+
dp_degree:
31+
instance_key: dp_degree
32+
pass_type: BY_REFERENCE
3033
training_target:
3134
num_target_tokens:
3235
component_key: number_conversion
3336
variant_key: num_tokens_from_num_steps
3437
config:
3538
num_steps: ${settings.training_target.num_target_steps}
36-
num_ranks: ${settings.cuda_env.world_size}
39+
dp_degree:
40+
instance_key: dp_degree
41+
pass_type: BY_REFERENCE
3742
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3843
sequence_length: ${settings.step_profile.sequence_length}
3944
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
@@ -42,7 +47,9 @@ settings:
4247
variant_key: num_steps_from_raw_dataset_index
4348
config:
4449
raw_index_path: ${settings.paths.index_path}
45-
num_ranks: ${settings.cuda_env.world_size}
50+
dp_degree:
51+
instance_key: dp_degree
52+
pass_type: BY_REFERENCE
4653
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4754
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
4855
training_progress:
@@ -228,6 +235,15 @@ device_mesh:
228235
data_parallel_shard_degree: ${settings.cuda_env.world_size} # i.e., fully sharded
229236
world_size: ${settings.cuda_env.world_size}
230237

238+
dp_degree:
239+
component_key: number_conversion
240+
variant_key: parallel_degree
241+
config: # get the parallel degree from the device mesh
242+
device_mesh:
243+
instance_key: device_mesh
244+
pass_type: BY_REFERENCE
245+
parallelism_methods: [dp_shard, dp_replicate]
246+
231247
app_state:
232248
component_key: app_state
233249
variant_key: raw

tutorials/modalities_in_15_mins/configs/pretraining_config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,22 @@ settings:
2525
gradient_accumulation_steps: 1
2626
local_train_micro_batch_size: 64
2727
sequence_length: 256
28+
dp_degree: ${settings.cuda_env.world_size}
2829
training_target:
2930
num_target_tokens:
3031
component_key: number_conversion
3132
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
3233
config:
3334
dataset_path: ${settings.paths.train_dataset_path}
3435
sequence_length: ${settings.step_profile.sequence_length}
35-
num_ranks: ${settings.cuda_env.world_size}
36+
dp_degree: ${settings.cuda_env.world_size}
3637
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3738
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3839
num_target_steps: # for the batch progress subscriber
3940
component_key: number_conversion
4041
variant_key: num_steps_from_num_tokens
4142
config:
42-
num_ranks: ${settings.cuda_env.world_size}
43+
dp_degree: ${settings.cuda_env.world_size}
4344
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4445
global_num_tokens: ${settings.training_target.num_target_tokens}
4546
sequence_length: ${settings.step_profile.sequence_length}

0 commit comments

Comments
 (0)