Skip to content

Commit 2c963e3

Browse files
author
rrutmann
committed
Merge remote-tracking branch 'origin/main' into pipeline_parallelism_fix
2 parents c90950c + 84663cf commit 2c963e3

24 files changed

+267
-112
lines changed

config_files/training/config_lorem_ipsum_long_fsdp2.yaml

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,36 @@ settings:
1818
checkpointing_interval_in_steps: 32
1919
evaluation_interval_in_steps: 32
2020
consistency_enforcement:
21-
enforce_tokens_per_step_consistency: true
21+
enforce_tokens_per_step_consistency: false
2222
enforce_last_step_logged: false
2323
enforce_last_step_evaluated: false
2424
enforce_last_step_checkpointed: false
2525
step_profile:
2626
gradient_accumulation_steps: 1
2727
local_train_micro_batch_size: 1
2828
sequence_length: 256
29+
dp_degree:
30+
instance_key: dp_degree
31+
pass_type: BY_REFERENCE
2932
training_target:
3033
num_target_tokens:
3134
component_key: number_conversion
3235
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
3336
config:
3437
dataset_path: ${settings.paths.train_dataset_path}
3538
sequence_length: ${settings.step_profile.sequence_length}
36-
num_ranks: ${settings.cuda_env.world_size}
39+
dp_degree:
40+
instance_key: dp_degree
41+
pass_type: BY_REFERENCE
3742
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
3843
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
3944
num_target_steps: # for the batch progress subscriber
4045
component_key: number_conversion
4146
variant_key: num_steps_from_num_tokens
4247
config:
43-
num_ranks: ${settings.cuda_env.world_size}
48+
dp_degree:
49+
instance_key: dp_degree
50+
pass_type: BY_REFERENCE
4451
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
4552
global_num_tokens: ${settings.training_target.num_target_tokens}
4653
sequence_length: ${settings.step_profile.sequence_length}
@@ -172,9 +179,18 @@ device_mesh:
172179
config:
173180
device_type: cuda
174181
data_parallel_replicate_degree: 1
175-
data_parallel_shard_degree: ${settings.cuda_env.world_size} # i.e., fully sharded
182+
data_parallel_shard_degree: -1
176183
world_size: ${settings.cuda_env.world_size}
177184

185+
dp_degree:
186+
component_key: number_conversion
187+
variant_key: parallel_degree
188+
config: # get the parallel degree from the device mesh
189+
device_mesh:
190+
instance_key: device_mesh
191+
pass_type: BY_REFERENCE
192+
parallelism_methods: [dp_shard, dp_replicate]
193+
178194
app_state:
179195
component_key: app_state
180196
variant_key: raw
@@ -326,17 +342,14 @@ evaluation_subscriber:
326342
directory: wandb_storage
327343
config_file_path: ${settings.config_file_path}
328344

329-
# mfu_calculator:
330-
# component_key: mfu_calculator
331-
# variant_key: gpt2
332-
# config:
333-
# n_layer: ${model_raw.config.n_layer}
334-
# sequence_length: ${settings.step_profile.sequence_length}
335-
# n_embd: ${model_raw.config.n_embd}
336-
# world_size: ${settings.cuda_env.world_size}
337-
# raw_model:
338-
# instance_key: model_raw
339-
# pass_type: BY_REFERENCE
340-
# wrapped_model:
341-
# instance_key: initialized_model
342-
# pass_type: BY_REFERENCE
345+
mfu_calculator:
346+
component_key: mfu_calculator
347+
variant_key: gpt2
348+
config:
349+
n_layer: ${model_raw.config.n_layer}
350+
sequence_length: ${settings.step_profile.sequence_length}
351+
n_embd: ${model_raw.config.n_embd}
352+
world_size: ${settings.cuda_env.world_size}
353+
wrapped_model:
354+
instance_key: initialized_model
355+
pass_type: BY_REFERENCE

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
[project]
22
name = "modalities"
33
version = "0.3.2"
4+
requires-python = ">=3.10,<3.13"
45
description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training."
56
readme = "README.md"
67
dependencies = [

src/modalities/__main__.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,7 @@ def _format_exception_as_json(e: Exception, environment: dict[str, Any]) -> str:
112112
"hostname": socket.gethostname(),
113113
}
114114
error_log_folder = (
115-
error_log_folder.parent
116-
/ f"{error_log_folder.stem}_{environment['hostname']}_{environment['local_rank']}.log"
115+
error_log_folder / f"error_logs_{environment['hostname']}_{environment['local_rank']}.log"
117116
)
118117
error_log_folder.parent.mkdir(parents=True, exist_ok=True)
119118
with open(error_log_folder, "w", encoding="utf-8") as f:
@@ -623,6 +622,13 @@ def prepare_sweep_configs(sweep_config_path: Path, output_dir: Path, world_sizes
623622
required=True,
624623
help="Path to the root directory of the experiment containing config files.",
625624
)
625+
@click.option(
626+
"--world_size",
627+
type=int,
628+
required=False,
629+
default=None,
630+
help="Number of ranks (world size) to filter the configs for.",
631+
)
626632
@click.option(
627633
"--file_list_path",
628634
type=click.Path(path_type=Path),
@@ -635,6 +641,12 @@ def prepare_sweep_configs(sweep_config_path: Path, output_dir: Path, world_sizes
635641
required=True,
636642
help="Expected number of steps in evaluation_results.jsonl",
637643
)
644+
@click.option(
645+
"--create_new_folders_if_partially_done",
646+
is_flag=True,
647+
default=False,
648+
help="Create new experiment folders for remaining configs if some runs already exist.",
649+
)
638650
@click.option(
639651
"--skip_exception_types",
640652
type=str,
@@ -647,6 +659,8 @@ def CMD_entry_point_list_remaining_runs(
647659
exp_root: Path,
648660
file_list_path: Path,
649661
expected_steps: int,
662+
create_new_folders_if_partially_done: bool,
663+
world_size: int | None = None,
650664
skip_exception_types: str = "",
651665
):
652666
"""
@@ -655,12 +669,15 @@ def CMD_entry_point_list_remaining_runs(
655669
skip_exception_types_list = skip_exception_types.split(",") if skip_exception_types != "" else []
656670
file_list_dict = get_updated_sweep_status(
657671
exp_root=exp_root,
672+
world_size=world_size,
658673
expected_steps=expected_steps,
659674
skip_exception_types=skip_exception_types_list,
675+
create_new_folders_if_partially_done=create_new_folders_if_partially_done,
660676
)
661-
with file_list_path.open("w", encoding="utf-8") as f:
662-
for cfg in file_list_dict[SweepSets.UPDATED_CONFIGS.value]:
663-
f.write(f"{cfg}\n")
677+
if SweepSets.UPDATED_CONFIGS.value in file_list_dict:
678+
with file_list_path.open("w", encoding="utf-8") as f:
679+
for cfg in file_list_dict[SweepSets.UPDATED_CONFIGS.value]:
680+
f.write(f"{cfg}\n")
664681

665682

666683
if __name__ == "__main__":

src/modalities/config/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,11 @@ class GPT2MFUCalculatorConfig(BaseModel):
483483
wrapped_model: PydanticFSDP1ModuleType | PydanticFSDP2ModuleType
484484

485485

486+
class ParallelDegreeConfig(BaseModel):
487+
device_mesh: PydanticDeviceMeshIFType
488+
parallelism_methods: list[ParallelismDegrees]
489+
490+
486491
def load_app_config_dict(
487492
config_file_path: Path,
488493
experiment_id: Optional[str] = None,

src/modalities/config/instantiation_models.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,7 @@ class StepProfile(BaseModel):
3535
gradient_accumulation_steps: Annotated[int, Field(strict=True, ge=1)]
3636
local_train_micro_batch_size: Annotated[int, Field(strict=True, ge=1)]
3737
sequence_length: Annotated[int, Field(strict=True, ge=1)]
38-
39-
40-
class MeshDefinition(BaseModel):
41-
dp_degree: Annotated[int, Field(strict=True, gt=0)]
42-
tp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
43-
pp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
44-
cp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
38+
dp_degree: Annotated[int, Field(strict=True, ge=1)]
4539

4640

4741
class ConsistencyEnforcement(BaseModel):
@@ -101,7 +95,6 @@ class DCPWarmstartCheckpointPaths(BaseModel):
10195
intervals: Intervals
10296
consistency_enforcement: ConsistencyEnforcement
10397
step_profile: StepProfile
104-
mesh_definition: MeshDefinition
10598
training_target: TrainingTarget
10699
training_progress: TrainingProgress
107100
warmstart_checkpoint_paths: Optional[WarmstartCheckpointPaths | DCPWarmstartCheckpointPaths] = None
@@ -116,7 +109,7 @@ def _check_tokens_per_step_conistency(self) -> "TrainingComponentsInstantiationM
116109
self.step_profile.local_train_micro_batch_size
117110
* self.step_profile.sequence_length
118111
* self.step_profile.gradient_accumulation_steps
119-
* self.mesh_definition.dp_degree
112+
* self.step_profile.dp_degree
120113
)
121114
if required_num_tokens_per_step != step_profile_num_tokens_per_step:
122115
warning_message = (

src/modalities/main.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from modalities.config.config import load_app_config_dict
1414
from modalities.config.instantiation_models import TrainingComponentsInstantiationModel, TrainingReportGenerator
1515
from modalities.evaluator import Evaluator
16-
from modalities.exceptions import RunningEnvError
1716
from modalities.gym import Gym
1817
from modalities.logging_broker.message_broker import MessageBroker
1918
from modalities.logging_broker.messages import MessageTypes, ProgressUpdate
@@ -110,14 +109,14 @@ def run(self, components: TrainingComponentsInstantiationModel):
110109
if experiment_path.is_dir():
111110
present_files = list(experiment_path.iterdir())
112111
if len(present_files) == 1 and expected_config_file_path not in present_files:
113-
raise RunningEnvError(
112+
logger.warning(
114113
f"The experiment folder {experiment_path} is non-empty and "
115114
f"contains a file {present_files[0].name} that "
116115
f"is not the config file. Please ensure that the config file is the only file present "
117-
"in the experiment folder."
116+
"in the experiment folder to alleviate side-effects."
118117
)
119118
elif len(present_files) > 1:
120-
raise RunningEnvError(
119+
logger.warning(
121120
f"The experiment folder {experiment_path} is non-empty and "
122121
f"contains multiple files: {present_files}. "
123122
f"Please ensure that the config file is the only file present."
@@ -145,8 +144,9 @@ def run(self, components: TrainingComponentsInstantiationModel):
145144
components.settings.step_profile.local_train_micro_batch_size
146145
* components.settings.step_profile.sequence_length
147146
* components.settings.step_profile.gradient_accumulation_steps
148-
* components.settings.mesh_definition.dp_degree
147+
* components.settings.step_profile.dp_degree
149148
)
149+
150150
trainer = Trainer(
151151
global_rank=components.settings.cuda_env.global_rank,
152152
progress_publisher=progress_publisher,
@@ -158,7 +158,7 @@ def run(self, components: TrainingComponentsInstantiationModel):
158158
gradient_acc_steps=components.settings.step_profile.gradient_accumulation_steps,
159159
gradient_clipper=components.gradient_clipper,
160160
global_num_tokens_per_train_step=global_num_tokens_per_train_step,
161-
mesh_definition=components.settings.mesh_definition,
161+
dp_degree=components.settings.step_profile.dp_degree,
162162
mfu_calculator=components.mfu_calculator,
163163
)
164164

src/modalities/registry/components.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
OneCycleLRSchedulerConfig,
5252
PackedMemMapDatasetContinuousConfig,
5353
PackedMemMapDatasetMegatronConfig,
54+
ParallelDegreeConfig,
5455
PreTrainedHFTokenizerConfig,
5556
PreTrainedSPTokenizerConfig,
5657
RawAppStateConfig,
@@ -101,7 +102,7 @@
101102
)
102103
from modalities.optimizers.lr_schedulers import DummyLRScheduler
103104
from modalities.optimizers.optimizer_factory import OptimizerFactory
104-
from modalities.running_env.fsdp.device_mesh import DeviceMeshConfig, get_device_mesh
105+
from modalities.running_env.fsdp.device_mesh import DeviceMeshConfig, get_device_mesh, get_parallel_degree
105106
from modalities.tokenization.tokenizer_wrapper import PreTrainedHFTokenizer, PreTrainedSPTokenizer
106107
from modalities.training.gradient_clipping.fsdp_gradient_clipper import (
107108
FSDP1GradientClipper,
@@ -191,6 +192,7 @@ class ComponentEntity:
191192
ComponentEntity("stages_generator", "gpt2_stages_generator", GPT2LLMStagesGenerator, GPT2LLMStagesGeneratorConfig),
192193
# Device mesh
193194
ComponentEntity("device_mesh", "default", get_device_mesh, DeviceMeshConfig),
195+
ComponentEntity("number_conversion", "parallel_degree", get_parallel_degree, ParallelDegreeConfig),
194196
# weight initializers
195197
ComponentEntity(
196198
"model_initialization",

src/modalities/running_env/fsdp/device_mesh.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from enum import Enum
2+
from math import prod
23
from typing import Annotated, Optional
34

45
from pydantic import BaseModel, Field, model_validator
@@ -129,24 +130,26 @@ def get_device_mesh(
129130
return device_mesh
130131

131132

132-
def get_num_parallel_ranks(device_mesh: DeviceMesh, parallelism_method: ParallelismDegrees) -> int:
133-
"""Gets the number of parallel ranks from the device mesh for a specific parallelism method.
134-
133+
def get_parallel_degree(device_mesh: DeviceMesh, parallelism_methods: list[ParallelismDegrees]) -> int:
134+
"""Gets the number of parallel ranks (i.e., the parallelism degree)
135+
from the device mesh for a specific parallelism method.
135136
Args:
136137
device_mesh (DeviceMesh): The device mesh.
137-
parallelism_method (ParallelismDegrees): The parallelism method.
138-
138+
parallelism_methods (list[ParallelismDegrees]): The parallelism methods.
139139
Returns:
140140
int: The number of parallel ranks for the specified parallelism method.
141141
"""
142-
if parallelism_method.value not in device_mesh.mesh_dim_names:
143-
return 1
144-
else:
145-
return device_mesh.size(device_mesh.mesh_dim_names.index(parallelism_method.value))
146-
147-
142+
if device_mesh.mesh_dim_names is None:
143+
raise ValueError("device_mesh.mesh_dim_names is None")
144+
145+
return prod(
146+
device_mesh.size(device_mesh.mesh_dim_names.index(method.value))
147+
for method in parallelism_methods
148+
if method.value in device_mesh.mesh_dim_names
149+
)
150+
148151
def get_mesh_for_parallelism_method(device_mesh: DeviceMesh | None, parallelism_method: ParallelismDegrees):
149152
if device_mesh is not None and parallelism_method.value in device_mesh.mesh_dim_names:
150153
return device_mesh[parallelism_method.value]
151154
else:
152-
return None
155+
return None

0 commit comments

Comments
 (0)