Modalities
diff --git a/‎config_files/training/config_lorem_ipsum_long_fsdp2.yaml‎
Lines changed: 31 additions & 18 deletions b/‎config_files/training/config_lorem_ipsum_long_fsdp2.yaml‎
Lines changed: 31 additions & 18 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/modalities/__main__.py‎
Lines changed: 22 additions & 5 deletions b/‎src/modalities/__main__.py‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎src/modalities/config/config.py‎
Lines changed: 5 additions & 0 deletions b/‎src/modalities/config/config.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/modalities/config/instantiation_models.py‎
Lines changed: 2 additions & 9 deletions b/‎src/modalities/config/instantiation_models.py‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎src/modalities/main.py‎
Lines changed: 6 additions & 6 deletions b/‎src/modalities/main.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/modalities/registry/components.py‎
Lines changed: 3 additions & 1 deletion b/‎src/modalities/registry/components.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/modalities/running_env/fsdp/device_mesh.py‎
Lines changed: 15 additions & 12 deletions b/‎src/modalities/running_env/fsdp/device_mesh.py‎
Lines changed: 15 additions & 12 deletions
@@ -18,29 +18,36 @@ settings:
     checkpointing_interval_in_steps: 32
     evaluation_interval_in_steps: 32
   consistency_enforcement:
-    enforce_tokens_per_step_consistency: true
+    enforce_tokens_per_step_consistency: false
     enforce_last_step_logged: false
     enforce_last_step_evaluated: false
     enforce_last_step_checkpointed: false
   step_profile:
     gradient_accumulation_steps: 1
     local_train_micro_batch_size: 1
     sequence_length: 256
+    dp_degree:
+      instance_key: dp_degree
+      pass_type: BY_REFERENCE
   training_target:
     num_target_tokens:
       component_key: number_conversion
       variant_key: num_tokens_from_packed_mem_map_dataset_continuous
       config:
         dataset_path: ${settings.paths.train_dataset_path}
         sequence_length: ${settings.step_profile.sequence_length}
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree:
+          instance_key: dp_degree
+          pass_type: BY_REFERENCE
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
     num_target_steps:  # for the batch progress subscriber
       component_key: number_conversion
       variant_key: num_steps_from_num_tokens
       config:
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree:
+          instance_key: dp_degree
+          pass_type: BY_REFERENCE
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         global_num_tokens: ${settings.training_target.num_target_tokens}
         sequence_length: ${settings.step_profile.sequence_length}
@@ -172,9 +179,18 @@ device_mesh:
   config:
     device_type: cuda
     data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: ${settings.cuda_env.world_size} # i.e., fully sharded
+    data_parallel_shard_degree: -1
     world_size: ${settings.cuda_env.world_size}
 
+dp_degree:
+  component_key: number_conversion
+  variant_key: parallel_degree
+  config: # get the parallel degree from the device mesh
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    parallelism_methods: [dp_shard, dp_replicate]
+
 app_state:
   component_key: app_state
   variant_key: raw
@@ -326,17 +342,14 @@ evaluation_subscriber:
     directory: wandb_storage
     config_file_path: ${settings.config_file_path}
 
-# mfu_calculator:
-#   component_key: mfu_calculator
-#   variant_key: gpt2
-#   config:
-#     n_layer: ${model_raw.config.n_layer}
-#     sequence_length: ${settings.step_profile.sequence_length}
-#     n_embd: ${model_raw.config.n_embd}
-#     world_size: ${settings.cuda_env.world_size}
-#     raw_model:
-#       instance_key: model_raw
-#       pass_type: BY_REFERENCE
-#     wrapped_model:
-#       instance_key: initialized_model
-#       pass_type: BY_REFERENCE
+mfu_calculator:
+  component_key: mfu_calculator
+  variant_key: gpt2
+  config:
+    n_layer: ${model_raw.config.n_layer}
+    sequence_length: ${settings.step_profile.sequence_length}
+    n_embd: ${model_raw.config.n_embd}
+    world_size: ${settings.cuda_env.world_size}
+    wrapped_model:
+      instance_key: initialized_model
+      pass_type: BY_REFERENCE
@@ -1,6 +1,7 @@
 [project]
 name = "modalities"
 version = "0.3.2"
+requires-python = ">=3.10,<3.13"
 description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training."
 readme = "README.md"
 dependencies = [
 
@@ -112,8 +112,7 @@ def _format_exception_as_json(e: Exception, environment: dict[str, Any]) -> str:
                 "hostname": socket.gethostname(),
             }
             error_log_folder = (
-                error_log_folder.parent
-                / f"{error_log_folder.stem}_{environment['hostname']}_{environment['local_rank']}.log"
+                error_log_folder / f"error_logs_{environment['hostname']}_{environment['local_rank']}.log"
             )
             error_log_folder.parent.mkdir(parents=True, exist_ok=True)
             with open(error_log_folder, "w", encoding="utf-8") as f:
@@ -623,6 +622,13 @@ def prepare_sweep_configs(sweep_config_path: Path, output_dir: Path, world_sizes
     required=True,
     help="Path to the root directory of the experiment containing config files.",
 )
+@click.option(
+    "--world_size",
+    type=int,
+    required=False,
+    default=None,
+    help="Number of ranks (world size) to filter the configs for.",
+)
 @click.option(
     "--file_list_path",
     type=click.Path(path_type=Path),
@@ -635,6 +641,12 @@ def prepare_sweep_configs(sweep_config_path: Path, output_dir: Path, world_sizes
     required=True,
     help="Expected number of steps in evaluation_results.jsonl",
 )
+@click.option(
+    "--create_new_folders_if_partially_done",
+    is_flag=True,
+    default=False,
+    help="Create new experiment folders for remaining configs if some runs already exist.",
+)
 @click.option(
     "--skip_exception_types",
     type=str,
@@ -647,6 +659,8 @@ def CMD_entry_point_list_remaining_runs(
     exp_root: Path,
     file_list_path: Path,
     expected_steps: int,
+    create_new_folders_if_partially_done: bool,
+    world_size: int | None = None,
     skip_exception_types: str = "",
 ):
     """
@@ -655,12 +669,15 @@ def CMD_entry_point_list_remaining_runs(
     skip_exception_types_list = skip_exception_types.split(",") if skip_exception_types != "" else []
     file_list_dict = get_updated_sweep_status(
         exp_root=exp_root,
+        world_size=world_size,
         expected_steps=expected_steps,
         skip_exception_types=skip_exception_types_list,
+        create_new_folders_if_partially_done=create_new_folders_if_partially_done,
     )
-    with file_list_path.open("w", encoding="utf-8") as f:
-        for cfg in file_list_dict[SweepSets.UPDATED_CONFIGS.value]:
-            f.write(f"{cfg}\n")
+    if SweepSets.UPDATED_CONFIGS.value in file_list_dict:
+        with file_list_path.open("w", encoding="utf-8") as f:
+            for cfg in file_list_dict[SweepSets.UPDATED_CONFIGS.value]:
+                f.write(f"{cfg}\n")
 
 
 if __name__ == "__main__":
 
@@ -483,6 +483,11 @@ class GPT2MFUCalculatorConfig(BaseModel):
     wrapped_model: PydanticFSDP1ModuleType | PydanticFSDP2ModuleType
 
 
+class ParallelDegreeConfig(BaseModel):
+    device_mesh: PydanticDeviceMeshIFType
+    parallelism_methods: list[ParallelismDegrees]
+
+
 def load_app_config_dict(
     config_file_path: Path,
     experiment_id: Optional[str] = None,
 
@@ -35,13 +35,7 @@ class StepProfile(BaseModel):
     gradient_accumulation_steps: Annotated[int, Field(strict=True, ge=1)]
     local_train_micro_batch_size: Annotated[int, Field(strict=True, ge=1)]
     sequence_length: Annotated[int, Field(strict=True, ge=1)]
-
-
-class MeshDefinition(BaseModel):
-    dp_degree: Annotated[int, Field(strict=True, gt=0)]
-    tp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
-    pp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
-    cp_degree: Annotated[int, Field(strict=True, gt=0)] = 1
+    dp_degree: Annotated[int, Field(strict=True, ge=1)]
 
 
 class ConsistencyEnforcement(BaseModel):
@@ -101,7 +95,6 @@ class DCPWarmstartCheckpointPaths(BaseModel):
         intervals: Intervals
         consistency_enforcement: ConsistencyEnforcement
         step_profile: StepProfile
-        mesh_definition: MeshDefinition
         training_target: TrainingTarget
         training_progress: TrainingProgress
         warmstart_checkpoint_paths: Optional[WarmstartCheckpointPaths | DCPWarmstartCheckpointPaths] = None
@@ -116,7 +109,7 @@ def _check_tokens_per_step_conistency(self) -> "TrainingComponentsInstantiationM
                 self.step_profile.local_train_micro_batch_size
                 * self.step_profile.sequence_length
                 * self.step_profile.gradient_accumulation_steps
-                * self.mesh_definition.dp_degree
+                * self.step_profile.dp_degree
             )
             if required_num_tokens_per_step != step_profile_num_tokens_per_step:
                 warning_message = (
 
@@ -13,7 +13,6 @@
 from modalities.config.config import load_app_config_dict
 from modalities.config.instantiation_models import TrainingComponentsInstantiationModel, TrainingReportGenerator
 from modalities.evaluator import Evaluator
-from modalities.exceptions import RunningEnvError
 from modalities.gym import Gym
 from modalities.logging_broker.message_broker import MessageBroker
 from modalities.logging_broker.messages import MessageTypes, ProgressUpdate
@@ -110,14 +109,14 @@ def run(self, components: TrainingComponentsInstantiationModel):
         if experiment_path.is_dir():
             present_files = list(experiment_path.iterdir())
             if len(present_files) == 1 and expected_config_file_path not in present_files:
-                raise RunningEnvError(
+                logger.warning(
                     f"The experiment folder {experiment_path} is non-empty and "
                     f"contains a file {present_files[0].name} that "
                     f"is not the config file. Please ensure that the config file is the only file present "
-                    "in the experiment folder."
+                    "in the experiment folder to alleviate side-effects."
                 )
             elif len(present_files) > 1:
-                raise RunningEnvError(
+                logger.warning(
                     f"The experiment folder {experiment_path} is non-empty and "
                     f"contains multiple files: {present_files}. "
                     f"Please ensure that the config file is the only file present."
@@ -145,8 +144,9 @@ def run(self, components: TrainingComponentsInstantiationModel):
             components.settings.step_profile.local_train_micro_batch_size
             * components.settings.step_profile.sequence_length
             * components.settings.step_profile.gradient_accumulation_steps
-            * components.settings.mesh_definition.dp_degree
+            * components.settings.step_profile.dp_degree
         )
+
         trainer = Trainer(
             global_rank=components.settings.cuda_env.global_rank,
             progress_publisher=progress_publisher,
@@ -158,7 +158,7 @@ def run(self, components: TrainingComponentsInstantiationModel):
             gradient_acc_steps=components.settings.step_profile.gradient_accumulation_steps,
             gradient_clipper=components.gradient_clipper,
             global_num_tokens_per_train_step=global_num_tokens_per_train_step,
-            mesh_definition=components.settings.mesh_definition,
+            dp_degree=components.settings.step_profile.dp_degree,
             mfu_calculator=components.mfu_calculator,
         )
 
 
@@ -51,6 +51,7 @@
     OneCycleLRSchedulerConfig,
     PackedMemMapDatasetContinuousConfig,
     PackedMemMapDatasetMegatronConfig,
+    ParallelDegreeConfig,
     PreTrainedHFTokenizerConfig,
     PreTrainedSPTokenizerConfig,
     RawAppStateConfig,
@@ -101,7 +102,7 @@
 )
 from modalities.optimizers.lr_schedulers import DummyLRScheduler
 from modalities.optimizers.optimizer_factory import OptimizerFactory
-from modalities.running_env.fsdp.device_mesh import DeviceMeshConfig, get_device_mesh
+from modalities.running_env.fsdp.device_mesh import DeviceMeshConfig, get_device_mesh, get_parallel_degree
 from modalities.tokenization.tokenizer_wrapper import PreTrainedHFTokenizer, PreTrainedSPTokenizer
 from modalities.training.gradient_clipping.fsdp_gradient_clipper import (
     FSDP1GradientClipper,
@@ -191,6 +192,7 @@ class ComponentEntity:
     ComponentEntity("stages_generator", "gpt2_stages_generator", GPT2LLMStagesGenerator, GPT2LLMStagesGeneratorConfig),
     # Device mesh
     ComponentEntity("device_mesh", "default", get_device_mesh, DeviceMeshConfig),
+    ComponentEntity("number_conversion", "parallel_degree", get_parallel_degree, ParallelDegreeConfig),
     # weight initializers
     ComponentEntity(
         "model_initialization",
 
@@ -1,4 +1,5 @@
 from enum import Enum
+from math import prod
 from typing import Annotated, Optional
 
 from pydantic import BaseModel, Field, model_validator
@@ -129,24 +130,26 @@ def get_device_mesh(
     return device_mesh
 
 
-def get_num_parallel_ranks(device_mesh: DeviceMesh, parallelism_method: ParallelismDegrees) -> int:
-    """Gets the number of parallel ranks from the device mesh for a specific parallelism method.
-
+def get_parallel_degree(device_mesh: DeviceMesh, parallelism_methods: list[ParallelismDegrees]) -> int:
+    """Gets the number of parallel ranks (i.e., the parallelism degree)
+    from the device mesh for a specific parallelism method.
     Args:
         device_mesh (DeviceMesh): The device mesh.
-        parallelism_method (ParallelismDegrees): The parallelism method.
-
+        parallelism_methods (list[ParallelismDegrees]): The parallelism methods.
     Returns:
         int: The number of parallel ranks for the specified parallelism method.
     """
-    if parallelism_method.value not in device_mesh.mesh_dim_names:
-        return 1
-    else:
-        return device_mesh.size(device_mesh.mesh_dim_names.index(parallelism_method.value))
-
-
+    if device_mesh.mesh_dim_names is None:
+        raise ValueError("device_mesh.mesh_dim_names is None")
+
+    return prod(
+        device_mesh.size(device_mesh.mesh_dim_names.index(method.value))
+        for method in parallelism_methods
+        if method.value in device_mesh.mesh_dim_names
+    )
+    
 def get_mesh_for_parallelism_method(device_mesh: DeviceMesh | None, parallelism_method: ParallelismDegrees):
     if device_mesh is not None and parallelism_method.value in device_mesh.mesh_dim_names:
         return device_mesh[parallelism_method.value]
     else:
-        return None
+        return None