Update base for Update on "[BE] replace the extra DeviceMesh _flatten…

… with mesh access" **Summary** pytorch/pytorch#138945 fixes DeviceMesh access on flattened mesh which are constructed from more than 2 meshes. Refer to the fix PR for details if interested. In #592 we avoided this issue by calling `_flatten` instead of direct accessing the flattened mesh. We want to turn back to mesh access which is more straightforward since the fix has been merged in PyTorch. [ghstack-poisoned]
pytorch · Oct 31, 2024 · 03d27ce · 03d27ce
2 parents 53d0f69 + 2a785e9
commit 03d27ce
Show file tree

Hide file tree

Showing 2 changed files with 0 additions and 37 deletions.
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
@@ -34,7 +34,6 @@
 from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.logging import logger
 from torchtitan.parallelisms.parallel_dims import ParallelDims
-from torchtitan.parallelisms.utils import check_strided_sharding_enabled
 
 
 def parallelize_llama(
@@ -330,12 +329,6 @@ def apply_fsdp(
     if cpu_offload:
         fsdp_config["offload_policy"] = CPUOffloadPolicy()
 
-    # TODO: remove this check once PyTorch 2.5 is released. We can safely assume
-    # that users won't use a nightly build which is older than 20240809 by then.
-    if tp_enabled:
-        # check if strided sharding is enabled, which is necessary for 2D/3D DCP
-        check_strided_sharding_enabled()
-
     for layer_id, transformer_block in model.layers.items():
         if pp_enabled:
             # For PP, do not reshard after forward to avoid per-microbatch

diff --git a/torchtitan/parallelisms/utils.py b/torchtitan/parallelisms/utils.py