Merge branch 'xccl-bak' into xccl-bak2

Chao1Han · Oct 17, 2024 · ae90994 · ae90994
2 parents edba8aa + a062f9f
commit ae90994
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 28 deletions.
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -51,8 +51,8 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     NCCL = 2,
     UCC = 3,
     MPI = 4,
-    CUSTOM = 5,
-    XCCL = 6,
+    XCCL = 5,
+    CUSTOM = 6,
   };
 
   static std::string backendTypeToString(const BackendType& type) {

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp
@@ -1,7 +1,6 @@
 #ifdef USE_C10D_XCCL
 
 #include <comm/XPUGuard.h>
-#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp>
 #include <fstream>
 #include <map>
@@ -254,24 +253,6 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
   auto tensor = tensors.back();
   checkXPUTensor(tensor);
 
-  RECORD_PARAM_COMMS_DATA(
-      // static_cast<int>(
-      //     this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      1,
-      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
-      tensors, // inputTensors
-      tensors, // outputTensors
-      rank_, // rank
-      "allreduce", // collective name
-      tensor.numel(), // inNelems
-      tensor.numel(), // outNelems
-      tensor.scalar_type(), // dType
-      std::vector<int64_t>(), // inSplitSizes
-      std::vector<int64_t>(), // outSplitSizes
-      0, // globalRankStart
-      1, // globalRankStride
-      this->getSize()); // worldSize
-
   return collective(
       tensor,
       tensor,

diff --git a/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupXCCL.hpp
@@ -138,6 +138,8 @@ class TORCH_API ProcessGroupXCCL : public Backend {
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
+  void setSequenceNumberForGroup() override {}
+
  protected:
   std::unordered_map<std::string, at::xpu::XPUStream> xcclStreamsMap_;
   std::unordered_map<std::string, at::xpu::XPUEvent> xcclEventsMap_;
@@ -151,7 +153,6 @@ class TORCH_API ProcessGroupXCCL : public Backend {
   ccl::shared_ptr_class<ccl::kvs> kvs;
 
   ccl::shared_ptr_class<ccl::kvs> get_kvs(int rank, c10d::Store& store) {
-  // todo: why do we need the mutex here?
     std::lock_guard<std::mutex> lock(kvs_mutex);
     if (kvs)
       return kvs;

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
@@ -1675,13 +1675,9 @@ def _new_process_group_helper(
             "created, please use a different group name"
         )
 
-    if device_id is not None and (
-        device_id.index is None
-        or (device_id.type != "cuda" and device_id.type != "xpu")
-    ):
+    if device_id is not None and device_id.index is None:
         raise ValueError(
-            "init_process_group device_id parameter must be a cuda device with an "
-            "id, e.g. cuda:0, xpu, not just cuda or xpu or cpu"
+            "init_process_group device_id parameter must be a device with an index"
         )
 
     # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value