Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

internal review #6

Open
wants to merge 33 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
652da01
Xccl process group for Pytorch
Chao1Han Aug 29, 2024
0cb0016
Merge remote-tracking branch 'upstream/main' into xccl-bak
Chao1Han Sep 20, 2024
a71d69a
Align latest
Chao1Han Sep 20, 2024
af6f03c
hidden env
Chao1Han Sep 24, 2024
88bea25
refine findccl code
Chao1Han Sep 29, 2024
f6ea934
Add comments for build xccl
Chao1Han Sep 30, 2024
1226e3b
refine workxccl
Chao1Han Sep 30, 2024
d62e0be
refine timeout
Chao1Han Sep 30, 2024
714de2a
rm head
Chao1Han Sep 30, 2024
0923781
update
Chao1Han Sep 30, 2024
31d092d
minor fix
Chao1Han Oct 9, 2024
cbea299
rm duplicate code and refine cmake
Chao1Han Oct 9, 2024
ef261c6
update cmake
Chao1Han Oct 10, 2024
6c648cd
hidden xccl specific
Chao1Han Sep 24, 2024
e621fe6
fix ci fail
Chao1Han Oct 11, 2024
3f225d9
rm vir fun and modify tensor check
Chao1Han Oct 12, 2024
1138a4a
Merge branch 'xccl-bak' into xccl-bak2
Chao1Han Oct 12, 2024
8e5e78a
refine collective, getcomm
Chao1Han Oct 12, 2024
1267963
accept comments
Chao1Han Oct 12, 2024
3d55b85
rm attr
Chao1Han Oct 12, 2024
f69059a
add default ccl root dir
Chao1Han Oct 12, 2024
bed720c
update
Chao1Han Oct 12, 2024
fd44abe
update
Chao1Han Oct 12, 2024
d12b922
code refine
zhangxiaoli73 Oct 13, 2024
b57e812
minor fix
Chao1Han Oct 14, 2024
5968f0f
update
Chao1Han Oct 15, 2024
edba8aa
update
Chao1Han Oct 16, 2024
56a5e7f
Refine specific code
Chao1Han Oct 17, 2024
a062f9f
accept comments
Chao1Han Oct 17, 2024
ae90994
Merge branch 'xccl-bak' into xccl-bak2
Chao1Han Oct 17, 2024
ab04fc0
rm header and refine profilehead
Chao1Han Oct 17, 2024
4ee49fb
add get_device_count
Chao1Han Oct 17, 2024
1a2c9c2
add backendSupportsSequenceNumbers
Chao1Han Oct 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
cmake_dependent_option(USE_NCCL "Use NCCL" ON
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_XCCL "Use XCCL" ON
"USE_XPU;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
Expand Down Expand Up @@ -353,6 +355,8 @@ cmake_dependent_option(USE_C10D_GLOO "USE C10D GLOO" ON
"USE_DISTRIBUTED;USE_GLOO" OFF)
cmake_dependent_option(USE_C10D_NCCL "USE C10D NCCL" ON
"USE_DISTRIBUTED;USE_NCCL" OFF)
cmake_dependent_option(USE_C10D_XCCL "USE C10D XCCL" ON
"USE_DISTRIBUTED;USE_XCCL" OFF)
cmake_dependent_option(USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI"
OFF)
cmake_dependent_option(
Expand Down
4 changes: 4 additions & 0 deletions build_variables.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,10 @@ libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_s
"torch/csrc/cuda/nccl.cpp",
]

libtorch_xpu_distributed_extra_sources = [
"torch/csrc/distributed/c10d/ProcessGroupXCCL.cpp",
]

torch_cpp_srcs = [
"torch/csrc/api/src/cuda.cpp", # this just forwards stuff, no real CUDA
"torch/csrc/api/src/data/datasets/mnist.cpp",
Expand Down
17 changes: 17 additions & 0 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,13 @@ elseif(USE_CUDA)
endif()

if(USE_XPU)
# if SYCL runtime and oneCCL runtime are both system installed
# then building flag USE_XPU=ON , USE_XCCL=ON and USE_C10D_XCCL=ON;
# XCCL backend will be build in libtorch_xpu;
# manually set `USE_XCCL=OFF` disable XCCL backend building.
if(USE_XCCL)
append_filelist("libtorch_xpu_distributed_extra_sources" Caffe2_XPU_SRCS)
endif()
add_library(torch_xpu ${Caffe2_XPU_SRCS})
torch_compile_options(torch_xpu) # see cmake/public/utils.cmake
target_compile_definitions(torch_xpu PRIVATE USE_XPU)
Expand Down Expand Up @@ -1078,6 +1085,10 @@ if(USE_XPU)
include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})

endif()
if(USE_XCCL)
target_link_libraries(torch_xpu PRIVATE torch::xccl)
target_compile_definitions(torch_xpu PRIVATE USE_XCCL)
endif()
endif()

if(NOT MSVC AND USE_XNNPACK)
Expand Down Expand Up @@ -1363,6 +1374,12 @@ if(USE_DISTRIBUTED)
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
endif()
endif()
if(USE_XPU AND USE_C10D_XCCL)
target_compile_definitions(torch_xpu PUBLIC USE_C10D_XCCL)
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupXCCL.cpp
PROPERTIES COMPILE_DEFINITIONS "CCL_ENABLE_ZE;CCL_ENABLE_SYCL")
endif()
if(USE_MPI AND USE_C10D_MPI)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_source_files_properties(
Expand Down
1 change: 1 addition & 0 deletions caffe2/core/macros.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
{"USE_CUDNN", "${USE_CUDNN}"}, \
{"CUDNN_VERSION", "${CUDNN_VERSION}"}, \
{"USE_NCCL", "${USE_NCCL}"}, \
{"USE_XCCL", "${USE_XCCL}"}, \
{"USE_MPI", "${USE_MPI}"}, \
{"USE_GFLAGS", "${USE_GFLAGS}"}, \
{"USE_GLOG", "${USE_GLOG}"}, \
Expand Down
18 changes: 18 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1151,6 +1151,24 @@ if(USE_CUDA)
include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
endif()

# ---[ XCCL
if(USE_XCCL)
if(NOT USE_XPU)
message(WARNING
"Not using XPU, so disabling USE_XCCL. Suppress this warning with "
"-DUSE_XCCL=OFF.")
caffe2_update_option(USE_XCCL OFF)
elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux")
message(WARNING "USE_XCCL is currently only supported under Linux.")
caffe2_update_option(USE_XCCL OFF)
else()
include(${CMAKE_CURRENT_LIST_DIR}/External/xccl.cmake)
if(NOT XCCL_FOUND)
caffe2_update_option(USE_XCCL OFF)
endif()
endif()
endif()

if(USE_DISTRIBUTED AND USE_TENSORPIPE)
if(MSVC)
message(WARNING "Tensorpipe cannot be used on Windows.")
Expand Down
15 changes: 15 additions & 0 deletions cmake/External/xccl.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
if(NOT __XCCL_INCLUDED)
set(__XCCL_INCLUDED TRUE)

# XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
find_package(XCCL REQUIRED)
if(XCCL_FOUND)
add_library(torch::xccl INTERFACE IMPORTED)
set_property(
TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${XCCL_INCLUDE_DIR})
set_property(
TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
${XCCL_LIBRARY})
endif()
endif()
69 changes: 69 additions & 0 deletions cmake/Modules/FindXCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# This will define the following variables:
# XCCL_FOUND : True if the system has the XCCL library.
# XCCL_INCLUDE_DIR : Include directories needed to use XCCL.
# XCCL_LIBRARY_DIR :The path to the XCCL library.
# XCCL_LIBRARY : XCCL library fullname.

include(FindPackageHandleStandardArgs)

set(XCCL_ROOT "/opt/intel/oneapi/ccl/latest")
if (NOT EXISTS "${XCCL_ROOT}")
message(STATUS "Default OneCCL not found, using current environment OneCCL")
set(XCCL_ROOT $ENV{CCL_ROOT})
endif()

string(COMPARE EQUAL "${XCCL_ROOT}" "" nocclfound)
if(nocclfound)
set(XCCL_FOUND False)
set(XCCL_REASON_FAILURE "OneCCL library not found!!")
set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
return()
endif()

# Find include path from binary.
find_file(
XCCL_INCLUDE_DIR
NAMES include
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find include/oneapi path from include path.
find_file(
XCCL_INCLUDE_ONEAPI_DIR
NAMES oneapi
HINTS ${XCCL_ROOT}/include/
NO_DEFAULT_PATH
)

list(APPEND XCCL_INCLUDE_DIR ${XCCL_INCLUDE_ONEAPI_DIR})

# Find library directory from binary.
find_file(
XCCL_LIBRARY_DIR
NAMES lib
HINTS ${XCCL_ROOT}
NO_DEFAULT_PATH
)

# Find XCCL library fullname.
find_library(
XCCL_LIBRARY
NAMES ccl
HINTS ${XCCL_LIBRARY_DIR}
NO_DEFAULT_PATH
)

if((NOT XCCL_INCLUDE_DIR) OR (NOT XCCL_LIBRARY_DIR) OR (NOT XCCL_LIBRARY))
set(XCCL_FOUND False)
set(XCCL_REASON_FAILURE "OneCCL library not found!!")
set(XCCL_NOT_FOUND_MESSAGE "${XCCL_REASON_FAILURE}")
return()
endif()

find_package_handle_standard_args(
XCCL
FOUND_VAR XCCL_FOUND
REQUIRED_VARS XCCL_INCLUDE_DIR XCCL_LIBRARY_DIR XCCL_LIBRARY
REASON_FAILURE_MESSAGE "${XCCL_REASON_FAILURE}"
)
6 changes: 6 additions & 0 deletions cmake/Summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ function(caffe2_print_configuration_summary)
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")
endif()
message(STATUS " USE_ITT : ${USE_ITT}")
message(STATUS " USE_XCCL : ${USE_XCCL}")
if(${USE_XCCL})
message(STATUS " USE_C10D_XCCL : ${USE_C10D_XCCL}")
message(STATUS " XCCL include path : ${XCCL_INCLUDE_DIR}")
message(STATUS " XCCL library : ${XCCL_LIBRARY}")
endif()
message(STATUS " USE_NCCL : ${USE_NCCL}")
if(${USE_NCCL})
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
Expand Down
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,10 @@ def run(self):
report("-- Building NCCL library")
else:
report("-- Not using NCCL")
if cmake_cache_vars["USE_XCCL"]:
report("-- Building XCCL library")
else:
report("-- Not using XCCL")
if cmake_cache_vars["USE_DISTRIBUTED"]:
if IS_WINDOWS:
report("-- Building without distributed package")
Expand Down
12 changes: 10 additions & 2 deletions test/distributed/test_c10d_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,13 @@ def gpus_for_rank(world_size):
On a single node, all visible GPUs are evenly
divided to subsets, each process only uses a subset.
"""
visible_devices = list(range(torch.cuda.device_count()))
gpus_per_process = torch.cuda.device_count() // world_size
device_count = (
torch.xpu.device_count()
if torch.xpu.is_available()
else torch.cuda.device_count()
)
visible_devices = list(range(device_count))
gpus_per_process = device_count // world_size
gpus_for_rank = []
for rank in range(world_size):
gpus_for_rank.append(
Expand Down Expand Up @@ -1831,6 +1836,9 @@ def test_init_process_group_for_all_backends(self):
elif backend == dist.Backend.UCC:
if not dist.is_ucc_available():
continue
elif backend == dist.Backend.XCCL:
if not dist.is_xccl_available():
continue
# Multi-threaded PG is defined as a pure python class.
# Its pg.name() does not going through Pybind, so its backend name
# is still "threaded" instead of "custom".
Expand Down
Loading
Loading