Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cooperative groups #2307

Draft
wants to merge 21 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c13f61c
Add CreateTaskCooperativeKernel, grid sync and HelloWorldGridSyncExam…
MichaelVarvarin Jul 1, 2024
ecbdcb1
Add comment about issue with grid sync on CUDA Clang
MichaelVarvarin Jul 26, 2024
dd0681f
Add cooperative kernel launch and grid sync support for HIP
MichaelVarvarin Jul 26, 2024
e92cee1
Add m_cooperativeLaunch device prop and runtime check for CG support …
MichaelVarvarin Jul 29, 2024
e423026
Clean errors in previous commit
MichaelVarvarin Aug 1, 2024
8fcd8ac
Clean formatting
MichaelVarvarin Aug 2, 2024
94f07a9
Add getMaxActiveBlocks to get the maximum allowed block count for lau…
MichaelVarvarin Aug 7, 2024
c6c12fd
Rename maxActiveBlocks trait
MichaelVarvarin Aug 10, 2024
4ad8bae
Fix issues from bad rebase
MichaelVarvarin Aug 12, 2024
a892019
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for A…
MichaelVarvarin Aug 12, 2024
f7efa76
Clean formatting
MichaelVarvarin Aug 13, 2024
d76e397
Correct the comment
MichaelVarvarin Aug 13, 2024
93b704c
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for O…
MichaelVarvarin Aug 20, 2024
d09ee84
Clean formatting
MichaelVarvarin Aug 20, 2024
7fdbb60
Update comments
MichaelVarvarin Aug 20, 2024
47d0a1c
Add include gridSync OMP to alpaka.hpp
MichaelVarvarin Aug 27, 2024
0051222
Add cooperative kernel launch, grid sync and getMaxActiveBlocks for s…
MichaelVarvarin Aug 27, 2024
beee9db
Clean warnings for CPU accelerators
MichaelVarvarin Sep 9, 2024
4db26da
Clean warnings for the HIP accelerator
MichaelVarvarin Sep 9, 2024
7b3e194
Merge branch 'develop' into cooperative-groups
MichaelVarvarin Sep 10, 2024
25b0e22
Merge branch 'alpaka-group:develop' into cooperative-groups
MichaelVarvarin Sep 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ add_subdirectory("counterBasedRng/")
add_subdirectory("heatEquation/")
add_subdirectory("helloWorld/")
add_subdirectory("helloWorldLambda/")
add_subdirectory("helloWorldGridSync/")
add_subdirectory("kernelSpecialization/")
add_subdirectory("ls/")
add_subdirectory("matrixMulWithMdspan/")
Expand Down
47 changes: 47 additions & 0 deletions example/helloWorldGridSync/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Copyright 2024 Mykhailo Varvarin
# SPDX-License-Identifier: ISC
#

################################################################################
# Required CMake version.

cmake_minimum_required(VERSION 3.22)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

################################################################################
# Project.

set(_TARGET_NAME helloWorldGridSync)

project(${_TARGET_NAME} LANGUAGES CXX)

#-------------------------------------------------------------------------------
# Find alpaka.

if(NOT TARGET alpaka::alpaka)
option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)

if(alpaka_USE_SOURCE_TREE)
# Don't build the examples recursively
set(alpaka_BUILD_EXAMPLES OFF)
add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
else()
find_package(alpaka REQUIRED)
endif()
endif()

#-------------------------------------------------------------------------------
# Add executable.

alpaka_add_executable(
${_TARGET_NAME}
src/helloWorldGridSync.cpp)
target_link_libraries(
${_TARGET_NAME}
PUBLIC alpaka::alpaka)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)

add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
102 changes: 102 additions & 0 deletions example/helloWorldGridSync/src/helloWorldGridSync.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/* Copyright 2024 Mykhailo Varvarin
* SPDX-License-Identifier: MPL-2.0
*/

#include <alpaka/alpaka.hpp>

#include <cstdint>
#include <iostream>

//! Hello world kernel, utilizing grid synchronization.
//! Prints hello world from a thread, performs grid sync.
//! and prints the sum of indixes of this thread and the opposite thread (the sums have to be the same).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] Could you explain what is the opposite thread here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thread, that has the same distance from the end of the grid dimension, as this from the start. So, if the IDs range from 0 to 9, these are 0 and 9, 1 and 8, 2 and 7 and so on. Their sum is constant, so we can check, if grid sync was performed successfully

//! Prints an error if sum is incorrect.
struct HelloWorldKernel
{
template<typename Acc>
ALPAKA_FN_ACC void operator()(Acc const& acc, uint32_t* data) const
{
// Get index of the current thread in the grid and the total number of threads.
uint32_t gridThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
uint32_t gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];

printf("Hello, World from alpaka thread %u!\n", gridThreadIdx);

// Write the index of the thread to array.
data[gridThreadIdx] = gridThreadIdx;

// Perform grid synchronization.
alpaka::syncGridThreads(acc);

// Get the index of the opposite thread.
uint32_t gridThreadIdxOpposite = data[gridThreadExtent - gridThreadIdx - 1];

// Sum them.
uint32_t sum = gridThreadIdx + gridThreadIdxOpposite;

// Get the expected sum.
uint32_t expectedSum = gridThreadExtent - 1;

// Print the result and signify an error if the grid synchronization fails.
printf(
"After grid sync, this thread is %u, thread on the opposite side is %u. Their sum is %u, expected: %u.%s",
gridThreadIdx,
gridThreadIdxOpposite,
sum,
expectedSum,
sum == expectedSum ? "\n" : " ERROR: the sum is incorrect.\n");
}
};

auto main() -> int
{
// Define dimensionality and type of indices to be used in kernels
using Dim = alpaka::DimInt<1>;
using Idx = uint32_t;

// Define alpaka accelerator type, which corresponds to the underlying programming model
using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;

// Select the first device available on a system, for the chosen accelerator
auto const platformAcc = alpaka::Platform<Acc>{};
auto const devAcc = getDevByIdx(platformAcc, 0u);

// Define type for a queue with requested properties: Blocking.
using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
// Create a queue for the device.
auto queue = Queue{devAcc};

// Define kernel execution configuration of blocks,
// threads per block, and elements per thread.
Idx blocksPerGrid = 10;
Idx threadsPerBlock = 1;
Idx elementsPerThread = 1;

using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};

// Allocate memory on the device.
alpaka::Vec<Dim, Idx> bufferExtent{blocksPerGrid * threadsPerBlock};
auto deviceMemory = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent);

// Instantiate the kernel object.
HelloWorldKernel helloWorldKernel;

int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
devAcc,
helloWorldKernel,
threadsPerBlock,
elementsPerThread,
getPtrNative(deviceMemory));
std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;

// Create a task to run the kernel.
// Note the cooperative kernel specification.
// Only cooperative kernels can perform grid synchronization.
auto taskRunKernel
= alpaka::createTaskCooperativeKernel<Acc>(workDiv, helloWorldKernel, getPtrNative(deviceMemory));

// Enqueue the kernel execution task..
alpaka::enqueue(queue, taskRunKernel);
return 0;
}
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuOmp2Blocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes()),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
// m_cooperativeLaunch
false};
}
};

Expand Down
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuOmp2Threads.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ namespace alpaka
// m_sharedMemSizeBytes
memBytes,
// m_globalMemSizeBytes
memBytes};
memBytes,
// m_cooperativeLaunch
false};
}
};

Expand Down
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuSerial.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(AccCpuSerial<TDim, TIdx>::staticAllocBytes()),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
// m_cooperativeLaunch
false};
}
};

Expand Down
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuTbbBlocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(AccCpuTbbBlocks<TDim, TIdx>::staticAllocBytes()),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
// m_cooperativeLaunch
false};
}
};

Expand Down
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuThreads.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@ namespace alpaka
// m_sharedMemSizeBytes
memBytes,
// m_globalMemSizeBytes
memBytes};
memBytes,
// m_cooperativeLaunch
false};
}
};

Expand Down
1 change: 1 addition & 0 deletions include/alpaka/acc/AccDevProps.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@ namespace alpaka
TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads.
size_t m_sharedMemSizeBytes; //!< The size of shared memory per block
size_t m_globalMemSizeBytes; //!< The size of global memory
bool m_cooperativeLaunch; //!< The support for launch of cooperative kernels
};
} // namespace alpaka
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ namespace alpaka::trait
// m_sharedMemSizeBytes
device.template get_info<sycl::info::device::local_mem_size>(),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
// m_cooperativeLaunch
false};
}
};

Expand Down
52 changes: 49 additions & 3 deletions include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp"
#include "alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp"
#include "alpaka/core/DemangleTypeNames.hpp"
#include "alpaka/grid/GridSyncGpuCudaHip.hpp"
#include "alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp"
#include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp"
#include "alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp"
Expand Down Expand Up @@ -40,7 +41,14 @@

namespace alpaka
{
template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
template<
typename TApi,
typename TAcc,
typename TDim,
typename TIdx,
typename TKernelFnObj,
bool TCooperative,
typename... TArgs>
class TaskKernelGpuUniformCudaHipRt;

//! The GPU CUDA accelerator.
Expand All @@ -59,6 +67,7 @@ namespace alpaka
, public BlockSharedMemDynUniformCudaHipBuiltIn
, public BlockSharedMemStUniformCudaHipBuiltIn
, public BlockSyncUniformCudaHipBuiltIn
, public GridSyncCudaHipBuiltIn
, public IntrinsicUniformCudaHipBuiltIn
, public MemFenceUniformCudaHipBuiltIn
# ifdef ALPAKA_DISABLE_VENDOR_RNG
Expand Down Expand Up @@ -161,6 +170,12 @@ namespace alpaka
TApi::deviceAttributeMaxSharedMemoryPerBlock,
dev.getNativeHandle()));

int cooperativeLaunch = {};
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
&cooperativeLaunch,
TApi::deviceAttributeCooperativeLaunch,
dev.getNativeHandle()));

return {// m_multiProcessorCount
alpaka::core::clipCast<TIdx>(multiProcessorCount),
// m_gridBlockExtentMax
Expand All @@ -184,7 +199,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(sharedMemSizeBytes),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
// m_cooperativeLaunch
static_cast<bool>(cooperativeLaunch)};

# else
typename TApi::DeviceProp_t properties;
Expand Down Expand Up @@ -213,7 +230,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(properties.sharedMemPerBlock),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
// m_cooperativeLaunch
static_cast<bool>(properties.cooperativeLaunch)};
# endif
}
};
Expand Down Expand Up @@ -284,6 +303,33 @@ namespace alpaka
TDim,
TIdx,
TKernelFnObj,
false,
TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
}
};

//! The GPU CUDA accelerator execution cooperative task type trait specialization.
template<
typename TApi,
typename TDim,
typename TIdx,
typename TWorkDiv,
typename TKernelFnObj,
typename... TArgs>
struct CreateTaskCooperativeKernel<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
{
ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
TWorkDiv const& workDiv,
TKernelFnObj const& kernelFnObj,
TArgs&&... args)
{
return TaskKernelGpuUniformCudaHipRt<
TApi,
AccGpuUniformCudaHipRt<TApi, TDim, TIdx>,
TDim,
TIdx,
TKernelFnObj,
true,
TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
}
};
Expand Down
3 changes: 3 additions & 0 deletions include/alpaka/alpaka.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
#include "alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp"
#include "alpaka/atomic/Op.hpp"
#include "alpaka/atomic/Traits.hpp"
// grid
#include "alpaka/grid/GridSyncGpuCudaHip.hpp"
#include "alpaka/grid/Traits.hpp"
fwyzard marked this conversation as resolved.
Show resolved Hide resolved
// block
// shared
// dynamic
Expand Down
22 changes: 22 additions & 0 deletions include/alpaka/core/ApiCudaRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ namespace alpaka
static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
static constexpr DeviceAttr_t deviceAttributeCooperativeLaunch = ::cudaDevAttrCooperativeLaunch;

static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
Expand Down Expand Up @@ -253,6 +254,17 @@ namespace alpaka
return ::cudaHostUnregister(ptr);
}

static inline Error_t launchCooperativeKernel(
void const* func,
dim3 gridDim,
dim3 blockDim,
void** args,
size_t sharedMem,
Stream_t stream)
{
return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
}

Comment on lines +257 to +267
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you change this to be templated on the func argument ?

Suggested change
static inline Error_t launchCooperativeKernel(
void const* func,
dim3 gridDim,
dim3 blockDim,
void** args,
size_t sharedMem,
Stream_t stream)
{
return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
}
template <typename TFunc>
static inline Error_t launchCooperativeKernel(
TFunc func,
dim3 gridDim,
dim3 blockDim,
void** args,
size_t sharedMem,
Stream_t stream)
{
return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
}

Same for the HIP implementation.

static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
{
# if CUDART_VERSION >= 10000
Expand Down Expand Up @@ -395,6 +407,16 @@ namespace alpaka
{
return ::make_cudaExtent(w, h, d);
}

template<class T>
static inline Error_t occupancyMaxActiveBlocksPerMultiprocessor(
int* numBlocks,
T func,
int blockSize,
size_t dynamicSMemSize)
{
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
}
};

} // namespace alpaka
Expand Down
Loading
Loading