alpaka-group · MichaelVarvarin · Jul 1, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 29, 2024
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -24,6 +24,7 @@ add_subdirectory("counterBasedRng/")
 add_subdirectory("heatEquation/")
 add_subdirectory("helloWorld/")
 add_subdirectory("helloWorldLambda/")
+add_subdirectory("helloWorldGridSync/")
 add_subdirectory("kernelSpecialization/")
 add_subdirectory("ls/")
 add_subdirectory("matrixMulWithMdspan/")

diff --git a/example/helloWorldGridSync/CMakeLists.txt b/example/helloWorldGridSync/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# Copyright 2024 Mykhailo Varvarin
+# SPDX-License-Identifier: ISC
+#
+
+################################################################################
+# Required CMake version.
+
+cmake_minimum_required(VERSION 3.22)
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+################################################################################
+# Project.
+
+set(_TARGET_NAME helloWorldGridSync)
+
+project(${_TARGET_NAME} LANGUAGES CXX)
+
+#-------------------------------------------------------------------------------
+# Find alpaka.
+
+if(NOT TARGET alpaka::alpaka)
+    option(alpaka_USE_SOURCE_TREE "Use alpaka's source tree instead of an alpaka installation" OFF)
+
+    if(alpaka_USE_SOURCE_TREE)
+        # Don't build the examples recursively
+        set(alpaka_BUILD_EXAMPLES OFF)
+        add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../.." "${CMAKE_BINARY_DIR}/alpaka")
+    else()
+        find_package(alpaka REQUIRED)
+    endif()
+endif()
+
+#-------------------------------------------------------------------------------
+# Add executable.
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    src/helloWorldGridSync.cpp)
+target_link_libraries(
+    ${_TARGET_NAME}
+    PUBLIC alpaka::alpaka)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
diff --git a/example/helloWorldGridSync/src/helloWorldGridSync.cpp b/example/helloWorldGridSync/src/helloWorldGridSync.cpp
@@ -0,0 +1,102 @@
+/* Copyright 2024 Mykhailo Varvarin
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#include <alpaka/alpaka.hpp>
+
+#include <cstdint>
+#include <iostream>
+
+//! Hello world kernel, utilizing grid synchronization.
+//! Prints hello world from a thread, performs grid sync.
+//! and prints the sum of indixes of this thread and the opposite thread (the sums have to be the same).
+//! Prints an error if sum is incorrect.
+struct HelloWorldKernel
+{
+    template<typename Acc>
+    ALPAKA_FN_ACC void operator()(Acc const& acc, uint32_t* data) const
+    {
+        // Get index of the current thread in the grid and the total number of threads.
+        uint32_t gridThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
+        uint32_t gridThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];
+
+        printf("Hello, World from alpaka thread %u!\n", gridThreadIdx);
+
+        // Write the index of the thread to array.
+        data[gridThreadIdx] = gridThreadIdx;
+
+        // Perform grid synchronization.
+        alpaka::syncGridThreads(acc);
+
+        // Get the index of the opposite thread.
+        uint32_t gridThreadIdxOpposite = data[gridThreadExtent - gridThreadIdx - 1];
+
+        // Sum them.
+        uint32_t sum = gridThreadIdx + gridThreadIdxOpposite;
+
+        // Get the expected sum.
+        uint32_t expectedSum = gridThreadExtent - 1;
+
+        // Print the result and signify an error if the grid synchronization fails.
+        printf(
+            "After grid sync, this thread is %u, thread on the opposite side is %u. Their sum is %u, expected: %u.%s",
+            gridThreadIdx,
+            gridThreadIdxOpposite,
+            sum,
+            expectedSum,
+            sum == expectedSum ? "\n" : " ERROR: the sum is incorrect.\n");
+    }
+};
+
+auto main() -> int
+{
+    // Define dimensionality and type of indices to be used in kernels
+    using Dim = alpaka::DimInt<1>;
+    using Idx = uint32_t;
+
+    // Define alpaka accelerator type, which corresponds to the underlying programming model
+    using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+
+    // Select the first device available on a system, for the chosen accelerator
+    auto const platformAcc = alpaka::Platform<Acc>{};
+    auto const devAcc = getDevByIdx(platformAcc, 0u);
+
+    // Define type for a queue with requested properties: Blocking.
+    using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
+    // Create a queue for the device.
+    auto queue = Queue{devAcc};
+
+    // Define kernel execution configuration of blocks,
+    // threads per block, and elements per thread.
+    Idx blocksPerGrid = 10;
+    Idx threadsPerBlock = 1;
+    Idx elementsPerThread = 1;
+
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+    auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
+
+    // Allocate memory on the device.
+    alpaka::Vec<Dim, Idx> bufferExtent{blocksPerGrid * threadsPerBlock};
+    auto deviceMemory = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent);
+
+    // Instantiate the kernel object.
+    HelloWorldKernel helloWorldKernel;
+
+    int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
+        devAcc,
+        helloWorldKernel,
+        threadsPerBlock,
+        elementsPerThread,
+        getPtrNative(deviceMemory));
+    std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;
+
+    // Create a task to run the kernel.
+    // Note the cooperative kernel specification.
+    // Only cooperative kernels can perform grid synchronization.
+    auto taskRunKernel
+        = alpaka::createTaskCooperativeKernel<Acc>(workDiv, helloWorldKernel, getPtrNative(deviceMemory));
+
+    // Enqueue the kernel execution task..
+    alpaka::enqueue(queue, taskRunKernel);
+    return 0;
+}
diff --git a/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/include/alpaka/acc/AccCpuOmp2Blocks.hpp
@@ -152,7 +152,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes()),
                         // m_globalMemSizeBytes
-                        getMemBytes(dev)};
+                        getMemBytes(dev),
+                        // m_cooperativeLaunch
+                        false};
             }
         };
 

diff --git a/include/alpaka/acc/AccCpuOmp2Threads.hpp b/include/alpaka/acc/AccCpuOmp2Threads.hpp
@@ -162,7 +162,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         memBytes,
                         // m_globalMemSizeBytes
-                        memBytes};
+                        memBytes,
+                        // m_cooperativeLaunch
+                        false};
             }
         };
 

diff --git a/include/alpaka/acc/AccCpuSerial.hpp b/include/alpaka/acc/AccCpuSerial.hpp
@@ -145,7 +145,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         static_cast<size_t>(AccCpuSerial<TDim, TIdx>::staticAllocBytes()),
                         // m_globalMemSizeBytes
-                        getMemBytes(dev)};
+                        getMemBytes(dev),
+                        // m_cooperativeLaunch
+                        false};
             }
         };
 

diff --git a/include/alpaka/acc/AccCpuTbbBlocks.hpp b/include/alpaka/acc/AccCpuTbbBlocks.hpp
@@ -146,7 +146,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         static_cast<size_t>(AccCpuTbbBlocks<TDim, TIdx>::staticAllocBytes()),
                         // m_globalMemSizeBytes
-                        getMemBytes(dev)};
+                        getMemBytes(dev),
+                        // m_cooperativeLaunch
+                        false};
             }
         };
 

diff --git a/include/alpaka/acc/AccCpuThreads.hpp b/include/alpaka/acc/AccCpuThreads.hpp
@@ -170,7 +170,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         memBytes,
                         // m_globalMemSizeBytes
-                        memBytes};
+                        memBytes,
+                        // m_cooperativeLaunch
+                        false};
             }
         };
 

diff --git a/include/alpaka/acc/AccDevProps.hpp b/include/alpaka/acc/AccDevProps.hpp
@@ -30,5 +30,6 @@ namespace alpaka
         TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads.
         size_t m_sharedMemSizeBytes; //!< The size of shared memory per block
         size_t m_globalMemSizeBytes; //!< The size of global memory
+        bool m_cooperativeLaunch; //!< The support for launch of cooperative kernels
     };
 } // namespace alpaka
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
@@ -153,7 +153,9 @@ namespace alpaka::trait
                     // m_sharedMemSizeBytes
                     device.template get_info<sycl::info::device::local_mem_size>(),
                     // m_globalMemSizeBytes
-                    getMemBytes(dev)};
+                    getMemBytes(dev),
+                    // m_cooperativeLaunch
+                    false};
         }
     };
 

diff --git a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
@@ -11,6 +11,7 @@
 #include "alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp"
 #include "alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp"
 #include "alpaka/core/DemangleTypeNames.hpp"
+#include "alpaka/grid/GridSyncGpuCudaHip.hpp"
 #include "alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp"
 #include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp"
 #include "alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp"
@@ -40,7 +41,14 @@
 
 namespace alpaka
 {
-    template<typename TApi, typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
+    template<
+        typename TApi,
+        typename TAcc,
+        typename TDim,
+        typename TIdx,
+        typename TKernelFnObj,
+        bool TCooperative,
+        typename... TArgs>
     class TaskKernelGpuUniformCudaHipRt;
 
     //! The GPU CUDA accelerator.
@@ -59,6 +67,7 @@ namespace alpaka
         , public BlockSharedMemDynUniformCudaHipBuiltIn
         , public BlockSharedMemStUniformCudaHipBuiltIn
         , public BlockSyncUniformCudaHipBuiltIn
+        , public GridSyncCudaHipBuiltIn
         , public IntrinsicUniformCudaHipBuiltIn
         , public MemFenceUniformCudaHipBuiltIn
 #    ifdef ALPAKA_DISABLE_VENDOR_RNG
@@ -161,6 +170,12 @@ namespace alpaka
                     TApi::deviceAttributeMaxSharedMemoryPerBlock,
                     dev.getNativeHandle()));
 
+                int cooperativeLaunch = {};
+                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
+                    &cooperativeLaunch,
+                    TApi::deviceAttributeCooperativeLaunch,
+                    dev.getNativeHandle()));
+
                 return {// m_multiProcessorCount
                         alpaka::core::clipCast<TIdx>(multiProcessorCount),
                         // m_gridBlockExtentMax
@@ -184,7 +199,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         static_cast<size_t>(sharedMemSizeBytes),
                         // m_globalMemSizeBytes
-                        getMemBytes(dev)};
+                        getMemBytes(dev),
+                        // m_cooperativeLaunch
+                        static_cast<bool>(cooperativeLaunch)};
 
 #    else
                 typename TApi::DeviceProp_t properties;
@@ -213,7 +230,9 @@ namespace alpaka
                         // m_sharedMemSizeBytes
                         static_cast<size_t>(properties.sharedMemPerBlock),
                         // m_globalMemSizeBytes
-                        getMemBytes(dev)};
+                        getMemBytes(dev),
+                        // m_cooperativeLaunch
+                        static_cast<bool>(properties.cooperativeLaunch)};
 #    endif
             }
         };
@@ -284,6 +303,33 @@ namespace alpaka
                     TDim,
                     TIdx,
                     TKernelFnObj,
+                    false,
+                    TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
+            }
+        };
+
+        //! The GPU CUDA accelerator execution cooperative task type trait specialization.
+        template<
+            typename TApi,
+            typename TDim,
+            typename TIdx,
+            typename TWorkDiv,
+            typename TKernelFnObj,
+            typename... TArgs>
+        struct CreateTaskCooperativeKernel<AccGpuUniformCudaHipRt<TApi, TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+        {
+            ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
+                TWorkDiv const& workDiv,
+                TKernelFnObj const& kernelFnObj,
+                TArgs&&... args)
+            {
+                return TaskKernelGpuUniformCudaHipRt<
+                    TApi,
+                    AccGpuUniformCudaHipRt<TApi, TDim, TIdx>,
+                    TDim,
+                    TIdx,
+                    TKernelFnObj,
+                    true,
                     TArgs...>(workDiv, kernelFnObj, std::forward<TArgs>(args)...);
             }
         };

diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
@@ -33,6 +33,9 @@
 #include "alpaka/atomic/AtomicUniformCudaHipBuiltIn.hpp"
 #include "alpaka/atomic/Op.hpp"
 #include "alpaka/atomic/Traits.hpp"
+// grid
+#include "alpaka/grid/GridSyncGpuCudaHip.hpp"
+#include "alpaka/grid/Traits.hpp"
 // block
 // shared
 // dynamic

diff --git a/include/alpaka/core/ApiCudaRt.hpp b/include/alpaka/core/ApiCudaRt.hpp
@@ -76,6 +76,7 @@ namespace alpaka
         static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
         static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
         static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
+        static constexpr DeviceAttr_t deviceAttributeCooperativeLaunch = ::cudaDevAttrCooperativeLaunch;
 
         static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
         static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
@@ -253,6 +254,17 @@ namespace alpaka
             return ::cudaHostUnregister(ptr);
         }
 
+        static inline Error_t launchCooperativeKernel(
+            void const* func,
+            dim3 gridDim,
+            dim3 blockDim,
+            void** args,
+            size_t sharedMem,
+            Stream_t stream)
+        {
+            return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
+        }
+
-        static inline Error_t launchCooperativeKernel(
-            void const* func,
-            dim3 gridDim,
-            dim3 blockDim,
-            void** args,
-            size_t sharedMem,
-            Stream_t stream)
-        {
-            return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
-        }
+        template <typename TFunc>
+        static inline Error_t launchCooperativeKernel(
+            TFunc func,
+            dim3 gridDim,
+            dim3 blockDim,
+            void** args,
+            size_t sharedMem,
+            Stream_t stream)
+        {
+            return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
+        }
+
-        static inline Error_t launchCooperativeKernel(
-            void const* func,
-            dim3 gridDim,
-            dim3 blockDim,
-            void** args,
-            size_t sharedMem,
-            Stream_t stream)
-        {
-            return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
-        }
+        template <typename TFunc>
+        static inline Error_t launchCooperativeKernel(
+            TFunc func,
+            dim3 gridDim,
+            dim3 blockDim,
+            void** args,
+            size_t sharedMem,
+            Stream_t stream)
+        {
+            return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream);
+        }
+
         static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
         {
 #    if CUDART_VERSION >= 10000
@@ -395,6 +407,16 @@ namespace alpaka
         {
             return ::make_cudaExtent(w, h, d);
         }
+
+        template<class T>
+        static inline Error_t occupancyMaxActiveBlocksPerMultiprocessor(
+            int* numBlocks,
+            T func,
+            int blockSize,
+            size_t dynamicSMemSize)
+        {
+            return ::cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize);
+        }
     };
 
 } // namespace alpaka