WIP Migrate C2H library to top-level.

[skip-docs][skip-vdc][skip-rapids]
NVIDIA · Oct 25, 2024 · 3ce1468 · 3ce1468
1 parent ca5dbf9
commit 3ce1468
Show file tree

Hide file tree

Showing 124 changed files with 2,308 additions and 1,214 deletions.
diff --git a/c2h/CMakeLists.txt b/c2h/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(C2H LANGUAGES CXX CUDA)
+
+cccl_get_catch2()
+
+find_package(CCCL CONFIG REQUIRED
+  NO_DEFAULT_PATH # Only check the explicit HINTS below:
+  HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cccl/"
+)
+
+find_package(CUDAToolkit)
+
+set(curand_default OFF)
+if (CUDA_curand_LIBRARY)
+  set(curand_default ON)
+endif()
+
+option(C2H_ENABLE_CURAND "Use CUDA CURAND library in c2h." ${curand_default})
+
+add_library(cccl.c2h STATIC generators.cu)
+target_include_directories(cccl.c2h PUBLIC "${C2H_SOURCE_DIR}/include")
+target_link_libraries(cccl.c2h PUBLIC
+  CCCL::CCCL
+  Catch2::Catch2
+)
+
+if (C2H_ENABLE_CURAND)
+  target_link_libraries(cccl.c2h PRIVATE CUDA::curand)
+  target_compile_definitions(cccl.c2h PRIVATE C2H_HAS_CURAND=1)
+else()
+  target_compile_definitions(cccl.c2h PRIVATE C2H_HAS_CURAND=0)
+endif()
+
+add_library(cccl.c2h.main OBJECT
+  catch2_runner.cpp
+  catch2_runner_helper.cu
+)
+target_link_libraries(cccl.c2h.main PUBLIC cccl.c2h)
diff --git a/cub/test/catch2_runner.cpp → c2h/catch2_runner.cpp b/cub/test/catch2_runner.cpp → c2h/catch2_runner.cpp
@@ -31,4 +31,4 @@
 
 #define CUB_CONFIG_MAIN
 #define CUB_EXCLUDE_CATCH2_HELPER_IMPL
-#include "catch2_main.cuh"
+#include <c2h/catch2_main.cuh>
diff --git a/cub/test/catch2_runner_helper.cu → c2h/catch2_runner_helper.cu b/cub/test/catch2_runner_helper.cu → c2h/catch2_runner_helper.cu
diff --git a/cub/test/catch2_runner_helper.inl → c2h/catch2_runner_helper.inl b/cub/test/catch2_runner_helper.inl → c2h/catch2_runner_helper.inl
diff --git a/cub/test/c2h/generators.cu → c2h/generators.cu b/cub/test/c2h/generators.cu → c2h/generators.cu
@@ -47,9 +47,9 @@
 #include <c2h/custom_type.cuh>
 #include <c2h/device_policy.cuh>
 #include <c2h/extended_types.cuh>
+#include <c2h/fill_striped.cuh>
 #include <c2h/generators.cuh>
 #include <c2h/vector.cuh>
-#include <fill_striped.cuh>
 
 #if C2H_HAS_CURAND
 #  include <curand.h>

diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh
@@ -0,0 +1,256 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * Utilities for interacting with the opaque CUDA __nv_bfloat16 type
+ */
+
+#include <cuda_bf16.h>
+
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <cstdint>
+#include <iosfwd>
+
+#ifdef __GNUC__
+// There's a ton of type-punning going on in this file.
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+/******************************************************************************
+ * bfloat16_t
+ ******************************************************************************/
+
+/**
+ * Host-based fp16 data type compatible and convertible with __nv_bfloat16
+ */
+struct bfloat16_t
+{
+  uint16_t __x;
+
+  /// Constructor from __nv_bfloat16
+  __host__ __device__ __forceinline__ explicit bfloat16_t(const __nv_bfloat16& other)
+  {
+    __x = reinterpret_cast<const uint16_t&>(other);
+  }
+
+  /// Constructor from integer
+  __host__ __device__ __forceinline__ explicit bfloat16_t(int a)
+  {
+    *this = bfloat16_t(float(a));
+  }
+
+  /// Constructor from std::size_t
+  __host__ __device__ __forceinline__ explicit bfloat16_t(std::size_t a)
+  {
+    *this = bfloat16_t(float(a));
+  }
+
+  /// Constructor from double
+  __host__ __device__ __forceinline__ explicit bfloat16_t(double a)
+  {
+    *this = bfloat16_t(float(a));
+  }
+
+  /// Constructor from unsigned long long int
+  template <typename T,
+            typename = typename ::cuda::std::enable_if<
+              ::cuda::std::is_same<T, unsigned long long int>::value
+              && (!::cuda::std::is_same<std::size_t, unsigned long long int>::value)>::type>
+  __host__ __device__ __forceinline__ explicit bfloat16_t(T a)
+  {
+    *this = bfloat16_t(float(a));
+  }
+
+  /// Default constructor
+  bfloat16_t() = default;
+
+  /// Constructor from float
+  __host__ __device__ __forceinline__ explicit bfloat16_t(float a)
+  {
+    // Refrence:
+    // https://github.com/pytorch/pytorch/blob/44cc873fba5e5ffc4d4d4eef3bd370b653ce1ce1/c10/util/BFloat16.h#L51
+    uint16_t ir;
+    if (a != a)
+    {
+      ir = UINT16_C(0x7FFF);
+    }
+    else
+    {
+      union
+      {
+        uint32_t U32;
+        float F32;
+      };
+
+      F32                    = a;
+      uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+      ir                     = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+    }
+    this->__x = ir;
+  }
+
+  /// Cast to __nv_bfloat16
+  __host__ __device__ __forceinline__ operator __nv_bfloat16() const
+  {
+    return reinterpret_cast<const __nv_bfloat16&>(__x);
+  }
+
+  /// Cast to float
+  __host__ __device__ __forceinline__ operator float() const
+  {
+    float f     = 0;
+    uint32_t* p = reinterpret_cast<uint32_t*>(&f);
+    *p          = uint32_t(__x) << 16;
+    return f;
+  }
+
+  /// Get raw storage
+  __host__ __device__ __forceinline__ uint16_t raw() const
+  {
+    return this->__x;
+  }
+
+  /// Equality
+  __host__ __device__ __forceinline__ friend bool operator==(const bfloat16_t& a, const bfloat16_t& b)
+  {
+    return (a.__x == b.__x);
+  }
+
+  /// Inequality
+  __host__ __device__ __forceinline__ friend bool operator!=(const bfloat16_t& a, const bfloat16_t& b)
+  {
+    return (a.__x != b.__x);
+  }
+
+  /// Assignment by sum
+  __host__ __device__ __forceinline__ bfloat16_t& operator+=(const bfloat16_t& rhs)
+  {
+    *this = bfloat16_t(float(*this) + float(rhs));
+    return *this;
+  }
+
+  /// Multiply
+  __host__ __device__ __forceinline__ bfloat16_t operator*(const bfloat16_t& other)
+  {
+    return bfloat16_t(float(*this) * float(other));
+  }
+
+  /// Add
+  __host__ __device__ __forceinline__ bfloat16_t operator+(const bfloat16_t& other)
+  {
+    return bfloat16_t(float(*this) + float(other));
+  }
+
+  /// Less-than
+  __host__ __device__ __forceinline__ bool operator<(const bfloat16_t& other) const
+  {
+    return float(*this) < float(other);
+  }
+
+  /// Less-than-equal
+  __host__ __device__ __forceinline__ bool operator<=(const bfloat16_t& other) const
+  {
+    return float(*this) <= float(other);
+  }
+
+  /// Greater-than
+  __host__ __device__ __forceinline__ bool operator>(const bfloat16_t& other) const
+  {
+    return float(*this) > float(other);
+  }
+
+  /// Greater-than-equal
+  __host__ __device__ __forceinline__ bool operator>=(const bfloat16_t& other) const
+  {
+    return float(*this) >= float(other);
+  }
+
+  /// numeric_traits<bfloat16_t>::max
+  __host__ __device__ __forceinline__ static bfloat16_t(max)()
+  {
+    uint16_t max_word = 0x7F7F;
+    return reinterpret_cast<bfloat16_t&>(max_word);
+  }
+
+  /// numeric_traits<bfloat16_t>::lowest
+  __host__ __device__ __forceinline__ static bfloat16_t lowest()
+  {
+    uint16_t lowest_word = 0xFF7F;
+    return reinterpret_cast<bfloat16_t&>(lowest_word);
+  }
+};
+
+/******************************************************************************
+ * I/O stream overloads
+ ******************************************************************************/
+
+/// Insert formatted \p bfloat16_t into the output stream
+inline std::ostream& operator<<(std::ostream& out, const bfloat16_t& x)
+{
+  out << (float) x;
+  return out;
+}
+
+/// Insert formatted \p __nv_bfloat16 into the output stream
+inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
+{
+  return out << bfloat16_t(x);
+}
+
+/******************************************************************************
+ * Traits overloads
+ ******************************************************************************/
+
+template <>
+struct CUB_NS_QUALIFIER::FpLimits<bfloat16_t>
+{
+  static __host__ __device__ __forceinline__ bfloat16_t Max()
+  {
+    return bfloat16_t::max();
+  }
+
+  static __host__ __device__ __forceinline__ bfloat16_t Lowest()
+  {
+    return bfloat16_t::lowest();
+  }
+};
+
+template <>
+struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
+    : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, true, false, unsigned short, bfloat16_t>
+{};
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif
diff --git a/cub/test/catch2_main.cuh → c2h/include/c2h/catch2_main.cuh b/cub/test/catch2_main.cuh → c2h/include/c2h/catch2_main.cuh
@@ -41,7 +41,7 @@
 #include <catch2/catch.hpp>
 
 #if defined(CUB_CONFIG_MAIN)
-#  include "catch2_runner_helper.h"
+#  include <c2h/catch2_runner_helper.cuh>
 
 #  if !defined(CUB_EXCLUDE_CATCH2_HELPER_IMPL)
 #    include "catch2_runner_helper.inl"

diff --git a/cub/test/catch2_runner_helper.h → c2h/include/c2h/catch2_runner_helper.cuh b/cub/test/catch2_runner_helper.h → c2h/include/c2h/catch2_runner_helper.cuh