Skip to content

Commit

Permalink
WIP Migrate C2H library to top-level.
Browse files Browse the repository at this point in the history
[skip-docs][skip-vdc][skip-rapids]
  • Loading branch information
alliepiper committed Oct 25, 2024
1 parent ca5dbf9 commit 3ce1468
Show file tree
Hide file tree
Showing 124 changed files with 2,308 additions and 1,214 deletions.
39 changes: 39 additions & 0 deletions c2h/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
cmake_minimum_required(VERSION 3.21)

project(C2H LANGUAGES CXX CUDA)

cccl_get_catch2()

find_package(CCCL CONFIG REQUIRED
NO_DEFAULT_PATH # Only check the explicit HINTS below:
HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cccl/"
)

find_package(CUDAToolkit)

set(curand_default OFF)
if (CUDA_curand_LIBRARY)
set(curand_default ON)
endif()

option(C2H_ENABLE_CURAND "Use CUDA CURAND library in c2h." ${curand_default})

add_library(cccl.c2h STATIC generators.cu)
target_include_directories(cccl.c2h PUBLIC "${C2H_SOURCE_DIR}/include")
target_link_libraries(cccl.c2h PUBLIC
CCCL::CCCL
Catch2::Catch2
)

if (C2H_ENABLE_CURAND)
target_link_libraries(cccl.c2h PRIVATE CUDA::curand)
target_compile_definitions(cccl.c2h PRIVATE C2H_HAS_CURAND=1)
else()
target_compile_definitions(cccl.c2h PRIVATE C2H_HAS_CURAND=0)
endif()

add_library(cccl.c2h.main OBJECT
catch2_runner.cpp
catch2_runner_helper.cu
)
target_link_libraries(cccl.c2h.main PUBLIC cccl.c2h)
2 changes: 1 addition & 1 deletion cub/test/catch2_runner.cpp → c2h/catch2_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@

#define CUB_CONFIG_MAIN
#define CUB_EXCLUDE_CATCH2_HELPER_IMPL
#include "catch2_main.cuh"
#include <c2h/catch2_main.cuh>
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion cub/test/c2h/generators.cu → c2h/generators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@
#include <c2h/custom_type.cuh>
#include <c2h/device_policy.cuh>
#include <c2h/extended_types.cuh>
#include <c2h/fill_striped.cuh>
#include <c2h/generators.cuh>
#include <c2h/vector.cuh>
#include <fill_striped.cuh>

#if C2H_HAS_CURAND
# include <curand.h>
Expand Down
256 changes: 256 additions & 0 deletions c2h/include/c2h/bfloat16.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
/******************************************************************************
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

/**
* \file
* Utilities for interacting with the opaque CUDA __nv_bfloat16 type
*/

#include <cuda_bf16.h>

#include <cub/util_type.cuh>

#include <cuda/std/type_traits>

#include <cstdint>
#include <iosfwd>

#ifdef __GNUC__
// There's a ton of type-punning going on in this file.
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif

/******************************************************************************
* bfloat16_t
******************************************************************************/

/**
* Host-based fp16 data type compatible and convertible with __nv_bfloat16
*/
struct bfloat16_t
{
uint16_t __x;

/// Constructor from __nv_bfloat16
__host__ __device__ __forceinline__ explicit bfloat16_t(const __nv_bfloat16& other)
{
__x = reinterpret_cast<const uint16_t&>(other);
}

/// Constructor from integer
__host__ __device__ __forceinline__ explicit bfloat16_t(int a)
{
*this = bfloat16_t(float(a));
}

/// Constructor from std::size_t
__host__ __device__ __forceinline__ explicit bfloat16_t(std::size_t a)
{
*this = bfloat16_t(float(a));
}

/// Constructor from double
__host__ __device__ __forceinline__ explicit bfloat16_t(double a)
{
*this = bfloat16_t(float(a));
}

/// Constructor from unsigned long long int
template <typename T,
typename = typename ::cuda::std::enable_if<
::cuda::std::is_same<T, unsigned long long int>::value
&& (!::cuda::std::is_same<std::size_t, unsigned long long int>::value)>::type>
__host__ __device__ __forceinline__ explicit bfloat16_t(T a)
{
*this = bfloat16_t(float(a));
}

/// Default constructor
bfloat16_t() = default;

/// Constructor from float
__host__ __device__ __forceinline__ explicit bfloat16_t(float a)
{
// Refrence:
// https://github.com/pytorch/pytorch/blob/44cc873fba5e5ffc4d4d4eef3bd370b653ce1ce1/c10/util/BFloat16.h#L51
uint16_t ir;
if (a != a)
{
ir = UINT16_C(0x7FFF);
}
else
{
union
{
uint32_t U32;
float F32;
};

F32 = a;
uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
ir = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
}
this->__x = ir;
}

/// Cast to __nv_bfloat16
__host__ __device__ __forceinline__ operator __nv_bfloat16() const
{
return reinterpret_cast<const __nv_bfloat16&>(__x);
}

/// Cast to float
__host__ __device__ __forceinline__ operator float() const
{
float f = 0;
uint32_t* p = reinterpret_cast<uint32_t*>(&f);
*p = uint32_t(__x) << 16;
return f;
}

/// Get raw storage
__host__ __device__ __forceinline__ uint16_t raw() const
{
return this->__x;
}

/// Equality
__host__ __device__ __forceinline__ friend bool operator==(const bfloat16_t& a, const bfloat16_t& b)
{
return (a.__x == b.__x);
}

/// Inequality
__host__ __device__ __forceinline__ friend bool operator!=(const bfloat16_t& a, const bfloat16_t& b)
{
return (a.__x != b.__x);
}

/// Assignment by sum
__host__ __device__ __forceinline__ bfloat16_t& operator+=(const bfloat16_t& rhs)
{
*this = bfloat16_t(float(*this) + float(rhs));
return *this;
}

/// Multiply
__host__ __device__ __forceinline__ bfloat16_t operator*(const bfloat16_t& other)
{
return bfloat16_t(float(*this) * float(other));
}

/// Add
__host__ __device__ __forceinline__ bfloat16_t operator+(const bfloat16_t& other)
{
return bfloat16_t(float(*this) + float(other));
}

/// Less-than
__host__ __device__ __forceinline__ bool operator<(const bfloat16_t& other) const
{
return float(*this) < float(other);
}

/// Less-than-equal
__host__ __device__ __forceinline__ bool operator<=(const bfloat16_t& other) const
{
return float(*this) <= float(other);
}

/// Greater-than
__host__ __device__ __forceinline__ bool operator>(const bfloat16_t& other) const
{
return float(*this) > float(other);
}

/// Greater-than-equal
__host__ __device__ __forceinline__ bool operator>=(const bfloat16_t& other) const
{
return float(*this) >= float(other);
}

/// numeric_traits<bfloat16_t>::max
__host__ __device__ __forceinline__ static bfloat16_t(max)()
{
uint16_t max_word = 0x7F7F;
return reinterpret_cast<bfloat16_t&>(max_word);
}

/// numeric_traits<bfloat16_t>::lowest
__host__ __device__ __forceinline__ static bfloat16_t lowest()
{
uint16_t lowest_word = 0xFF7F;
return reinterpret_cast<bfloat16_t&>(lowest_word);
}
};

/******************************************************************************
* I/O stream overloads
******************************************************************************/

/// Insert formatted \p bfloat16_t into the output stream
inline std::ostream& operator<<(std::ostream& out, const bfloat16_t& x)
{
out << (float) x;
return out;
}

/// Insert formatted \p __nv_bfloat16 into the output stream
inline std::ostream& operator<<(std::ostream& out, const __nv_bfloat16& x)
{
return out << bfloat16_t(x);
}

/******************************************************************************
* Traits overloads
******************************************************************************/

template <>
struct CUB_NS_QUALIFIER::FpLimits<bfloat16_t>
{
static __host__ __device__ __forceinline__ bfloat16_t Max()
{
return bfloat16_t::max();
}

static __host__ __device__ __forceinline__ bfloat16_t Lowest()
{
return bfloat16_t::lowest();
}
};

template <>
struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
: CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, true, false, unsigned short, bfloat16_t>
{};

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
#include <catch2/catch.hpp>

#if defined(CUB_CONFIG_MAIN)
# include "catch2_runner_helper.h"
# include <c2h/catch2_runner_helper.cuh>

# if !defined(CUB_EXCLUDE_CATCH2_HELPER_IMPL)
# include "catch2_runner_helper.inl"
Expand Down
File renamed without changes.
Loading

0 comments on commit 3ce1468

Please sign in to comment.