diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp
index a367036c494..ac09c3400cb 100644
--- a/cpp/benchmarks/decimal/convert_floating.cpp
+++ b/cpp/benchmarks/decimal/convert_floating.cpp
@@ -32,8 +32,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, Out
 
   static constexpr bool is_double =
     std::is_same_v<InputType, double> || std::is_same_v<OutputType, double>;
-  static constexpr bool is_32bit =
-    std::is_same_v<InputType, numeric::decimal32> || std::is_same_v<OutputType, numeric::decimal32>;
   static constexpr bool is_128bit = std::is_same_v<InputType, numeric::decimal128> ||
                                     std::is_same_v<OutputType, numeric::decimal128>;
 
@@ -69,21 +67,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, Out
     return;
   }
 
-  // The current float <--> decimal conversion algorithm is limited
-  static constexpr bool is_64bit = !is_32bit && !is_128bit;
-  if (is_32bit && (exp_mode != 3)) {
-    state.skip("Decimal32 conversion only works up to scale factors of 10^9.");
-    return;
-  }
-  if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) {
-    state.skip("Decimal64 conversion only works up to scale factors of 10^18.");
-    return;
-  }
-  if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) {
-    state.skip("Decimal128 conversion only works up to scale factors of 10^38.");
-    return;
-  }
-
   // Type IDs
   auto const input_id  = cudf::type_to_id<InputType>();
   auto const output_id = cudf::type_to_id<OutputType>();
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index 2c3a5c5629d..c64ae8877d4 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/cmath>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
@@ -34,6 +35,49 @@ namespace numeric {
 
 namespace detail {
 
+/**
+ * @brief Determine the number of significant bits in an integer
+ *
+ * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
+ * @param value The integer whose bits are being counted
+ * @return The number of significant bits: the # of bits - # of leading zeroes
+ */
+template <typename T,
+          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
+                         std::is_same_v<T, __uint128_t>)>
+CUDF_HOST_DEVICE inline int count_significant_bits(T value)
+{
+#ifdef __CUDA_ARCH__
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __clzll(static_cast<int64_t>(value));
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __clz(static_cast<int32_t>(value));
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<int64_t>(value >> 64);
+    auto const low_bits  = static_cast<int64_t>(value);
+    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
+  }
+#else
+  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
+  if (value == 0) { return 0; }
+
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __builtin_clzll(value);
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __builtin_clz(value);
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<uint64_t>(value >> 64);
+    if (high_bits == 0) {
+      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
+    } else {
+      return 128 - __builtin_clzll(high_bits);
+    }
+  }
+#endif
+}
+
 /**
  * @brief Helper struct for getting and setting the components of a floating-point value
  *
@@ -62,27 +106,28 @@ struct floating_converter {
   // The low 23 / 52 bits (for float / double) are the mantissa.
   // The mantissa is normalized. There is an understood 1 bit to the left of the binary point.
   // The value of the mantissa is in the range [1, 2).
-  /// # mantissa bits (-1 for understood bit)
-  static constexpr int num_mantissa_bits = cuda::std::numeric_limits<FloatingType>::digits - 1;
+  /// # significand bits (includes understood bit)
+  static constexpr int num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
+  /// # stored mantissa bits (-1 for understood bit)
+  static constexpr int num_stored_mantissa_bits = num_significand_bits - 1;
   /// The mask for the understood bit
-  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_mantissa_bits);
+  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_stored_mantissa_bits);
   /// The mask to select the mantissa
   static constexpr IntegralType mantissa_mask = understood_bit_mask - 1;
 
   // And in between are the bits used to store the biased power-of-2 exponent.
   /// # exponents bits (-1 for sign bit)
-  static constexpr int num_exponent_bits = num_floating_bits - num_mantissa_bits - 1;
+  static constexpr int num_exponent_bits = num_floating_bits - num_stored_mantissa_bits - 1;
   /// The mask for the exponents, unshifted
   static constexpr IntegralType unshifted_exponent_mask =
     (IntegralType(1) << num_exponent_bits) - 1;
   /// The mask to select the exponents
-  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_mantissa_bits;
+  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_stored_mantissa_bits;
 
   // To store positive and negative exponents as unsigned values, the stored value for
   // the power-of-2 is exponent + bias. The bias is 127 for floats and 1023 for doubles.
   /// 127 / 1023 for float / double
-  static constexpr IntegralType exponent_bias =
-    cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
+  static constexpr int exponent_bias = cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
 
   /**
    * @brief Reinterpret the bits of a floating-point value as an integer
@@ -113,15 +158,15 @@ struct floating_converter {
   }
 
   /**
-   * @brief Extracts the integral significand of a bit-casted floating-point number
+   * @brief Checks whether the bit-casted floating-point value is +/-0
    *
-   * @param integer_rep The bit-casted floating value to extract the exponent from
-   * @return The integral significand, bit-shifted to a (large) whole number
+   * @param integer_rep The bit-casted floating value to check if is +/-0
+   * @return True if is a zero, else false
    */
-  CUDF_HOST_DEVICE inline static IntegralType get_base2_value(IntegralType integer_rep)
+  CUDF_HOST_DEVICE inline static bool is_zero(IntegralType integer_rep)
   {
-    // Extract the significand, setting the high bit for the understood 1/2
-    return (integer_rep & mantissa_mask) | understood_bit_mask;
+    // It's a zero if every non-sign bit is zero
+    return ((integer_rep & ~sign_mask) == 0);
   }
 
   /**
@@ -137,40 +182,59 @@ struct floating_converter {
   }
 
   /**
-   * @brief Extracts the exponent of a bit-casted floating-point number
+   * @brief Extracts the significand and exponent of a bit-casted floating-point number,
+   * shifted for denormals.
    *
-   * @note This returns INT_MIN for +/-0, +/-inf, NaN's, and denormals
-   * For all of these cases, the decimal fixed_point number should be set to zero
+   * @note Zeros/inf/NaN not handled.
    *
    * @param integer_rep The bit-casted floating value to extract the exponent from
-   * @return The stored base-2 exponent, or INT_MIN for special values
+   * @return The stored base-2 exponent and significand, shifted for denormals
    */
-  CUDF_HOST_DEVICE inline static int get_exp2(IntegralType integer_rep)
+  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
+    IntegralType integer_rep)
   {
-    // First extract the exponent bits and handle its special values.
-    // To minimize branching, all of these special cases will return INT_MIN.
-    // For all of these cases, the decimal fixed_point number should be set to zero.
+    // Extract the significand
+    auto significand = (integer_rep & mantissa_mask);
+
+    // Extract the exponent bits.
     auto const exponent_bits = integer_rep & exponent_mask;
+
+    // Notes on special values of exponent_bits:
+    // bits = exponent_mask is +/-inf or NaN, but those are handled prior to input.
+    // bits = 0 is either a denormal (handled below) or a zero (handled earlier by caller).
+    int floating_pow2;
     if (exponent_bits == 0) {
-      // Because of the understood set-bit not stored in the mantissa, it is not possible
-      // to store the value zero directly. Instead both +/-0 and denormals are represented with
-      // the exponent bits set to zero.
-      // Thus it's fastest to just floor (generally unwanted) denormals to zero.
-      return INT_MIN;
-    } else if (exponent_bits == exponent_mask) {
-      //+/-inf and NaN values are stored with all of the exponent bits set.
-      // As none of these are representable by integers, we'll return the same value for all cases.
-      return INT_MIN;
+      // Denormal values are 2^(1 - exponent_bias) * Sum_i(B_i * 2^-i)
+      // Where i is the i-th mantissa bit (counting from the LEFT, starting at 1),
+      // and B_i is the value of that bit (0 or 1)
+      // So e.g. for the minimum denormal, only the lowest bit is set:
+      // FLT_TRUE_MIN = 2^(1 - 127) * 2^-23 = 2^-149
+      // DBL_TRUE_MIN = 2^(1 - 1023) * 2^-52 = 2^-1074
+      floating_pow2 = 1 - exponent_bias;
+
+      // Line-up denormal to same (understood) bit as normal numbers
+      // This is so bit-shifting starts at the same bit index
+      auto const lineup_shift = num_significand_bits - count_significant_bits(significand);
+      significand <<= lineup_shift;
+      floating_pow2 -= lineup_shift;
+    } else {
+      // Extract the exponent value: shift the bits down and subtract the bias.
+      auto const shifted_exponent_bits = exponent_bits >> num_stored_mantissa_bits;
+      floating_pow2                    = static_cast<int>(shifted_exponent_bits) - exponent_bias;
+
+      // Set the high bit for the understood 1/2
+      significand |= understood_bit_mask;
     }
 
-    // Extract the exponent value: shift the bits down and subtract the bias.
-    using SignedIntegralType                       = cuda::std::make_signed_t<IntegralType>;
-    SignedIntegralType const shifted_exponent_bits = exponent_bits >> num_mantissa_bits;
-    return shifted_exponent_bits - static_cast<SignedIntegralType>(exponent_bias);
+    // To convert the mantissa to an integer, we effectively applied #-mantissa-bits
+    // powers of 2 to convert the fractional value to an integer, so subtract them off here
+    int const pow2 = floating_pow2 - num_stored_mantissa_bits;
+
+    return {significand, pow2};
   }
 
   /**
-   * @brief Sets the sign bit of a positive floating-point number
+   * @brief Sets the sign bit of a floating-point number
    *
    * @param floating The floating-point value to set the sign of. Must be positive.
    * @param is_negative The sign bit to set for the floating-point number
@@ -192,83 +256,60 @@ struct floating_converter {
   /**
    * @brief Adds to the base-2 exponent of a floating-point number
    *
+   * @note The caller must guarantee that the input is a positive (> 0) whole number.
+   *
    * @param floating The floating value to add to the exponent of. Must be positive.
-   * @param exp2 The power-of-2 to add to the floating-point number
-   * @return The input floating-point value * 2^exp2
+   * @param pow2 The power-of-2 to add to the floating-point number
+   * @return The input floating-point value * 2^pow2
    */
-  CUDF_HOST_DEVICE inline static FloatingType add_exp2(FloatingType floating, int exp2)
+  CUDF_HOST_DEVICE inline static FloatingType add_pow2(FloatingType floating, int pow2)
   {
+    // Note that the input floating-point number is positive (& whole), so we don't have to
+    // worry about the sign here; the sign will be set later in set_is_negative()
+
     // Convert floating to integer
     auto integer_rep = bit_cast_to_integer(floating);
 
     // Extract the currently stored (biased) exponent
+    using SignedType   = std::make_signed_t<IntegralType>;
     auto exponent_bits = integer_rep & exponent_mask;
-    auto stored_exp2   = exponent_bits >> num_mantissa_bits;
+    auto stored_pow2   = static_cast<SignedType>(exponent_bits >> num_stored_mantissa_bits);
 
     // Add the additional power-of-2
-    stored_exp2 += exp2;
+    stored_pow2 += pow2;
 
     // Check for exponent over/under-flow.
-    // Note that the input floating-point number is always positive, so we don't have to
-    // worry about the sign here; the sign will be set later in set_is_negative()
-    if (stored_exp2 <= 0) {
-      return 0.0;
-    } else if (stored_exp2 >= unshifted_exponent_mask) {
+    if (stored_pow2 <= 0) {
+      // Denormal (zero handled prior to input)
+
+      // Early out if bit shift will zero it anyway.
+      // Note: We must handle this explicitly, as too-large a bit-shift is UB
+      auto const bit_shift = -stored_pow2 + 1;  //+1 due to understood bit set below
+      if (bit_shift > num_stored_mantissa_bits) { return 0.0; }
+
+      // Clear the exponent bits (zero means 2^-126/2^-1022 w/ no understood bit)
+      integer_rep &= (~exponent_mask);
+
+      // The input floating-point number has an "understood" bit that we need to set
+      // prior to bit-shifting. Set the understood bit.
+      integer_rep |= understood_bit_mask;
+
+      // Convert to denormal: bit shift off the low bits
+      integer_rep >>= bit_shift;
+    } else if (stored_pow2 >= static_cast<SignedType>(unshifted_exponent_mask)) {
+      // Overflow: Set infinity
       return cuda::std::numeric_limits<FloatingType>::infinity();
     } else {
-      // Clear existing exponent bits and set new ones
-      exponent_bits = stored_exp2 << num_mantissa_bits;
+      // Normal number: Clear existing exponent bits and set new ones
+      exponent_bits = static_cast<IntegralType>(stored_pow2) << num_stored_mantissa_bits;
       integer_rep &= (~exponent_mask);
       integer_rep |= exponent_bits;
-
-      // Convert back to float
-      return bit_cast_to_floating(integer_rep);
     }
-  }
-};
 
-/**
- * @brief Determine the number of significant bits in an integer
- *
- * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
- * @param value The integer whose bits are being counted
- * @return The number of significant bits: the # of bits - # of leading zeroes
- */
-template <typename T,
-          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
-                         std::is_same_v<T, __uint128_t>)>
-CUDF_HOST_DEVICE inline int count_significant_bits(T value)
-{
-#ifdef __CUDA_ARCH__
-  if constexpr (std::is_same_v<T, uint64_t>) {
-    return 64 - __clzll(static_cast<int64_t>(value));
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    return 32 - __clz(static_cast<int32_t>(value));
-  } else if constexpr (std::is_same_v<T, __uint128_t>) {
-    // 128 bit type, must break up into high and low components
-    auto const high_bits = static_cast<int64_t>(value >> 64);
-    auto const low_bits  = static_cast<int64_t>(value);
-    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
-  }
-#else
-  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
-  if (value == 0) { return 0; }
-
-  if constexpr (std::is_same_v<T, uint64_t>) {
-    return 64 - __builtin_clzll(value);
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    return 32 - __builtin_clz(value);
-  } else if constexpr (std::is_same_v<T, __uint128_t>) {
-    // 128 bit type, must break up into high and low components
-    auto const high_bits = static_cast<uint64_t>(value >> 64);
-    if (high_bits == 0) {
-      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
-    } else {
-      return 128 - __builtin_clzll(high_bits);
-    }
+    // Convert back to float
+    return bit_cast_to_floating(integer_rep);
   }
-#endif
-}
+};
 
 /**
  * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an
@@ -276,18 +317,18 @@ CUDF_HOST_DEVICE inline int count_significant_bits(T value)
  *
  * @note Intended to be run at compile time.
  *
- * @tparam Exp10 The power of 10 to calculate
- * @return Returns 10^Exp10
+ * @tparam Pow10 The power of 10 to calculate
+ * @return Returns 10^Pow10
  */
-template <int Exp10>
+template <int Pow10>
 constexpr __uint128_t large_power_of_10()
 {
   // Stop at 10^19 to speed up compilation; literals can be used for smaller powers of 10.
-  static_assert(Exp10 >= 19);
-  if constexpr (Exp10 == 19)
+  static_assert(Pow10 >= 19);
+  if constexpr (Pow10 == 19)
     return __uint128_t(10000000000000000000ULL);
   else
-    return large_power_of_10<Exp10 - 1>() * __uint128_t(10);
+    return large_power_of_10<Pow10 - 1>() * __uint128_t(10);
 }
 
 /**
@@ -295,11 +336,11 @@ constexpr __uint128_t large_power_of_10()
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 9 inclusive.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator, from 0 to 9 inclusive.
+ * @return Returns value / 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int pow10)
 {
   // Computing division this way is much faster than the alternatives.
   // Division is not implemented in GPU hardware, and the compiler will often implement it as a
@@ -309,7 +350,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
 
   // Instead, if the compiler can see exactly what number it is dividing by, it can
   // produce much more optimal assembly, doing bit shifting, multiplies by a constant, etc.
-  // For the compiler to see the value though, array lookup (with exp10 as the index)
+  // For the compiler to see the value though, array lookup (with pow10 as the index)
   // is not sufficient: We have to use a switch statement. Although this introduces a branch,
   // it is still much faster than doing the divide any other way.
   // Perhaps an array can be used in C++23 with the assume attribute?
@@ -325,7 +366,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
   // introduces too much pressure on the kernels that use this code, slowing down their benchmarks.
   // It also dramatically slows down the compile time.
 
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -345,14 +386,14 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 19 inclusive.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator, from 0 to 19 inclusive.
+ * @return Returns value / 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -382,14 +423,14 @@ CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 38 inclusive.
- * @return Returns value / 10^exp10.
+ * @param pow10 The power-of-10 of the denominator, from 0 to 38 inclusive.
+ * @return Returns value / 10^pow10.
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for an introduction.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -438,14 +479,14 @@ CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
+ * @return Returns value * 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -465,14 +506,14 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
+ * @return Returns value * 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -502,14 +543,14 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
- * @return Returns value * 10^exp10.
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
+ * @return Returns value * 10^pow10.
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int pow10)
 {
   // See comments in divide_power10_128bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -556,59 +597,678 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10)
 /**
  * @brief Multiply an integer by a power of 10.
  *
- * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * @note Use this function if you have no a-priori knowledge of what pow10 might be.
  * If you do, prefer calling the bit-size-specific versions
  *
  * @tparam Rep Representation type needed for integer exponentiation
  * @tparam T Integral type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier.
+ * @return Returns value * 10^pow10
  */
-template <typename Rep,
-          typename T,
-          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int exp10)
+template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int pow10)
 {
-  // Use this function if you have no knowledge of what exp10 might be
+  // Use this function if you have no knowledge of what pow10 might be
   // If you do, prefer calling the bit-size-specific versions
   if constexpr (sizeof(Rep) <= 4) {
-    return multiply_power10_32bit(value, exp10);
+    return multiply_power10_32bit(value, pow10);
   } else if constexpr (sizeof(Rep) <= 8) {
-    return multiply_power10_64bit(value, exp10);
+    return multiply_power10_64bit(value, pow10);
   } else {
-    return multiply_power10_128bit(value, exp10);
+    return multiply_power10_128bit(value, pow10);
   }
 }
 
 /**
  * @brief Divide an integer by a power of 10.
  *
- * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * @note Use this function if you have no a-priori knowledge of what pow10 might be.
  * If you do, prefer calling the bit-size-specific versions
  *
  * @tparam Rep Representation type needed for integer exponentiation
  * @tparam T Integral type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator.
+ * @return Returns value / 10^pow10
  */
-template <typename Rep,
-          typename T,
-          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int exp10)
+template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int pow10)
 {
-  // Use this function if you have no knowledge of what exp10 might be
+  // Use this function if you have no knowledge of what pow10 might be
   // If you do, prefer calling the bit-size-specific versions
   if constexpr (sizeof(Rep) <= 4) {
-    return divide_power10_32bit(value, exp10);
+    return divide_power10_32bit(value, pow10);
   } else if constexpr (sizeof(Rep) <= 8) {
-    return divide_power10_64bit(value, exp10);
+    return divide_power10_64bit(value, pow10);
   } else {
-    return divide_power10_128bit(value, exp10);
+    return divide_power10_128bit(value, pow10);
   }
 }
 
+/**
+ * @brief Perform a bit-shift left, guarding against undefined behavior
+ *
+ * @tparam IntegerType Type of input unsigned integer value
+ * @param value The integer whose bits are being shifted
+ * @param bit_shift The number of bits to shift left
+ * @return The bit-shifted integer, except max value if UB would occur
+ */
+template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
+CUDF_HOST_DEVICE inline IntegerType guarded_left_shift(IntegerType value, int bit_shift)
+{
+  // Bit shifts larger than this are undefined behavior
+  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
+  return (bit_shift <= max_safe_bit_shift) ? value << bit_shift
+                                           : cuda::std::numeric_limits<IntegerType>::max();
+}
+
+/**
+ * @brief Perform a bit-shift right, guarding against undefined behavior
+ *
+ * @tparam IntegerType Type of input unsigned integer value
+ * @param value The integer whose bits are being shifted
+ * @param bit_shift The number of bits to shift right
+ * @return The bit-shifted integer, which is zero on underflow
+ */
+template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
+CUDF_HOST_DEVICE inline IntegerType guarded_right_shift(IntegerType value, int bit_shift)
+{
+  // Bit shifts larger than this are undefined behavior
+  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
+  return (bit_shift <= max_safe_bit_shift) ? value >> bit_shift : 0;
+}
+
+/**
+ * @brief Helper struct with common constants needed by the floating <--> decimal conversions
+ */
+template <typename FloatingType>
+struct shifting_constants {
+  /// Whether the type is double
+  static constexpr bool is_double = cuda::std::is_same_v<FloatingType, double>;
+
+  /// Integer type that can hold the value of the significand
+  using IntegerRep = std::conditional_t<is_double, uint64_t, uint32_t>;
+
+  /// Num bits needed to hold the significand
+  static constexpr auto num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
+
+  /// Shift data back and forth in space of a type with 2x the starting bits, to give us enough room
+  using ShiftingRep = std::conditional_t<is_double, __uint128_t, uint64_t>;
+
+  // The significand of a float / double is 24 / 53 bits
+  // However, to uniquely represent each double / float as different #'s in decimal
+  // you need 17 / 9 digits (from std::numeric_limits<T>::max_digits10)
+  // To represent 10^17 / 10^9, you need 57 / 30 bits
+  // So we need to keep track of at least this # of bits during shifting to ensure no info is lost
+
+  // We will be alternately shifting our data back and forth by powers of 2 and 10 to convert
+  // between floating and decimal (see shifting functions for details).
+
+  // To iteratively shift back and forth, our 2's (bit-) and 10's (divide-/multiply-) shifts must
+  // be of nearly the same magnitude, or else we'll over-/under-flow our shifting integer
+
+  // 2^10 is approximately 10^3, so the largest shifts will have a 10/3 ratio
+  // The difference between 2^10 and 10^3 is 1024/1000: 2.4%
+  // So every time we shift by 10 bits and 3 decimal places, the 2s shift is an extra 2.4%
+
+  // This 2.4% error compounds each time we do an iteration.
+  // The min (normal) float is 2^-126.
+  // Min denormal: 2^-126 * 2^-23 (mantissa bits): 2^-149 = ~1.4E-45
+  // With our 10/3 shifting ratio, 149 (bit-shifts) * (3 / 10) = 44.7 (10s-shifts)
+  // 10^(-44.7) = 2E-45, which is off by ~1.4x from 1.4E-45
+
+  // Similarly, the min (normal) double is 2^-1022.
+  // Min denormal: 2^-1022 * 2^-52 (mantissa bits): 2^-1074 = 4.94E-324
+  // With our 10/3 shifting ratio, 1074 (bit-shifts) * (3 / 10) = 322.2 (10s-shifts)
+  // 10^(-322.2) = 6.4E-323, which is off by ~13.2x from 4.94E-324
+
+  // To account for this compounding error, we can either complicate our loop code (slow),
+  // or use extra bits (in the direction we're shifting the 2s!) to compensate:
+  // 4 extra bits for doubles (2^4 = 16 > 13.2x error), 1 extra for floats (2 > 1.4x error)
+  /// # buffer bits to account for shifting error
+  static constexpr int num_2s_shift_buffer_bits = is_double ? 4 : 1;
+
+  // How much room do we have for shifting?
+  // Float: 64-bit ShiftingRep - 31 (rep + buffer) = 33 bits. 2^33 = 8.6E9
+  // Double: 128-bit ShiftingRep - 61 (rep + buffer) = 67 bits. 2^67 = 1.5E20
+  // Thus for double / float we can shift up to 20 / 9 decimal places at once
+
+  // But, we need to stick to our 10-bits / 3-decimals shift ratio to not over/under-flow.
+  // To simplify our loop code, we'll keep to this ratio by instead shifting a max of
+  // 18 / 9 decimal places, for double / float (60 / 30 bits)
+  /// Max at-once decimal place shift
+  static constexpr int max_digits_shift = is_double ? 18 : 9;
+  /// Max at-once bit shift
+  static constexpr int max_bits_shift = max_digits_shift * 10 / 3;
+
+  // Pre-calculate 10^max_digits_shift. Note that 10^18 / 10^9 fits within IntegerRep
+  /// 10^max_digits_shift
+  static constexpr auto max_digits_shift_pow =
+    multiply_power10<IntegerRep>(IntegerRep(1), max_digits_shift);
+};
+
+/**
+ * @brief Add half a bit to integer rep of floating point if conversion causes truncation
+ *
+ * @note This fixes problems like 1.2 (value = 1.1999...) at scale -1 -> 11
+ *
+ * @tparam FloatingType Type of integer holding the floating-point significand
+ * @param floating The floating-point number to convert
+ * @param integer_rep The integer representation of the floating-point significand
+ * @param pow2 The power of 2 that needs to be applied to the significand
+ * @param pow10 The power of 10 that needs to be applied to the significand
+ * @return integer_rep, shifted 1 and ++'d if the conversion to decimal causes truncation
+ */
+template <typename FloatingType, CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE cuda::std::pair<typename floating_converter<FloatingType>::IntegralType, int>
+add_half_if_truncates(FloatingType floating,
+                      typename floating_converter<FloatingType>::IntegralType integer_rep,
+                      int pow2,
+                      int pow10)
+{
+  // The user-supplied scale may truncate information, so we need to talk about rounding.
+  // We have chosen not to round, so we want 1.23456f with scale -4 to be decimal 12345
+
+  // But if we don't round at all, 1.2 (double) with scale -1 is 11 instead of 12!
+  // Why? Because 1.2 (double) is actually stored as 1.1999999... which we truncate to 1.1
+  // While correct (given our choice to truncate), this is surprising and undesirable.
+  // This problem happens because 1.2 is not perfectly representable in floating point,
+  // and the value 1.199999... happened to be closer to 1.2 than the next value (1.2000...1...)
+
+  // If the scale truncates information (we didn't choose to keep exactly 1.1999...), how
+  // do we make sure we store 1.2?  We'll add half an ulp! (unit in the last place)
+  // Then 1.1999... becomes 1.2000...1... which truncates to 1.2.
+  // And if it had been 1.2000...1..., adding half an ulp still truncates to 1.2
+
+  // Why 1/2 an ulp? Because that's all that is needed. The reason we have this problem in the
+  // first place is because the compiler rounded (e.g.) 1.2 to the nearest floating point number.
+  // The distance of this rounding is at most 1/2 ulp, otherwise we'd have rounded the other way.
+
+  // How do we add 1/2 an ulp? Just shift the bits left (updating pow2) and add 1.
+  // We'll always shift up so every input to the conversion algorithm is aligned the same way.
+
+  // If we add a full ulp we run into issues where we add too much and get the wrong result.
+  // This is because (e.g.) 2^23 = 8.4E6 which is not quite 7 digits of precision.
+  // So if we want 7 digits, that may "barely" truncate information; adding a 1 ulp is overkill.
+
+  // So when does the user-supplied scale truncate info?
+  // For powers > 0: When the 10s (scale) shift is larger than the corresponding bit-shift.
+  // For powers < 0: When the 10s shift is less than the corresponding bit-shift.
+
+  // Corresponding bit-shift:
+  // 2^10 is approximately 10^3, but this is off by 1.024%
+  // 1.024^30 is 2.03704, so this is high by one bit for every 30*3 = 90 powers of 10
+  // So 10^N = 2^(10*N/3 - N/90) = 2^(299*N/90)
+  // Do comparison without dividing, which loses information:
+  // Note: if shift is "equal," still truncates if pow2 < 0 (shifting UP by 2s, 2^10 > 10^3)
+  int const pow2_term  = 90 * pow2;
+  int const pow10_term = 299 * pow10;
+  bool const conversion_truncates =
+    (pow10_term > pow2_term) || ((pow2_term == pow10_term) && (pow2 < 0));
+
+  // However, don't add a half-bit if the input is a whole number!
+  // This is only for errors introduced by rounding decimal fractions!
+  bool const is_whole_number = (cuda::std::floor(floating) == floating);
+  bool const add_half_bit    = conversion_truncates && !is_whole_number;
+
+  // Add half a bit on truncation (shift to make room and update pow2)
+  integer_rep <<= 1;
+  --pow2;
+  integer_rep += static_cast<decltype(integer_rep)>(add_half_bit);
+
+  return {integer_rep, pow2};
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 > 0
+ *
+ * @tparam Rep The type of the storage for the decimal value
+ * @tparam FloatingType The type of the original floating-point value we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Magnitude of the converted-to decimal integer
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_pospow(
+  typename shifting_constants<FloatingType>::IntegerRep const base2_value, int pow2, int pow10)
+{
+  // To convert to decimal, we need to apply the input powers of 2 and 10
+  // The result will be (integer) base2_value * (2^pow2) / (10^pow10)
+  // Output type is ShiftingRep
+
+  // Here pow10 > 0 and pow2 > 0, so we need to shift left by 2s and divide by 10s.
+  // We'll iterate back and forth between them, shifting up by 2s
+  // and down by 10s until all of the powers have been applied.
+
+  // However the input base2_value type has virtually no spare room to shift our data
+  // without over- or under-flowing and losing precision.
+  // So we'll cast up to ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
+
+  // We want to start with our significand bits at the top of the shifting range,
+  // so that we don't lose information we need on intermediary right-shifts.
+  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side,
+  // For all numbers this bit shift is a fixed distance, due to the understood 2^0 bit.
+  // Note that shift_from is +1 due to shift in add_half_if_truncates()
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
+  static constexpr int shift_from  = Constants::num_significand_bits + 1;
+  static constexpr int max_init_shift = shift_up_to - shift_from;
+
+  // If our total bit shift is less than this, we don't need to iterate
+  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
+  if (pow2 <= max_init_shift) {
+    // Shift bits left, divide by 10s to apply the scale factor, and we're done.
+    shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
+    // NOTE: Cast can overflow!
+    return static_cast<UnsignedRep>(shifting_rep);
+  }
+
+  // We need to iterate. Do the combined initial shift
+  shifting_rep <<= max_init_shift;
+  pow2 -= max_init_shift;
+
+  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
+  while (pow10 > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Divide the max number of 10s
+    shifting_rep /= Constants::max_digits_shift_pow;
+    pow10 -= Constants::max_digits_shift;
+
+    // If our remaining bit shift is less than the max, we're finished iterating
+    if (pow2 <= Constants::max_bits_shift) {
+      // Shift bits left, divide by 10s to apply the scale factor, and we're done.
+      shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
+
+      // NOTE: Cast can overflow!
+      return static_cast<UnsignedRep>(shifting_rep);
+    }
+
+    // Shift the max number of bits left again
+    shifting_rep <<= Constants::max_bits_shift;
+    pow2 -= Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: Divide all remaining decimal places, shift all remaining bits, then bail
+  // Note: This divide result may not fit in the low half of the bit range
+  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = divide_power10_64bit(shifting_rep, pow10);
+  } else {
+    shifting_rep = divide_power10_32bit(shifting_rep, pow10);
+  }
+
+  // Final bit shift: Shift may be large, guard against UB
+  // NOTE: This can overflow (both cast and shift)!
+  return guarded_left_shift(static_cast<UnsignedRep>(shifting_rep), pow2);
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 < 0
+ *
+ * @tparam Rep The type of the storage for the decimal value
+ * @tparam FloatingType The type of the original floating-point value we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Magnitude of the converted-to decimal integer
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_negpow(
+  typename shifting_constants<FloatingType>::IntegerRep base2_value, int pow2, int pow10)
+{
+  // This is similar to shift_to_decimal_pospow(), except pow10 < 0 & pow2 < 0
+  // See comments in that function for details.
+  // Instead here we need to multiply by 10s and shift right by 2s
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
+
+  // Convert to using positive values so we don't have keep negating
+  int pow10_mag = -pow10;
+  int pow2_mag  = -pow2;
+
+  // For performing final 10s-shift
+  using UnsignedRep        = cuda::std::make_unsigned_t<Rep>;
+  auto final_shifts_low10s = [&]() {
+    // Last 10s-shift: multiply all remaining decimal places, shift all remaining bits, then bail
+    // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
+    if constexpr (Constants::is_double) {
+      shifting_rep = multiply_power10_64bit(shifting_rep, pow10_mag);
+    } else {
+      shifting_rep = multiply_power10_32bit(shifting_rep, pow10_mag);
+    }
+
+    // Final bit shifting: Shift may be large, guard against UB
+    return static_cast<UnsignedRep>(guarded_right_shift(shifting_rep, pow2_mag));
+  };
+
+  // If our total decimal shift is less than the max, we don't need to iterate
+  if (pow10_mag <= Constants::max_digits_shift) { return final_shifts_low10s(); }
+
+  // We want to start by lining up our bits to the top of the shifting range,
+  // except our first operation is a multiply, so not quite that far
+  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
+  // Note that shift_from is +1 due to shift in add_half_if_truncates()
+  static constexpr int shift_up_to        = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
+  static constexpr int shift_from         = Constants::num_significand_bits + 1;
+  static constexpr int num_init_bit_shift = shift_up_to - shift_from;
+
+  // Perform initial shift
+  shifting_rep <<= num_init_bit_shift;
+  pow2_mag += num_init_bit_shift;
+
+  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
+  do {
+    // More decimal places to shift than we have room: Multiply the max number of 10s
+    shifting_rep *= Constants::max_digits_shift_pow;
+    pow10_mag -= Constants::max_digits_shift;
+
+    // If our remaining bit shift is less than the max, we're finished iterating
+    if (pow2_mag <= Constants::max_bits_shift) {
+      // Last bit-shift: Shift all remaining bits, apply the remaining scale, then bail
+      shifting_rep >>= pow2_mag;
+
+      // We need to convert to the output rep for the final scale-factor multiply, because if (e.g.)
+      // float -> dec128 and some large pow10_mag, it might overflow the 64bit shifting rep.
+      // It's not needed for pow10 > 0 because we're dividing by 10s there instead of multiplying.
+      // NOTE: This can overflow! (Both multiply and cast)
+      return multiply_power10<UnsignedRep>(static_cast<UnsignedRep>(shifting_rep), pow10_mag);
+    }
+
+    // More bits to shift than we have room: Shift the max number of 2s
+    shifting_rep >>= Constants::max_bits_shift;
+    pow2_mag -= Constants::max_bits_shift;
+  } while (pow10_mag > Constants::max_digits_shift);
+
+  // Do our final shifts
+  return final_shifts_low10s();
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> convert_floating_to_integral_shifting(
+  typename floating_converter<FloatingType>::IntegralType base2_value, int pow10, int pow2)
+{
+  // Apply the powers of 2 and 10 to convert to decimal.
+  // The result will be base2_value * (2^pow2) / (10^pow10)
+
+  // Note that while this code is branchy, the decimal scale factor is part of the
+  // column type itself, so every thread will take the same branches on pow10.
+  // Also data within a column tends to be similar, so they will often take the
+  // same branches on pow2 as well.
+
+  // NOTE: some returns here can overflow (e.g. ShiftingRep -> UnsignedRep)
+  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
+  if (pow10 == 0) {
+    // NOTE: Left Bit-shift can overflow! As can cast! (e.g. double -> decimal32)
+    // Bit shifts may be large, guard against UB
+    if (pow2 >= 0) {
+      return guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
+    } else {
+      return static_cast<UnsignedRep>(guarded_right_shift(base2_value, -pow2));
+    }
+  } else if (pow10 > 0) {
+    if (pow2 <= 0) {
+      // Power-2/10 shifts both downward: order doesn't matter, apply and bail.
+      // Guard against shift being undefined behavior
+      auto const shifted = guarded_right_shift(base2_value, -pow2);
+      return static_cast<UnsignedRep>(divide_power10<decltype(shifted)>(shifted, pow10));
+    }
+    return shift_to_decimal_pospow<Rep, FloatingType>(base2_value, pow2, pow10);
+  } else {  // pow10 < 0
+    if (pow2 >= 0) {
+      // Power-2/10 shifts both upward: order doesn't matter, apply and bail.
+      // NOTE: Either shift, multiply, or cast (e.g. double -> decimal32) can overflow!
+      auto const shifted = guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
+      return multiply_power10<UnsignedRep>(shifted, -pow10);
+    }
+    return shift_to_decimal_negpow<Rep, FloatingType>(base2_value, pow2, pow10);
+  }
+}
+
+/**
+ * @brief Perform floating-point -> integer decimal conversion
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param floating The floating point value to convert
+ * @param scale The desired base-10 scale factor: decimal value = returned value * 10^scale
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline Rep convert_floating_to_integral(FloatingType const& floating,
+                                                         scale_type const& scale)
+{
+  // Extract components of the floating point number
+  using converter        = floating_converter<FloatingType>;
+  auto const integer_rep = converter::bit_cast_to_integer(floating);
+  if (converter::is_zero(integer_rep)) { return 0; }
+
+  // Note that the significand here is an unsigned integer with sizeof(FloatingType)
+  auto const is_negative                  = converter::get_is_negative(integer_rep);
+  auto const [significand, floating_pow2] = converter::get_significand_and_pow2(integer_rep);
+
+  // Add half a bit if truncating to yield expected value, see function for discussion.
+  auto const pow10 = static_cast<int>(scale);
+  auto const [base2_value, pow2] =
+    add_half_if_truncates(floating, significand, floating_pow2, pow10);
+
+  // Apply the powers of 2 and 10 to convert to decimal.
+  auto const magnitude =
+    convert_floating_to_integral_shifting<Rep, FloatingType>(base2_value, pow10, pow2);
+
+  // Reapply the sign and return
+  // NOTE: Cast can overflow!
+  auto const signed_magnitude = static_cast<Rep>(magnitude);
+  return is_negative ? -signed_magnitude : signed_magnitude;
+}
+
+/**
+ * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 > 0
+ *
+ * @tparam DecimalRep The decimal integer type we are converting from
+ * @tparam FloatingType The type of floating point object we are converting to
+ * @param decimal_rep The decimal integer to convert
+ * @param pow10 The number of powers of 10 to apply to undo the scale factor
+ * @return A pair of the base-2 value and the remaining powers of 2 to be applied
+ */
+template <typename FloatingType,
+          typename DecimalRep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int pow10)
+{
+  // This is the reverse of shift_to_decimal_pospow(), see that for more details.
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+
+  // We want to start by lining up our bits to the top of the shifting range,
+  // except our first operation is a multiply, so not quite that far
+  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
+  int const shift_from             = count_significant_bits(decimal_rep);
+  int const num_init_bit_shift     = shift_up_to - shift_from;
+  int pow2                         = -num_init_bit_shift;
+
+  // Perform the initial bit shift
+  ShiftingRep shifting_rep;
+  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
+    // Shift within DecimalRep before dropping to the smaller ShiftingRep
+    decimal_rep  = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+  } else {
+    // Scale up to ShiftingRep before shifting
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+    shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
+  }
+
+  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
+  while (pow10 > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Multiply the max number of 10s
+    shifting_rep *= Constants::max_digits_shift_pow;
+    pow10 -= Constants::max_digits_shift;
+
+    // Then make more room by bit shifting down by the max # of 2s
+    shifting_rep >>= Constants::max_bits_shift;
+    pow2 += Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: multiply all remaining decimal places
+  // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = multiply_power10_64bit(shifting_rep, pow10);
+  } else {
+    shifting_rep = multiply_power10_32bit(shifting_rep, pow10);
+  }
+
+  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
+  return std::pair{shifting_rep, pow2};
+}
+
+/**
+ * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 < 0
+ *
+ * @tparam DecimalRep The decimal integer type we are converting from
+ * @tparam FloatingType The type of floating point object we are converting to
+ * @param decimal_rep The decimal integer to convert
+ * @param pow10 The number of powers of 10 to apply to undo the scale factor
+ * @return A pair of the base-2 value and the remaining powers of 2 to be applied
+ */
+template <typename FloatingType,
+          typename DecimalRep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int const pow10)
+{
+  // This is the reverse of shift_to_decimal_negpow(), see that for more details.
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+
+  // We want to start with our significand bits at the top of the shifting range,
+  // so that we lose minimal information we need on intermediary right-shifts.
+  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
+  int const shift_from             = count_significant_bits(decimal_rep);
+  int const num_init_bit_shift     = shift_up_to - shift_from;
+  int pow2                         = -num_init_bit_shift;
+
+  // Perform the initial bit shift
+  ShiftingRep shifting_rep;
+  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
+    // Shift within DecimalRep before dropping to the smaller ShiftingRep
+    decimal_rep  = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+  } else {
+    // Scale up to ShiftingRep before shifting
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+    shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
+  }
+
+  // Convert to using positive values upfront, simpler than doing later.
+  int pow10_mag = -pow10;
+
+  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
+  while (pow10_mag > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Divide the max number of 10s
+    shifting_rep /= Constants::max_digits_shift_pow;
+    pow10_mag -= Constants::max_digits_shift;
+
+    // Then make more room by bit shifting up by the max # of 2s
+    shifting_rep <<= Constants::max_bits_shift;
+    pow2 -= Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: Divdie all remaining decimal places.
+  // This divide result may not fit in the low half of the bit range
+  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = divide_power10_64bit(shifting_rep, pow10_mag);
+  } else {
+    shifting_rep = divide_power10_32bit(shifting_rep, pow10_mag);
+  }
+
+  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
+  return std::pair{shifting_rep, pow2};
+}
+
+/**
+ * @brief Perform integer decimal -> floating-point conversion
+ *
+ * @tparam FloatingType The type of floating-point object we are converting to
+ * @tparam Rep The decimal integer type we are converting from
+ * @param value The decimal integer to convert
+ * @param scale The base-10 scale factor for the input integer
+ * @return Floating-point representation of the scaled integral value
+ */
+template <typename FloatingType,
+          typename Rep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& value,
+                                                                  scale_type const& scale)
+{
+  // Check the sign of the input
+  bool const is_negative = (value < 0);
+
+  // Convert to unsigned for bit counting/shifting
+  using UnsignedType        = cuda::std::make_unsigned_t<Rep>;
+  auto const unsigned_value = [&]() -> UnsignedType {
+    // Must guard against minimum value, as we can't just negate it: not representable.
+    if (value == cuda::std::numeric_limits<Rep>::min()) { return static_cast<UnsignedType>(value); }
+
+    // No abs function for 128bit types, so have to do it manually.
+    if constexpr (cuda::std::is_same_v<Rep, __int128_t>) {
+      return static_cast<UnsignedType>(is_negative ? -value : value);
+    } else {
+      return cuda::std::abs(value);
+    }
+  }();
+
+  // Shift by powers of 2 and 10 to get our integer mantissa
+  auto const [mantissa, pow2] = [&]() {
+    auto const pow10 = static_cast<int32_t>(scale);
+    if (pow10 >= 0) {
+      return shift_to_binary_pospow<FloatingType>(unsigned_value, pow10);
+    } else {  // pow10 < 0
+      return shift_to_binary_negpow<FloatingType>(unsigned_value, pow10);
+    }
+  }();
+
+  // Zero has special exponent bits, just handle it here
+  if (mantissa == 0) { return FloatingType(0.0f); }
+
+  // Cast our integer mantissa to floating point
+  auto const floating = static_cast<FloatingType>(mantissa);  // IEEE-754 rounds to even
+
+  // Apply the sign and the remaining powers of 2
+  using converter      = floating_converter<FloatingType>;
+  auto const magnitude = converter::add_pow2(floating, pow2);
+  return converter::set_is_negative(magnitude, is_negative);
+}
+
 }  // namespace detail
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 74c8bc67d3a..8a515335351 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -50,14 +51,19 @@ namespace cudf {
  */
 template <typename Fixed,
           typename Floating,
-          typename cuda::std::enable_if_t<is_fixed_point<Fixed>() &&
-                                          cuda::std::is_floating_point_v<Floating>>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>&& is_fixed_point<Fixed>())>
 CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale)
 {
-  using Rep          = typename Fixed::rep;
-  auto const shifted = numeric::detail::shift<Rep, Fixed::rad>(floating, scale);
-  numeric::scaled_integer<Rep> scaled{static_cast<Rep>(shifted), scale};
-  return Fixed(scaled);
+  using Rep        = typename Fixed::rep;
+  auto const value = [&]() {
+    if constexpr (Fixed::rad == numeric::Radix::BASE_10) {
+      return numeric::detail::convert_floating_to_integral<Rep>(floating, scale);
+    } else {
+      return static_cast<Rep>(numeric::detail::shift<Rep, Fixed::rad>(floating, scale));
+    }
+  }();
+
+  return Fixed(numeric::scaled_integer<Rep>{value, scale});
 }
 
 /**
@@ -75,14 +81,17 @@ CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::sca
  */
 template <typename Floating,
           typename Fixed,
-          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating> &&
-                                          is_fixed_point<Fixed>()>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>&& is_fixed_point<Fixed>())>
 CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
 {
-  using Rep         = typename Fixed::rep;
-  auto const casted = static_cast<Floating>(fixed.value());
-  auto const scale  = numeric::scale_type{-fixed.scale()};
-  return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+  using Rep = typename Fixed::rep;
+  if constexpr (Fixed::rad == numeric::Radix::BASE_10) {
+    return numeric::detail::convert_integral_to_floating<Floating>(fixed.value(), fixed.scale());
+  } else {
+    auto const casted = static_cast<Floating>(fixed.value());
+    auto const scale  = numeric::scale_type{-fixed.scale()};
+    return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+  }
 }
 
 /**
@@ -95,7 +104,7 @@ CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
  */
 template <typename Floating,
           typename Input,
-          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating>>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>)>
 CUDF_HOST_DEVICE Floating convert_to_floating(Input input)
 {
   if constexpr (is_fixed_point<Input>()) {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index ab7984d4b03..a222289216d 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -38,7 +38,7 @@ struct FixedPointTest : public cudf::test::BaseFixture {};
 template <typename T>
 struct FixedPointTestAllReps : public cudf::test::BaseFixture {};
 
-using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
+using RepresentationTypes = ::testing::Types<int32_t, int64_t, __int128_t>;
 
 TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes);
 
@@ -53,6 +53,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
   auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-4));
   auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-5));
   auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
+  auto num7 = cudf::convert_floating_to_fixed<decimalXX>(0.0, scale_type(-4));
 
   EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num0));
   EXPECT_EQ(1.2, cudf::convert_fixed_to_floating<double>(num1));
@@ -61,6 +62,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
   EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating<double>(num4));
   EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating<double>(num5));
   EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(0.0, cudf::convert_fixed_to_floating<double>(num7));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
@@ -74,6 +76,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
   auto num4 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-4));
   auto num5 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-5));
   auto num6 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-6));
+  auto num7 = cudf::convert_floating_to_fixed<decimalXX>(-0.0, scale_type(-4));
 
   EXPECT_EQ(-1, cudf::convert_fixed_to_floating<double>(num0));
   EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating<double>(num1));
@@ -82,6 +85,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
   EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating<double>(num4));
   EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating<double>(num5));
   EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(-0.0, cudf::convert_fixed_to_floating<double>(num7));
 }
 
 TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
@@ -99,14 +103,10 @@ TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 
   EXPECT_EQ(1.1, cudf::convert_fixed_to_floating<double>(a));
   EXPECT_EQ(1.01, cudf::convert_fixed_to_floating<double>(b));
-  EXPECT_EQ(1,
-            cudf::convert_fixed_to_floating<double>(
-              c));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.001, cudf::convert_fixed_to_floating<double>(c));
   EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating<double>(d));
   EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating<double>(e));
-  EXPECT_EQ(1,
-            cudf::convert_fixed_to_floating<double>(
-              f));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.000001, cudf::convert_fixed_to_floating<double>(f));
 
   EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating<double>(x) <
               std::numeric_limits<double>::epsilon());
@@ -153,6 +153,119 @@ TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
   EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating<double>(num1));
 }
 
+TEST_F(FixedPointTest, PreciseFloatDecimal64Construction)
+{
+  // Need 9 decimal digits to uniquely represent all floats (numeric_limits::max_digits10()).
+  // Precise conversion: set the scale factor to 9 less than the order-of-magnitude.
+  // But with -9 scale factor decimal32 can overflow: use decimal64 instead.
+
+  // Positive Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E7f, scale_type(-2));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E12f, scale_type(3));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E17f, scale_type(8));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E22f, scale_type(13));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E27f, scale_type(18));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E32f, scale_type(23));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E37f, scale_type(28));
+
+    EXPECT_EQ(3.141593E7f, cudf::convert_fixed_to_floating<float>(num0));
+    EXPECT_EQ(3.141593E12f, cudf::convert_fixed_to_floating<float>(num1));
+    EXPECT_EQ(3.141593E17f, cudf::convert_fixed_to_floating<float>(num2));
+    EXPECT_EQ(3.141593E22f, cudf::convert_fixed_to_floating<float>(num3));
+    EXPECT_EQ(3.141593E27f, cudf::convert_fixed_to_floating<float>(num4));
+    EXPECT_EQ(3.141593E32f, cudf::convert_fixed_to_floating<float>(num5));
+    EXPECT_EQ(3.141593E37f, cudf::convert_fixed_to_floating<float>(num6));
+  }
+
+  // Negative Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-7f, scale_type(-16));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-12f, scale_type(-21));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-17f, scale_type(-26));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-22f, scale_type(-31));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-27f, scale_type(-36));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-32f, scale_type(-41));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-37f, scale_type(-47));
+
+    EXPECT_EQ(3.141593E-7f, cudf::convert_fixed_to_floating<float>(num0));
+    EXPECT_EQ(3.141593E-12f, cudf::convert_fixed_to_floating<float>(num1));
+    EXPECT_EQ(3.141593E-17f, cudf::convert_fixed_to_floating<float>(num2));
+    EXPECT_EQ(3.141593E-22f, cudf::convert_fixed_to_floating<float>(num3));
+    EXPECT_EQ(3.141593E-27f, cudf::convert_fixed_to_floating<float>(num4));
+    EXPECT_EQ(3.141593E-32f, cudf::convert_fixed_to_floating<float>(num5));
+    EXPECT_EQ(3.141593E-37f, cudf::convert_fixed_to_floating<float>(num6));
+
+    // Denormals
+    auto num7  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-39f, scale_type(-48));
+    auto num8  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-41f, scale_type(-50));
+    auto num9  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-43f, scale_type(-52));
+    auto num10 = cudf::convert_floating_to_fixed<decimal64>(FLT_TRUE_MIN, scale_type(-54));
+
+    EXPECT_EQ(3.141593E-39f, cudf::convert_fixed_to_floating<float>(num7));
+    EXPECT_EQ(3.141593E-41f, cudf::convert_fixed_to_floating<float>(num8));
+    EXPECT_EQ(3.141593E-43f, cudf::convert_fixed_to_floating<float>(num9));
+    EXPECT_EQ(FLT_TRUE_MIN, cudf::convert_fixed_to_floating<float>(num10));
+  }
+}
+
+TEST_F(FixedPointTest, PreciseDoubleDecimal64Construction)
+{
+  // Need 17 decimal digits to uniquely represent all doubles (numeric_limits::max_digits10()).
+  // Precise conversion: set the scale factor to 17 less than the order-of-magnitude.
+
+  using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
+
+  // Positive Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E8, scale_type(-9));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E58, scale_type(41));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E108, scale_type(91));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E158, scale_type(141));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E208, scale_type(191));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E258, scale_type(241));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E307, scale_type(290));
+
+    EXPECT_EQ(3.141593E8, cudf::convert_fixed_to_floating<double>(num0));
+    EXPECT_EQ(3.141593E58, cudf::convert_fixed_to_floating<double>(num1));
+    EXPECT_EQ(3.141593E108, cudf::convert_fixed_to_floating<double>(num2));
+    EXPECT_EQ(3.141593E158, cudf::convert_fixed_to_floating<double>(num3));
+    EXPECT_EQ(3.141593E208, cudf::convert_fixed_to_floating<double>(num4));
+    EXPECT_EQ(3.141593E258, cudf::convert_fixed_to_floating<double>(num5));
+    EXPECT_EQ(3.141593E307, cudf::convert_fixed_to_floating<double>(num6));
+  }
+
+  // Negative Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-8, scale_type(-25));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-58, scale_type(-75));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-108, scale_type(-125));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-158, scale_type(-175));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-208, scale_type(-225));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-258, scale_type(-275));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-308, scale_type(-325));
+
+    EXPECT_EQ(3.141593E-8, cudf::convert_fixed_to_floating<double>(num0));
+    EXPECT_EQ(3.141593E-58, cudf::convert_fixed_to_floating<double>(num1));
+    EXPECT_EQ(3.141593E-108, cudf::convert_fixed_to_floating<double>(num2));
+    EXPECT_EQ(3.141593E-158, cudf::convert_fixed_to_floating<double>(num3));
+    EXPECT_EQ(3.141593E-208, cudf::convert_fixed_to_floating<double>(num4));
+    EXPECT_EQ(3.141593E-258, cudf::convert_fixed_to_floating<double>(num5));
+    EXPECT_EQ(3.141593E-308, cudf::convert_fixed_to_floating<double>(num6));
+
+    // Denormals
+    auto num7  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-309, scale_type(-326));
+    auto num8  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-314, scale_type(-331));
+    auto num9  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-319, scale_type(-336));
+    auto num10 = cudf::convert_floating_to_fixed<decimal64>(DBL_TRUE_MIN, scale_type(-341));
+
+    EXPECT_EQ(3.141593E-309, cudf::convert_fixed_to_floating<double>(num7));
+    EXPECT_EQ(3.141593E-314, cudf::convert_fixed_to_floating<double>(num8));
+    EXPECT_EQ(3.141593E-319, cudf::convert_fixed_to_floating<double>(num9));
+    EXPECT_EQ(DBL_TRUE_MIN, cudf::convert_fixed_to_floating<double>(num10));
+  }
+}
+
 TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
@@ -442,8 +555,6 @@ void float_vector_test(ValueType const initial_value,
                        int32_t const scale,
                        Binop binop)
 {
-  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-
   std::vector<decimal32> vec1(size);
   std::vector<ValueType> vec2(size);
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 1d6a3b3304a..7136b162c13 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3509,9 +3509,9 @@ void testCastFloatToDecimal() {
   @Test
   void testCastDoubleToDecimal() {
     testCastNumericToDecimalsAndBack(DType.FLOAT64, false, 0,
-        () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Long.MAX_VALUE),
-        () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Long.MAX_VALUE),
-        new Long[]{1L, 2L, -3L, null, 2L, Long.MAX_VALUE}
+        () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Integer.MAX_VALUE),
+        () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Integer.MAX_VALUE),
+        new Long[]{1L, 2L, -3L, null, 2L, (long) Integer.MAX_VALUE}
     );
     testCastNumericToDecimalsAndBack(DType.FLOAT64, false, -2,
         () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, -55.01999),
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index c41a938f6ea..65f739bc74a 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -97,7 +97,7 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
         pytest.mark.xfail(
             condition=version.parse(pa.__version__) >= version.parse("13.0.0")
             and from_dtype == np.dtype("float32")
-            and to_dtype.precision > 7,
+            and to_dtype.precision > 12,
             reason="https://github.com/rapidsai/cudf/issues/14169",
         )
     )