From 76cae874a6f75c741055e50ebb839620ea98c8a0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 1 Oct 2024 15:35:06 -1000
Subject: [PATCH 001/117] Add string.find_multiple APIs to pylibcudf (#16920)

Redo at https://github.com/rapidsai/cudf/pull/16824

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16920
---
 .../pylibcudf/strings/find_multiple.rst       |  6 +++
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 .../cudf/cudf/_lib/strings/find_multiple.pyx  | 27 ++++---------
 .../libcudf/strings/find_multiple.pxd         |  2 +-
 .../pylibcudf/strings/CMakeLists.txt          |  1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |  1 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |  1 +
 .../pylibcudf/strings/find_multiple.pxd       |  6 +++
 .../pylibcudf/strings/find_multiple.pyx       | 39 +++++++++++++++++++
 .../tests/test_string_find_multiple.py        | 22 +++++++++++
 10 files changed, 85 insertions(+), 21 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
new file mode 100644
index 00000000000..8e86b33b1a0
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
@@ -0,0 +1,6 @@
+=============
+find_multiple
+=============
+
+.. automodule:: pylibcudf.strings.find_multiple
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 9b1a6b72a88..7e0d128cfb2 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -9,6 +9,7 @@ strings
     contains
     extract
     find
+    find_multiple
     findall
     regex_flags
     regex_program
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index 1358f8e3c2c..39e0013769f 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -1,18 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.find_multiple cimport (
-    find_multiple as cpp_find_multiple,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def find_multiple(Column source_strings, Column target_strings):
@@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings):
     Returns a column with character position values where each
     of the `target_strings` are found in each string of `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_find_multiple(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.find_multiple.find_multiple(
+        source_strings.to_pylibcudf(mode="read"),
+        target_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
index 0491644a10a..3d048c1f50b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
         nogil:
 
     cdef unique_ptr[column] find_multiple(
-        column_view source_strings,
+        column_view input,
         column_view targets) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 052a0cf3c56..71b1e29afcb 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     contains.pyx
     extract.pyx
     find.pyx
+    find_multiple.pyx
     findall.pyx
     regex_flags.pyx
     regex_program.pyx
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 142637ff577..e6e6bee2750 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -9,6 +9,7 @@ from . cimport (
     convert,
     extract,
     find,
+    find_multiple,
     findall,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index decfadd63a4..7f121279969 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -9,6 +9,7 @@
     convert,
     extract,
     find,
+    find_multiple,
     findall,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
new file mode 100644
index 00000000000..b7b3aefa336
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column find_multiple(Column input, Column targets)
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
new file mode 100644
index 00000000000..413fc1cb79d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
@@ -0,0 +1,39 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple
+
+
+cpdef Column find_multiple(Column input, Column targets):
+    """
+    Returns a lists column with character position values where each
+    of the target strings are found in each string.
+
+    For details, see :cpp:func:`cudf::strings::find_multiple`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    targets : Column
+        Strings to search for in each string
+
+    Returns
+    -------
+    Column
+        Lists column with character position values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_find_multiple.find_multiple(
+                input.view(),
+                targets.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
new file mode 100644
index 00000000000..d6b37a388f0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_find_multiple():
+    arr = pa.array(["abc", "def"])
+    targets = pa.array(["a", "c", "e"])
+    result = plc.strings.find_multiple.find_multiple(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(targets),
+    )
+    expected = pa.array(
+        [
+            [elem.find(target) for target in targets.to_pylist()]
+            for elem in arr.to_pylist()
+        ],
+        type=pa.list_(pa.int32()),
+    )
+    assert_column_eq(expected, result)

From 6c9064ad074351591f8a4ad757b4d4e32789b8e5 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 1 Oct 2024 18:42:10 -0700
Subject: [PATCH 002/117] Refactor the `cuda_memcpy` functions to make them
 more usable (#16945)

As we expanded the use of the `cuda_memcpy` functions, we realized that they are not very ergonomic, as they require caller to query `is_device_accessible` and pass the correct `PAGEABLE`/`PINNED` enum based on this.

This PR aims to make the `cuda_memcpy` functions easier to use, and the call site changes hopefully showcase this. The new implementation takes spans as parameters and relies on the `host_span::is_device_accessible` to enable copy strategies for pinned memory. Host spans set this flag during construction; creating a host span from a `cudf::detail::host_vector` will correctly propagate `is_device_accessible`. Thus, call can simply* call the `cuda_memcpy` functions with their containers as parameters and rely on implicit conversion to `host_span`/`device_span`. Bonus - there's no way to mix up host and device memory pointers :+1:

Sharp edges:
* Conversion prevents template deduction, so calls that pass containers as parameters need to specify the template parameter (see changes in this PR).
* ~The API copies the `min(input.size(), output.size())` bytes, as this is what we can do safely. This might cause surprises to users if they unintentionally pass spans of different sizes. We could instead throw in this case.~

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16945
---
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 78 +++++++++++++++----
 .../detail/utilities/vector_factories.hpp     | 16 +---
 cpp/src/io/json/host_tree_algorithms.cu       | 13 +---
 cpp/src/io/utilities/hostdevice_vector.hpp    | 14 +---
 cpp/src/utilities/cuda_memcpy.cu              | 11 +--
 5 files changed, 76 insertions(+), 56 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 632d5a732ec..4f0c52c5954 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -25,33 +26,82 @@ namespace detail {
 
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
+void cuda_memcpy_async_impl(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
 /**
- * @brief Asynchronously copies data between the host and device.
+ * @brief Asynchronously copies data from host to device memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination device memory
+ * @param src Source host memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy_async(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+  auto const is_pinned = src.is_device_accessible();
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
+}
 
 /**
- * @brief Synchronously copies data between the host and device.
+ * @brief Asynchronously copies data from device to host memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination host memory
+ * @param src Source device memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+  auto const is_pinned = dst.is_device_accessible();
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
+}
+
+/**
+ * @brief Synchronously copies data from host to device memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination device memory
+ * @param src Source host memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
+}
+
+/**
+ * @brief Synchronously copies data from device to host memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination host memory
+ * @param src Source device memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
+}
 
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 953ae5b9308..1f1e7a2db77 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -101,12 +101,7 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  auto const is_pinned = source_data.is_device_accessible();
-  cuda_memcpy_async(ret.data(),
-                    source_data.data(),
-                    source_data.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  cuda_memcpy_async<T>(ret, source_data, stream);
   return ret;
 }
 
@@ -405,13 +400,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
 template <typename T>
 host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  auto result          = make_host_vector<T>(v.size(), stream);
-  auto const is_pinned = result.get_allocator().is_device_accessible();
-  cuda_memcpy_async(result.data(),
-                    v.data(),
-                    v.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  auto result = make_host_vector<T>(v.size(), stream);
+  cuda_memcpy_async<T>(result, v, stream);
   return result;
 }
 
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 5855f1b5a5f..f7e8134b68d 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -634,11 +634,8 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
           is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
-    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                    column_categories.data(),
-                                    column_categories.size() * sizeof(column_categories[0]),
-                                    cudf::detail::host_memory_kind::PAGEABLE,
-                                    stream);
+    cudf::detail::cuda_memcpy_async<NodeT>(
+      d_column_tree.node_categories, column_categories, stream);
   }
 
   // ignore all children of columns forced as string
@@ -653,11 +650,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
         forced_as_string_column[this_col_id])
       column_categories[this_col_id] = NC_STR;
   }
-  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                  column_categories.data(),
-                                  column_categories.size() * sizeof(column_categories[0]),
-                                  cudf::detail::host_memory_kind::PAGEABLE,
-                                  stream);
+  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, column_categories, stream);
 
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index aed745c42dd..634e6d78ebc 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -125,23 +125,17 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    cuda_memcpy_async<T>(d_data, h_data, stream);
   }
 
-  void host_to_device_sync(rmm::cuda_stream_view stream)
-  {
-    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
-  }
+  void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(d_data, h_data, stream); }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    cuda_memcpy_async<T>(h_data, d_data, stream);
   }
 
-  void device_to_host_sync(rmm::cuda_stream_view stream)
-  {
-    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
-  }
+  void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(h_data, d_data, stream); }
 
   /**
    * @brief Converts a hostdevice_vector into a hostdevice_span.
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 0efb881eb3e..c0af27a1748 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -30,7 +30,7 @@ namespace cudf::detail {
 namespace {
 
 // Simple kernel to copy between device buffers
-CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
   if (idx < n) { dst[idx] = src[idx]; }
@@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
 
 };  // namespace
 
-void cuda_memcpy_async(
+void cuda_memcpy_async_impl(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
   if (kind == host_memory_kind::PINNED) {
@@ -73,11 +73,4 @@ void cuda_memcpy_async(
   }
 }
 
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
-{
-  cuda_memcpy_async(dst, src, size, kind, stream);
-  stream.synchronize();
-}
-
 }  // namespace cudf::detail

From bac81cb8f4c61c9a81e30e79d03c323406bf657a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 1 Oct 2024 15:54:05 -1000
Subject: [PATCH 003/117] Add string.split APIs to pylibcudf (#16940)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Includes `split/split.pxd` and `split/partition.pxd`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16940
---
 .../api_docs/pylibcudf/strings/index.rst      |   1 +
 .../api_docs/pylibcudf/strings/split.rst      |   6 +
 .../cudf/_lib/strings/split/partition.pyx     |  59 +---
 python/cudf/cudf/_lib/strings/split/split.pyx | 217 +++---------
 python/cudf/cudf/core/column/string.py        |  12 +-
 .../libcudf/strings/split/partition.pxd       |   4 +-
 .../pylibcudf/libcudf/strings/split/split.pxd |  24 +-
 .../pylibcudf/strings/CMakeLists.txt          |   1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |   2 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |   2 +
 .../pylibcudf/strings/split/CMakeLists.txt    |  22 ++
 .../pylibcudf/strings/split/__init__.pxd      |   2 +
 .../pylibcudf/strings/split/__init__.py       |   2 +
 .../pylibcudf/strings/split/partition.pxd     |  10 +
 .../pylibcudf/strings/split/partition.pyx     |  95 +++++
 .../pylibcudf/strings/split/split.pxd         |  24 ++
 .../pylibcudf/strings/split/split.pyx         | 326 ++++++++++++++++++
 .../tests/test_string_split_partition.py      |  43 +++
 .../tests/test_string_split_split.py          | 130 +++++++
 19 files changed, 750 insertions(+), 232 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/__init__.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/__init__.py
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pyx
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_split_split.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 7e0d128cfb2..e73ea3370ec 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -16,4 +16,5 @@ strings
     repeat
     replace
     slice
+    split
     strip
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
new file mode 100644
index 00000000000..cba96e86f45
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
@@ -0,0 +1,6 @@
+=====
+split
+=====
+
+.. automodule:: pylibcudf.strings.split
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index a81fb18e752..5319addc41c 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -1,21 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.split.partition cimport (
-    partition as cpp_partition,
-    rpartition as cpp_rpartition,
-)
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -25,25 +14,11 @@ def partition(Column source_strings,
     Returns data by splitting the `source_strings`
     column at the first occurrence of the specified `py_delimiter`.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_partition(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.partition.partition(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -53,22 +28,8 @@ def rpartition(Column source_strings,
     Returns a Column by splitting the `source_strings`
     column at the last occurrence of the specified `py_delimiter`.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rpartition(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.partition.rpartition(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index f481fea4c51..4ec6c7073d8 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -1,33 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.split.split cimport (
-    rsplit as cpp_rsplit,
-    rsplit_re as cpp_rsplit_re,
-    rsplit_record as cpp_rsplit_record,
-    rsplit_record_re as cpp_rsplit_record_re,
-    split as cpp_split,
-    split_re as cpp_split_re,
-    split_record as cpp_split_record,
-    split_record_re as cpp_split_record_re,
-)
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -39,26 +18,12 @@ def split(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from beginning.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_split(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.split(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -70,25 +35,12 @@ def split_record(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from beginning.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_split_record(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.split_record(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -100,26 +52,12 @@ def rsplit(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from the end.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rsplit(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.rsplit(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -131,25 +69,12 @@ def rsplit_record(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from the end.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rsplit_record(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.rsplit_record(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -160,24 +85,15 @@ def split_re(Column source_strings,
     Returns data by splitting the `source_strings`
     column around the delimiters identified by `pattern`.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_split_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.split_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -189,24 +105,15 @@ def rsplit_re(Column source_strings,
     column around the delimiters identified by `pattern`.
     The delimiters are searched starting from the end of each string.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_rsplit_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.rsplit_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -217,23 +124,15 @@ def split_record_re(Column source_strings,
     Returns a Column by splitting the `source_strings`
     column around the delimiters identified by `pattern`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_split_record_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.split_record_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings,
     column around the delimiters identified by `pattern`.
     The delimiters are searched starting from the end of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_rsplit_record_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.rsplit_record_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 4463e3280df..da422db5eae 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2546,9 +2546,9 @@ def split(
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data, _ = libstrings.split_re(self._column, pat, n)
+                    data = libstrings.split_re(self._column, pat, n)
                 else:
-                    data, _ = libstrings.split(
+                    data = libstrings.split(
                         self._column, cudf.Scalar(pat, "str"), n
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
@@ -2719,9 +2719,9 @@ def rsplit(
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data, _ = libstrings.rsplit_re(self._column, pat, n)
+                    data = libstrings.rsplit_re(self._column, pat, n)
                 else:
-                    data, _ = libstrings.rsplit(
+                    data = libstrings.rsplit(
                         self._column, cudf.Scalar(pat, "str"), n
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
@@ -2820,7 +2820,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0],
+            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -2885,7 +2885,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0],
+            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
index 4162e886a7d..4299cf62e99 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
@@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] partition(
-        column_view source_strings,
+        column_view input,
         string_scalar delimiter) except +
 
     cdef unique_ptr[table] rpartition(
-        column_view source_strings,
+        column_view input,
         string_scalar delimiter) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
index 3046149aebb..a22a79fc7d7 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
@@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] split(
-        column_view source_strings,
+        column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[table] rsplit(
-        column_view source_strings,
+        column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] split_record(
-        column_view source_strings,
+        column_view strings,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] rsplit_record(
-        column_view source_strings,
+        column_view strings,
         string_scalar delimiter,
         size_type maxsplit) except +
 
@@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] split_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[table] rsplit_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] split_record_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] rsplit_record_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 71b1e29afcb..d92f806efbe 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -40,3 +40,4 @@ rapids_cython_create_modules(
 )
 
 add_subdirectory(convert)
+add_subdirectory(split)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index e6e6bee2750..788e2c99ab1 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -15,6 +15,7 @@ from . cimport (
     regex_program,
     replace,
     slice,
+    split,
     strip,
     translate,
 )
@@ -35,6 +36,7 @@ __all__ = [
     "replace",
     "slice",
     "strip",
+    "split",
     "side_type",
     "translate",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 7f121279969..bcaeb073d0b 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -16,6 +16,7 @@
     repeat,
     replace,
     slice,
+    split,
     strip,
     translate,
 )
@@ -36,6 +37,7 @@
     "replace",
     "slice",
     "strip",
+    "split",
     "SideType",
     "translate",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
new file mode 100644
index 00000000000..8f544f6f537
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
@@ -0,0 +1,22 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources partition.pyx split.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.pxd b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd
new file mode 100644
index 00000000000..72086e57d9f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . cimport partition, split
diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py
new file mode 100644
index 00000000000..2033e5e275b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . import partition, split
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
new file mode 100644
index 00000000000..c18257a4787
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+
+cpdef Table partition(Column input, Scalar delimiter=*)
+
+cpdef Table rpartition(Column input, Scalar delimiter=*)
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
new file mode 100644
index 00000000000..ecc959e65b0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings.split cimport partition as cpp_partition
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+
+
+cpdef Table partition(Column input, Scalar delimiter=None):
+    """
+    Returns a set of 3 columns by splitting each string using the
+    specified delimiter.
+
+    For details, see :cpp:func:`cudf::strings::partition`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating where to split each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = move(
+            cpp_partition.partition(
+                input.view(),
+                dereference(c_delimiter)
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Table rpartition(Column input, Scalar delimiter=None):
+    """
+    Returns a set of 3 columns by splitting each string using the
+    specified delimiter starting from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rpartition`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating where to split each string.
+
+    Returns
+    -------
+    Table
+       New strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = move(
+            cpp_partition.rpartition(
+                input.view(),
+                dereference(c_delimiter)
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd
new file mode 100644
index 00000000000..355a1874298
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit)
+
+cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit)
+
+cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit)
+
+cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit)
+
+cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit)
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx
new file mode 100644
index 00000000000..a7d7f39fc47
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx
@@ -0,0 +1,326 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.split cimport split as cpp_split
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+
+
+cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit):
+    """
+    Returns a list of columns by splitting each string using the
+    specified delimiter.
+
+    For details, see :cpp:func:`cudf::strings::split`.
+
+    Parameters
+    ----------
+    strings_column : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating the split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.split(
+                strings_column.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit):
+    """
+    Returns a list of columns by splitting each string using the
+    specified delimiter starting from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit`.
+
+    Parameters
+    ----------
+    strings_column : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating the split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns.
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit(
+                strings_column.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit):
+    """
+    Splits individual strings elements into a list of strings.
+
+    For details, see :cpp:func:`cudf::strings::split_record`.
+
+    Parameters
+    ----------
+    strings : Column
+        A column of string elements to be split.
+
+    delimiter : Scalar
+        The string to identify split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_record(
+                strings.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit):
+    """
+    Splits individual strings elements into a list of strings starting
+    from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_record`.
+
+    Parameters
+    ----------
+    strings : Column
+        A column of string elements to be split.
+
+    delimiter : Scalar
+        The string to identify split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_record(
+                strings.view(),
+                dereference(c_delimiter),
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a table of strings columns
+    using a regex_program's pattern to delimit each string.
+
+    For details, see :cpp:func:`cudf::strings::split_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        A table of columns of strings.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a table of strings columns
+    using a regex_program's pattern to delimit each string starting from
+    the end of the string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        A table of columns of strings.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a list column of strings using the given
+    regex_program to delimit each string.
+
+    For details, see :cpp:func:`cudf::strings::split_record_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.split_record_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a list column of strings using the given
+    regex_program to delimit each string starting from the end of the string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_record_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_split.rsplit_record_re(
+                input.view(),
+                prog.c_obj.get()[0],
+                maxsplit,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
new file mode 100644
index 00000000000..80cae8d1c6b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_table_eq
+
+
+@pytest.fixture
+def data_col():
+    pa_arr = pa.array(["ab_cd", "def_g_h", None])
+    plc_column = plc.interop.from_arrow(pa_arr)
+    return pa_arr, plc_column
+
+
+def test_partition(data_col):
+    pa_arr, plc_column = data_col
+    result = plc.strings.split.partition.partition(
+        plc_column, plc.interop.from_arrow(pa.scalar("_"))
+    )
+    expected = pa.table(
+        {
+            "a": ["ab", "def", None],
+            "b": ["_", "_", None],
+            "c": ["cd", "g_h", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rpartition(data_col):
+    pa_arr, plc_column = data_col
+    result = plc.strings.split.partition.rpartition(
+        plc_column, plc.interop.from_arrow(pa.scalar("_"))
+    )
+    expected = pa.table(
+        {
+            "a": ["ab", "def_g", None],
+            "b": ["_", "_", None],
+            "c": ["cd", "h", None],
+        }
+    )
+    assert_table_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
new file mode 100644
index 00000000000..2aeffac8209
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+
+@pytest.fixture
+def data_col():
+    pa_array = pa.array(["a_b_c", "d-e-f", None])
+    plc_column = plc.interop.from_arrow(pa_array)
+    return pa_array, plc_column
+
+
+@pytest.fixture
+def delimiter():
+    delimiter = "_"
+    plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter))
+    return delimiter, plc_delimiter
+
+
+@pytest.fixture
+def re_delimiter():
+    return "[_-]"
+
+
+def test_split(data_col, delimiter):
+    _, plc_column = data_col
+    _, plc_delimiter = delimiter
+    result = plc.strings.split.split.split(plc_column, plc_delimiter, 1)
+    expected = pa.table(
+        {
+            "a": ["a", "d-e-f", None],
+            "b": ["b_c", None, None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rsplit(data_col, delimiter):
+    _, plc_column = data_col
+    _, plc_delimiter = delimiter
+    result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1)
+    expected = pa.table(
+        {
+            "a": ["a_b", "d-e-f", None],
+            "b": ["c", None, None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_split_record(data_col, delimiter):
+    pa_array, plc_column = data_col
+    delim, plc_delim = delimiter
+    result = plc.strings.split.split.split_record(plc_column, plc_delim, 1)
+    expected = pc.split_pattern(pa_array, delim, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_rsplit_record(data_col, delimiter):
+    pa_array, plc_column = data_col
+    delim, plc_delim = delimiter
+    result = plc.strings.split.split.split_record(plc_column, plc_delim, 1)
+    expected = pc.split_pattern(pa_array, delim, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_split_re(data_col, re_delimiter):
+    _, plc_column = data_col
+    result = plc.strings.split.split.split_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pa.table(
+        {
+            "a": ["a", "d", None],
+            "b": ["b_c", "e-f", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rsplit_re(data_col, re_delimiter):
+    _, plc_column = data_col
+    result = plc.strings.split.split.rsplit_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pa.table(
+        {
+            "a": ["a_b", "d-e", None],
+            "b": ["c", "f", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_split_record_re(data_col, re_delimiter):
+    pa_array, plc_column = data_col
+    result = plc.strings.split.split.split_record_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_rsplit_record_re(data_col, re_delimiter):
+    pa_array, plc_column = data_col
+    result = plc.strings.split.split.rsplit_record_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        -1,
+    )
+    expected = pc.split_pattern_regex(pa_array, re_delimiter)
+    assert_column_eq(expected, result)

From a6ca0f0068995e5080e1c8d04410a2a1b9dc8b37 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 2 Oct 2024 10:09:16 -0400
Subject: [PATCH 004/117] Use nvcomp wheel instead of bundling nvcomp (#16946)

Contributes to https://github.com/rapidsai/rapids-wheels-planning/issues/74

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16946
---
 ci/build_wheel_libcudf.sh             |  6 +++++-
 cpp/cmake/thirdparty/get_nvcomp.cmake |  8 ++------
 dependencies.yaml                     | 28 ++++++++++++++++++++++++++-
 python/libcudf/CMakeLists.txt         | 15 +++++++++-----
 python/libcudf/pyproject.toml         |  3 +++
 5 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index 8975381ceba..91bc071583e 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -5,11 +5,15 @@ set -euo pipefail
 
 package_dir="python/libcudf"
 
+export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 mkdir -p ${package_dir}/final_dist
-python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+python -m auditwheel repair \
+    --exclude libnvcomp.so.4 \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index 41bbf44abc8..1b6a1730161 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,11 +16,7 @@
 function(find_and_configure_nvcomp)
 
   include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  rapids_cpm_nvcomp(
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports
-    USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}
-  )
+  rapids_cpm_nvcomp(USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP})
 
   # Per-thread default stream
   if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM)
diff --git a/dependencies.yaml b/dependencies.yaml
index ed36a23e5c3..b192158c4ea 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -15,6 +15,7 @@ files:
       - depends_on_cupy
       - depends_on_libkvikio
       - depends_on_librmm
+      - depends_on_nvcomp
       - depends_on_rmm
       - develop
       - docs
@@ -152,6 +153,13 @@ files:
       - build_cpp
       - depends_on_libkvikio
       - depends_on_librmm
+  py_run_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: project
+    includes:
+      - depends_on_nvcomp
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -367,9 +375,27 @@ dependencies:
           - fmt>=11.0.2,<12
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
+          - spdlog>=1.14.1,<1.15
+  depends_on_nvcomp:
+    common:
+      - output_types: conda
+        packages:
           # Align nvcomp version with rapids-cmake
           - nvcomp==4.0.1
-          - spdlog>=1.14.1,<1.15
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - nvidia-nvcomp-cu12==4.0.1
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - nvidia-nvcomp-cu11==4.0.1
+          - matrix:
+            packages:
+              - nvidia-nvcomp==4.0.1
   rapids_build_skbuild:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 0a8f5c4807d..2b208e2e021 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -22,6 +22,8 @@ project(
   LANGUAGES CXX
 )
 
+option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
+
 # Check if cudf is already available. If so, it is the user's responsibility to ensure that the
 # CMake package is also available at build time of the Python cudf package.
 find_package(cudf "${RAPIDS_VERSION}")
@@ -45,8 +47,11 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 
 add_subdirectory(../../cpp cudf-cpp)
 
-# Ensure other libraries needed by libcudf.so get installed alongside it.
-include(cmake/Modules/WheelHelpers.cmake)
-install_aliased_imported_targets(
-  TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
-)
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
+  set_property(
+    TARGET cudf
+    PROPERTY INSTALL_RPATH ${rpaths}
+    APPEND
+  )
+endif()
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 5bffe9fd96c..84660cbc276 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -37,6 +37,9 @@ classifiers = [
     "Programming Language :: C++",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
+dependencies = [
+    "nvidia-nvcomp==4.0.1",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"

From 63a5d2e708fffde63891d3f4767d444748d8e1dd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 2 Oct 2024 07:24:20 -1000
Subject: [PATCH 005/117] Add string.wrap APIs to pylibcudf (#16935)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16935
---
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 .../api_docs/pylibcudf/strings/wrap.rst       |  6 +++
 python/cudf/cudf/_lib/strings/wrap.pyx        | 24 ++++-------
 .../pylibcudf/libcudf/strings/wrap.pxd        |  2 +-
 .../pylibcudf/strings/CMakeLists.txt          |  1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |  2 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |  2 +
 python/pylibcudf/pylibcudf/strings/wrap.pxd   |  7 ++++
 python/pylibcudf/pylibcudf/strings/wrap.pyx   | 42 +++++++++++++++++++
 .../pylibcudf/tests/test_string_wrap.py       | 24 +++++++++++
 10 files changed, 93 insertions(+), 18 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/wrap.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/wrap.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_wrap.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index e73ea3370ec..5a06adf6a11 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -18,3 +18,4 @@ strings
     slice
     split
     strip
+    wrap
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst
new file mode 100644
index 00000000000..bd825f78568
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst
@@ -0,0 +1,6 @@
+====
+wrap
+====
+
+.. automodule:: pylibcudf.strings.wrap
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx
index eed5cf33b10..2b40f01f818 100644
--- a/python/cudf/cudf/_lib/strings/wrap.pyx
+++ b/python/cudf/cudf/_lib/strings/wrap.pyx
@@ -1,17 +1,13 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def wrap(Column source_strings,
@@ -21,14 +17,8 @@ def wrap(Column source_strings,
     in the Column to be formatted in paragraphs
     with length less than a given `width`.
     """
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_wrap(
-            source_view,
-            width
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.wrap.wrap(
+        source_strings.to_pylibcudf(mode="read"),
+        width
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
index c0053391328..abc1bd43ad2 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
@@ -9,5 +9,5 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] wrap(
-        column_view source_strings,
+        column_view input,
         size_type width) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index d92f806efbe..e3343b38740 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -30,6 +30,7 @@ set(cython_sources
     slice.pyx
     strip.pyx
     translate.pyx
+    wrap.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 788e2c99ab1..a61c98fe77c 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -18,6 +18,7 @@ from . cimport (
     split,
     strip,
     translate,
+    wrap,
 )
 from .side_type cimport side_type
 
@@ -39,4 +40,5 @@ __all__ = [
     "split",
     "side_type",
     "translate",
+    "wrap",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index bcaeb073d0b..ab3ad971db6 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -19,6 +19,7 @@
     split,
     strip,
     translate,
+    wrap,
 )
 from .side_type import SideType
 
@@ -40,4 +41,5 @@
     "split",
     "SideType",
     "translate",
+    "wrap",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/strings/wrap.pxd
new file mode 100644
index 00000000000..fcc86650acf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column wrap(Column input, size_type width)
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx
new file mode 100644
index 00000000000..11e31f54eee
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport wrap as cpp_wrap
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column wrap(Column input, size_type width):
+    """
+    Wraps strings onto multiple lines shorter than `width` by
+    replacing appropriate white space with
+    new-line characters (ASCII 0x0A).
+
+    For details, see :cpp:func:`cudf::strings::wrap`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+
+    width : int
+        Maximum character width of a line within each string
+
+    Returns
+    -------
+    Column
+        Column of wrapped strings
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_wrap.wrap(
+                input.view(),
+                width,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
new file mode 100644
index 00000000000..85abd3a2bae
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import textwrap
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_wrap():
+    pa_array = pa.array(
+        [
+            "the quick brown fox jumped over the lazy brown dog",
+            "hello, world",
+            None,
+        ]
+    )
+    result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), 12)
+    expected = pa.array(
+        [
+            textwrap.fill(val, 12) if isinstance(val, str) else val
+            for val in pa_array.to_pylist()
+        ]
+    )
+    assert_column_eq(expected, result)

From 6af1d2294075e4ef6e5a77a52cdadf341a31b1a3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 2 Oct 2024 08:47:02 -1000
Subject: [PATCH 006/117] Add string padding and side_type APIs to pylibcudf
 (#16833)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16833
---
 .../api_docs/pylibcudf/strings/index.rst      |   2 +
 .../api_docs/pylibcudf/strings/padding.rst    |   6 +
 .../api_docs/pylibcudf/strings/side_type.rst  |   6 +
 python/cudf/cudf/_lib/strings/__init__.py     |   9 +-
 python/cudf/cudf/_lib/strings/padding.pyx     | 112 +++---------------
 python/cudf/cudf/_lib/strings/strip.pyx       |  67 +++--------
 python/cudf/cudf/core/column/string.py        |   4 +-
 .../pylibcudf/libcudf/strings/padding.pxd     |   4 +-
 .../pylibcudf/libcudf/strings/side_type.pxd   |  12 +-
 .../pylibcudf/libcudf/strings/strip.pxd       |   4 +-
 .../pylibcudf/strings/CMakeLists.txt          |   1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |   2 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |   2 +
 .../pylibcudf/pylibcudf/strings/padding.pxd   |  11 ++
 .../pylibcudf/pylibcudf/strings/padding.pyx   |  75 ++++++++++++
 .../pylibcudf/pylibcudf/strings/side_type.pxd |   1 -
 .../pylibcudf/pylibcudf/strings/side_type.pyx |   1 -
 .../pylibcudf/tests/test_string_padding.py    |  26 ++++
 18 files changed, 175 insertions(+), 170 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/padding.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/padding.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_padding.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 5a06adf6a11..48dc8a13c3e 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -11,10 +11,12 @@ strings
     find
     find_multiple
     findall
+    padding
     regex_flags
     regex_program
     repeat
     replace
+    side_type
     slice
     split
     strip
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst
new file mode 100644
index 00000000000..5b417024fd5
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst
@@ -0,0 +1,6 @@
+=======
+padding
+=======
+
+.. automodule:: pylibcudf.strings.padding
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst
new file mode 100644
index 00000000000..d5aef9c4f75
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst
@@ -0,0 +1,6 @@
+=========
+side_type
+=========
+
+.. automodule:: pylibcudf.strings.side_type
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 4bf8a9b1a8f..049dbab4851 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -73,14 +73,7 @@
 from cudf._lib.strings.find_multiple import find_multiple
 from cudf._lib.strings.findall import findall
 from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
-from cudf._lib.strings.padding import (
-    SideType,
-    center,
-    ljust,
-    pad,
-    rjust,
-    zfill,
-)
+from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill
 from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
 from cudf._lib.strings.replace import (
     insert,
diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx
index d0239e91ec3..015a2ebab8a 100644
--- a/python/cudf/cudf/_lib/strings/padding.pyx
+++ b/python/cudf/cudf/_lib/strings/padding.pyx
@@ -1,64 +1,31 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
-from enum import IntEnum
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.strings.padding cimport (
-    pad as cpp_pad,
-    zfill as cpp_zfill,
-)
-from pylibcudf.libcudf.strings.side_type cimport (
-    side_type,
-    underlying_type_t_side_type,
-)
-
-
-class SideType(IntEnum):
-    LEFT = <underlying_type_t_side_type> side_type.LEFT
-    RIGHT = <underlying_type_t_side_type> side_type.RIGHT
-    BOTH = <underlying_type_t_side_type> side_type.BOTH
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
 def pad(Column source_strings,
         size_type width,
         fill_char,
-        side=SideType.LEFT):
+        side=plc.strings.side_type.SideType.LEFT):
     """
     Returns a Column by padding strings in `source_strings`
     up to the given `width`. Direction of padding is to be specified by `side`.
     The additional characters being filled can be changed by specifying
     `fill_char`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string f_char = <string>str(fill_char).encode()
-
-    cdef side_type pad_direction = <side_type>(
-        <underlying_type_t_side_type> side
+    plc_result = plc.strings.padding.pad(
+        source_strings.to_pylibcudf(mode="read"),
+        width,
+        side,
+        fill_char,
     )
-
-    with nogil:
-        c_result = move(cpp_pad(
-            source_view,
-            width,
-            pad_direction,
-            f_char
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -68,19 +35,13 @@ def zfill(Column source_strings,
     Returns a Column by prepending strings in `source_strings`
     with '0' characters up to the given `width`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_zfill(
-            source_view,
-            width
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.padding.zfill(
+        source_strings.to_pylibcudf(mode="read"),
+        width
+    )
+    return Column.from_pylibcudf(plc_result)
 
 
-@acquire_spill_lock()
 def center(Column source_strings,
            size_type width,
            fill_char):
@@ -89,23 +50,9 @@ def center(Column source_strings,
     in `source_strings` with additional character, `fill_char`
     up to the given `width`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string f_char = <string>str(fill_char).encode()
-
-    with nogil:
-        c_result = move(cpp_pad(
-            source_view,
-            width,
-            side_type.BOTH,
-            f_char
-        ))
+    return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH)
 
-    return Column.from_unique_ptr(move(c_result))
 
-
-@acquire_spill_lock()
 def ljust(Column source_strings,
           size_type width,
           fill_char):
@@ -113,23 +60,9 @@ def ljust(Column source_strings,
     Returns a Column by filling right side of strings in `source_strings`
     with additional character, `fill_char` up to the given `width`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string f_char = <string>str(fill_char).encode()
+    return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT)
 
-    with nogil:
-        c_result = move(cpp_pad(
-            source_view,
-            width,
-            side_type.RIGHT,
-            f_char
-        ))
 
-    return Column.from_unique_ptr(move(c_result))
-
-
-@acquire_spill_lock()
 def rjust(Column source_strings,
           size_type width,
           fill_char):
@@ -137,17 +70,4 @@ def rjust(Column source_strings,
     Returns a Column by filling left side of strings in `source_strings`
     with additional character, `fill_char` up to the given `width`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string f_char = <string>str(fill_char).encode()
-
-    with nogil:
-        c_result = move(cpp_pad(
-            source_view,
-            width,
-            side_type.LEFT,
-            f_char
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT)
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 38ecb21a94c..982c5a600e7 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -1,18 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.side_type cimport side_type
-from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 import pylibcudf as plc
 
 
@@ -24,15 +14,12 @@ def strip(Column source_strings,
     The set of characters need be stripped from left and right side
     can be specified by `py_repl`.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-    return Column.from_pylibcudf(
-        plc.strings.strip.strip(
-            source_strings.to_pylibcudf(mode="read"),
-            plc.strings.SideType.BOTH,
-            repl.c_value
-        )
+    plc_result = plc.strings.strip.strip(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.side_type.SideType.BOTH,
+        py_repl.device_value.c_value,
     )
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -43,24 +30,12 @@ def lstrip(Column source_strings,
     The set of characters need be stripped from left side can
     be specified by `py_repl`.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_result = plc.strings.strip.strip(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.side_type.SideType.LEFT,
+        py_repl.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_strip(
-            source_view,
-            side_type.LEFT,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -71,21 +46,9 @@ def rstrip(Column source_strings,
     The set of characters need be stripped from right side can
     be specified by `py_repl`.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_result = plc.strings.strip.strip(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.side_type.SideType.RIGHT,
+        py_repl.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_strip(
-            source_view,
-            side_type.RIGHT,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index da422db5eae..88df57b1b3b 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -11,6 +11,8 @@
 import pandas as pd
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
 import cudf.api.types
 from cudf import _lib as libcudf
@@ -2966,7 +2968,7 @@ def pad(
             raise TypeError(msg)
 
         try:
-            side = libstrings.SideType[side.upper()]
+            side = plc.strings.side_type.SideType[side.upper()]
         except KeyError:
             raise ValueError(
                 "side has to be either one of {'left', 'right', 'both'}"
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
index 657fe61eb14..875f8cafd14 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
@@ -12,11 +12,11 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] pad(
-        column_view source_strings,
+        column_view input,
         size_type width,
         side_type side,
         string fill_char) except +
 
     cdef unique_ptr[column] zfill(
-        column_view source_strings,
+        column_view input,
         size_type width) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
index 019ff3f17ba..e92c5dc1d66 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
@@ -1,12 +1,10 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
-from libc.stdint cimport int32_t
+from libcpp cimport int
 
 
 cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil:
 
-    cpdef enum class side_type(int32_t):
-        LEFT 'cudf::strings::side_type::LEFT'
-        RIGHT 'cudf::strings::side_type::RIGHT'
-        BOTH 'cudf::strings::side_type::BOTH'
-
-ctypedef int32_t underlying_type_t_side_type
+    cpdef enum class side_type(int):
+        LEFT
+        RIGHT
+        BOTH
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
index b0ca771762d..dd527a78e7f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
@@ -10,6 +10,6 @@ from pylibcudf.libcudf.strings.side_type cimport side_type
 cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] strip(
-        column_view source_strings,
-        side_type stype,
+        column_view input,
+        side_type side,
         string_scalar to_strip) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index e3343b38740..eeb44d19333 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -22,6 +22,7 @@ set(cython_sources
     find.pyx
     find_multiple.pyx
     findall.pyx
+    padding.pyx
     regex_flags.pyx
     regex_program.pyx
     repeat.pyx
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index a61c98fe77c..187ef113073 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -11,9 +11,11 @@ from . cimport (
     find,
     find_multiple,
     findall,
+    padding,
     regex_flags,
     regex_program,
     replace,
+    side_type,
     slice,
     split,
     strip,
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index ab3ad971db6..6033cea0625 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -11,10 +11,12 @@
     find,
     find_multiple,
     findall,
+    padding,
     regex_flags,
     regex_program,
     repeat,
     replace,
+    side_type,
     slice,
     split,
     strip,
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/strings/padding.pxd
new file mode 100644
index 00000000000..a035a5ad187
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/padding.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column pad(Column input, size_type width, side_type side, str fill_char)
+
+cpdef Column zfill(Column input, size_type width)
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx
new file mode 100644
index 00000000000..24daaaa3838
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyx
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport padding as cpp_padding
+from pylibcudf.libcudf.strings.side_type cimport side_type
+
+
+cpdef Column pad(Column input, size_type width, side_type side, str fill_char):
+    """
+    Add padding to each string using a provided character.
+
+    For details, see :cpp:func:`cudf::strings::pad`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    width : int
+        The minimum number of characters for each string.
+    side : SideType
+        Where to place the padding characters.
+    fill_char : str
+        Single UTF-8 character to use for padding
+
+    Returns
+    -------
+    Column
+        New column with padded strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_fill_char = fill_char.encode("utf-8")
+
+    with nogil:
+        c_result = move(
+            cpp_padding.pad(
+                input.view(),
+                width,
+                side,
+                c_fill_char,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column zfill(Column input, size_type width):
+    """
+    Add '0' as padding to the left of each string.
+
+    For details, see :cpp:func:`cudf::strings::zfill`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    width : int
+        The minimum number of characters for each string.
+
+    Returns
+    -------
+    Column
+        New column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_padding.zfill(
+                input.view(),
+                width,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd
index 34b7a580380..34b03e9bc27 100644
--- a/python/pylibcudf/pylibcudf/strings/side_type.pxd
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd
@@ -1,3 +1,2 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from pylibcudf.libcudf.strings.side_type cimport side_type
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx
index acdc7d6ff1f..cf0c770cc11 100644
--- a/python/pylibcudf/pylibcudf/strings/side_type.pyx
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx
@@ -1,4 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from pylibcudf.libcudf.strings.side_type import \
     side_type as SideType  # no-cython-lint
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
new file mode 100644
index 00000000000..2ba775d17ae
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+
+
+def test_pad():
+    arr = pa.array(["a", "1", None])
+    plc_result = plc.strings.padding.pad(
+        plc.interop.from_arrow(arr),
+        2,
+        plc.strings.side_type.SideType.LEFT,
+        "!",
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="!"))
+    assert result.equals(expected)
+
+
+def test_zfill():
+    arr = pa.array(["a", "1", None])
+    plc_result = plc.strings.padding.zfill(plc.interop.from_arrow(arr), 2)
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="0"))
+    assert result.equals(expected)

From 8a9df040e18b2f54df67ad6fde94969990e61b7f Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Wed, 2 Oct 2024 14:59:00 -0400
Subject: [PATCH 007/117] Add license to the pylibcudf wheel (#16976)

Add the license file symlink to the `pylibcudf` wheels
---
 python/pylibcudf/LICENSE | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 python/pylibcudf/LICENSE

diff --git a/python/pylibcudf/LICENSE b/python/pylibcudf/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/pylibcudf/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file

From 466e37973d3b9aef4d14a7aa0cd48df0b886300d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 2 Oct 2024 20:09:21 -0400
Subject: [PATCH 008/117] Fix performance regression for
 generate_character_ngrams (#16849)

Fixes performance regression in `nvtext::generate_character_ngrams` introduced in #16212. Thread-per-row kernel is faster for smaller strings.

Authors:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16849
---
 cpp/src/text/generate_ngrams.cu | 50 ++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index a87ecb81b9d..997b0278fe2 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -48,6 +49,9 @@
 namespace nvtext {
 namespace detail {
 namespace {
+// long strings threshold found with benchmarking
+constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64;
+
 /**
  * @brief Generate ngrams from strings column.
  *
@@ -173,33 +177,39 @@ constexpr cudf::thread_index_type bytes_per_thread = 4;
 /**
  * @brief Counts the number of ngrams in each row of the given strings column
  *
- * Each warp processes a single string.
+ * Each warp/thread processes a single string.
  * Formula is `count = max(0,str.length() - ngrams + 1)`
  * If a string has less than ngrams characters, its count is 0.
  */
 CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings,
                                           cudf::size_type ngrams,
+                                          cudf::size_type tile_size,
                                           cudf::size_type* d_counts)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
 
-  auto const str_idx = idx / cudf::detail::warp_size;
+  auto const str_idx = idx / tile_size;
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) {
     d_counts[str_idx] = 0;
     return;
   }
 
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (tile_size == 1) {
+    d_counts[str_idx] = cuda::std::max(0, (d_str.length() + 1 - ngrams));
+    return;
+  }
+
   namespace cg    = cooperative_groups;
   auto const warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
 
-  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
-  auto const end   = d_str.data() + d_str.size_bytes();
+  auto const end = d_str.data() + d_str.size_bytes();
 
   auto const lane_idx   = warp.thread_rank();
   cudf::size_type count = 0;
   for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
-       itr += cudf::detail::warp_size * bytes_per_thread) {
+       itr += tile_size * bytes_per_thread) {
     for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
       count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
     }
@@ -256,19 +266,27 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Parameter ngrams should be an integer value of 2 or greater",
                std::invalid_argument);
 
-  auto const strings_count = input.size();
-  if (strings_count == 0) {  // if no strings, return an empty column
-    return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  if (input.is_empty()) {  // if no strings, return an empty column
+    return cudf::lists::detail::make_empty_lists_column(
+      cudf::data_type{cudf::type_id::STRING}, stream, mr);
+  }
+  if (input.size() == input.null_count()) {
+    return cudf::lists::detail::make_all_nulls_lists_column(
+      input.size(), cudf::data_type{cudf::type_id::STRING}, stream, mr);
   }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
   auto [offsets, total_ngrams] = [&] {
-    auto counts           = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-    auto const num_blocks = cudf::util::div_rounding_up_safe(
-      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
-    count_char_ngrams_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
-      *d_strings, ngrams, counts.data());
+    auto counts               = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    auto const avg_char_bytes = (input.chars_size(stream) / (input.size() - input.null_count()));
+    auto const tile_size      = (avg_char_bytes < AVG_CHAR_BYTES_THRESHOLD)
+                                  ? 1                         // thread per row
+                                  : cudf::detail::warp_size;  // warp per row
+    auto const grid           = cudf::detail::grid_1d(
+      static_cast<cudf::thread_index_type>(input.size()) * tile_size, block_size);
+    count_char_ngrams_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, ngrams, tile_size, counts.data());
     return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
   }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
@@ -277,8 +295,8 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Insufficient number of characters in each string to generate ngrams");
 
   character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    generator, strings_count, total_ngrams, stream, mr);
+  auto [offsets_column, chars] =
+    cudf::strings::detail::make_strings_children(generator, input.size(), total_ngrams, stream, mr);
 
   auto output = cudf::make_strings_column(
     total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
@@ -368,7 +386,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
   auto [offsets, total_ngrams] = [&] {
     auto counts = rmm::device_uvector<cudf::size_type>(input.size(), stream);
     count_char_ngrams_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, ngrams, counts.data());
+      *d_strings, ngrams, cudf::detail::warp_size, counts.data());
     return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
   }();
   auto d_offsets = offsets->view().data<cudf::size_type>();

From 7ae536031effd31d1c7aab63d1af812b0fc2a291 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 20:26:17 -0700
Subject: [PATCH 009/117] Batch memcpy the last offsets for output buffers of
 str and list cols in PQ reader (#16905)

This PR adds the capability to batch memcpy the last offsets for the output buffers of string and list columns in PQ reader. This reduces the overhead from several `cudaMemcpyAsync` calls when reading wide strings and/or list columns tables. This optimization was found as well as ORC changes were contributed by @vuule. See this [comment](https://github.com/rapidsai/cudf/pull/16905#issuecomment-2375532577) for performance improvement data and discussion.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16905
---
 cpp/benchmarks/CMakeLists.txt                 |   5 -
 .../io/utilities/batched_memset_bench.cpp     | 101 -------------
 .../cudf/detail/utilities/batched_memcpy.hpp  |  67 +++++++++
 .../utilities}/batched_memset.hpp             |   4 +-
 cpp/src/io/orc/stripe_enc.cu                  |  64 +++++---
 cpp/src/io/parquet/page_data.cu               |  26 ++++
 cpp/src/io/parquet/parquet_gpu.hpp            |  12 ++
 cpp/src/io/parquet/reader_impl.cpp            |  24 ++-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   6 +-
 cpp/tests/CMakeLists.txt                      |   3 +-
 .../utilities_tests/batched_memcpy_tests.cu   | 139 ++++++++++++++++++
 .../utilities_tests/batched_memset_tests.cu   |   4 +-
 12 files changed, 308 insertions(+), 147 deletions(-)
 delete mode 100644 cpp/benchmarks/io/utilities/batched_memset_bench.cpp
 create mode 100644 cpp/include/cudf/detail/utilities/batched_memcpy.hpp
 rename cpp/include/cudf/{io/detail => detail/utilities}/batched_memset.hpp (98%)
 create mode 100644 cpp/tests/utilities_tests/batched_memcpy_tests.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4113e38dcf4..110b4557840 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
-# ##################################################################################################
-# * multi buffer memset benchmark
-# ----------------------------------------------------------------------
-ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
-
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
deleted file mode 100644
index 2905895a63b..00000000000
--- a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_common.hpp>
-#include <benchmarks/io/nvbench_helpers.hpp>
-
-#include <cudf/io/parquet.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <nvbench/nvbench.cuh>
-
-// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
-// run on most GPUs, but large enough to allow highest throughput
-constexpr size_t data_size = 512 << 20;
-
-void parquet_read_common(cudf::size_type num_rows_to_read,
-                         cudf::size_type num_cols_to_read,
-                         cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
-{
-  cudf::io::parquet_reader_options read_opts =
-    cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
-
-  auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      try_drop_l3_cache();
-
-      timer.start();
-      auto const result = cudf::io::read_parquet(read_opts);
-      timer.stop();
-
-      CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns");
-      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-    });
-
-  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
-  state.add_buffer_size(
-    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
-  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
-}
-
-template <data_type DataType>
-void bench_batched_memset(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
-{
-  auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
-  auto const num_cols    = static_cast<cudf::size_type>(state.get_int64("num_cols"));
-  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
-  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
-  auto const compression = cudf::io::compression_type::NONE;
-  cuio_source_sink_pair source_sink(source_type);
-  auto const tbl =
-    create_random_table(cycle_dtypes(d_type, num_cols),
-                        table_size_bytes{data_size},
-                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
-  auto const view = tbl->view();
-
-  cudf::io::parquet_writer_options write_opts =
-    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
-      .compression(compression);
-  cudf::io::write_parquet(write_opts);
-  auto const num_rows = view.num_rows();
-
-  parquet_read_common(num_rows, num_cols, source_sink, state);
-}
-
-using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
-                                            data_type::FLOAT,
-                                            data_type::DECIMAL,
-                                            data_type::TIMESTAMP,
-                                            data_type::DURATION,
-                                            data_type::STRING,
-                                            data_type::LIST,
-                                            data_type::STRUCT>;
-
-NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list))
-  .set_name("batched_memset")
-  .set_type_axes_names({"data_type"})
-  .add_int64_axis("num_cols", {1000})
-  .add_string_axis("io_type", {"DEVICE_BUFFER"})
-  .set_min_samples(4)
-  .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
new file mode 100644
index 00000000000..ed0ab9e6e5b
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <cub/device/device_memcpy.cuh>
+#include <cuda/functional>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief A helper function that copies a vector of vectors from source to destination addresses in
+ * a batched manner.
+ *
+ * @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator
+ * @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator
+ * @tparam SizeIterator **[inferred]** The type of device-accessible buffer size iterator
+ *
+ * @param src_iter Device-accessible iterator to source addresses
+ * @param dst_iter Device-accessible iterator to destination addresses
+ * @param size_iter Device-accessible iterator to the buffer sizes (in bytes)
+ * @param num_buffs Number of buffers to be copied
+ * @param stream CUDA stream to use
+ */
+template <typename SrcIterator, typename DstIterator, typename SizeIterator>
+void batched_memcpy_async(SrcIterator src_iter,
+                          DstIterator dst_iter,
+                          SizeIterator size_iter,
+                          size_t num_buffs,
+                          rmm::cuda_stream_view stream)
+{
+  size_t temp_storage_bytes = 0;
+  cub::DeviceMemcpy::Batched(
+    nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value());
+
+  rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()};
+
+  cub::DeviceMemcpy::Batched(d_temp_storage.data(),
+                             temp_storage_bytes,
+                             src_iter,
+                             dst_iter,
+                             size_iter,
+                             num_buffs,
+                             stream.value());
+}
+
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp
similarity index 98%
rename from cpp/include/cudf/io/detail/batched_memset.hpp
rename to cpp/include/cudf/detail/utilities/batched_memset.hpp
index 1c74be4a9fe..75f738f7529 100644
--- a/cpp/include/cudf/io/detail/batched_memset.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp
@@ -28,7 +28,7 @@
 #include <thrust/transform.h>
 
 namespace CUDF_EXPORT cudf {
-namespace io::detail {
+namespace detail {
 
 /**
  * @brief A helper function that takes in a vector of device spans and memsets them to the
@@ -78,5 +78,5 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
     d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
 }
 
-}  // namespace io::detail
+}  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 5c70e35fd2e..ed0b6969154 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -20,6 +20,8 @@
 #include "orc_gpu.hpp"
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/batched_memcpy.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -1087,37 +1089,42 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 /**
  * @brief Merge chunked column data into a single contiguous stream
  *
- * @param[in,out] strm_desc StripeStream device array [stripe][stream]
- * @param[in,out] streams List of encoder chunk streams [column][rowgroup]
+ * @param[in] strm_desc StripeStream device array [stripe][stream]
+ * @param[in] streams List of encoder chunk streams [column][rowgroup]
+ * @param[out] srcs  List of source encoder chunk stream data addresses
+ * @param[out] dsts List of destination StripeStream data addresses
+ * @param[out] sizes List of stream sizes in bytes
  */
 // blockDim {compact_streams_block_size,1,1}
 CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
-  gpuCompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
-                           device_2dspan<encoder_chunk_streams> streams)
+  gpuInitBatchedMemcpy(device_2dspan<StripeStream const> strm_desc,
+                       device_2dspan<encoder_chunk_streams> streams,
+                       device_span<uint8_t*> srcs,
+                       device_span<uint8_t*> dsts,
+                       device_span<size_t> sizes)
 {
-  __shared__ __align__(16) StripeStream ss;
-
-  auto const stripe_id = blockIdx.x;
+  auto const stripe_id = cudf::detail::grid_1d::global_thread_id();
   auto const stream_id = blockIdx.y;
-  auto const t         = threadIdx.x;
+  if (stripe_id >= strm_desc.size().first) { return; }
 
-  if (t == 0) { ss = strm_desc[stripe_id][stream_id]; }
-  __syncthreads();
+  auto const out_id = stream_id * strm_desc.size().first + stripe_id;
+  StripeStream ss   = strm_desc[stripe_id][stream_id];
 
   if (ss.data_ptr == nullptr) { return; }
 
   auto const cid = ss.stream_type;
   auto dst_ptr   = ss.data_ptr;
   for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) {
+    auto const out_id = stream_id * streams.size().second + group;
+    srcs[out_id]      = streams[ss.column_id][group].data_ptrs[cid];
+    dsts[out_id]      = dst_ptr;
+
+    // Also update the stream here, data will be copied in a separate kernel
+    streams[ss.column_id][group].data_ptrs[cid] = dst_ptr;
+
     auto const len = streams[ss.column_id][group].lengths[cid];
-    if (len > 0) {
-      auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid];
-      for (uint32_t i = t; i < len; i += blockDim.x) {
-        dst_ptr[i] = src_ptr[i];
-      }
-      __syncthreads();
-    }
-    if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; }
+    // len is the size (in bytes) of the current stream.
+    sizes[out_id] = len;
     dst_ptr += len;
   }
 }
@@ -1325,9 +1332,26 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream)
 {
+  auto const num_rowgroups = enc_streams.size().second;
+  auto const num_streams   = strm_desc.size().second;
+  auto const num_stripes   = strm_desc.size().first;
+  auto const num_chunks    = num_rowgroups * num_streams;
+  auto srcs                = cudf::detail::make_zeroed_device_uvector_async<uint8_t*>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+  auto dsts = cudf::detail::make_zeroed_device_uvector_async<uint8_t*>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+  auto lengths = cudf::detail::make_zeroed_device_uvector_async<size_t>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+
   dim3 dim_block(compact_streams_block_size, 1);
-  dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
-  gpuCompactOrcDataStreams<<<dim_grid, dim_block, 0, stream.value()>>>(strm_desc, enc_streams);
+  dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size),
+                strm_desc.size().second);
+  gpuInitBatchedMemcpy<<<dim_grid, dim_block, 0, stream.value()>>>(
+    strm_desc, enc_streams, srcs, dsts, lengths);
+
+  // Copy streams in a batched manner.
+  cudf::detail::batched_memcpy_async(
+    srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream);
 }
 
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index e0d50d7ccf9..b3276c81c1f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -17,6 +17,8 @@
 #include "page_data.cuh"
 #include "page_decode.cuh"
 
+#include <cudf/detail/utilities/batched_memcpy.hpp>
+
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/reduce.h>
@@ -466,4 +468,28 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
   }
 }
 
+void WriteFinalOffsets(host_span<size_type const> offsets,
+                       host_span<size_type* const> buff_addrs,
+                       rmm::cuda_stream_view stream)
+{
+  // Copy offsets to device and create an iterator
+  auto d_src_data = cudf::detail::make_device_uvector_async(
+    offsets, stream, cudf::get_current_device_resource_ref());
+  // Iterator for the source (scalar) data
+  auto src_iter = cudf::detail::make_counting_transform_iterator(
+    static_cast<std::size_t>(0),
+    cuda::proclaim_return_type<size_type*>(
+      [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; }));
+
+  // Copy buffer addresses to device and create an iterator
+  auto d_dst_addrs = cudf::detail::make_device_uvector_async(
+    buff_addrs, stream, cudf::get_current_device_resource_ref());
+  // size_iter is simply a constant iterator of sizeof(size_type) bytes.
+  auto size_iter = thrust::make_constant_iterator(sizeof(size_type));
+
+  // Copy offsets to buffers in batched manner.
+  cudf::detail::batched_memcpy_async(
+    src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream);
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index e631e12119d..a8ba3a969ce 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -797,6 +797,18 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
+/**
+ * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a
+ * batched manner.
+ *
+ * @param offsets Host span of final offsets
+ * @param buff_addrs Host span of corresponding output col buffer end addresses
+ * @param stream CUDA stream to use
+ */
+void WriteFinalOffsets(host_span<size_type const> offsets,
+                       host_span<size_type* const> buff_addrs,
+                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the string column data stored in the pages
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 7d817bde7af..1b69ccb7742 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -371,13 +371,15 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
   }
 
-  // for list columns, add the final offset to every offset buffer.
-  // TODO : make this happen in more efficiently. Maybe use thrust::for_each
-  // on each buffer.
+  // For list and string columns, add the final offset to every offset buffer.
   // Note : the reason we are doing this here instead of in the decode kernel is
   // that it is difficult/impossible for a given page to know that it is writing the very
   // last value that should then be followed by a terminator (because rows can span
   // page boundaries).
+  std::vector<size_type*> out_buffers;
+  std::vector<size_type> final_offsets;
+  out_buffers.reserve(_input_columns.size());
+  final_offsets.reserve(_input_columns.size());
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     input_column_info const& input_col = _input_columns[idx];
 
@@ -393,25 +395,21 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
 
         // the final offset for a list at level N is the size of it's child
         size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + (out_buf.size - 1),
-                                      &offset,
-                                      sizeof(size_type),
-                                      cudaMemcpyDefault,
-                                      _stream.value()));
+        out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + (out_buf.size - 1));
+        final_offsets.emplace_back(offset);
         out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
         auto const sz = static_cast<size_type>(col_string_sizes[idx]);
         if (sz <= strings::detail::get_offset64_threshold()) {
-          CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
-                                        &sz,
-                                        sizeof(size_type),
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
+          out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
+          final_offsets.emplace_back(sz);
         }
       }
     }
   }
+  // Write the final offsets for list and string columns in a batched manner
+  WriteFinalOffsets(final_offsets, out_buffers, _stream);
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < subpass.pages.size(); idx++) {
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 3763c2e8e6d..8cab68ea721 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -19,9 +19,9 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/batched_memset.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/detail/batched_memset.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -1656,9 +1656,9 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
     }
   }
 
-  cudf::io::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
+  cudf::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
   // Need to set null mask bufs to all high bits
-  cudf::io::detail::batched_memset(
+  cudf::detail::batched_memset(
     nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b67d922d377..4596ec65ce7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -385,6 +385,8 @@ ConfigureTest(
 # * utilities tests -------------------------------------------------------------------------------
 ConfigureTest(
   UTILITIES_TEST
+  utilities_tests/batched_memcpy_tests.cu
+  utilities_tests/batched_memset_tests.cu
   utilities_tests/column_debug_tests.cpp
   utilities_tests/column_utilities_tests.cpp
   utilities_tests/column_wrapper_tests.cpp
@@ -395,7 +397,6 @@ ConfigureTest(
   utilities_tests/pinned_memory_tests.cpp
   utilities_tests/type_check_tests.cpp
   utilities_tests/type_list_tests.cpp
-  utilities_tests/batched_memset_tests.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
new file mode 100644
index 00000000000..98657f8e224
--- /dev/null
+++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/utilities/batched_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/device_vector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/tuple.h>
+
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <type_traits>
+
+template <typename T>
+struct BatchedMemcpyTest : public cudf::test::BaseFixture {};
+
+TEST(BatchedMemcpyTest, BasicTest)
+{
+  using T1 = int64_t;
+
+  // Device init
+  auto stream = cudf::get_default_stream();
+  auto mr     = cudf::get_current_device_resource_ref();
+
+  // Buffer lengths (in number of elements)
+  std::vector<size_t> const h_lens{
+    50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000};
+
+  // Total number of buffers
+  auto const num_buffs = h_lens.size();
+
+  // Exclusive sum of buffer lengths for pointers
+  std::vector<size_t> h_lens_excl_sum(num_buffs);
+  std::exclusive_scan(h_lens.begin(), h_lens.end(), h_lens_excl_sum.begin(), 0);
+
+  // Corresponding buffer sizes (in bytes)
+  std::vector<size_t> h_sizes_bytes;
+  h_sizes_bytes.reserve(num_buffs);
+  std::transform(
+    h_lens.cbegin(), h_lens.cend(), std::back_inserter(h_sizes_bytes), [&](auto& size) {
+      return size * sizeof(T1);
+    });
+
+  // Initialize random engine
+  auto constexpr seed = 0xcead;
+  std::mt19937 engine{seed};
+  using uniform_distribution =
+    typename std::conditional_t<std::is_same_v<T1, bool>,
+                                std::bernoulli_distribution,
+                                std::conditional_t<std::is_floating_point_v<T1>,
+                                                   std::uniform_real_distribution<T1>,
+                                                   std::uniform_int_distribution<T1>>>;
+  uniform_distribution dist{};
+
+  // Generate a src vector of random data vectors
+  std::vector<std::vector<T1>> h_sources;
+  h_sources.reserve(num_buffs);
+  std::transform(h_lens.begin(), h_lens.end(), std::back_inserter(h_sources), [&](auto size) {
+    std::vector<T1> data(size);
+    std::generate_n(data.begin(), size, [&]() { return T1{dist(engine)}; });
+    return data;
+  });
+  // Copy the vectors to device
+  std::vector<rmm::device_uvector<T1>> h_device_vecs;
+  h_device_vecs.reserve(h_sources.size());
+  std::transform(
+    h_sources.begin(), h_sources.end(), std::back_inserter(h_device_vecs), [stream, mr](auto& vec) {
+      return cudf::detail::make_device_uvector_async(vec, stream, mr);
+    });
+  // Pointers to the source vectors
+  std::vector<T1*> h_src_ptrs;
+  h_src_ptrs.reserve(h_sources.size());
+  std::transform(
+    h_device_vecs.begin(), h_device_vecs.end(), std::back_inserter(h_src_ptrs), [](auto& vec) {
+      return static_cast<T1*>(vec.data());
+    });
+  // Copy the source data pointers to device
+  auto d_src_ptrs = cudf::detail::make_device_uvector_async(h_src_ptrs, stream, mr);
+
+  // Total number of elements in all buffers
+  auto const total_buff_len = std::accumulate(h_lens.cbegin(), h_lens.cend(), 0);
+
+  // Create one giant buffer for destination
+  auto d_dst_data = cudf::detail::make_zeroed_device_uvector_async<T1>(total_buff_len, stream, mr);
+  // Pointers to destination buffers within the giant destination buffer
+  std::vector<T1*> h_dst_ptrs(num_buffs);
+  std::for_each(thrust::make_counting_iterator(static_cast<size_t>(0)),
+                thrust::make_counting_iterator(num_buffs),
+                [&](auto i) { return h_dst_ptrs[i] = d_dst_data.data() + h_lens_excl_sum[i]; });
+  // Copy destination data pointers to device
+  auto d_dst_ptrs = cudf::detail::make_device_uvector_async(h_dst_ptrs, stream, mr);
+
+  // Copy buffer size iterators (in bytes) to device
+  auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr);
+
+  // Run the batched memcpy
+  cudf::detail::batched_memcpy_async(
+    d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream);
+
+  // Expected giant destination buffer after the memcpy
+  std::vector<T1> expected_buffer;
+  expected_buffer.reserve(total_buff_len);
+  std::for_each(h_sources.cbegin(), h_sources.cend(), [&expected_buffer](auto& source) {
+    expected_buffer.insert(expected_buffer.end(), source.begin(), source.end());
+  });
+
+  // Copy over the result destination buffer to host and synchronize the stream
+  auto result_dst_buffer =
+    cudf::detail::make_std_vector_sync<T1>(cudf::device_span<T1>(d_dst_data), stream);
+
+  // Check if both vectors are equal
+  EXPECT_TRUE(
+    std::equal(expected_buffer.begin(), expected_buffer.end(), result_dst_buffer.begin()));
+}
diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu
index bed0f40d70e..0eeb7b95318 100644
--- a/cpp/tests/utilities_tests/batched_memset_tests.cu
+++ b/cpp/tests/utilities_tests/batched_memset_tests.cu
@@ -18,8 +18,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/detail/utilities/batched_memset.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/detail/batched_memset.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -78,7 +78,7 @@ TEST(MultiBufferTestIntegral, BasicTest1)
     });
 
   // Function Call
-  cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
+  cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
 
   // Set all buffer regions to 0 for expected comparison
   std::for_each(

From 2ec6cb32d825d2ef255d0e56497c20be30713d32 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 2 Oct 2024 18:07:52 -1000
Subject: [PATCH 010/117] Fix astype from tz-aware type to tz-aware type
 (#16980)

closes https://github.com/rapidsai/cudf/issues/16973

Also matches astype from tz-naive to tz-aware type like pandas

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16980
---
 python/cudf/cudf/core/column/datetime.py      | 15 +++++++++++++
 .../cudf/tests/series/test_datetimelike.py    | 22 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index d0ea4612a1b..2c9b0baa9b6 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -480,6 +480,11 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
     def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
         if dtype == self.dtype:
             return self
+        elif isinstance(dtype, pd.DatetimeTZDtype):
+            raise TypeError(
+                "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. "
+                "Use tz_localize instead."
+            )
         return libcudf.unary.cast(self, dtype=dtype)
 
     def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
@@ -940,6 +945,16 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
     def as_string_column(self) -> cudf.core.column.StringColumn:
         return self._local_time.as_string_column()
 
+    def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
+        if isinstance(dtype, pd.DatetimeTZDtype) and dtype != self.dtype:
+            if dtype.unit != self.time_unit:
+                # TODO: Doesn't check that new unit is valid.
+                casted = self._with_type_metadata(dtype)
+            else:
+                casted = self
+            return casted.tz_convert(str(dtype.tz))
+        return super().as_datetime_column(dtype)
+
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(
             self._local_time, field
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index cea86a5499e..691da224f44 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -266,3 +266,25 @@ def test_pandas_compatible_non_zoneinfo_raises(klass):
     with cudf.option_context("mode.pandas_compatible", True):
         with pytest.raises(NotImplementedError):
             cudf.from_pandas(pandas_obj)
+
+
+def test_astype_naive_to_aware_raises():
+    ser = cudf.Series([datetime.datetime(2020, 1, 1)])
+    with pytest.raises(TypeError):
+        ser.astype("datetime64[ns, UTC]")
+    with pytest.raises(TypeError):
+        ser.to_pandas().astype("datetime64[ns, UTC]")
+
+
+@pytest.mark.parametrize("unit", ["ns", "us"])
+def test_astype_aware_to_aware(unit):
+    ser = cudf.Series(
+        [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)]
+    )
+    result = ser.astype(f"datetime64[{unit}, US/Pacific]")
+    expected = ser.to_pandas().astype(f"datetime64[{unit}, US/Pacific]")
+    zoneinfo_type = pd.DatetimeTZDtype(
+        expected.dtype.unit, zoneinfo.ZoneInfo(str(expected.dtype.tz))
+    )
+    expected = ser.astype(zoneinfo_type)
+    assert_eq(result, expected)

From 3faa3ee8b869a8450f6352c7770fb155b321d926 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 3 Oct 2024 08:53:08 -0400
Subject: [PATCH 011/117] Add cudf::strings::find_re API (#16742)

Adds the `cudf::strings::find_re` and `str.find_re` API to libcudf/pylibcudf/cudf. This function returns the character position where the pattern first matches in each row of the input column. If a match is not found, -1 is returned for that corresponding row.

Closes #16729

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Matthew Murray (https://github.com/Matt711)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16742
---
 cpp/doxygen/regex.md                          |  1 +
 cpp/include/cudf/strings/findall.hpp          | 29 ++++++++++++
 cpp/src/strings/search/findall.cu             | 46 +++++++++++++++++++
 cpp/tests/streams/strings/find_test.cpp       |  1 +
 cpp/tests/strings/findall_tests.cpp           | 35 +++++++++++---
 python/cudf/cudf/_lib/strings/__init__.py     |  2 +-
 python/cudf/cudf/_lib/strings/findall.pyx     | 16 +++++++
 python/cudf/cudf/core/column/string.py        | 40 ++++++++++++++++
 python/cudf/cudf/tests/test_string.py         | 20 ++++++++
 .../pylibcudf/libcudf/strings/findall.pxd     |  4 ++
 .../pylibcudf/pylibcudf/strings/findall.pxd   |  1 +
 .../pylibcudf/pylibcudf/strings/findall.pyx   | 32 +++++++++++++
 .../pylibcudf/tests/test_string_findall.py    | 17 +++++++
 13 files changed, 237 insertions(+), 7 deletions(-)

diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index 6d1c91a5752..6902b1948bd 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -8,6 +8,7 @@ This page specifies which regular expression (regex) features are currently supp
 - cudf::strings::extract()
 - cudf::strings::extract_all_record()
 - cudf::strings::findall()
+- cudf::strings::find_re()
 - cudf::strings::replace_re()
 - cudf::strings::replace_with_backrefs()
 - cudf::strings::split_re()
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index c6b9bc7e58a..867764b6d9a 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -66,6 +66,35 @@ std::unique_ptr<column> findall(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the starting character index of the first match for the given pattern
+ * in each row of the input column
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["bunny", "rabbit", "hare", "dog"]
+ * p = regex_program::create("[be]")
+ * r = find_re(s, p)
+ * r is now [0, 2, 3, -1]
+ * @endcode
+ *
+ * A null output row occurs if the corresponding input row is null.
+ * A -1 is returned for rows that do not contain a match.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param input Strings instance for this operation
+ * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of integers
+ */
+std::unique_ptr<column> find_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index d8c1b50a94b..21708e48a25 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -126,6 +126,43 @@ std::unique_ptr<column> findall(strings_column_view const& input,
                            mr);
 }
 
+namespace {
+struct find_re_fn {
+  column_device_view d_strings;
+
+  __device__ size_type operator()(size_type const idx,
+                                  reprog_device const prog,
+                                  int32_t const thread_idx) const
+  {
+    if (d_strings.is_null(idx)) { return 0; }
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    auto const result = prog.find(thread_idx, d_str, d_str.begin());
+    return result.has_value() ? result.value().first : -1;
+  }
+};
+}  // namespace
+
+std::unique_ptr<column> find_re(strings_column_view const& input,
+                                regex_program const& prog,
+                                rmm::cuda_stream_view stream,
+                                rmm::device_async_resource_ref mr)
+{
+  auto results = make_numeric_column(data_type{type_to_id<size_type>()},
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
+                                     stream,
+                                     mr);
+  if (input.is_empty()) { return results; }
+
+  auto d_results       = results->mutable_view().data<size_type>();
+  auto d_prog          = regex_device_builder::create_prog_device(prog, stream);
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  launch_transform_kernel(find_re_fn{*d_strings}, *d_prog, d_results, input.size(), stream);
+
+  return results;
+}
 }  // namespace detail
 
 // external API
@@ -139,5 +176,14 @@ std::unique_ptr<column> findall(strings_column_view const& input,
   return detail::findall(input, prog, stream, mr);
 }
 
+std::unique_ptr<column> find_re(strings_column_view const& input,
+                                regex_program const& prog,
+                                rmm::cuda_stream_view stream,
+                                rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::find_re(input, prog, stream, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp
index 52839c6fc9f..e5a1ee0988c 100644
--- a/cpp/tests/streams/strings/find_test.cpp
+++ b/cpp/tests/streams/strings/find_test.cpp
@@ -46,4 +46,5 @@ TEST_F(StringsFindTest, Find)
   auto const pattern = std::string("[a-z]");
   auto const prog    = cudf::strings::regex_program::create(pattern);
   cudf::strings::findall(view, *prog, cudf::test::get_default_stream());
+  cudf::strings::find_re(view, *prog, cudf::test::get_default_stream());
 }
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 73da4d081e2..4821a7fa999 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -19,6 +19,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/strings/findall.hpp>
@@ -149,6 +150,22 @@ TEST_F(StringsFindallTests, LargeRegex)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
+TEST_F(StringsFindallTests, FindTest)
+{
+  auto const valids = cudf::test::iterators::null_at(5);
+  cudf::test::strings_column_wrapper input(
+    {"3A", "May4", "Jan2021", "March", "A9BC", "", "", "abcdef ghijklm 12345"}, valids);
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern = std::string("\\d+");
+
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::find_re(sv, *prog);
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 3, 3, -1, 1, 0, -1, 15}, valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
+
 TEST_F(StringsFindallTests, NoMatches)
 {
   cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
@@ -169,10 +186,16 @@ TEST_F(StringsFindallTests, EmptyTest)
   auto prog = cudf::strings::regex_program::create(pattern);
 
   cudf::test::strings_column_wrapper input;
-  auto sv      = cudf::strings_column_view(input);
-  auto results = cudf::strings::findall(sv, *prog);
-
-  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected;
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto sv = cudf::strings_column_view(input);
+  {
+    auto results = cudf::strings::findall(sv, *prog);
+    using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected;
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  }
+  {
+    auto results  = cudf::strings::find_re(sv, *prog);
+    auto expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>{};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  }
 }
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 049dbab4851..e712937f816 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -71,7 +71,7 @@
     startswith_multiple,
 )
 from cudf._lib.strings.find_multiple import find_multiple
-from cudf._lib.strings.findall import findall
+from cudf._lib.strings.findall import find_re, findall
 from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
 from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill
 from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 0e758d5b322..3e7a504d535 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -23,3 +23,19 @@ def findall(Column source_strings, object pattern, uint32_t flags):
         prog,
     )
     return Column.from_pylibcudf(plc_result)
+
+
+@acquire_spill_lock()
+def find_re(Column source_strings, object pattern, uint32_t flags):
+    """
+    Returns character positions where the pattern first matches
+    the elements in source_strings.
+    """
+    prog = plc.strings.regex_program.RegexProgram.create(
+        str(pattern), flags
+    )
+    plc_result = plc.strings.findall.find_re(
+        source_strings.to_pylibcudf(mode="read"),
+        prog,
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 88df57b1b3b..b50e23bd52e 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -3626,6 +3626,46 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         data = libstrings.findall(self._column, pat, flags)
         return self._return_or_inplace(data)
 
+    def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex:
+        """
+        Find first occurrence of pattern or regular expression in the
+        Series/Index.
+
+        Parameters
+        ----------
+        pat : str
+            Pattern or regular expression.
+        flags : int, default 0 (no flags)
+            Flags to pass through to the regex engine (e.g. re.MULTILINE)
+
+        Returns
+        -------
+        Series
+            A Series of position values where the pattern first matches
+            each string.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series(['Lion', 'Monkey', 'Rabbit', 'Cat'])
+        >>> s.str.find_re('[ti]')
+        0    1
+        1   -1
+        2    4
+        3    2
+        dtype: int32
+        """
+        if isinstance(pat, re.Pattern):
+            flags = pat.flags & ~re.U
+            pat = pat.pattern
+        if not _is_supported_regex_flags(flags):
+            raise NotImplementedError(
+                "Unsupported value for `flags` parameter"
+            )
+
+        data = libstrings.find_re(self._column, pat, flags)
+        return self._return_or_inplace(data)
+
     def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series:
         """
         Find all first occurrences of patterns in the Series/Index.
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index cc88cc79769..45143211a11 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1899,6 +1899,26 @@ def test_string_findall(pat, flags):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "pat, flags, pos",
+    [
+        ("Monkey", 0, [-1, 0, -1, -1]),
+        ("on", 0, [2, 1, -1, 1]),
+        ("bit", 0, [-1, -1, 3, -1]),
+        ("on$", 0, [2, -1, -1, -1]),
+        ("on$", re.MULTILINE, [2, -1, -1, 1]),
+        ("o.*k", re.DOTALL, [-1, 1, -1, 1]),
+    ],
+)
+def test_string_find_re(pat, flags, pos):
+    test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"]
+    gs = cudf.Series(test_data)
+
+    expected = pd.Series(pos, dtype=np.int32)
+    actual = gs.str.find_re(pat, flags)
+    assert_eq(expected, actual)
+
+
 def test_string_replace_multi():
     ps = pd.Series(["hello", "goodbye"])
     gs = cudf.Series(["hello", "goodbye"])
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
index e0a8b776465..0d286c36446 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
@@ -11,3 +11,7 @@ cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] findall(
         column_view input,
         regex_program prog) except +
+
+    cdef unique_ptr[column] find_re(
+        column_view input,
+        regex_program prog) except +
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd
index 54afa088141..3c35a9c9aa9 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/strings/findall.pxd
@@ -4,4 +4,5 @@ from pylibcudf.column cimport Column
 from pylibcudf.strings.regex_program cimport RegexProgram
 
 
+cpdef Column find_re(Column input, RegexProgram pattern)
 cpdef Column findall(Column input, RegexProgram pattern)
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
index 3a6b87504b3..5212dc4594d 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyx
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -38,3 +38,35 @@ cpdef Column findall(Column input, RegexProgram pattern):
         )
 
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column find_re(Column input, RegexProgram pattern):
+    """
+    Returns character positions where the pattern first matches
+    the elements in input strings.
+
+    For details, see :cpp:func:`cudf::strings::find_re`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    pattern : RegexProgram
+        Regex pattern
+
+    Returns
+    -------
+    Column
+        New column of integers
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_findall.find_re(
+                input.view(),
+                pattern.c_obj.get()[0]
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
index 994552fa276..debfad92d00 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
@@ -21,3 +21,20 @@ def test_findall():
         type=pa_result.type,
     )
     assert_column_eq(result, expected)
+
+
+def test_find_re():
+    arr = pa.array(["bunny", "rabbit", "hare", "dog"])
+    pattern = "[eb]"
+    result = plc.strings.findall.find_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    pa_result = plc.interop.to_arrow(result)
+    expected = pa.array(
+        [0, 2, 3, -1],
+        type=pa_result.type,
+    )
+    assert_column_eq(result, expected)

From bd3b3327a6326ffea4658d682b8b9087e32da98a Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 3 Oct 2024 16:25:09 -0400
Subject: [PATCH 012/117] Restore export of nvcomp outside of wheel builds
 (#16988)

Fixes https://github.com/rapidsai/cudf/issues/16986

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16988
---
 cpp/CMakeLists.txt                    | 1 +
 cpp/cmake/thirdparty/get_nvcomp.cmake | 6 +++++-
 python/libcudf/CMakeLists.txt         | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 136f43ee706..f7a5dd2f2fb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
 option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
 mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
+option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
 mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
 option(
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index 1b6a1730161..33b1b45fb44 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -16,7 +16,11 @@
 function(find_and_configure_nvcomp)
 
   include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  rapids_cpm_nvcomp(USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP})
+  set(export_args)
+  if(CUDF_EXPORT_NVCOMP)
+    set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+  endif()
+  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP})
 
   # Per-thread default stream
   if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM)
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 2b208e2e021..5f9a04d3cee 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -41,6 +41,9 @@ set(BUILD_TESTS OFF)
 set(BUILD_BENCHMARKS OFF)
 set(CUDF_BUILD_TESTUTIL OFF)
 set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(CUDF_EXPORT_NVCOMP OFF)
+endif()
 set(CUDA_STATIC_RUNTIME ON)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)

From 010839172ecb5a99609044a98031ff5b7578cd64 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 3 Oct 2024 19:44:20 -0500
Subject: [PATCH 013/117] Use `libcudf` wheel from PR rather than nightly for
 `polars-polars` CI test job (#16975)

This PR fixes an issue where one `cudf-polars` CI job uses the `pylibcudf` wheel generated from the branch being tested, but pulls a libcudf nightly which can cause issues when introducing cython/c++ changes simultaneously.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16975
---
 ci/test_cudf_polars_polars_tests.sh | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 55399d0371a..f5bcdc62604 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -24,14 +24,17 @@ rapids-logger "Download wheels"
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+# Download libcudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep
 
-rapids-logger "Install pylibcudf"
-python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+rapids-logger "Install libcudf, pylibcudf and cudf_polars"
+python -m pip install \
+    -v \
+    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
-rapids-logger "Install cudf_polars"
-python -m pip install $(echo ./dist/cudf_polars*.whl)
 
 TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"

From 2fa2e6a554096181b0a625cdc50368893dbaaa1f Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Fri, 4 Oct 2024 16:08:37 +0100
Subject: [PATCH 014/117] Switched AST benchmarks from GoogleBench to NVBench
 (#16952)

This merge request switches the Benchmarking solution for the AST benchmark from GoogleBench to NVBench. ~It also refactors the L2 cache flushing functionality of `cuda_event_timer` into a separate function `flush_L2_device_cache`, since NVBench already performs the timing, synchronization, and timer setup necessary.~

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16952
---
 cpp/benchmarks/CMakeLists.txt    |  2 +-
 cpp/benchmarks/ast/transform.cpp | 51 +++++++++++---------------------
 2 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 110b4557840..1e13bf176c1 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -330,7 +330,7 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureBench(AST_BENCH ast/transform.cpp)
+ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 65a44532cf1..f44f26e4d2c 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,16 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
 #include <list>
 #include <memory>
@@ -35,13 +37,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-class AST : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-static void BM_ast_transform(benchmark::State& state)
+static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
@@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state)
 
   auto const& expression_tree_root = expressions.back();
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::compute_column(table, expression_tree_root);
-  }
-
   // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
-}
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
 
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
-  (::benchmark::State & st)                                                                \
+  static void name(::nvbench::state& st)                                                   \
   {                                                                                        \
-    BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                    \
+    ::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                  \
   }                                                                                        \
-  BENCHMARK_REGISTER_F(AST, name)                                                          \
-    ->Apply(CustomRanges)                                                                  \
-    ->Unit(benchmark::kMillisecond)                                                        \
-    ->UseManualTime();
+  NVBENCH_BENCH(name)                                                                      \
+    .set_name(#name)                                                                       \
+    .add_int64_axis("tree_levels", {1, 5, 10})                                             \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);

From a78432184f20f7acf493eaa8d1928cfee29d1771 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Fri, 4 Oct 2024 16:19:37 +0100
Subject: [PATCH 015/117] Switched BINARY_OP Benchmarks from GoogleBench to
 NVBench (#16963)

This merge request switches the Benchmarking solution for the BINARY_OP benchmarks from GoogleBench to NVBench

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/16963
---
 cpp/benchmarks/CMakeLists.txt                 |  2 +-
 cpp/benchmarks/binaryop/binaryop.cpp          | 65 ++++++-------------
 cpp/benchmarks/binaryop/compiled_binaryop.cpp | 47 ++++++--------
 3 files changed, 40 insertions(+), 74 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 1e13bf176c1..b8a53cd8bd9 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -334,7 +334,7 @@ ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index fa98d9e601a..7d267a88764 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,14 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
-#include <vector>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
 
@@ -33,13 +32,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns>
-class BINARYOP : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns>
-static void BM_binaryop_transform(benchmark::State& state)
+static void BM_binaryop_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
+  auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};
 
   // Create table data
   auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
@@ -47,9 +43,10 @@ static void BM_binaryop_transform(benchmark::State& state)
     cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
   cudf::table_view table{*source_table};
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
+  // Use the number of bytes read from global memory
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
     auto const op               = cudf::binary_operator::ADD;
     auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
@@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state)
         result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }
-  }
-
-  // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
+  });
 }
 
 #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
-  BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns)     \
-  (::benchmark::State & st) { BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); }
+                                                                                      \
+  static void name(::nvbench::state& st)                                              \
+  {                                                                                   \
+    BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                    \
+  }                                                                                   \
+  NVBENCH_BENCH(name)                                                                 \
+    .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
                                     int32_t,
@@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
                                     double,
                                     TreeType::IMBALANCED_LEFT,
                                     false);
-
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 2, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
-}
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index 7086a61c7c5..bc0ff69bce9 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -15,20 +15,18 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 
-class COMPILED_BINARYOP : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
+void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
 {
-  auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
 
   auto const source_table = create_random_table(
-    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{column_size});
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});
 
   auto lhs = cudf::column_view(source_table->get_column(0));
   auto rhs = cudf::column_view(source_table->get_column(1));
@@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
   // Call once for hot cache.
   cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    cudf::binary_operation(lhs, rhs, binop, output_dtype);
-  }
-
   // use number of bytes read and written to global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut)));
+  state.add_global_memory_reads<TypeLhs>(table_size);
+  state.add_global_memory_reads<TypeRhs>(table_size);
+  state.add_global_memory_reads<TypeOut>(table_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
 }
 
+#define BM_STRINGIFY(a) #a
+
 // TODO tparam boolean for null.
-#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)           \
-  BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name)                             \
-  (::benchmark::State & st)                                               \
-  {                                                                       \
-    BM_compiled_binaryop<lhs, rhs, tout>(st, cudf::binary_operator::bop); \
-  }                                                                       \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                           \
-    ->Unit(benchmark::kMicrosecond)                                       \
-    ->UseManualTime()                                                     \
-    ->Arg(10000)      /* 10k */                                           \
-    ->Arg(100000)     /* 100k */                                          \
-    ->Arg(1000000)    /* 1M */                                            \
-    ->Arg(10000000)   /* 10M */                                           \
-    ->Arg(100000000); /* 100M */
+#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)               \
+  static void name(::nvbench::state& st)                                      \
+  {                                                                           \
+    ::BM_compiled_binaryop<lhs, rhs, tout>(st, ::cudf::binary_operator::bop); \
+  }                                                                           \
+  NVBENCH_BENCH(name)                                                         \
+    .set_name("compiled_binary_op_" BM_STRINGIFY(name))                       \
+    .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
 
 #define build_name(a, b, c, d) a##_##b##_##c##_##d
 

From 39342b8762c734aa2a94b94815bef75869a4e59c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 4 Oct 2024 09:39:20 -0700
Subject: [PATCH 016/117] Properly handle the mapped and registered regions in
 `memory_mapped_source` (#16865)

Depends on https://github.com/rapidsai/cudf/pull/16826

Set of fixes that improve robustness on the non-GDS file input:

1. Avoid registering beyond the byte range - addresses problems when reading adjacent byte ranges from multiple threads (GH only).
2. Allow reading data outside of the memory mapped region. This prevents issues with very long rows in CSV or JSON input.
3. Copy host data when the range being read is only partially registered. This avoids errors when trying to copy the host data range to the device (GH only).

Modifies the datasource class hierarchy to avoid reuse of direct file `host_read`s

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16865
---
 cpp/include/cudf/io/datasource.hpp  |  22 +++-
 cpp/src/io/functions.cpp            |  14 ++-
 cpp/src/io/utilities/datasource.cpp | 157 +++++++++++++++++-----------
 cpp/tests/io/csv_test.cpp           |  35 +++++++
 4 files changed, 158 insertions(+), 70 deletions(-)

diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index b12fbe39a57..dc14802adc1 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -86,14 +86,28 @@ class datasource {
   /**
    * @brief Creates a source from a file path.
    *
+   * @note Parameters `offset`, `max_size_estimate` and `min_size_estimate` are hints to the
+   * `datasource` implementation about the expected range of the data that will be read. The
+   * implementation may use these hints to optimize the read operation. These parameters are usually
+   * based on the byte range option. In this case, `min_size_estimate` should be no greater than the
+   * byte range to avoid potential issues when reading adjacent ranges. `max_size_estimate` can
+   * include padding after the byte range, to include additional data that may be needed for
+   * processing.
+   *
+   @throws cudf::logic_error if the minimum size estimate is greater than the maximum size estimate
+   *
    * @param[in] filepath Path to the file to use
-   * @param[in] offset Bytes from the start of the file (the default is zero)
-   * @param[in] size Bytes from the offset; use zero for entire file (the default is zero)
+   * @param[in] offset Starting byte offset from which data will be read (the default is zero)
+   * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is
+   * zero, which means the whole file after `offset`)
+   * @param[in] min_size_estimate Lower estimate of the data range that will be read (the default is
+   * zero, which means the whole file after `offset`)
    * @return Constructed datasource object
    */
   static std::unique_ptr<datasource> create(std::string const& filepath,
-                                            size_t offset = 0,
-                                            size_t size   = 0);
+                                            size_t offset            = 0,
+                                            size_t max_size_estimate = 0,
+                                            size_t min_size_estimate = 0);
 
   /**
    * @brief Creates a source from a host memory buffer.
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index de8eea9e99b..5a060902eb2 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -122,14 +122,16 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 namespace {
 
 std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
-                                                                    size_t range_offset = 0,
-                                                                    size_t range_size   = 0)
+                                                                    size_t offset            = 0,
+                                                                    size_t max_size_estimate = 0,
+                                                                    size_t min_size_estimate = 0)
 {
   switch (info.type()) {
     case io_type::FILEPATH: {
       auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
       for (auto const& filepath : info.filepaths()) {
-        sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
+        sources.emplace_back(
+          cudf::io::datasource::create(filepath, offset, max_size_estimate, min_size_estimate));
       }
       return sources;
     }
@@ -211,7 +213,8 @@ table_with_metadata read_json(json_reader_options options,
 
   auto datasources = make_datasources(options.get_source(),
                                       options.get_byte_range_offset(),
-                                      options.get_byte_range_size_with_padding());
+                                      options.get_byte_range_size_with_padding(),
+                                      options.get_byte_range_size());
 
   return json::detail::read_json(datasources, options, stream, mr);
 }
@@ -238,7 +241,8 @@ table_with_metadata read_csv(csv_reader_options options,
 
   auto datasources = make_datasources(options.get_source(),
                                       options.get_byte_range_offset(),
-                                      options.get_byte_range_size_with_padding());
+                                      options.get_byte_range_size_with_padding(),
+                                      options.get_byte_range_size());
 
   CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported.");
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index e4313eba454..0be976b6144 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -32,6 +32,7 @@
 #include <unistd.h>
 
 #include <unordered_map>
+#include <vector>
 
 namespace cudf {
 namespace io {
@@ -54,6 +55,30 @@ class file_source : public datasource {
     }
   }
 
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    lseek(_file.desc(), offset, SEEK_SET);
+
+    // Clamp length to available data
+    ssize_t const read_size = std::min(size, _file.size() - offset);
+
+    std::vector<uint8_t> v(read_size);
+    CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed");
+    return buffer::create(std::move(v));
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    lseek(_file.desc(), offset, SEEK_SET);
+
+    // Clamp length to available data
+    auto const read_size = std::min(size, _file.size() - offset);
+
+    CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast<ssize_t>(read_size),
+                 "read failed");
+    return read_size;
+  }
+
   ~file_source() override = default;
 
   [[nodiscard]] bool supports_device_read() const override
@@ -138,40 +163,63 @@ class file_source : public datasource {
  */
 class memory_mapped_source : public file_source {
  public:
-  explicit memory_mapped_source(char const* filepath, size_t offset, size_t size)
+  explicit memory_mapped_source(char const* filepath,
+                                size_t offset,
+                                size_t max_size_estimate,
+                                size_t min_size_estimate)
     : file_source(filepath)
   {
     if (_file.size() != 0) {
-      map(_file.desc(), offset, size);
-      register_mmap_buffer();
+      // Memory mapping is not exclusive, so we can include the whole region we expect to read
+      map(_file.desc(), offset, max_size_estimate);
+      // Buffer registration is exclusive (can't overlap with other registered buffers) so we
+      // register the lower estimate; this avoids issues when reading adjacent ranges from the same
+      // file from multiple threads
+      register_mmap_buffer(offset, min_size_estimate);
     }
   }
 
   ~memory_mapped_source() override
   {
     if (_map_addr != nullptr) {
-      munmap(_map_addr, _map_size);
+      unmap();
       unregister_mmap_buffer();
     }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
-    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
+    // Clamp length to available data
+    auto const read_size = std::min(size, +_file.size() - offset);
+
+    // If the requested range is outside of the mapped region, read from the file
+    if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) {
+      return file_source::host_read(offset, read_size);
+    }
 
-    // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    // If the requested range is only partially within the registered region, copy to a new
+    // host buffer to make the data safe to copy to the device
+    if (_reg_addr != nullptr and
+        (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) {
+      auto const src = static_cast<uint8_t*>(_map_addr) + (offset - _map_offset);
+
+      return std::make_unique<owning_buffer<std::vector<uint8_t>>>(
+        std::vector<uint8_t>(src, src + read_size));
+    }
 
     return std::make_unique<non_owning_buffer>(
-      static_cast<uint8_t*>(_map_addr) + (offset - _map_offset), read_size);
+      static_cast<uint8_t*>(_map_addr) + offset - _map_offset, read_size);
   }
 
   size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
-    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
+    // Clamp length to available data
+    auto const read_size = std::min(size, +_file.size() - offset);
 
-    // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    // If the requested range is outside of the mapped region, read from the file
+    if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) {
+      return file_source::host_read(offset, read_size, dst);
+    }
 
     auto const src = static_cast<uint8_t*>(_map_addr) + (offset - _map_offset);
     std::memcpy(dst, src, read_size);
@@ -184,16 +232,18 @@ class memory_mapped_source : public file_source {
    *
    * Fixes nvbugs/4215160
    */
-  void register_mmap_buffer()
+  void register_mmap_buffer(size_t offset, size_t size)
   {
-    if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) {
-      return;
-    }
+    if (_map_addr == nullptr or not pageableMemoryAccessUsesHostPageTables()) { return; }
 
-    auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault);
-    if (result == cudaSuccess) {
-      _is_map_registered = true;
-    } else {
+    // Registered region must be within the mapped region
+    _reg_offset = std::max(offset, _map_offset);
+    _reg_size   = std::min(size != 0 ? size : _map_size, (_map_offset + _map_size) - _reg_offset);
+
+    _reg_addr         = static_cast<std::byte*>(_map_addr) - _map_offset + _reg_offset;
+    auto const result = cudaHostRegister(_reg_addr, _reg_size, cudaHostRegisterReadOnly);
+    if (result != cudaSuccess) {
+      _reg_addr = nullptr;
       CUDF_LOG_WARN("cudaHostRegister failed with {} ({})",
                     static_cast<int>(result),
                     cudaGetErrorString(result));
@@ -205,10 +255,12 @@ class memory_mapped_source : public file_source {
    */
   void unregister_mmap_buffer()
   {
-    if (not _is_map_registered) { return; }
+    if (_reg_addr == nullptr) { return; }
 
-    auto const result = cudaHostUnregister(_map_addr);
-    if (result != cudaSuccess) {
+    auto const result = cudaHostUnregister(_reg_addr);
+    if (result == cudaSuccess) {
+      _reg_addr = nullptr;
+    } else {
       CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})",
                     static_cast<int>(result),
                     cudaGetErrorString(result));
@@ -226,52 +278,30 @@ class memory_mapped_source : public file_source {
 
     // Size for `mmap()` needs to include the page padding
     _map_size = size + (offset - _map_offset);
+    if (_map_size == 0) { return; }
 
     // Check if accessing a region within already mapped area
     _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset);
     CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping");
   }
 
- private:
-  size_t _map_size        = 0;
-  size_t _map_offset      = 0;
-  void* _map_addr         = nullptr;
-  bool _is_map_registered = false;
-};
-
-/**
- * @brief Implementation class for reading from a file using `read` calls
- *
- * Potentially faster than `memory_mapped_source` when only a small portion of the file is read
- * through the host.
- */
-class direct_read_source : public file_source {
- public:
-  explicit direct_read_source(char const* filepath) : file_source(filepath) {}
-
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  void unmap()
   {
-    lseek(_file.desc(), offset, SEEK_SET);
-
-    // Clamp length to available data
-    ssize_t const read_size = std::min(size, _file.size() - offset);
-
-    std::vector<uint8_t> v(read_size);
-    CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed");
-    return buffer::create(std::move(v));
+    if (_map_addr != nullptr) {
+      auto const result = munmap(_map_addr, _map_size);
+      if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); }
+      _map_addr = nullptr;
+    }
   }
 
-  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
-  {
-    lseek(_file.desc(), offset, SEEK_SET);
-
-    // Clamp length to available data
-    auto const read_size = std::min(size, _file.size() - offset);
+ private:
+  size_t _map_offset = 0;
+  size_t _map_size   = 0;
+  void* _map_addr    = nullptr;
 
-    CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast<ssize_t>(read_size),
-                 "read failed");
-    return read_size;
-  }
+  size_t _reg_offset = 0;
+  size_t _reg_size   = 0;
+  void* _reg_addr    = nullptr;
 };
 
 /**
@@ -431,16 +461,21 @@ class user_datasource_wrapper : public datasource {
 
 std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t offset,
-                                               size_t size)
+                                               size_t max_size_estimate,
+                                               size_t min_size_estimate)
 {
+  CUDF_EXPECTS(max_size_estimate == 0 or min_size_estimate <= max_size_estimate,
+               "Invalid min/max size estimates for datasource creation");
+
 #ifdef CUFILE_FOUND
   if (cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
-    return std::make_unique<direct_read_source>(filepath.c_str());
+    return std::make_unique<file_source>(filepath.c_str());
   }
 #endif
   // Use our own memory mapping implementation for direct file reads
-  return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, size);
+  return std::make_unique<memory_mapped_source>(
+    filepath.c_str(), offset, max_size_estimate, min_size_estimate);
 }
 
 std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index dc14824d834..0028dd946e3 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -2516,4 +2516,39 @@ TEST_F(CsvReaderTest, UTF8BOM)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected);
 }
 
+void expect_buffers_equal(cudf::io::datasource::buffer* lhs, cudf::io::datasource::buffer* rhs)
+{
+  ASSERT_EQ(lhs->size(), rhs->size());
+  EXPECT_EQ(0, std::memcmp(lhs->data(), rhs->data(), lhs->size()));
+}
+
+TEST_F(CsvReaderTest, OutOfMapBoundsReads)
+{
+  // write a lot of data into a file
+  auto filepath        = temp_env->get_temp_dir() + "OutOfMapBoundsReads.csv";
+  auto const num_rows  = 1 << 20;
+  auto const row       = std::string{"0,1,2,3,4,5,6,7,8,9\n"};
+  auto const file_size = num_rows * row.size();
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    for (size_t i = 0; i < num_rows; ++i) {
+      outfile << row;
+    }
+  }
+
+  // Only memory map the middle of the file
+  auto source         = cudf::io::datasource::create(filepath, file_size / 2, file_size / 4);
+  auto full_source    = cudf::io::datasource::create(filepath);
+  auto const all_data = source->host_read(0, file_size);
+  auto ref_data       = full_source->host_read(0, file_size);
+  expect_buffers_equal(ref_data.get(), all_data.get());
+
+  auto const start_data = source->host_read(file_size / 2, file_size / 2);
+  expect_buffers_equal(full_source->host_read(file_size / 2, file_size / 2).get(),
+                       start_data.get());
+
+  auto const end_data = source->host_read(0, file_size / 2 + 512);
+  expect_buffers_equal(full_source->host_read(0, file_size / 2 + 512).get(), end_data.get());
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From d15bbfdded7181fdc23d33fa5efae181b4af2e2b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 4 Oct 2024 07:45:54 -1000
Subject: [PATCH 017/117] Allow melt(var_name=) to be a falsy label (#16981)

closes https://github.com/rapidsai/cudf/issues/16972

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16981
---
 python/cudf/cudf/core/reshape.py       | 2 +-
 python/cudf/cudf/tests/test_reshape.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 6e5abb2b82b..3d132c92d54 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -681,7 +681,7 @@ def _tile(A, reps):
     nval = len(value_vars)
     dtype = min_unsigned_type(nval)
 
-    if not var_name:
+    if var_name is None:
         var_name = "variable"
 
     if not value_vars:
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 4235affd4d1..3adbe1d2a74 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -119,6 +119,15 @@ def test_melt_str_scalar_id_var():
     assert_eq(result, expected)
 
 
+def test_melt_falsy_var_name():
+    df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]})
+    result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="")
+    expected = pd.melt(
+        df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name=""
+    )
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 1000])
 @pytest.mark.parametrize(

From 04c17ded6563f4caaeeb51319672c10587401e33 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 4 Oct 2024 14:06:23 -0400
Subject: [PATCH 018/117] [FEA]  Migrate nvtext/edit_distance APIs to pylibcudf
 (#16957)

Apart of #15162. This PR migrates `edit_distance.pxd` to pylibcudf

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16957
---
 cpp/include/nvtext/edit_distance.hpp          |  2 +-
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../pylibcudf/nvtext/edit_distance.rst        |  6 ++
 .../api_docs/pylibcudf/nvtext/index.rst       |  7 +++
 .../cudf/cudf/_lib/nvtext/edit_distance.pyx   | 34 +++-------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |  1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |  2 +
 python/pylibcudf/pylibcudf/__init__.py        |  2 +
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 22 +++++++
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  7 +++
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  7 +++
 .../pylibcudf/nvtext/edit_distance.pxd        |  8 +++
 .../pylibcudf/nvtext/edit_distance.pyx        | 63 +++++++++++++++++++
 .../tests/test_nvtext_edit_distance.py        | 34 ++++++++++
 14 files changed, 171 insertions(+), 25 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/__init__.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/__init__.py
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py

diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index 723ba310a1e..dca590baebf 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext {
  * @param targets Strings to compute edit distance against `input`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New lists column of edit distance values
  */
 std::unique_ptr<cudf::column> edit_distance(
   cudf::strings_column_view const& input,
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index e21536e2e97..052479d6720 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -49,3 +49,4 @@ This page provides API documentation for pylibcudf.
 
     io/index.rst
     strings/index.rst
+    nvtext/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst
new file mode 100644
index 00000000000..abb45e426a8
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst
@@ -0,0 +1,6 @@
+=============
+edit_distance
+=============
+
+.. automodule:: pylibcudf.nvtext.edit_distance
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
new file mode 100644
index 00000000000..b5cd5ee42c3
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -0,0 +1,7 @@
+nvtext
+======
+
+.. toctree::
+    :maxdepth: 1
+
+    edit_distance
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
index e3c2273345a..3dd99c42d76 100644
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
@@ -2,37 +2,23 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.edit_distance cimport (
-    edit_distance as cpp_edit_distance,
-    edit_distance_matrix as cpp_edit_distance_matrix,
-)
+from pylibcudf cimport nvtext
 
 from cudf._lib.column cimport Column
 
 
 @acquire_spill_lock()
 def edit_distance(Column strings, Column targets):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_targets = targets.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_edit_distance(c_strings, c_targets))
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.edit_distance.edit_distance(
+        strings.to_pylibcudf(mode="read"),
+        targets.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def edit_distance_matrix(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_edit_distance_matrix(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.edit_distance.edit_distance_matrix(
+        strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index a7cb66d7b16..1d72eacac12 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -66,3 +66,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
 add_subdirectory(libcudf)
 add_subdirectory(strings)
 add_subdirectory(io)
+add_subdirectory(nvtext)
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index a384edd456d..b98b37fe0fd 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     lists,
     merge,
     null_mask,
+    nvtext,
     partitioning,
     quantiles,
     reduce,
@@ -78,4 +79,5 @@ __all__ = [
     "transpose",
     "types",
     "unary",
+    "nvtext",
 ]
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 2a5365e8fad..304f27be340 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -28,6 +28,7 @@
     lists,
     merge,
     null_mask,
+    nvtext,
     partitioning,
     quantiles,
     reduce,
@@ -92,4 +93,5 @@
     "transpose",
     "types",
     "unary",
+    "nvtext",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
new file mode 100644
index 00000000000..ebe1fda1f12
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -0,0 +1,22 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources edit_distance.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
new file mode 100644
index 00000000000..82f7c425b1d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . cimport edit_distance
+
+__all__ = [
+    "edit_distance",
+]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
new file mode 100644
index 00000000000..986652a241f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import edit_distance
+
+__all__ = [
+    "edit_distance",
+]
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
new file mode 100644
index 00000000000..446b95afabb
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column edit_distance(Column input, Column targets)
+
+cpdef Column edit_distance_matrix(Column input)
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
new file mode 100644
index 00000000000..fc98ccbc50c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.edit_distance cimport (
+    edit_distance as cpp_edit_distance,
+    edit_distance_matrix as cpp_edit_distance_matrix,
+)
+
+
+cpdef Column edit_distance(Column input, Column targets):
+    """
+    Returns the edit distance between individual strings in two strings columns
+
+    For details, see :cpp:func:`edit_distance`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    targets : Column
+        Strings to compute edit distance against
+
+    Returns
+    -------
+    Column
+        New column of edit distance values
+    """
+    cdef column_view c_strings = input.view()
+    cdef column_view c_targets = targets.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_edit_distance(c_strings, c_targets))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column edit_distance_matrix(Column input):
+    """
+    Returns the edit distance between all strings in the input strings column
+
+    For details, see :cpp:func:`edit_distance_matrix`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+
+    Returns
+    -------
+    Column
+        New column of edit distance values
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_edit_distance_matrix(c_strings))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
new file mode 100644
index 00000000000..7d93c471cc4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def edit_distance_data():
+    arr1 = ["hallo", "goodbye", "world"]
+    arr2 = ["hello", "", "world"]
+    return pa.array(arr1), pa.array(arr2)
+
+
+def test_edit_distance(edit_distance_data):
+    input_col, targets = edit_distance_data
+    result = plc.nvtext.edit_distance.edit_distance(
+        plc.interop.from_arrow(input_col),
+        plc.interop.from_arrow(targets),
+    )
+    expected = pa.array([1, 7, 0], type=pa.int32())
+    assert_column_eq(result, expected)
+
+
+def test_edit_distance_matrix(edit_distance_data):
+    input_col, _ = edit_distance_data
+    result = plc.nvtext.edit_distance.edit_distance_matrix(
+        plc.interop.from_arrow(input_col)
+    )
+    expected = pa.array(
+        [[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32())
+    )
+    assert_column_eq(expected, result)

From efaa0b50c6ffd15c6506847987cb531e5f6ba955 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 4 Oct 2024 08:20:34 -1000
Subject: [PATCH 019/117] Add string.convert.convert_datetime/convert_booleans
 APIs to pylibcudf (#16971)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Also address a review in https://github.com/rapidsai/cudf/pull/16935#discussion_r1783726477

This also modifies some `format` arguments in `convert_datetime.pyx` to accept `str` instead of `bytes` (`const string&`) to align more with Python. Let me know if you prefer to change this back

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16971
---
 python/cudf/cudf/_lib/string_casting.pyx      | 110 +++---------------
 python/cudf_polars/cudf_polars/dsl/expr.py    |   4 +-
 .../strings/convert/convert_booleans.pxd      |   4 +-
 .../strings/convert/convert_datetime.pxd      |   6 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |   2 +-
 .../pylibcudf/strings/convert/__init__.pxd    |   2 +-
 .../pylibcudf/strings/convert/__init__.py     |   2 +-
 .../strings/convert/convert_booleans.pxd      |   9 ++
 .../strings/convert/convert_booleans.pyx      |  91 +++++++++++++++
 .../strings/convert/convert_datetime.pxd      |  11 +-
 .../strings/convert/convert_datetime.pyx      |  82 +++++++++++--
 .../pylibcudf/tests/test_string_convert.py    |   2 +-
 .../tests/test_string_convert_booleans.py     |  26 +++++
 .../tests/test_string_convert_datetime.py     |  46 ++++++++
 .../pylibcudf/tests/test_string_wrap.py       |   5 +-
 15 files changed, 286 insertions(+), 116 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 60a6795a402..55ff38f472d 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -3,9 +3,6 @@
 from cudf._lib.column cimport Column
 
 from cudf._lib.scalar import as_device_scalar
-
-from cudf._lib.scalar cimport DeviceScalar
-
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
@@ -14,14 +11,6 @@ from libcpp.utility cimport move
 
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.convert.convert_booleans cimport (
-    from_booleans as cpp_from_booleans,
-    to_booleans as cpp_to_booleans,
-)
-from pylibcudf.libcudf.strings.convert.convert_datetime cimport (
-    is_timestamp as cpp_is_timestamp,
-)
 from pylibcudf.libcudf.strings.convert.convert_floats cimport (
     from_floats as cpp_from_floats,
     to_floats as cpp_to_floats,
@@ -427,77 +416,21 @@ def stoul(Column input_col):
     return string_to_integer(input_col, cudf.dtype("uint64"))
 
 
-def _to_booleans(Column input_col, object string_true="True"):
-    """
-    Converting/Casting input column of type string to boolean column
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    string_true : string that represents True
-
-    Returns
-    -------
-    A Column with string values cast to boolean
-    """
-
-    cdef DeviceScalar str_true = as_device_scalar(string_true)
-    cdef column_view input_column_view = input_col.view()
-    cdef const string_scalar* string_scalar_true = <const string_scalar*>(
-        str_true.get_raw_ptr())
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_to_booleans(
-                input_column_view,
-                string_scalar_true[0]))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
 def to_booleans(Column input_col):
-
-    return _to_booleans(input_col)
-
-
-def _from_booleans(
-        Column input_col,
-        object string_true="True",
-        object string_false="False"):
-    """
-    Converting/Casting input column of type boolean to string column
-
-    Parameters
-    ----------
-    input_col : input column of type boolean
-    string_true : string that represents True
-    string_false : string that represents False
-
-    Returns
-    -------
-    A Column with boolean values cast to string
-    """
-
-    cdef DeviceScalar str_true = as_device_scalar(string_true)
-    cdef DeviceScalar str_false = as_device_scalar(string_false)
-    cdef column_view input_column_view = input_col.view()
-    cdef const string_scalar* string_scalar_true = <const string_scalar*>(
-        str_true.get_raw_ptr())
-    cdef const string_scalar* string_scalar_false = <const string_scalar*>(
-        str_false.get_raw_ptr())
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_booleans(
-                input_column_view,
-                string_scalar_true[0],
-                string_scalar_false[0]))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_booleans.to_booleans(
+        input_col.to_pylibcudf(mode="read"),
+        as_device_scalar("True").c_value,
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def from_booleans(Column input_col):
-    return _from_booleans(input_col)
+    plc_column = plc.strings.convert.convert_booleans.from_booleans(
+        input_col.to_pylibcudf(mode="read"),
+        as_device_scalar("True").c_value,
+        as_device_scalar("False").c_value,
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def int2timestamp(
@@ -520,11 +453,10 @@ def int2timestamp(
     A Column with date-time represented in string format
 
     """
-    cdef string c_timestamp_format = format.encode("UTF-8")
     return Column.from_pylibcudf(
         plc.strings.convert.convert_datetime.from_timestamps(
             input_col.to_pylibcudf(mode="read"),
-            c_timestamp_format,
+            format,
             names.to_pylibcudf(mode="read")
         )
     )
@@ -545,12 +477,11 @@ def timestamp2int(Column input_col, dtype, format):
 
     """
     dtype = dtype_to_pylibcudf_type(dtype)
-    cdef string c_timestamp_format = format.encode('UTF-8')
     return Column.from_pylibcudf(
         plc.strings.convert.convert_datetime.to_timestamps(
             input_col.to_pylibcudf(mode="read"),
             dtype,
-            c_timestamp_format
+            format
         )
     )
 
@@ -572,16 +503,11 @@ def istimestamp(Column input_col, str format):
     """
     if input_col.size == 0:
         return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool"))
-    cdef column_view input_column_view = input_col.view()
-    cdef string c_timestamp_format = <string>str(format).encode('UTF-8')
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_is_timestamp(
-                input_column_view,
-                c_timestamp_format))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_datetime.is_timestamp(
+        input_col.to_pylibcudf(mode="read"),
+        format
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def timedelta2int(Column input_col, dtype, format):
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index c401e5a2f17..54476b7fedc 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -914,7 +914,7 @@ def do_evaluate(
             col = self.children[0].evaluate(df, context=context, mapping=mapping)
 
             is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
-                col.obj, format.encode()
+                col.obj, format
             )
 
             if strict:
@@ -937,7 +937,7 @@ def do_evaluate(
                 )
                 return Column(
                     plc.strings.convert.convert_datetime.to_timestamps(
-                        res.columns()[0], self.dtype, format.encode()
+                        res.columns()[0], self.dtype, format
                     )
                 )
         elif self.name == pl_expr.StringFunction.Replace:
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
index 83a9573baad..e6688cfff81 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
@@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_booleans(
-        column_view input_col,
+        column_view input,
         string_scalar true_string) except +
 
     cdef unique_ptr[column] from_booleans(
-        column_view input_col,
+        column_view booleans,
         string_scalar true_string,
         string_scalar false_string) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
index fa8975c4df9..fceddd58df0 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
@@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_timestamps(
-        column_view input_col,
+        column_view input,
         data_type timestamp_type,
         string format) except +
 
     cdef unique_ptr[column] from_timestamps(
-        column_view input_col,
+        column_view timestamps,
         string format,
-        column_view input_strings_names) except +
+        column_view names) except +
 
     cdef unique_ptr[column] is_timestamp(
         column_view input_col,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 175c9b3738e..3febc78dfd2 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources convert_durations.pyx convert_datetime.pyx)
+set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index 05324cb49df..5525bca46d6 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -1,2 +1,2 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from . cimport convert_datetime, convert_durations
+from . cimport convert_booleans, convert_datetime, convert_durations
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index d803399d53c..2340ebe9a26 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -1,2 +1,2 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from . import convert_datetime, convert_durations
+from . import convert_booleans, convert_datetime, convert_durations
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd
new file mode 100644
index 00000000000..312ac3c0ca0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column to_booleans(Column input, Scalar true_string)
+
+cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
new file mode 100644
index 00000000000..0c10f821ab6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_booleans as cpp_convert_booleans,
+)
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+
+cpdef Column to_booleans(Column input, Scalar true_string):
+    """
+    Returns a new bool column by parsing boolean values from the strings
+    in the provided strings column.
+
+    For details, see :cpp:func:`cudf::strings::to_booleans`.
+
+    Parameters
+    ----------
+    input :  Column
+        Strings instance for this operation
+
+    true_string : Scalar
+        String to expect for true. Non-matching strings are false
+
+    Returns
+    -------
+    Column
+        New bool column converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_true_string = <const string_scalar*>(
+        true_string.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_convert_booleans.to_booleans(
+                input.view(),
+                dereference(c_true_string)
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string):
+    """
+    Returns a new strings column converting the boolean values from the
+    provided column into strings.
+
+    For details, see :cpp:func:`cudf::strings::from_booleans`.
+
+    Parameters
+    ----------
+    booleans :  Column
+        Boolean column to convert.
+
+    true_string : Scalar
+        String to use for true in the output column.
+
+    false_string : Scalar
+        String to use for false in the output column.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_true_string = <const string_scalar*>(
+        true_string.c_obj.get()
+    )
+    cdef const string_scalar* c_false_string = <const string_scalar*>(
+        false_string.c_obj.get()
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_convert_booleans.from_booleans(
+                booleans.view(),
+                dereference(c_true_string),
+                dereference(c_false_string),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
index 07c84d263d6..80ec168644b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
@@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType
 cpdef Column to_timestamps(
     Column input,
     DataType timestamp_type,
-    const string& format
+    str format
 )
 
 cpdef Column from_timestamps(
-    Column input,
-    const string& format,
+    Column timestamps,
+    str format,
     Column input_strings_names
 )
+
+cpdef Column is_timestamp(
+    Column input,
+    str format,
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
index fcacb096f87..0ee60812e00 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
@@ -15,28 +15,74 @@ from pylibcudf.types import DataType
 cpdef Column to_timestamps(
     Column input,
     DataType timestamp_type,
-    const string& format
+    str format
 ):
+    """
+    Returns a new timestamp column converting a strings column into
+    timestamps using the provided format pattern.
+
+    For details, see cpp:`cudf::strings::to_timestamps`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    timestamp_type : DataType
+        The timestamp type used for creating the output column.
+
+    format : str
+        String specifying the timestamp format in strings.
+
+    Returns
+    -------
+    Column
+        New datetime column
+    """
     cdef unique_ptr[column] c_result
+    cdef string c_format = format.encode()
     with nogil:
         c_result = cpp_convert_datetime.to_timestamps(
             input.view(),
             timestamp_type.c_obj,
-            format
+            c_format
         )
 
     return Column.from_libcudf(move(c_result))
 
 cpdef Column from_timestamps(
-    Column input,
-    const string& format,
+    Column timestamps,
+    str format,
     Column input_strings_names
 ):
+    """
+    Returns a new strings column converting a timestamp column into
+    strings using the provided format pattern.
+
+    For details, see cpp:`cudf::strings::from_timestamps`.
+
+    Parameters
+    ----------
+    timestamps : Column
+        Timestamp values to convert
+
+    format : str
+        The string specifying output format.
+
+    input_strings_names : Column
+        The string names to use for weekdays ("%a", "%A") and months ("%b", "%B").
+
+    Returns
+    -------
+    Column
+        New strings column with formatted timestamps.
+    """
     cdef unique_ptr[column] c_result
+    cdef string c_format = format.encode()
     with nogil:
         c_result = cpp_convert_datetime.from_timestamps(
-            input.view(),
-            format,
+            timestamps.view(),
+            c_format,
             input_strings_names.view()
         )
 
@@ -44,13 +90,33 @@ cpdef Column from_timestamps(
 
 cpdef Column is_timestamp(
     Column input,
-    const string& format
+    str format
 ):
+    """
+    Verifies the given strings column can be parsed to timestamps
+    using the provided format pattern.
+
+    For details, see cpp:`cudf::strings::is_timestamp`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    format : str
+        String specifying the timestamp format in strings.
+
+    Returns
+    -------
+    Column
+        New bool column.
+    """
     cdef unique_ptr[column] c_result
+    cdef string c_format = format.encode()
     with nogil:
         c_result = cpp_convert_datetime.is_timestamp(
             input.view(),
-            format
+            c_format
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
index e9e95459d0e..22bb4971cb1 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -62,7 +62,7 @@ def test_to_datetime(
     got = plc.strings.convert.convert_datetime.to_timestamps(
         plc_timestamp_col,
         plc.interop.from_arrow(timestamp_type),
-        format.encode(),
+        format,
     )
     assert_column_eq(expect, got)
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
new file mode 100644
index 00000000000..117c59ff1b8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_to_booleans():
+    pa_array = pa.array(["true", None, "True"])
+    result = plc.strings.convert.convert_booleans.to_booleans(
+        plc.interop.from_arrow(pa_array),
+        plc.interop.from_arrow(pa.scalar("True")),
+    )
+    expected = pa.array([False, None, True])
+    assert_column_eq(result, expected)
+
+
+def test_from_booleans():
+    pa_array = pa.array([True, None, False])
+    result = plc.strings.convert.convert_booleans.from_booleans(
+        plc.interop.from_arrow(pa_array),
+        plc.interop.from_arrow(pa.scalar("A")),
+        plc.interop.from_arrow(pa.scalar("B")),
+    )
+    expected = pa.array(["A", None, "B"])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
new file mode 100644
index 00000000000..f3e84286a36
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import datetime
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture
+def fmt():
+    return "%Y-%m-%dT%H:%M:%S"
+
+
+def test_to_timestamp(fmt):
+    arr = pa.array(["2020-01-01T01:01:01", None])
+    result = plc.strings.convert.convert_datetime.to_timestamps(
+        plc.interop.from_arrow(arr),
+        plc.DataType(plc.TypeId.TIMESTAMP_SECONDS),
+        fmt,
+    )
+    expected = pc.strptime(arr, fmt, "s")
+    assert_column_eq(result, expected)
+
+
+def test_from_timestamp(fmt):
+    arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None])
+    result = plc.strings.convert.convert_datetime.from_timestamps(
+        plc.interop.from_arrow(arr),
+        fmt,
+        plc.interop.from_arrow(pa.array([], type=pa.string())),
+    )
+    # pc.strftime will add the extra %f
+    expected = pa.array(["2020-01-01T01:01:01", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_timestamp(fmt):
+    arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"])
+    result = plc.strings.convert.convert_datetime.is_timestamp(
+        plc.interop.from_arrow(arr),
+        fmt,
+    )
+    expected = pa.array([True, None, False])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
index 85abd3a2bae..a1c820cd586 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
@@ -7,6 +7,7 @@
 
 
 def test_wrap():
+    width = 12
     pa_array = pa.array(
         [
             "the quick brown fox jumped over the lazy brown dog",
@@ -14,10 +15,10 @@ def test_wrap():
             None,
         ]
     )
-    result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), 12)
+    result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width)
     expected = pa.array(
         [
-            textwrap.fill(val, 12) if isinstance(val, str) else val
+            textwrap.fill(val, width) if isinstance(val, str) else val
             for val in pa_array.to_pylist()
         ]
     )

From a8da1ff2b393abbafa27dddcf4c19481ec853c28 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 4 Oct 2024 12:11:31 -0700
Subject: [PATCH 020/117] Deprecate support for directly accessing logger
 (#16964)

This PR removes support for accessing cudf's underlying spdlog logger directly.

Contributes to https://github.com/rapidsai/build-planning/issues/104

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16964
---
 cpp/include/cudf/detail/utilities/logger.hpp | 14 ++++----
 cpp/include/cudf/utilities/logger.hpp        |  8 ++++-
 cpp/src/utilities/logger.cpp                 |  4 ++-
 cpp/tests/utilities_tests/logger_tests.cpp   | 37 ++++++++++----------
 4 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp
index 8c1c3c28df8..e7643eb44bd 100644
--- a/cpp/include/cudf/detail/utilities/logger.hpp
+++ b/cpp/include/cudf/detail/utilities/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,9 @@
 #include <cudf/utilities/logger.hpp>
 
 // Log messages that require computation should only be used at level TRACE and DEBUG
-#define CUDF_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::logger(), __VA_ARGS__)
+#define CUDF_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__)
diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp
index 45d5d1b12e1..982554a23f5 100644
--- a/cpp/include/cudf/utilities/logger.hpp
+++ b/cpp/include/cudf/utilities/logger.hpp
@@ -22,6 +22,10 @@
 
 namespace CUDF_EXPORT cudf {
 
+namespace detail {
+spdlog::logger& logger();
+}
+
 /**
  * @brief Returns the global logger.
  *
@@ -43,6 +47,8 @@ namespace CUDF_EXPORT cudf {
  *
  * @return spdlog::logger& The logger.
  */
-spdlog::logger& logger();
+[[deprecated(
+  "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger&
+logger();
 
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp
index d54f5677c4c..e52fffbd8c6 100644
--- a/cpp/src/utilities/logger.cpp
+++ b/cpp/src/utilities/logger.cpp
@@ -74,8 +74,10 @@ struct logger_wrapper {
 
 }  // namespace
 
-spdlog::logger& cudf::logger()
+spdlog::logger& cudf::detail::logger()
 {
   static logger_wrapper wrapped{};
   return wrapped.logger_;
 }
+
+spdlog::logger& cudf::logger() { return cudf::detail::logger(); }
diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp
index d052e20eedb..cfab570833b 100644
--- a/cpp/tests/utilities_tests/logger_tests.cpp
+++ b/cpp/tests/utilities_tests/logger_tests.cpp
@@ -28,16 +28,17 @@ class LoggerTest : public cudf::test::BaseFixture {
   std::vector<spdlog::sink_ptr> prev_sinks;
 
  public:
-  LoggerTest() : prev_level{cudf::logger().level()}, prev_sinks{cudf::logger().sinks()}
+  LoggerTest()
+    : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()}
   {
-    cudf::logger().sinks() = {std::make_shared<spdlog::sinks::ostream_sink_mt>(oss)};
-    cudf::logger().set_formatter(
+    cudf::detail::logger().sinks() = {std::make_shared<spdlog::sinks::ostream_sink_mt>(oss)};
+    cudf::detail::logger().set_formatter(
       std::unique_ptr<spdlog::formatter>(new spdlog::pattern_formatter("%v")));
   }
   ~LoggerTest() override
   {
-    cudf::logger().set_level(prev_level);
-    cudf::logger().sinks() = prev_sinks;
+    cudf::detail::logger().set_level(prev_level);
+    cudf::detail::logger().sinks() = prev_sinks;
   }
 
   void clear_sink() { oss.str(""); }
@@ -46,32 +47,32 @@ class LoggerTest : public cudf::test::BaseFixture {
 
 TEST_F(LoggerTest, Basic)
 {
-  cudf::logger().critical("crit msg");
+  cudf::detail::logger().critical("crit msg");
   ASSERT_EQ(this->sink_content(), "crit msg\n");
 }
 
 TEST_F(LoggerTest, DefaultLevel)
 {
-  cudf::logger().trace("trace");
-  cudf::logger().debug("debug");
-  cudf::logger().info("info");
-  cudf::logger().warn("warn");
-  cudf::logger().error("error");
-  cudf::logger().critical("critical");
+  cudf::detail::logger().trace("trace");
+  cudf::detail::logger().debug("debug");
+  cudf::detail::logger().info("info");
+  cudf::detail::logger().warn("warn");
+  cudf::detail::logger().error("error");
+  cudf::detail::logger().critical("critical");
   ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n");
 }
 
 TEST_F(LoggerTest, CustomLevel)
 {
-  cudf::logger().set_level(spdlog::level::warn);
-  cudf::logger().info("info");
-  cudf::logger().warn("warn");
+  cudf::detail::logger().set_level(spdlog::level::warn);
+  cudf::detail::logger().info("info");
+  cudf::detail::logger().warn("warn");
   ASSERT_EQ(this->sink_content(), "warn\n");
 
   this->clear_sink();
 
-  cudf::logger().set_level(spdlog::level::debug);
-  cudf::logger().trace("trace");
-  cudf::logger().debug("debug");
+  cudf::detail::logger().set_level(spdlog::level::debug);
+  cudf::detail::logger().trace("trace");
+  cudf::detail::logger().debug("debug");
   ASSERT_EQ(this->sink_content(), "debug\n");
 }

From 119aa9d9c5cffc2de460f52f11fb4a78f8b51dce Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 4 Oct 2024 12:08:26 -1000
Subject: [PATCH 021/117] Add string.convert.convert_fixed_type APIs to
 pylibcudf (#16984)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16984
---
 .../strings/convert/convert_fixed_point.pyx   |  69 +++--------
 .../strings/convert/convert_fixed_point.pxd   |   8 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |   4 +-
 .../pylibcudf/strings/convert/__init__.pxd    |   7 +-
 .../pylibcudf/strings/convert/__init__.py     |   7 +-
 .../strings/convert/convert_fixed_point.pxd   |  11 ++
 .../strings/convert/convert_fixed_point.pyx   | 107 ++++++++++++++++++
 .../tests/test_string_convert_fixed_point.py  |  34 ++++++
 8 files changed, 188 insertions(+), 59 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py

diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index a8df8c9a92c..96dcd021c3b 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,22 +1,11 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-import cudf
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
-    from_fixed_point as cpp_from_fixed_point,
-    is_fixed_point as cpp_is_fixed_point,
-    to_fixed_point as cpp_to_fixed_point,
-)
-from pylibcudf.libcudf.types cimport data_type, type_id
-
 from cudf._lib.column cimport Column
+from cudf._lib.types cimport dtype_to_pylibcudf_type
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -32,14 +21,10 @@ def from_decimal(Column input_col):
     -------
     A column of strings representing the input decimal values.
     """
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_fixed_point(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point(
+        input_col.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type):
     -------
     A column of decimals parsed from the string values.
     """
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef int scale = out_type.scale
-    cdef data_type c_out_type
-    if isinstance(out_type, cudf.Decimal32Dtype):
-        c_out_type = data_type(type_id.DECIMAL32, -scale)
-    elif isinstance(out_type, cudf.Decimal64Dtype):
-        c_out_type = data_type(type_id.DECIMAL64, -scale)
-    elif isinstance(out_type, cudf.Decimal128Dtype):
-        c_out_type = data_type(type_id.DECIMAL128, -scale)
-    else:
-        raise TypeError("should be a decimal dtype")
-    with nogil:
-        c_result = move(
-            cpp_to_fixed_point(
-                input_column_view,
-                c_out_type))
-
-    result = Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
+        input_col.to_pylibcudf(mode="read"),
+        dtype_to_pylibcudf_type(out_type),
+    )
+    result = Column.from_pylibcudf(plc_column)
     result.dtype.precision = out_type.precision
     return result
 
@@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype):
     -------
     A Column of booleans indicating valid decimal conversion.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = input_col.view()
-    cdef int scale = dtype.scale
-    cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale)
-    with nogil:
-        c_result = move(cpp_is_fixed_point(
-            source_view,
-            c_dtype
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point(
+        input_col.to_pylibcudf(mode="read"),
+        dtype_to_pylibcudf_type(dtype),
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
index 6f820f3c9a4..72ab329f2dd 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
@@ -9,13 +9,13 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_fixed_point(
-        column_view input_col,
+        column_view input,
         data_type output_type) except +
 
     cdef unique_ptr[column] from_fixed_point(
-        column_view input_col) except +
+        column_view input) except +
 
     cdef unique_ptr[column] is_fixed_point(
-        column_view source_strings,
-        data_type output_type
+        column_view input,
+        data_type decimal_type
     ) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 3febc78dfd2..fe8da975566 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -12,7 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx)
+set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
+                   convert_fixed_point.pyx
+)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index 5525bca46d6..36abf463371 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -1,2 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from . cimport convert_booleans, convert_datetime, convert_durations
+from . cimport (
+    convert_booleans,
+    convert_datetime,
+    convert_durations,
+    convert_fixed_point,
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index 2340ebe9a26..c0be4093836 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -1,2 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from . import convert_booleans, convert_datetime, convert_durations
+from . import (
+    convert_booleans,
+    convert_datetime,
+    convert_durations,
+    convert_fixed_point,
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd
new file mode 100644
index 00000000000..049b9b3fffe
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_fixed_point(Column input, DataType output_type)
+
+cpdef Column from_fixed_point(Column input)
+
+cpdef Column is_fixed_point(Column input, DataType decimal_type=*)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
new file mode 100644
index 00000000000..40dadf6f967
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
@@ -0,0 +1,107 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_fixed_point as cpp_fixed_point,
+)
+from pylibcudf.types cimport DataType, type_id
+
+
+cpdef Column to_fixed_point(Column input, DataType output_type):
+    """
+    Returns a new fixed-point column parsing decimal values from the
+    provided strings column.
+
+    For details, see :cpp:details:`cudf::strings::to_fixed_point`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of fixed-point column to return including the scale value.
+
+    Returns
+    -------
+    Column
+        New column of output_type.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_fixed_point.to_fixed_point(
+                input.view(),
+                output_type.c_obj,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column from_fixed_point(Column input):
+    """
+    Returns a new strings column converting the fixed-point values
+    into a strings column.
+
+    For details, see :cpp:details:`cudf::strings::from_fixed_point`
+
+    Parameters
+    ----------
+    input : Column
+        Fixed-point column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_fixed_point.from_fixed_point(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column is_fixed_point(Column input, DataType decimal_type=None):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to fixed-point.
+
+    For details, see :cpp:details:`cudf::strings::is_fixed_point`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    decimal_type : DataType
+        Fixed-point type (with scale) used only for checking overflow.
+        Defaults to Decimal64
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    if decimal_type is None:
+        decimal_type = DataType(type_id.DECIMAL64)
+
+    with nogil:
+        c_result = move(
+            cpp_fixed_point.is_fixed_point(
+                input.view(),
+                decimal_type.c_obj,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
new file mode 100644
index 00000000000..b1c4d729604
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import decimal
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_to_fixed_point():
+    typ = pa.decimal128(38, 2)
+    arr = pa.array(["123", "1.23", None])
+    result = plc.strings.convert.convert_fixed_point.to_fixed_point(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_fixed_point():
+    arr = pa.array([decimal.Decimal("1.1"), None])
+    result = plc.strings.convert.convert_fixed_point.from_fixed_point(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array(["1.1", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_fixed_point():
+    arr = pa.array(["123", "1.23", "1.2.3", "", None])
+    result = plc.strings.convert.convert_fixed_point.is_fixed_point(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)

From 77f3a5d3229ed1b3186fe9f4d5b5b04d124c6a4d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 4 Oct 2024 12:33:44 -1000
Subject: [PATCH 022/117] Add docstrings and test for strings.convert_durations
 APIs for pylibcudf (#16982)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Since the implementation already existed:

* Added docstrings
* Like https://github.com/rapidsai/cudf/pull/16971, made the `format` parameter accept `str` instead
* Aligned parameter names closer to pylibcudf
* Added missing `move`s
* Moved `convert_duration` tests to `test_string_convert_duration.py` and added a new test for `from_durations`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16982
---
 python/cudf/cudf/_lib/string_casting.pyx      |  7 +-
 .../strings/convert/convert_durations.pxd     |  2 +-
 .../strings/convert/convert_durations.pxd     |  6 +-
 .../strings/convert/convert_durations.pyx     | 73 ++++++++++++++++---
 .../pylibcudf/tests/test_string_convert.py    | 43 -----------
 .../tests/test_string_convert_durations.py    | 61 ++++++++++++++++
 6 files changed, 130 insertions(+), 62 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 55ff38f472d..fe19379bf93 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -525,12 +525,11 @@ def timedelta2int(Column input_col, dtype, format):
 
     """
     dtype = dtype_to_pylibcudf_type(dtype)
-    cdef string c_timestamp_format = format.encode('UTF-8')
     return Column.from_pylibcudf(
         plc.strings.convert.convert_durations.to_durations(
             input_col.to_pylibcudf(mode="read"),
             dtype,
-            c_timestamp_format
+            format
         )
     )
 
@@ -549,12 +548,10 @@ def int2timedelta(Column input_col, str format):
     A Column with Timedelta represented in string format
 
     """
-
-    cdef string c_duration_format = format.encode('UTF-8')
     return Column.from_pylibcudf(
         plc.strings.convert.convert_durations.from_durations(
             input_col.to_pylibcudf(mode="read"),
-            c_duration_format
+            format
         )
     )
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
index ebe10574353..43ffad1d89f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
@@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_durations(
-        const column_view & strings_col,
+        const column_view & input,
         data_type duration_type,
         const string & format) except +
 
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
index ac11b8959ed..eecdade4ef9 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
@@ -8,10 +8,10 @@ from pylibcudf.types cimport DataType
 cpdef Column to_durations(
     Column input,
     DataType duration_type,
-    const string& format
+    str format
 )
 
 cpdef Column from_durations(
-    Column input,
-    const string& format
+    Column durations,
+    str format=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
index f3e0b7c9c8e..76c5809c3d5 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -15,27 +15,80 @@ from pylibcudf.types import DataType
 cpdef Column to_durations(
     Column input,
     DataType duration_type,
-    const string& format
+    str format
 ):
+    """
+    Returns a new duration column converting a strings column into
+    durations using the provided format pattern.
+
+    For details, see cpp:func:`cudf::strings::to_durations`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    duration_type : DataType
+        The duration type used for creating the output column.
+
+    format : str
+        String specifying the duration format in strings.
+
+    Returns
+    -------
+    Column
+        New duration column.
+    """
     cdef unique_ptr[column] c_result
+    cdef string c_format = format.encode()
+
     with nogil:
-        c_result = cpp_convert_durations.to_durations(
-            input.view(),
-            duration_type.c_obj,
-            format
+        c_result = move(
+            cpp_convert_durations.to_durations(
+                input.view(),
+                duration_type.c_obj,
+                c_format
+            )
         )
 
     return Column.from_libcudf(move(c_result))
 
 cpdef Column from_durations(
-    Column input,
-    const string& format
+    Column durations,
+    str format=None
 ):
+    """
+    Returns a new strings column converting a duration column into
+    strings using the provided format pattern.
+
+    For details, see cpp:func:`cudf::strings::from_durations`
+
+    Parameters
+    ----------
+    durations : Column
+        Duration values to convert.
+
+    format : str
+        The string specifying output format.
+        Default format is "%D days %H:%M:%S".
+
+    Returns
+    -------
+    Column
+        New strings column with formatted durations.
+    """
     cdef unique_ptr[column] c_result
+
+    if format is None:
+        format = "%D days %H:%M:%S"
+    cdef string c_format = format.encode()
+
     with nogil:
-        c_result = cpp_convert_durations.from_durations(
-            input.view(),
-            format
+        c_result = move(
+            cpp_convert_durations.from_durations(
+                durations.view(),
+                c_format
+            )
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
index 22bb4971cb1..69f7a0fdd33 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from datetime import datetime
-
 import pyarrow as pa
 import pylibcudf as plc
 import pytest
@@ -21,39 +19,16 @@ def timestamp_type(request):
     return request.param
 
 
-@pytest.fixture(
-    scope="module",
-    params=[
-        pa.duration("ns"),
-        pa.duration("us"),
-        pa.duration("ms"),
-        pa.duration("s"),
-    ],
-)
-def duration_type(request):
-    return request.param
-
-
 @pytest.fixture(scope="module")
 def pa_timestamp_col():
     return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"])
 
 
-@pytest.fixture(scope="module")
-def pa_duration_col():
-    return pa.array(["05:20:25"])
-
-
 @pytest.fixture(scope="module")
 def plc_timestamp_col(pa_timestamp_col):
     return plc.interop.from_arrow(pa_timestamp_col)
 
 
-@pytest.fixture(scope="module")
-def plc_duration_col(pa_duration_col):
-    return plc.interop.from_arrow(pa_duration_col)
-
-
 @pytest.mark.parametrize("format", ["%Y-%m-%d"])
 def test_to_datetime(
     pa_timestamp_col, plc_timestamp_col, timestamp_type, format
@@ -65,21 +40,3 @@ def test_to_datetime(
         format,
     )
     assert_column_eq(expect, got)
-
-
-@pytest.mark.parametrize("format", ["%H:%M:%S"])
-def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format):
-    def to_timedelta(duration_str):
-        date = datetime.strptime(duration_str, format)
-        return date - datetime(1900, 1, 1)  # "%H:%M:%S" zero date
-
-    expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast(
-        duration_type
-    )
-
-    got = plc.strings.convert.convert_durations.to_durations(
-        plc_duration_col,
-        plc.interop.from_arrow(duration_type),
-        format.encode(),
-    )
-    assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
new file mode 100644
index 00000000000..6d704309bfd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from datetime import datetime, timedelta
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(
+    params=[
+        pa.duration("ns"),
+        pa.duration("us"),
+        pa.duration("ms"),
+        pa.duration("s"),
+    ],
+)
+def duration_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def pa_duration_col():
+    return pa.array(["05:20:25"])
+
+
+@pytest.fixture(scope="module")
+def plc_duration_col(pa_duration_col):
+    return plc.interop.from_arrow(pa_duration_col)
+
+
+def test_to_duration(pa_duration_col, plc_duration_col, duration_type):
+    format = "%H:%M:%S"
+
+    def to_timedelta(duration_str):
+        date = datetime.strptime(duration_str, format)
+        return date - datetime(1900, 1, 1)  # "%H:%M:%S" zero date
+
+    expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast(
+        duration_type
+    )
+
+    got = plc.strings.convert.convert_durations.to_durations(
+        plc_duration_col,
+        plc.interop.from_arrow(duration_type),
+        format,
+    )
+    assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize("format", [None, "%D days %H:%M:%S"])
+def test_from_durations(format):
+    pa_array = pa.array(
+        [timedelta(days=1, hours=1, minutes=1, seconds=1), None]
+    )
+    result = plc.strings.convert.convert_durations.from_durations(
+        plc.interop.from_arrow(pa_array), format
+    )
+    expected = pa.array(["1 days 01:01:01", None])
+    assert_column_eq(result, expected)

From c958d8e88d8c0cb149b1442ab91705853167a609 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 4 Oct 2024 18:45:30 -0500
Subject: [PATCH 023/117] Upgrade pandas pinnings to support `2.2.3` (#16882)

Pandas released a newer version `2.2.3` with very minimal fixes but one that adds support for python-3.13 and numpy 2.1 compatibility. This PR updates pinnings in `cudf` to support `2.2.3`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16882
---
 ci/test_python_cudf.sh                           | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 conda/recipes/pylibcudf/meta.yaml                | 2 +-
 dependencies.yaml                                | 6 +++++-
 python/cudf/cudf/core/_compat.py                 | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 9 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index 2386414b32e..9528549a562 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf
 
 rapids-logger "Check GPU usage"
 nvidia-smi
-
+rapids-print-env
 EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8b45d26c367..bd5e6c3d569 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -63,7 +63,7 @@ dependencies:
 - openpyxl
 - packaging
 - pandas
-- pandas>=2.0,<2.2.3dev0
+- pandas>=2.0,<2.2.4dev0
 - pandoc
 - polars>=1.8,<1.9
 - pre-commit
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 354c1360e5a..565a3ebfa3c 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -61,7 +61,7 @@ dependencies:
 - openpyxl
 - packaging
 - pandas
-- pandas>=2.0,<2.2.3dev0
+- pandas>=2.0,<2.2.4dev0
 - pandoc
 - polars>=1.8,<1.9
 - pre-commit
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 25e69b89789..2c254415318 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,7 @@ requirements:
   run:
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.3dev0
+    - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
     - numba-cuda >=0.0.13
     - numpy >=1.23,<3.0a0
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index 7c1efa0176c..3d965f30986 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -77,7 +77,7 @@ requirements:
   run:
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.3dev0
+    - pandas >=2.0,<2.2.4dev0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/dependencies.yaml b/dependencies.yaml
index b192158c4ea..3561b22965d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -602,7 +602,7 @@ dependencies:
         packages:
           - fsspec>=0.6.0
           - &numpy numpy>=1.23,<3.0a0
-          - pandas>=2.0,<2.2.3dev0
+          - pandas>=2.0,<2.2.4dev0
   run_pylibcudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -748,6 +748,10 @@ dependencies:
             packages:
               - *numba-cuda-dep
               - pandas==2.0.*
+          - matrix: {dependencies: "latest"}
+            packages:
+              - numba-cuda==0.0.15
+              - pandas==2.2.3
           - matrix:
             packages:
       - output_types: conda
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index e2bdecbe67a..871ffc6269d 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from packaging import version
 
-PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2")
+PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.3")
 PANDAS_VERSION = version.parse(pd.__version__)
 
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 605f9be5a49..1b730ffd13c 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
     "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pandas>=2.0,<2.2.3dev0",
+    "pandas>=2.0,<2.2.4dev0",
     "ptxcompiler",
     "pyarrow>=14.0.0,<18.0.0a0",
     "pylibcudf==24.12.*,>=0.0.0a0",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 76e47b50c3b..ce825c7647b 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<3.0a0",
-    "pandas>=2.0,<2.2.3dev0",
+    "pandas>=2.0,<2.2.4dev0",
     "rapids-dask-dependency==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [

From 33b8dfa42ff9a600adfa6d10c7740169a0340338 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 4 Oct 2024 15:30:19 -1000
Subject: [PATCH 024/117] Add string.convert.convert_ipv4 APIs to pylibcudf
 (#16994)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16994
---
 python/cudf/cudf/_lib/string_casting.pyx      | 42 +++------
 .../libcudf/strings/convert/convert_ipv4.pxd  |  6 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |  2 +-
 .../pylibcudf/strings/convert/__init__.pxd    |  1 +
 .../pylibcudf/strings/convert/__init__.py     |  1 +
 .../strings/convert/convert_ipv4.pxd          | 10 ++
 .../strings/convert/convert_ipv4.pyx          | 92 +++++++++++++++++++
 .../tests/test_string_convert_ipv4.py         | 31 +++++++
 8 files changed, 151 insertions(+), 34 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index fe19379bf93..76c862a8657 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -22,11 +22,6 @@ from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     is_hex as cpp_is_hex,
     to_integers as cpp_to_integers,
 )
-from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
-    integers_to_ipv4 as cpp_integers_to_ipv4,
-    ipv4_to_integers as cpp_ipv4_to_integers,
-    is_ipv4 as cpp_is_ipv4,
-)
 from pylibcudf.libcudf.types cimport data_type, type_id
 
 from cudf._lib.types cimport underlying_type_t_type_id
@@ -569,14 +564,10 @@ def int2ip(Column input_col):
     A Column with integer represented in string ipv4 format
 
     """
-
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_integers_to_ipv4(input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4(
+        input_col.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def ip2int(Column input_col):
@@ -592,14 +583,10 @@ def ip2int(Column input_col):
     A Column with ipv4 represented as integer
 
     """
-
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_ipv4_to_integers(input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers(
+        input_col.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def is_ipv4(Column source_strings):
@@ -608,15 +595,10 @@ def is_ipv4(Column source_strings):
     that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn
     where nnn is integer digits in [0,255].
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_ipv4(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_ipv4.is_ipv4(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def htoi(Column input_col, **kwargs):
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
index fe571cfced6..801db438e92 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
@@ -8,11 +8,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] ipv4_to_integers(
-        column_view input_col) except +
+        column_view input) except +
 
     cdef unique_ptr[column] integers_to_ipv4(
-        column_view input_col) except +
+        column_view integers) except +
 
     cdef unique_ptr[column] is_ipv4(
-        column_view source_strings
+        column_view input
     ) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index fe8da975566..eb0d6ee6999 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx
+                   convert_fixed_point.pyx convert_ipv4.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index 36abf463371..431beed8e5d 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -4,4 +4,5 @@ from . cimport (
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_ipv4,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index c0be4093836..a601b562c2e 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -4,4 +4,5 @@
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_ipv4,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd
new file mode 100644
index 00000000000..c61f5c0bdca
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column ipv4_to_integers(Column input)
+
+cpdef Column integers_to_ipv4(Column integers)
+
+cpdef Column is_ipv4(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
new file mode 100644
index 00000000000..f2a980d4269
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
@@ -0,0 +1,92 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4
+
+
+cpdef Column ipv4_to_integers(Column input):
+    """
+    Converts IPv4 addresses into integers.
+
+    For details, see cpp:func:`cudf::strings::ipv4_to_integers`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    Returns
+    -------
+    Column
+        New uint32 column converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_ipv4.ipv4_to_integers(
+                input.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column integers_to_ipv4(Column integers):
+    """
+    Converts integers into IPv4 addresses as strings.
+
+    For details, see cpp:func:`cudf::strings::integers_to_ipv4`
+
+    Parameters
+    ----------
+    integers : Column
+        Integer (uint32) column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_ipv4.integers_to_ipv4(
+                integers.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_ipv4(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to integers from IPv4 format.
+
+    For details, see cpp:func:`cudf::strings::is_ipv4`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_ipv4.is_ipv4(
+                input.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
new file mode 100644
index 00000000000..4dc3e512624
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_ipv4_to_integers():
+    arr = pa.array(["123.45.67.890", None])
+    result = plc.strings.convert.convert_ipv4.ipv4_to_integers(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array([2066564730, None], type=pa.uint32())
+    assert_column_eq(result, expected)
+
+
+def test_integers_to_ipv4():
+    arr = pa.array([1, 0, None], type=pa.uint32())
+    result = plc.strings.convert.convert_ipv4.integers_to_ipv4(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(["0.0.0.1", "0.0.0.0", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_ipv4():
+    arr = pa.array(["0.0.0.1", "1.2.34", "A", None])
+    result = plc.strings.convert.convert_ipv4.is_ipv4(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array([True, False, False, None])
+    assert_column_eq(result, expected)

From fcff2b6ef7d6db62fc064ad10ffc6c873fc85b58 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Sat, 5 Oct 2024 02:52:53 -0500
Subject: [PATCH 025/117] Fix write_json to handle empty string column (#16995)

Add empty string column condition for write_json
bypass make_strings_children for empty column because when grid size is zero, it throws cuda error.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16995
---
 cpp/src/io/json/write_json.cu     |  3 +++
 cpp/tests/io/json/json_writer.cpp | 37 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index dc7199d7ab1..e1241f8f90c 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -170,6 +170,9 @@ struct escape_strings_fn {
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
   {
+    if (column_v.is_empty()) {  // empty begets empty
+      return make_empty_column(type_id::STRING);
+    }
     auto [offsets_column, chars] =
       cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
 
diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp
index 2c4e29a01b9..39d31c406a5 100644
--- a/cpp/tests/io/json/json_writer.cpp
+++ b/cpp/tests/io/json/json_writer.cpp
@@ -70,6 +70,43 @@ TEST_F(JsonWriterTest, EmptyInput)
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 }
 
+TEST_F(JsonWriterTest, EmptyLeaf)
+{
+  cudf::test::strings_column_wrapper col1{""};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0};
+  auto col2 = make_lists_column(1,
+                                offsets.release(),
+                                cudf::test::strings_column_wrapper{}.release(),
+                                0,
+                                rmm::device_buffer{},
+                                cudf::test::get_default_stream());
+  auto col3 = cudf::test::lists_column_wrapper<int>::make_one_empty_row_column();
+  cudf::table_view tbl_view{{col1, *col2, col3}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}}};
+
+  std::vector<char> out_buffer;
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(true)
+                       .metadata(mt)
+                       .lines(false)
+                       .na_rep("null")
+                       .build();
+
+  // Empty columns in table
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
+  std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])";
+  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+
+  // Empty columns in table - JSON Lines
+  out_buffer.clear();
+  out_options.enable_lines(true);
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
+  std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})"
+                                     "\n";
+  EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
+}
+
 TEST_F(JsonWriterTest, ErrorCases)
 {
   cudf::test::strings_column_wrapper col1{"a", "b", "c"};

From bfd568b4f5c4dd9799b60a2975c1fd183e9b99aa Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 7 Oct 2024 12:04:24 -0400
Subject: [PATCH 026/117] Remove unused import (#17005)

This PR removes an unused unused import in cudf which was causing errors in doc builds.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17005
---
 python/cudf/cudf/_lib/string_casting.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 76c862a8657..d9595f4ab0a 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -6,7 +6,6 @@ from cudf._lib.scalar import as_device_scalar
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from pylibcudf.libcudf.column.column cimport column

From f926a61c7d31b7b33c3a3482507e9efb44b2cc36 Mon Sep 17 00:00:00 2001
From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com>
Date: Mon, 7 Oct 2024 12:37:55 -0400
Subject: [PATCH 027/117] Add release tracking to project automation scripts
 (#17001)

This PR adds two new jobs to the project automations. One to extract the version number from the branch name, and one to set the project `Release` field to the version found.

Authors:
  - Ben Jarmak (https://github.com/jarmak-nv)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17001
---
 .../workflows/pr_issue_status_automation.yml  | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index af8d1289ea1..6f0e88fb245 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -62,3 +62,33 @@ jobs:
         UPDATE_ITEM: true
         UPDATE_LINKED_ISSUES: true
       secrets: inherit
+
+    process-branch-name:
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
+      needs: get-project-id
+      runs-on: ubuntu-latest
+      outputs:
+        branch-name: ${{ steps.process-branch-name.outputs.branch-name }}
+      steps:
+        - name: Extract branch name
+          id: process-branch-name
+          run: |
+            branch=${{ github.event.pull_request.base.ref }}
+            release=${branch#branch-}
+            echo "branch-name=$release" >> "$GITHUB_OUTPUT"
+
+    update-release:
+      # This job sets the PR and its linked issues to the release they are targeting
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
+      needs: [get-project-id, process-branch-name]
+      with:
+        PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
+        SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ"
+        SINGLE_SELECT_FIELD_NAME: "Release"
+        SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}"
+        ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}"
+        ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}"
+        UPDATE_ITEM: true
+        UPDATE_LINKED_ISSUES: true
+      secrets: inherit

From 7e1e4757e753253a99df110fd3814d0136289ef2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 7 Oct 2024 12:41:47 -0700
Subject: [PATCH 028/117] Address all remaining clang-tidy errors (#16956)

With this set of changes I get a clean run of clang-tidy (with one caveat that I'll explain in the follow-up PR to add clang-tidy to pre-commit/CI).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16956
---
 cpp/.clang-tidy                               | 43 ++++++++++++---
 cpp/cmake/thirdparty/get_nanoarrow.cmake      |  8 +--
 .../nanoarrow_clang_tidy_compliance.diff      | 38 ++++++++++++++
 .../patches/nanoarrow_override.json           | 18 +++++++
 cpp/include/cudf/table/table.hpp              |  2 +-
 cpp/include/cudf/table/table_view.hpp         |  2 +-
 cpp/src/io/avro/avro.cpp                      |  3 +-
 cpp/src/io/orc/orc.hpp                        |  2 +-
 .../io/parquet/compact_protocol_reader.cpp    |  6 ++-
 cpp/src/io/utilities/data_sink.cpp            |  6 ++-
 cpp/src/io/utilities/hostdevice_span.hpp      |  8 +--
 cpp/src/utilities/host_memory.cpp             | 13 ++++-
 cpp/tests/binaryop/binop-compiled-test.cpp    |  6 ++-
 cpp/tests/binaryop/util/operation.h           |  4 +-
 cpp/tests/column/column_test.cpp              |  4 +-
 cpp/tests/copying/slice_tests.cpp             | 12 +++--
 cpp/tests/copying/slice_tests.cuh             | 21 ++++----
 cpp/tests/copying/split_tests.cpp             | 52 ++++++++++---------
 .../hashing/murmurhash3_x64_128_test.cpp      |  4 +-
 cpp/tests/hashing/sha256_test.cpp             |  2 -
 cpp/tests/interop/from_arrow_device_test.cpp  | 12 ++---
 cpp/tests/interop/from_arrow_host_test.cpp    |  6 +--
 cpp/tests/interop/from_arrow_test.cpp         |  8 +--
 cpp/tests/interop/to_arrow_device_test.cpp    | 12 ++---
 cpp/tests/interop/to_arrow_host_test.cpp      |  6 +--
 cpp/tests/interop/to_arrow_test.cpp           | 14 ++---
 cpp/tests/io/comp/decomp_test.cpp             | 36 ++++++++-----
 cpp/tests/io/csv_test.cpp                     | 12 ++---
 cpp/tests/io/json/json_test.cpp               |  6 +--
 cpp/tests/io/orc_test.cpp                     | 37 ++++++-------
 cpp/tests/io/parquet_misc_test.cpp            |  2 +-
 cpp/tests/io/parquet_reader_test.cpp          |  7 +--
 cpp/tests/io/parquet_v2_test.cpp              | 36 ++++++-------
 cpp/tests/io/parquet_writer_test.cpp          | 17 +++---
 cpp/tests/join/distinct_join_tests.cpp        | 10 ++--
 cpp/tests/merge/merge_string_test.cpp         |  4 +-
 cpp/tests/merge/merge_test.cpp                |  6 +--
 .../reductions/segmented_reduction_tests.cpp  |  9 ++--
 cpp/tests/replace/replace_tests.cpp           |  4 +-
 cpp/tests/rolling/collect_ops_test.cpp        |  8 +--
 cpp/tests/rolling/offset_row_window_test.cpp  | 12 +++--
 cpp/tests/rolling/rolling_test.cpp            |  2 +-
 cpp/tests/scalar/scalar_test.cpp              |  6 +--
 cpp/tests/search/search_list_test.cpp         |  3 +-
 cpp/tests/sort/sort_test.cpp                  |  2 +-
 cpp/tests/stream_compaction/unique_tests.cpp  |  1 -
 cpp/tests/streams/stream_compaction_test.cpp  |  2 -
 cpp/tests/strings/integers_tests.cpp          |  3 +-
 cpp/tests/structs/structs_column_tests.cpp    |  5 +-
 cpp/tests/transform/bools_to_mask_test.cpp    |  2 +-
 .../integration/unary_transform_test.cpp      | 28 +++++-----
 51 files changed, 344 insertions(+), 228 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
 create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_override.json

diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
index b791d846d1d..2d4f8c0d80e 100644
--- a/cpp/.clang-tidy
+++ b/cpp/.clang-tidy
@@ -1,18 +1,47 @@
 ---
+# Notes on disabled checks
+# ------------------------
+# modernize-use-equals-default:
+#     auto-fix is broken (doesn't insert =default correctly)
+# modernize-concat-nested-namespaces:
+#     auto-fix is broken (can delete code)
+# modernize-use-trailing-return-type:
+#     Purely stylistic, no benefit to rewriting everything
+# modernize-return-braced-init-list:
+#     Stylistically we prefer to see the return type at the return site.
+#     See https://github.com/rapidsai/cudf/pull/16956#pullrequestreview-2341891672
+#     for more information.
+# modernize-use-bool-literals:
+#     Our tests use int flags for validity masks extensively and we prefer that
+# clang-analyzer-cplusplus.NewDeleteLeaks:
+#     This check has numerous bugs, see
+#     https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+newdeleteleaks
+#     We encounter at least
+#     https://github.com/llvm/llvm-project/issues/60896
+#     https://github.com/llvm/llvm-project/issues/69602
+# clang-analyzer-optin.core.EnumCastOutOfRange
+#     We use enums as flags in multiple cases and this check makes ORing flags invalid
+# clang-analyzer-optin.cplusplus.UninitializedObject'
+#     There is an error in nanoarrow that none of the clang-tidy filters (i.e.
+#     header-filter and exclude-header-filter are able to properly avoid. This
+#     merits further investigation
+#
+# We need to verify that broken checks are still broken
 Checks:
       'modernize-*,
        -modernize-use-equals-default,
        -modernize-concat-nested-namespaces,
        -modernize-use-trailing-return-type,
-       -modernize-use-bool-literals'
-
-      # -modernize-use-equals-default        # auto-fix is broken (doesn't insert =default correctly)
-      # -modernize-concat-nested-namespaces  # auto-fix is broken (can delete code)
-      # -modernize-use-trailing-return-type  # just a preference
+       -modernize-return-braced-init-list,
+       -modernize-use-bool-literals,
+       clang-analyzer-*,
+       -clang-analyzer-cplusplus.NewDeleteLeaks,
+       -clang-analyzer-optin.core.EnumCastOutOfRange,
+       -clang-analyzer-optin.cplusplus.UninitializedObject'
 
 WarningsAsErrors: ''
-HeaderFilterRegex: ''
-AnalyzeTemporaryDtors: false
+HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*'
+ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*'
 FormatStyle:     none
 CheckOptions:
  - key:             modernize-loop-convert.MaxCopySize
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 8df1b431095..d7d7fcca044 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,15 +14,17 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+
+  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
+  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
+
   # Currently we need to always build nanoarrow so we don't pickup a previous installed version
   set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
     nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
-    GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
-    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
-    GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
new file mode 100644
index 00000000000..e9a36fcb567
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
@@ -0,0 +1,38 @@
+diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h
+index caa6be4..70ec8a2 100644
+--- a/src/nanoarrow/common/inline_buffer.h
++++ b/src/nanoarrow/common/inline_buffer.h
+@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) {
+ }
+ 
+ static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) {
+-  *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
++  *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT
+                    ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
+                    ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
+                    ((values[7] + 0x7f) & 0x80));
+@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l
+     // set bits within a single byte
+     const uint8_t only_byte_mask =
+         i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask);
+-    bits[bytes_begin] &= only_byte_mask;
++    bits[bytes_begin] &= only_byte_mask;  // NOLINT
+     bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask);
+     return;
+   }
+ 
+   // set/clear trailing bits of first byte
+-  bits[bytes_begin] &= first_byte_mask;
++  bits[bytes_begin] &= first_byte_mask;  // NOLINT
+   bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask);
+ 
+   if (bytes_end - bytes_begin > 2) {
+@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap,
+   n_remaining -= n_full_bytes * 8;
+   if (n_remaining > 0) {
+     // Zero out the last byte
+-    *out_cursor = 0x00;
++    *out_cursor = 0x00;  // NOLINT
+     for (int i = 0; i < n_remaining; i++) {
+       ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]);
+     }
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
new file mode 100644
index 00000000000..d529787e7c8
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
@@ -0,0 +1,18 @@
+
+{
+  "packages" : {
+    "nanoarrow" : {
+      "version" : "0.6.0.dev",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb",
+      "git_shallow" : false,
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff",
+          "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 762131a174f..15fdad21d9f 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -148,7 +148,7 @@ class table {
     std::vector<column_view> columns(std::distance(begin, end));
     std::transform(
       begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); });
-    return table_view(columns);
+    return table_view{columns};
   }
 
   /**
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 4a990f67ce4..d41176590ea 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -241,7 +241,7 @@ class table_view : public detail::table_view_base<column_view> {
   {
     std::vector<column_view> columns(std::distance(begin, end));
     std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); });
-    return table_view(columns);
+    return table_view{columns};
   }
 
   /**
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 03cf6d4a0e0..d5caa4720ac 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -16,6 +16,7 @@
 
 #include "avro.hpp"
 
+#include <array>
 #include <cstring>
 #include <unordered_map>
 
@@ -302,7 +303,7 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, std::string const&
   // Empty schema
   if (json_str == "[]") return true;
 
-  char depthbuf[MAX_SCHEMA_DEPTH];
+  std::array<char, MAX_SCHEMA_DEPTH> depthbuf;
   int depth = 0, parent_idx = -1, entry_idx = -1;
   json_state_e state = state_attrname;
   std::string str;
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 790532c9d54..5ab36fdae8e 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -258,7 +258,7 @@ class ProtobufReader {
 
  private:
   template <int index>
-  friend class FunctionSwitchImpl;
+  friend struct FunctionSwitchImpl;
 
   void skip_bytes(size_t bytecnt)
   {
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index b978799b8bc..312a5243687 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -228,7 +228,8 @@ class parquet_field_string : public parquet_field {
  * @return True if field types mismatch or if the process of reading a
  * string fails
  */
-struct parquet_field_string_list : public parquet_field_list<std::string, FieldType::BINARY> {
+class parquet_field_string_list : public parquet_field_list<std::string, FieldType::BINARY> {
+ public:
   parquet_field_string_list(int f, std::vector<std::string>& v) : parquet_field_list(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
@@ -396,8 +397,9 @@ class parquet_field_binary : public parquet_field {
  * @return True if field types mismatch or if the process of reading a
  * binary fails
  */
-struct parquet_field_binary_list
+class parquet_field_binary_list
   : public parquet_field_list<std::vector<uint8_t>, FieldType::BINARY> {
+ public:
   parquet_field_binary_list(int f, std::vector<std::vector<uint8_t>>& v) : parquet_field_list(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 1dbb9369115..0b76f3d3e8f 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -50,7 +50,8 @@ class file_sink : public data_sink {
     }
   }
 
-  ~file_sink() override { flush(); }
+  // Marked as NOLINT because we are calling a virtual method in the destructor
+  ~file_sink() override { flush(); }  // NOLINT
 
   void host_write(void const* data, size_t size) override
   {
@@ -114,7 +115,8 @@ class host_buffer_sink : public data_sink {
  public:
   explicit host_buffer_sink(std::vector<char>* buffer) : buffer_(buffer) {}
 
-  ~host_buffer_sink() override { flush(); }
+  // Marked as NOLINT because we are calling a virtual method in the destructor
+  ~host_buffer_sink() override { flush(); }  // NOLINT
 
   void host_write(void const* data, size_t size) override
   {
diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp
index d9eac423901..1d8b34addbd 100644
--- a/cpp/src/io/utilities/hostdevice_span.hpp
+++ b/cpp/src/io/utilities/hostdevice_span.hpp
@@ -43,8 +43,8 @@ class hostdevice_span {
   template <typename C,
             // Only supported containers of types convertible to T
             std::enable_if_t<std::is_convertible_v<
-              std::remove_pointer_t<decltype(std::declval<C&>().host_ptr())> (*)[],
-              T (*)[]>>* = nullptr>
+              std::remove_pointer_t<decltype(std::declval<C&>().host_ptr())> (*)[],  // NOLINT
+              T (*)[]>>* = nullptr>                                                  // NOLINT
   constexpr hostdevice_span(C& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size())
   {
   }
@@ -54,8 +54,8 @@ class hostdevice_span {
   template <typename C,
             // Only supported containers of types convertible to T
             std::enable_if_t<std::is_convertible_v<
-              std::remove_pointer_t<decltype(std::declval<C&>().host_ptr())> (*)[],
-              T (*)[]>>* = nullptr>
+              std::remove_pointer_t<decltype(std::declval<C&>().host_ptr())> (*)[],  // NOLINT
+              T (*)[]>>* = nullptr>                                                  // NOLINT
   constexpr hostdevice_span(C const& in)
     : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size())
   {
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 125b98c4a67..9d8e3cf2fa6 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -115,12 +115,19 @@ class fixed_pinned_pool_memory_resource {
     return !operator==(other);
   }
 
-  friend void get_property(fixed_pinned_pool_memory_resource const&,
+  // clang-tidy will complain about this function because it is completely
+  // unused at runtime and only exist for tag introspection by CCCL, so we
+  // ignore linting. This masks a real issue if we ever want to compile with
+  // clang, though, which is that the function will actually be compiled out by
+  // clang. If cudf were ever to try to support clang as a compile we would
+  // need to force the compiler to emit this symbol. The same goes for the
+  // other get_property definitions in this file.
+  friend void get_property(fixed_pinned_pool_memory_resource const&,  // NOLINT
                            cuda::mr::device_accessible) noexcept
   {
   }
 
-  friend void get_property(fixed_pinned_pool_memory_resource const&,
+  friend void get_property(fixed_pinned_pool_memory_resource const&,  // NOLINT
                            cuda::mr::host_accessible) noexcept
   {
   }
@@ -235,7 +242,9 @@ class new_delete_memory_resource {
 
   bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); }
 
+  // NOLINTBEGIN
   friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+  // NOLINTEND
 };
 
 static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 06e0d193d80..aa5b49567e6 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -557,7 +557,11 @@ auto NullOp_Result(cudf::column_view lhs, cudf::column_view rhs)
   std::transform(thrust::make_counting_iterator(0),
                  thrust::make_counting_iterator(lhs.size()),
                  result.begin(),
-                 [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut {
+                 [&lhs_data    = lhs_data,
+                  &lhs_mask    = lhs_mask,
+                  &rhs_data    = rhs_data,
+                  &rhs_mask    = rhs_mask,
+                  &result_mask = result_mask](auto i) -> TypeOut {
                    auto lhs_valid    = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i);
                    auto rhs_valid    = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i);
                    bool output_valid = lhs_valid or rhs_valid;
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index d36b48d666a..ef1ccfccab5 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -100,7 +100,7 @@ struct Mul {
             std::enable_if_t<(cudf::is_duration_t<LhsT>::value && std::is_integral_v<RhsT>) ||
                                (cudf::is_duration_t<RhsT>::value && std::is_integral_v<LhsT>),
                              void>* = nullptr>
-  OutT DurationProduct(LhsT x, RhsT y) const
+  [[nodiscard]] OutT DurationProduct(LhsT x, RhsT y) const
   {
     return x * y;
   }
@@ -128,7 +128,7 @@ struct Div {
     typename LhsT,
     typename RhsT,
     std::enable_if_t<(std::is_integral_v<RhsT> || cudf::is_duration<RhsT>()), void>* = nullptr>
-  OutT DurationDivide(LhsT x, RhsT y) const
+  [[nodiscard]] OutT DurationDivide(LhsT x, RhsT y) const
   {
     return x / y;
   }
diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp
index 14b4197de71..631f5150829 100644
--- a/cpp/tests/column/column_test.cpp
+++ b/cpp/tests/column/column_test.cpp
@@ -340,7 +340,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask)
 
   cudf::column moved_to{std::move(original)};
 
-  EXPECT_EQ(0, original.size());
+  EXPECT_EQ(0, original.size());  // NOLINT
   EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type());
 
   verify_column_views(moved_to);
@@ -359,7 +359,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask)
   cudf::column moved_to{std::move(original)};
   verify_column_views(moved_to);
 
-  EXPECT_EQ(0, original.size());
+  EXPECT_EQ(0, original.size());  // NOLINT
   EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type());
 
   // Verify move
diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp
index bebd3d25610..aef0d4ad78a 100644
--- a/cpp/tests/copying/slice_tests.cpp
+++ b/cpp/tests/copying/slice_tests.cpp
@@ -29,6 +29,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <array>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -370,11 +371,12 @@ TEST_F(SliceStringTableTest, StringWithNulls)
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
-  cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids},
-                                              {strings[1].begin(), strings[1].end(), valids}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
+  std::array<cudf::test::strings_column_wrapper, 2> sw{
+    {{strings[0].begin(), strings[0].end(), valids},
+     {strings[1].begin(), strings[1].end(), valids}}};
 
   std::vector<std::unique_ptr<cudf::column>> scols;
   scols.push_back(sw[0].release());
diff --git a/cpp/tests/copying/slice_tests.cuh b/cpp/tests/copying/slice_tests.cuh
index a180740f143..1e037294527 100644
--- a/cpp/tests/copying/slice_tests.cuh
+++ b/cpp/tests/copying/slice_tests.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -148,7 +148,7 @@ std::vector<cudf::table> create_expected_tables(cudf::size_type num_cols,
       }
     }
 
-    result.push_back(cudf::table(std::move(cols)));
+    result.emplace_back(std::move(cols));
   }
 
   return result;
@@ -163,13 +163,12 @@ inline std::vector<cudf::test::strings_column_wrapper> create_expected_string_co
 
   for (unsigned long index = 0; index < indices.size(); index += 2) {
     if (not nullable) {
-      result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index],
-                                                          strings.begin() + indices[index + 1]));
+      result.emplace_back(strings.begin() + indices[index], strings.begin() + indices[index + 1]);
     } else {
       auto valids = cudf::detail::make_counting_transform_iterator(
         indices[index], [](auto i) { return i % 2 == 0; });
-      result.push_back(cudf::test::strings_column_wrapper(
-        strings.begin() + indices[index], strings.begin() + indices[index + 1], valids));
+      result.emplace_back(
+        strings.begin() + indices[index], strings.begin() + indices[index + 1], valids);
     }
   }
 
@@ -184,16 +183,16 @@ inline std::vector<cudf::test::strings_column_wrapper> create_expected_string_co
   std::vector<cudf::test::strings_column_wrapper> result = {};
 
   for (unsigned long index = 0; index < indices.size(); index += 2) {
-    result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index],
-                                                        strings.begin() + indices[index + 1],
-                                                        validity.begin() + indices[index]));
+    result.emplace_back(strings.begin() + indices[index],
+                        strings.begin() + indices[index + 1],
+                        validity.begin() + indices[index]);
   }
 
   return result;
 }
 
 inline std::vector<cudf::table> create_expected_string_tables(
-  std::vector<std::string> const strings[2],
+  std::vector<std::vector<std::string>> const strings,
   std::vector<cudf::size_type> const& indices,
   bool nullable)
 {
@@ -216,7 +215,7 @@ inline std::vector<cudf::table> create_expected_string_tables(
       }
     }
 
-    result.push_back(cudf::table(std::move(cols)));
+    result.emplace_back(std::move(cols));
   }
 
   return result;
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index ee3e7da5e0f..b56b0f2d3f8 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -35,6 +35,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
+#include <array>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -135,7 +136,7 @@ std::vector<cudf::table> create_expected_tables_for_splits(
 }
 
 std::vector<cudf::table> create_expected_string_tables_for_splits(
-  std::vector<std::string> const strings[2],
+  std::vector<std::vector<std::string>> const strings,
   std::vector<cudf::size_type> const& splits,
   bool nullable)
 {
@@ -144,8 +145,8 @@ std::vector<cudf::table> create_expected_string_tables_for_splits(
 }
 
 std::vector<cudf::table> create_expected_string_tables_for_splits(
-  std::vector<std::string> const strings[2],
-  std::vector<bool> const validity[2],
+  std::vector<std::vector<std::string>> const strings,
+  std::vector<std::vector<bool>> const validity,
   std::vector<cudf::size_type> const& splits)
 {
   std::vector<cudf::size_type> indices = splits_to_indices(splits, strings[0].size());
@@ -627,11 +628,12 @@ void split_string_with_invalids(SplitFunc Split,
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
-  cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids},
-                                              {strings[1].begin(), strings[1].end(), valids}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
+  std::array<cudf::test::strings_column_wrapper, 2> sw{
+    {{strings[0].begin(), strings[0].end(), valids},
+     {strings[1].begin(), strings[1].end(), valids}}};
 
   std::vector<std::unique_ptr<cudf::column>> scols;
   scols.push_back(sw[0].release());
@@ -658,11 +660,12 @@ void split_empty_output_strings_column_value(SplitFunc Split,
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
-  cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids},
-                                              {strings[1].begin(), strings[1].end(), valids}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
+  std::array<cudf::test::strings_column_wrapper, 2> sw{
+    {{strings[0].begin(), strings[0].end(), valids},
+     {strings[1].begin(), strings[1].end(), valids}}};
 
   std::vector<std::unique_ptr<cudf::column>> scols;
   scols.push_back(sw[0].release());
@@ -684,9 +687,9 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare)
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
 
   std::vector<cudf::size_type> splits{2, 5, 9};
 
@@ -699,16 +702,17 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare)
     EXPECT_NO_THROW(Split(empty_table, splits));
   }
 
-  cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), no_valids},
-                                              {strings[1].begin(), strings[1].end(), valids}};
+  std::array<cudf::test::strings_column_wrapper, 2> sw{
+    {{strings[0].begin(), strings[0].end(), no_valids},
+     {strings[1].begin(), strings[1].end(), valids}}};
   std::vector<std::unique_ptr<cudf::column>> scols;
   scols.push_back(sw[0].release());
   scols.push_back(sw[1].release());
   cudf::table src_table(std::move(scols));
   auto result = Split(src_table, splits);
 
-  std::vector<bool> validity_masks[2] = {std::vector<bool>(strings[0].size()),
-                                         std::vector<bool>(strings[0].size())};
+  std::vector<std::vector<bool>> validity_masks{std::vector<bool>(strings[0].size()),
+                                                std::vector<bool>(strings[0].size())};
   std::generate(
     validity_masks[1].begin(), validity_masks[1].end(), [i = 0]() mutable { return i++ % 2 == 0; });
 
@@ -1913,9 +1917,9 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes)
   cudf::size_type start = 0;
   auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
 
   std::vector<std::unique_ptr<cudf::column>> cols;
 
@@ -2377,7 +2381,7 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall)
 {
   // internally, contiguous split chunks GPU work in 1MB contiguous copies
   // so the output buffer must be 1MB or larger.
-  EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error);
+  EXPECT_THROW(auto _ = cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error);
 }
 
 TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
diff --git a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp
index 4fb8f78b558..0e68050f935 100644
--- a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp
+++ b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,6 @@
 
 #include <cudf/hashing.hpp>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 using NumericTypesNoBools =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 
diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp
index cc95c7a2f0f..8bc47c92c6b 100644
--- a/cpp/tests/hashing/sha256_test.cpp
+++ b/cpp/tests/hashing/sha256_test.cpp
@@ -23,8 +23,6 @@
 #include <cudf/hashing.hpp>
 #include <cudf/utilities/error.hpp>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 class SHA256HashTest : public cudf::test::BaseFixture {};
 
 TEST_F(SHA256HashTest, EmptyTable)
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index a4dc7531765..2151ec6e22f 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -270,9 +270,9 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
@@ -414,9 +414,9 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
 {
   std::vector<std::unique_ptr<cudf::column>> columns;
   auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(cudf::dictionary::encode(col));
+  columns.emplace_back(cudf::dictionary::encode(col));
+  columns.emplace_back(cudf::dictionary::encode(col));
 
   cudf::table expected_table(std::move(columns));
   cudf::table_view expected_table_view = expected_table.view();
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
index cbfa4911c3c..ef9936b214c 100644
--- a/cpp/tests/interop/from_arrow_host_test.cpp
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -309,9 +309,9 @@ TEST_F(FromArrowHostDeviceTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 81c406c0faf..6e742b9e4cf 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -52,7 +52,7 @@ std::unique_ptr<cudf::table> get_cudf_table()
                          .release());
   auto col4 = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 5, 2, 7},
                                                               {true, false, true, true, true});
-  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
                          {true, false, true, false, true}, {true, false, true, true, false})
                          .release());
@@ -339,9 +339,9 @@ TEST_F(FromArrowTest, DictionaryIndicesType)
   std::vector<std::unique_ptr<cudf::column>> columns;
   auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7},
                                                              {true, false, true, true, true});
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(cudf::dictionary::encode(col));
+  columns.emplace_back(cudf::dictionary::encode(col));
+  columns.emplace_back(cudf::dictionary::encode(col));
 
   cudf::table expected_table(std::move(columns));
 
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 51216a8512c..7ba586461dc 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -55,7 +55,7 @@ get_nanoarrow_cudf_table(cudf::size_type length)
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
     test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin());
   auto dict_col = cudf::dictionary::encode(col4);
-  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(test_data.bool_data.begin(),
                                                                     test_data.bool_data.end(),
                                                                     test_data.bool_validity.begin())
@@ -82,8 +82,8 @@ get_nanoarrow_cudf_table(cudf::size_type length)
       test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin())
       .release();
   vector_of_columns cols;
-  cols.push_back(move(int_column));
-  cols.push_back(move(str_column));
+  cols.push_back(std::move(int_column));
+  cols.push_back(std::move(str_column));
   auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
     test_data.bool_data_validity.begin(), test_data.bool_data_validity.end()));
   columns.emplace_back(
@@ -575,9 +575,9 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp
index fc0ed6c9352..fcb4433b42e 100644
--- a/cpp/tests/interop/to_arrow_host_test.cpp
+++ b/cpp/tests/interop/to_arrow_host_test.cpp
@@ -436,9 +436,9 @@ TEST_F(ToArrowHostDeviceTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 90ae12cdd90..a6aa4b22eca 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -90,7 +90,7 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
     int64_data.begin(), int64_data.end(), validity.begin());
   auto dict_col = cudf::dictionary::encode(col4);
-  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
                          bool_data.begin(), bool_data.end(), bool_validity.begin())
                          .release());
@@ -112,8 +112,8 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
     cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
       .release();
   vector_of_columns cols;
-  cols.push_back(move(int_column));
-  cols.push_back(move(str_column));
+  cols.push_back(std::move(int_column));
+  cols.push_back(std::move(str_column));
   auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
     bool_data_validity.begin(), bool_data_validity.end()));
   columns.emplace_back(
@@ -294,9 +294,9 @@ TEST_F(ToArrowTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
@@ -438,7 +438,7 @@ TEST_F(ToArrowTest, FixedPoint64TableLarge)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};  // NOLINT
     ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index 840cf263ed9..54262dc3b44 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -39,19 +39,19 @@ using cudf::device_span;
  */
 template <typename Decompressor>
 struct DecompressTest : public cudf::test::BaseFixture {
-  std::vector<uint8_t> vector_from_string(char const* str) const
+  [[nodiscard]] std::vector<uint8_t> vector_from_string(std::string const str) const
   {
-    return std::vector<uint8_t>(reinterpret_cast<uint8_t const*>(str),
-                                reinterpret_cast<uint8_t const*>(str) + strlen(str));
+    return {reinterpret_cast<uint8_t const*>(str.c_str()),
+            reinterpret_cast<uint8_t const*>(str.c_str()) + strlen(str.c_str())};
   }
 
-  void Decompress(std::vector<uint8_t>* decompressed,
+  void Decompress(std::vector<uint8_t>& decompressed,
                   uint8_t const* compressed,
                   size_t compressed_size)
   {
     auto stream = cudf::get_default_stream();
     rmm::device_buffer src{compressed, compressed_size, stream};
-    rmm::device_uvector<uint8_t> dst{decompressed->size(), stream};
+    rmm::device_uvector<uint8_t> dst{decompressed.size(), stream};
 
     cudf::detail::hostdevice_vector<device_span<uint8_t const>> inf_in(1, stream);
     inf_in[0] = {static_cast<uint8_t const*>(src.data()), src.size()};
@@ -67,7 +67,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
 
     static_cast<Decompressor*>(this)->dispatch(inf_in, inf_out, inf_stat);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
-      decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value()));
+      decompressed.data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value()));
     inf_stat.device_to_host_sync(stream);
     ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS);
   }
@@ -125,49 +125,57 @@ struct NvcompConfigTest : public cudf::test::BaseFixture {};
 
 TEST_F(GzipDecompressTest, HelloWorld)
 {
-  constexpr char uncompressed[]  = "hello world";
+  std::string const uncompressed{"hello world"};
+  // NOLINTBEGIN
   constexpr uint8_t compressed[] = {
     0x1f, 0x8b, 0x8,  0x0,  0x9,  0x63, 0x99, 0x5c, 0x2,  0xff, 0xcb, 0x48, 0xcd, 0xc9, 0xc9, 0x57,
     0x28, 0xcf, 0x2f, 0xca, 0x49, 0x1,  0x0,  0x85, 0x11, 0x4a, 0xd,  0xb,  0x0,  0x0,  0x0};
+  // NOLINTEND
 
   std::vector<uint8_t> input = vector_from_string(uncompressed);
   std::vector<uint8_t> output(input.size());
-  Decompress(&output, compressed, sizeof(compressed));
+  Decompress(output, compressed, sizeof(compressed));
   EXPECT_EQ(output, input);
 }
 
 TEST_F(SnappyDecompressTest, HelloWorld)
 {
-  constexpr char uncompressed[]  = "hello world";
+  std::string const uncompressed{"hello world"};
+  // NOLINTBEGIN
   constexpr uint8_t compressed[] = {
     0xb, 0x28, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64};
+  // NOLINTEND
 
   std::vector<uint8_t> input = vector_from_string(uncompressed);
   std::vector<uint8_t> output(input.size());
-  Decompress(&output, compressed, sizeof(compressed));
+  Decompress(output, compressed, sizeof(compressed));
   EXPECT_EQ(output, input);
 }
 
 TEST_F(SnappyDecompressTest, ShortLiteralAfterLongCopyAtStartup)
 {
-  constexpr char uncompressed[]  = "Aaaaaaaaaaaah!";
+  std::string const uncompressed{"Aaaaaaaaaaaah!"};
+  // NOLINTBEGIN
   constexpr uint8_t compressed[] = {14, 0x0, 'A', 0x0, 'a', (10 - 4) * 4 + 1, 1, 0x4, 'h', '!'};
+  // NOLINTEND
 
   std::vector<uint8_t> input = vector_from_string(uncompressed);
   std::vector<uint8_t> output(input.size());
-  Decompress(&output, compressed, sizeof(compressed));
+  Decompress(output, compressed, sizeof(compressed));
   EXPECT_EQ(output, input);
 }
 
 TEST_F(BrotliDecompressTest, HelloWorld)
 {
-  constexpr char uncompressed[]  = "hello world";
+  std::string const uncompressed{"hello world"};
+  // NOLINTBEGIN
   constexpr uint8_t compressed[] = {
     0xb, 0x5, 0x80, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x3};
+  // NOLINTEND
 
   std::vector<uint8_t> input = vector_from_string(uncompressed);
   std::vector<uint8_t> output(input.size());
-  Decompress(&output, compressed, sizeof(compressed));
+  Decompress(output, compressed, sizeof(compressed));
   EXPECT_EQ(output, input);
 }
 
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 0028dd946e3..b265dcf9273 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -63,9 +63,9 @@ auto dtype()
 
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
-  typename std::conditional<std::is_same_v<T, cudf::string_view>,
-                            cudf::test::strings_column_wrapper,
-                            cudf::test::fixed_width_column_wrapper<T, SourceElementT>>::type;
+  std::conditional_t<std::is_same_v<T, cudf::string_view>,
+                     cudf::test::strings_column_wrapper,
+                     cudf::test::fixed_width_column_wrapper<T, SourceElementT>>;
 using column     = cudf::column;
 using table      = cudf::table;
 using table_view = cudf::table_view;
@@ -954,7 +954,7 @@ TEST_F(CsvReaderTest, Strings)
   ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
-    std::vector<std::string>{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"},
+    std::vector<std::string>{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"},
     view.column(1));
 }
 
@@ -1014,7 +1014,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
   ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
-    std::vector<std::string>{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"},
+    std::vector<std::string>{"\"abcdef ghi\"", R"("jkl ""mno"" pqr")", "stu \"vwx\" yz"},
     view.column(1));
 }
 
@@ -1830,7 +1830,7 @@ TEST_F(CsvReaderTest, StringsWithWriter)
 
   auto int_column = column_wrapper<int32_t>{10, 20, 30};
   auto string_column =
-    column_wrapper<cudf::string_view>{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"};
+    column_wrapper<cudf::string_view>{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"};
   cudf::table_view input_table(std::vector<cudf::column_view>{int_column, string_column});
 
   // TODO add quoting style flag?
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 49ad0c408dc..cb6716f4a18 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -68,9 +68,9 @@ auto dtype()
 
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
-  typename std::conditional<std::is_same_v<T, cudf::string_view>,
-                            cudf::test::strings_column_wrapper,
-                            cudf::test::fixed_width_column_wrapper<T, SourceElementT>>::type;
+  std::conditional_t<std::is_same_v<T, cudf::string_view>,
+                     cudf::test::strings_column_wrapper,
+                     cudf::test::fixed_width_column_wrapper<T, SourceElementT>>;
 
 cudf::test::TempDirTestEnvironment* const temp_env =
   static_cast<cudf::test::TempDirTestEnvironment*>(
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 89e704f3ed3..cce0adbf317 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -43,9 +43,9 @@
 
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
-  typename std::conditional<std::is_same_v<T, cudf::string_view>,
-                            cudf::test::strings_column_wrapper,
-                            cudf::test::fixed_width_column_wrapper<T, SourceElementT>>::type;
+  std::conditional_t<std::is_same_v<T, cudf::string_view>,
+                     cudf::test::strings_column_wrapper,
+                     cudf::test::fixed_width_column_wrapper<T, SourceElementT>>;
 
 using str_col     = column_wrapper<cudf::string_view>;
 using bool_col    = column_wrapper<bool>;
@@ -1358,21 +1358,22 @@ TEST_P(OrcWriterTestStripes, StripeSize)
   cols.push_back(col.release());
   auto const expected = std::make_unique<table>(std::move(cols));
 
-  auto validate = [&](std::vector<char> const& orc_buffer) {
-    auto const expected_stripe_num =
-      std::max<cudf::size_type>(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes);
-    auto const stats = cudf::io::read_parsed_orc_statistics(
-      cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
-    EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
-
-    cudf::io::orc_reader_options in_opts =
-      cudf::io::orc_reader_options::builder(
-        cudf::io::source_info(orc_buffer.data(), orc_buffer.size()))
-        .use_index(false);
-    auto result = cudf::io::read_orc(in_opts);
-
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  };
+  auto validate =
+    [&, &size_bytes = size_bytes, &size_rows = size_rows](std::vector<char> const& orc_buffer) {
+      auto const expected_stripe_num =
+        std::max<cudf::size_type>(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes);
+      auto const stats = cudf::io::read_parsed_orc_statistics(
+        cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
+      EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
+
+      cudf::io::orc_reader_options in_opts =
+        cudf::io::orc_reader_options::builder(
+          cudf::io::source_info(orc_buffer.data(), orc_buffer.size()))
+          .use_index(false);
+      auto result = cudf::io::read_orc(in_opts);
+
+      CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+    };
 
   {
     std::vector<char> out_buffer_chunked;
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index 8b03e94191e..f1286a00d22 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -98,7 +98,7 @@ TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced)
   // list<T>
   constexpr int vals_per_row = 4;
   auto c1_offset_iter        = cudf::detail::make_counting_transform_iterator(
-    0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; });
+    0, [](cudf::size_type idx) { return idx * vals_per_row; });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
                                                                      c1_offset_iter + num_rows + 1);
   cudf::test::fixed_width_column_wrapper<T> c1_vals(
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index dc8e68b3a15..4a5309f3ba7 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -1189,15 +1189,12 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest)
   cudf::test::fixed_width_column_wrapper<int> values(value_iter, value_iter + num_values, validity);
 
   // ~256k values with num_nesting_levels = 16
-  int total_values_produced = num_values;
-  auto prev_col             = values.release();
+  auto prev_col = values.release();
   for (int idx = 0; idx < num_nesting_levels; idx++) {
-    auto const depth    = num_nesting_levels - idx;
     auto const num_rows = (1 << (num_nesting_levels - idx));
 
     auto offsets_iter = cudf::detail::make_counting_transform_iterator(
-      0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; });
-    total_values_produced += (num_rows + 1);
+      0, [](cudf::size_type i) { return i * rows_per_level; });
 
     cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets(offsets_iter,
                                                                     offsets_iter + num_rows + 1);
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index 7c305235ea6..a0b48f54854 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -1302,24 +1302,24 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
   table_view expected({col0, col1, col2, col3, col4, col5, col6, col7});
 
   std::array<int64_t, 9> expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11};
-  std::vector<int64_t> const expected_def_hists[] = {{1, 1, 2, 3},
-                                                     {1, 3, 10},
-                                                     {1, 1, 2, 10},
-                                                     {1, 1, 2, 2, 8},
-                                                     {1, 1, 1, 1, 10},
-                                                     {1, 1, 1, 1, 2, 8},
-                                                     {1, 3, 9},
-                                                     {1, 3, 1, 8},
-                                                     {1, 0, 4, 1, 1, 4, 9}};
-  std::vector<int64_t> const expected_rep_hists[] = {{4, 3},
-                                                     {4, 4, 6},
-                                                     {4, 4, 6},
-                                                     {4, 4, 6},
-                                                     {4, 4, 6},
-                                                     {4, 4, 6},
-                                                     {4, 4, 5},
-                                                     {4, 4, 5},
-                                                     {4, 6, 2, 8}};
+  std::vector<std::vector<int64_t>> const expected_def_hists = {{1, 1, 2, 3},
+                                                                {1, 3, 10},
+                                                                {1, 1, 2, 10},
+                                                                {1, 1, 2, 2, 8},
+                                                                {1, 1, 1, 1, 10},
+                                                                {1, 1, 1, 1, 2, 8},
+                                                                {1, 3, 9},
+                                                                {1, 3, 1, 8},
+                                                                {1, 0, 4, 1, 1, 4, 9}};
+  std::vector<std::vector<int64_t>> const expected_rep_hists = {{4, 3},
+                                                                {4, 4, 6},
+                                                                {4, 4, 6},
+                                                                {4, 4, 6},
+                                                                {4, 4, 6},
+                                                                {4, 4, 6},
+                                                                {4, 4, 5},
+                                                                {4, 4, 5},
+                                                                {4, 6, 2, 8}};
 
   auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet");
   auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 8794f2ee304..6c5e9cdf07a 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -290,7 +290,8 @@ class custom_test_data_sink : public cudf::io::data_sink {
     CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file");
   }
 
-  ~custom_test_data_sink() override { flush(); }
+  // Marked as NOLINT because we are calling a virtual method in the destructor
+  ~custom_test_data_sink() override { flush(); }  // NOLINT
 
   void host_write(void const* data, size_t size) override
   {
@@ -981,13 +982,15 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
 
 TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
 {
-  std::vector<uint8_t> truncated_min[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe},
-                                          {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-                                          {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
+  std::array<std::vector<uint8_t>, 3> truncated_min{
+    {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe},
+     {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+     {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}};
 
-  std::vector<uint8_t> truncated_max[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff},
-                                          {0xff},
-                                          {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
+  std::array<std::vector<uint8_t>, 3> truncated_max{
+    {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff},
+     {0xff},
+     {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}};
 
   cudf::test::lists_column_wrapper<uint8_t> col0{
     {0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}};
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 93754091b3f..178edc52dd3 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -314,7 +314,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -362,7 +362,7 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -398,7 +398,7 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -423,7 +423,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}};
   strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
@@ -468,7 +468,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   auto col0_gold_names_col = strcol_wrapper{
     "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"};
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index 97979e79010..bea044496b3 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -97,7 +97,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyColumns)
                                             "hi",
                                             "hj"});
 
-  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) {
+  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8)
       return 0;
     else
@@ -296,7 +296,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns)
                                             true,
                                             false,
                                             false});
-  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) {
+  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8)
       return 0;
     else
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 2e09f25b51f..6208d395f0a 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -349,7 +349,7 @@ TYPED_TEST(MergeTest_, Merge1KeyColumns)
   cudf::test::fixed_width_column_wrapper<TypeParam, typename decltype(seq_out1)::value_type>
     expectedDataWrap1(seq_out1, seq_out1 + outputRows);
 
-  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) {
+  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8)
       return 0;
     else
@@ -452,7 +452,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns)
   cudf::size_type inputRows = 40;
 
   // data: 0  2  4  6 | valid: 1 1 1 0
-  auto sequence1       = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) {
+  auto sequence1       = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8) {
       return 0;  // <- no shortcut to this can avoid compiler errors
     } else {
@@ -465,7 +465,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns)
     leftColWrap1(sequence1, sequence1 + inputRows, valid_sequence1);
 
   // data: 1  3  5  7 | valid: 1 1 1 0
-  auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) {
+  auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8) {
       return 1;
     } else
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 19996f827cf..bc0321bd40a 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -1092,11 +1092,10 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
   auto aggregates =
     std::vector<std::unique_ptr<cudf::segmented_reduce_aggregation,
                                 std::default_delete<cudf::segmented_reduce_aggregation>>>();
-  aggregates.push_back(std::move(cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>()));
-  aggregates.push_back(std::move(cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>()));
-  aggregates.push_back(std::move(cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>()));
-  aggregates.push_back(
-    std::move(cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>()));
+  aggregates.push_back(cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>());
+  aggregates.push_back(cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>());
+  aggregates.push_back(cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>());
+  aggregates.push_back(cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>());
 
   auto output_type = cudf::data_type{cudf::type_to_id<int32_t>()};
   for (auto&& agg : aggregates) {
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 1858cd7782e..b12bf08520f 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -356,7 +356,7 @@ void test_replace(cudf::host_span<T const> input_column,
 
   for (size_t i = 0; i < values_to_replace_column.size(); i++) {
     size_t k  = 0;
-    auto pred = [=, &k, &reference_result, &expected_valid, &isReplaced](T element) {
+    auto pred = [=, &k, &expected_valid, &isReplaced](T element) {
       bool toBeReplaced = false;
       if (!isReplaced[k]) {
         if (!input_has_nulls || expected_valid[k]) {
@@ -503,7 +503,7 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest)
   const size_t REPLACE_SIZE = 10000;
 
   thrust::host_vector<TypeParam> input_column(DATA_SIZE);
-  std::generate(std::begin(input_column), std::end(input_column), [REPLACE_SIZE]() {
+  std::generate(std::begin(input_column), std::end(input_column), []() {
     return std::rand() % (REPLACE_SIZE);
   });
 
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index f702dc78371..165e0347785 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -214,7 +214,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
                          *cudf::make_collect_list_aggregation<cudf::rolling_aggregation>());
   auto expected_result_2 = cudf::test::lists_column_wrapper<T, int32_t>{
     {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}},
-    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
       return i != 0 && i < 4;
     })}.release();
 
@@ -338,7 +338,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 4, 8, 12, 12, 12}.release();
     auto expected_num_rows = expected_offsets->size() - 1;
     auto null_mask_iter    = cudf::detail::make_counting_transform_iterator(
-      cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; });
+      cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; });
 
     auto [null_mask, null_count] =
       cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows);
@@ -373,7 +373,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 3, 5, 8, 8, 8}.release();
     auto expected_num_rows = expected_offsets->size() - 1;
     auto null_mask_iter    = cudf::detail::make_counting_transform_iterator(
-      cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; });
+      cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; });
 
     auto [null_mask, null_count] =
       cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows);
@@ -1499,7 +1499,7 @@ TYPED_TEST(TypedCollectSetTest, RollingWindowHonoursMinPeriods)
                         *cudf::make_collect_set_aggregation<cudf::rolling_aggregation>());
   auto expected_result_2 = cudf::test::lists_column_wrapper<T, int32_t>{
     {{}, {0, 1, 2}, {1, 2, 4}, {2, 4, 5}, {}, {}},
-    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
       return i != 0 && i < 4;
     })}.release();
 
diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp
index ec726878b34..0eaab0c9f7a 100644
--- a/cpp/tests/rolling/offset_row_window_test.cpp
+++ b/cpp/tests/rolling/offset_row_window_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,11 @@ using cudf::test::iterators::nulls_at;
 
 auto constexpr null = int32_t{0};  // NULL representation for int32_t;
 
+// clang-tidy doesn't think std::transform can handle a
+// thrust::constant_iterator, so this is a workaround that uses nulls_at
+// instead of no_nulls
+auto no_nulls_list() { return nulls_at({}); }
+
 struct OffsetRowWindowTest : public cudf::test::BaseFixture {
   static ints_column const _keys;    // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
   static ints_column const _values;  // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
@@ -210,7 +215,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *run_rolling(*AGG_COLLECT_LIST),
-    lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, no_nulls});
+    lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}},
+                 no_nulls_list()});
 }
 
 TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2)
@@ -250,7 +256,7 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *run_rolling(*AGG_COLLECT_LIST),
     lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}},
-                 no_nulls});
+                 no_nulls_list()});
 }
 
 // To test that preceding bounds are clamped correctly at group boundaries.
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index c2c22986975..6e0dc16dca9 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -541,7 +541,7 @@ class RollingTest : public cudf::test::BaseFixture {
 
     agg_op op;
     for (cudf::size_type i = 0; i < num_rows; i++) {
-      OutputType val = agg_op::template identity<OutputType>();
+      auto val = agg_op::template identity<OutputType>();
 
       // load sizes
       min_periods = std::max(min_periods, 1);  // at least one observation is required
diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp
index 2d37de920d5..2b79911a95a 100644
--- a/cpp/tests/scalar/scalar_test.cpp
+++ b/cpp/tests/scalar/scalar_test.cpp
@@ -190,7 +190,7 @@ TEST_F(ListScalarTest, MoveConstructorNonNested)
 
   EXPECT_EQ(mask_ptr, s2.validity_data());
   EXPECT_EQ(data_ptr, s2.view().data<int32_t>());
-  EXPECT_EQ(s.view().data<int32_t>(), nullptr);
+  EXPECT_EQ(s.view().data<int32_t>(), nullptr);  // NOLINT
 }
 
 TEST_F(ListScalarTest, MoveConstructorNested)
@@ -205,8 +205,8 @@ TEST_F(ListScalarTest, MoveConstructorNested)
   EXPECT_EQ(mask_ptr, s2.validity_data());
   EXPECT_EQ(offset_ptr, s2.view().child(0).data<cudf::size_type>());
   EXPECT_EQ(data_ptr, s2.view().child(1).data<int32_t>());
-  EXPECT_EQ(s.view().data<int32_t>(), nullptr);
-  EXPECT_EQ(s.view().num_children(), 0);
+  EXPECT_EQ(s.view().data<int32_t>(), nullptr);  // NOLINT
+  EXPECT_EQ(s.view().num_children(), 0);         // NOLINT
 }
 
 struct StructScalarTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp
index 48711c21715..7584003e800 100644
--- a/cpp/tests/search/search_list_test.cpp
+++ b/cpp/tests/search/search_list_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,6 @@ using strings_col = cudf::test::strings_column_wrapper;
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR};
 constexpr int32_t null{0};  // Mark for null child elements at the current level
-constexpr int32_t XXX{0};   // Mark for null elements at all levels
 
 using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                      cudf::test::FloatingPointTypes,
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index e84275f41ef..6a35e977b46 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -1087,7 +1087,7 @@ TEST_F(SortCornerTest, WithEmptyStructColumn)
   child_columns2.push_back(std::move(child_col_1));
   int_col col4{{5, 4, 3, 2, 1, 0}};
   std::vector<std::unique_ptr<cudf::column>> grand_child;
-  grand_child.push_back(std::move(col4.release()));
+  grand_child.push_back(col4.release());
   auto child_col_2 = cudf::make_structs_column(6, std::move(grand_child), 0, rmm::device_buffer{});
   child_columns2.push_back(std::move(child_col_2));
   auto struct_col3 =
diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp
index 4d7d23dc881..d5b6915b520 100644
--- a/cpp/tests/stream_compaction/unique_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_tests.cpp
@@ -43,7 +43,6 @@ auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
 auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
 auto constexpr KEEP_LAST    = cudf::duplicate_keep_option::KEEP_LAST;
 auto constexpr KEEP_NONE    = cudf::duplicate_keep_option::KEEP_NONE;
-auto constexpr NULL_EQUAL   = cudf::null_equality::EQUAL;
 auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
 
 using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
index 443f4548b2c..07b2d77cc04 100644
--- a/cpp/tests/streams/stream_compaction_test.cpp
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -29,8 +29,6 @@
 
 #include <cmath>
 
-auto constexpr null{0};  // null at current level
-auto constexpr XXX{0};   // null pushed down from parent level
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
 auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
 auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index ce5f68de3c9..26bcfe8028d 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -30,6 +30,7 @@
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
+#include <array>
 #include <string>
 #include <vector>
 
@@ -425,7 +426,7 @@ TYPED_TEST(StringsIntegerConvertTest, IntegerToHex)
     if (v == 0) { return std::string("00"); }
     // special handling for single-byte types
     if constexpr (std::is_same_v<TypeParam, int8_t> || std::is_same_v<TypeParam, uint8_t>) {
-      char const hex_digits[16] = {
+      std::array const hex_digits = {
         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
       std::string str;
       str += hex_digits[(v & 0xF0) >> 4];
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index f0010fc1ed9..219bd6d8b01 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -635,9 +635,8 @@ TEST_F(StructColumnWrapperTest, TestStructsColumnWithEmptyChild)
   auto mask_vec = std::vector<bool>{true, false, false};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(mask_vec.begin(), mask_vec.end());
-  auto structs_col =
-    cudf::make_structs_column(num_rows, std::move(cols), null_count, std::move(null_mask));
-  EXPECT_NO_THROW(structs_col->view());
+  EXPECT_NO_THROW(auto structs_col = cudf::make_structs_column(
+                    num_rows, std::move(cols), null_count, std::move(null_mask)));
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp
index 215ca158f37..2684123c08a 100644
--- a/cpp/tests/transform/bools_to_mask_test.cpp
+++ b/cpp/tests/transform/bools_to_mask_test.cpp
@@ -32,7 +32,7 @@ struct MaskToNullTest : public cudf::test::BaseFixture {
   {
     cudf::test::fixed_width_column_wrapper<bool> input_column(
       input.begin(), input.end(), val.begin());
-    std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and<bool>());
+    std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and<>());
 
     auto sample = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
 
diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp
index 1785848ec77..0bdf5b321ac 100644
--- a/cpp/tests/transform/integration/unary_transform_test.cpp
+++ b/cpp/tests/transform/integration/unary_transform_test.cpp
@@ -47,7 +47,7 @@ void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool
 TEST_F(UnaryOperationIntegrationTest, Transform_FP32_FP32)
 {
   // c = a*a*a*a
-  char const* cuda =
+  std::string const cuda =
     R"***(
 __device__ inline void    fdsf   (
        float* C,
@@ -58,7 +58,7 @@ __device__ inline void    fdsf   (
 }
 )***";
 
-  char const* ptx =
+  std::string const ptx =
     R"***(
 //
 // Generated by NVIDIA NVVM Compiler
@@ -101,17 +101,17 @@ __device__ inline void    fdsf   (
   auto op        = [](dtype a) { return a * a * a * a; };
   auto data_init = [](cudf::size_type row) { return row % 3; };
 
-  test_udf<dtype>(cuda, op, data_init, 500, false);
-  test_udf<dtype>(ptx, op, data_init, 500, true);
+  test_udf<dtype>(cuda.c_str(), op, data_init, 500, false);
+  test_udf<dtype>(ptx.c_str(), op, data_init, 500, true);
 }
 
 TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32)
 {
   // c = a * a - a
-  char const cuda[] =
+  std::string const cuda =
     "__device__ inline void f(int* output,int input){*output = input*input - input;}";
 
-  char const* ptx =
+  std::string const ptx =
     R"***(
 .func _Z1fPii(
         .param .b64 _Z1fPii_param_0,
@@ -136,8 +136,8 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32)
   auto op        = [](dtype a) { return a * a - a; };
   auto data_init = [](cudf::size_type row) { return row % 78; };
 
-  test_udf<dtype>(cuda, op, data_init, 500, false);
-  test_udf<dtype>(ptx, op, data_init, 500, true);
+  test_udf<dtype>(cuda.c_str(), op, data_init, 500, false);
+  test_udf<dtype>(ptx.c_str(), op, data_init, 500, true);
 }
 
 TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8)
@@ -145,7 +145,7 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8)
   // Capitalize all the lower case letters
   // Assuming ASCII, the PTX code is compiled from the following CUDA code
 
-  char const cuda[] =
+  std::string const cuda =
     R"***(
 __device__ inline void f(
   signed char* output,
@@ -159,7 +159,7 @@ __device__ inline void f(
 }
 )***";
 
-  char const ptx[] =
+  std::string const ptx =
     R"***(
 .func _Z1fPcc(
         .param .b64 _Z1fPcc_param_0,
@@ -191,15 +191,15 @@ __device__ inline void f(
   auto op        = [](dtype a) { return std::toupper(a); };
   auto data_init = [](cudf::size_type row) { return 'a' + (row % 26); };
 
-  test_udf<dtype>(cuda, op, data_init, 500, false);
-  test_udf<dtype>(ptx, op, data_init, 500, true);
+  test_udf<dtype>(cuda.c_str(), op, data_init, 500, false);
+  test_udf<dtype>(ptx.c_str(), op, data_init, 500, true);
 }
 
 TEST_F(UnaryOperationIntegrationTest, Transform_Datetime)
 {
   // Add one day to timestamp in microseconds
 
-  char const cuda[] =
+  std::string const cuda =
     R"***(
 __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input)
 {
@@ -217,7 +217,7 @@ __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input)
   auto random_eng = cudf::test::UniformRandomGenerator<cudf::timestamp_us::rep>(0, 100000000);
   auto data_init  = [&random_eng](cudf::size_type row) { return random_eng.generate(); };
 
-  test_udf<dtype>(cuda, op, data_init, 500, false);
+  test_udf<dtype>(cuda.c_str(), op, data_init, 500, false);
 }
 
 }  // namespace transformation

From 2d02bdce9e3efae232dea4a5b8b2eecf5c0f8a93 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 7 Oct 2024 16:30:58 -0500
Subject: [PATCH 029/117] Implement `extract_datetime_component` in
 `libcudf`/`pylibcudf` (#16776)

Closes https://github.com/rapidsai/cudf/issues/16735

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16776
---
 cpp/include/cudf/datetime.hpp                 |  34 +++++
 cpp/include/cudf/detail/datetime.hpp          |  10 ++
 cpp/src/datetime/datetime_ops.cu              |  88 ++++++------
 cpp/tests/datetime/datetime_ops_test.cpp      | 130 ++++++++++++++++++
 python/cudf/cudf/_lib/datetime.pyx            |  39 +++++-
 python/cudf_polars/cudf_polars/dsl/expr.py    |  40 ++++--
 python/pylibcudf/pylibcudf/datetime.pxd       |   7 +
 python/pylibcudf/pylibcudf/datetime.pyx       |  71 ++++------
 .../pylibcudf/libcudf/CMakeLists.txt          |   5 +-
 .../pylibcudf/pylibcudf/libcudf/datetime.pxd  |  17 +++
 .../pylibcudf/pylibcudf/libcudf/datetime.pyx  |   0
 .../pylibcudf/tests/test_datetime.py          |  55 ++++----
 12 files changed, 358 insertions(+), 138 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/datetime.pyx

diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 7359a0d5fde..1eaea5b6374 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -38,6 +38,22 @@ namespace datetime {
  * @file
  */
 
+/**
+ * @brief Types of datetime components that may be extracted.
+ */
+enum class datetime_component : uint8_t {
+  YEAR,
+  MONTH,
+  DAY,
+  WEEKDAY,
+  HOUR,
+  MINUTE,
+  SECOND,
+  MILLISECOND,
+  MICROSECOND,
+  NANOSECOND
+};
+
 /**
  * @brief  Extracts year from any datetime type and returns an int16_t
  * cudf::column.
@@ -207,6 +223,24 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Extracts the specified datetime component from any datetime type and
+ * returns an int16_t cudf::column.
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param component The datetime component to extract
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate device memory of the returned column
+ *
+ * @returns cudf::column of the extracted int16_t datetime component
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ */
+std::unique_ptr<cudf::column> extract_datetime_component(
+  cudf::column_view const& column,
+  datetime_component component,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 /**
  * @addtogroup datetime_compute
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 9db7e48498f..df3050d6494 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -115,6 +115,16 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view cons
                                                           rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr);
 
+/**
+ * @copydoc cudf::extract_datetime_component(cudf::column_view const&, datetime_component,
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
+ *
+ */
+std::unique_ptr<cudf::column> extract_datetime_component(cudf::column_view const& column,
+                                                         datetime_component component,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr);
+
 /**
  * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index ddb0dbcd96d..a497cedb3bc 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -44,19 +44,6 @@
 namespace cudf {
 namespace datetime {
 namespace detail {
-enum class datetime_component {
-  INVALID = 0,
-  YEAR,
-  MONTH,
-  DAY,
-  WEEKDAY,
-  HOUR,
-  MINUTE,
-  SECOND,
-  MILLISECOND,
-  MICROSECOND,
-  NANOSECOND
-};
 
 enum class rounding_function {
   CEIL,   ///< Rounds up to the next integer multiple of the provided frequency
@@ -453,90 +440,70 @@ std::unique_ptr<column> extract_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::YEAR>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::YEAR, stream, mr);
 }
 
 std::unique_ptr<column> extract_month(column_view const& column,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::MONTH>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::MONTH, stream, mr);
 }
 
 std::unique_ptr<column> extract_day(column_view const& column,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::DAY>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::DAY, stream, mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::WEEKDAY>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::WEEKDAY, stream, mr);
 }
 
 std::unique_ptr<column> extract_hour(column_view const& column,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::HOUR>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::HOUR, stream, mr);
 }
 
 std::unique_ptr<column> extract_minute(column_view const& column,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::MINUTE>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::MINUTE, stream, mr);
 }
 
 std::unique_ptr<column> extract_second(column_view const& column,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::SECOND>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::SECOND, stream, mr);
 }
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::MILLISECOND>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::MILLISECOND, stream, mr);
 }
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::MICROSECOND>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::MICROSECOND, stream, mr);
 }
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::NANOSECOND>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::NANOSECOND, stream, mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
@@ -576,6 +543,32 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
   return apply_datetime_op<extract_quarter_op, type_id::INT16>(column, stream, mr);
 }
 
+std::unique_ptr<cudf::column> extract_datetime_component(cudf::column_view const& column,
+                                                         datetime_component component,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+#define extract(field)                                                                 \
+  case field:                                                                          \
+    return apply_datetime_op<extract_component_operator<field>, cudf::type_id::INT16>( \
+      column, stream, mr)
+
+  switch (component) {
+    extract(datetime_component::YEAR);
+    extract(datetime_component::MONTH);
+    extract(datetime_component::DAY);
+    extract(datetime_component::WEEKDAY);
+    extract(datetime_component::HOUR);
+    extract(datetime_component::MINUTE);
+    extract(datetime_component::SECOND);
+    extract(datetime_component::MILLISECOND);
+    extract(datetime_component::MICROSECOND);
+    extract(datetime_component::NANOSECOND);
+    default: CUDF_FAIL("Unsupported datetime component.");
+  }
+#undef extract
+}
+
 }  // namespace detail
 
 std::unique_ptr<column> ceil_datetimes(column_view const& column,
@@ -661,6 +654,15 @@ std::unique_ptr<column> extract_second(column_view const& column,
   return detail::extract_second(column, stream, mr);
 }
 
+std::unique_ptr<cudf::column> extract_datetime_component(cudf::column_view const& column,
+                                                         datetime_component component,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_datetime_component(column, component, stream, mr);
+}
+
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 13577c4d0ea..603edb27c7c 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -196,6 +196,136 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
                                  fixed_width_column_wrapper<int16_t>{0, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns),
                                  fixed_width_column_wrapper<int16_t>{766, 424, 623});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::YEAR),
+    fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::YEAR),
+    fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::YEAR),
+    fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::YEAR),
+    fixed_width_column_wrapper<int16_t>{1969, 1970, 1970});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MONTH),
+    fixed_width_column_wrapper<int16_t>{10, 7, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MONTH),
+    fixed_width_column_wrapper<int16_t>{10, 7, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MONTH),
+    fixed_width_column_wrapper<int16_t>{10, 7, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MONTH),
+    fixed_width_column_wrapper<int16_t>{12, 1, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::DAY),
+    fixed_width_column_wrapper<int16_t>{26, 4, 25});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::DAY),
+    fixed_width_column_wrapper<int16_t>{26, 4, 25});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::DAY),
+    fixed_width_column_wrapper<int16_t>{26, 4, 25});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::DAY),
+    fixed_width_column_wrapper<int16_t>{31, 1, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::WEEKDAY),
+    fixed_width_column_wrapper<int16_t>{2, 3, 3});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::WEEKDAY),
+    fixed_width_column_wrapper<int16_t>{2, 3, 3});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY),
+    fixed_width_column_wrapper<int16_t>{2, 3, 3});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY),
+    fixed_width_column_wrapper<int16_t>{2, 3, 3});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::HOUR),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::HOUR),
+    fixed_width_column_wrapper<int16_t>{14, 12, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::HOUR),
+    fixed_width_column_wrapper<int16_t>{14, 12, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::HOUR),
+    fixed_width_column_wrapper<int16_t>{23, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MINUTE),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MINUTE),
+    fixed_width_column_wrapper<int16_t>{1, 0, 32});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MINUTE),
+    fixed_width_column_wrapper<int16_t>{1, 0, 32});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MINUTE),
+    fixed_width_column_wrapper<int16_t>{59, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::SECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::SECOND),
+    fixed_width_column_wrapper<int16_t>{12, 0, 12});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::SECOND),
+    fixed_width_column_wrapper<int16_t>{12, 0, 12});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::SECOND),
+    fixed_width_column_wrapper<int16_t>{59, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MILLISECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MILLISECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MILLISECOND),
+    fixed_width_column_wrapper<int16_t>{762, 0, 929});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MILLISECOND),
+    fixed_width_column_wrapper<int16_t>{976, 23, 987});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MICROSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MICROSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MICROSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MICROSECOND),
+    fixed_width_column_wrapper<int16_t>{675, 432, 234});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::NANOSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::NANOSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::NANOSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::NANOSECOND),
+    fixed_width_column_wrapper<int16_t>{766, 424, 623});
 }
 
 template <typename T>
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index bc5e085ec39..d844466120f 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -13,12 +13,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.filling cimport calendrical_month_sequence
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.datetime import DatetimeComponent
 
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
-import pylibcudf as plc
-
 
 @acquire_spill_lock()
 def add_months(Column col, Column months):
@@ -40,9 +39,39 @@ def add_months(Column col, Column months):
 
 @acquire_spill_lock()
 def extract_datetime_component(Column col, object field):
-    result = Column.from_pylibcudf(
-        plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field)
-    )
+
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+    cdef libcudf_datetime.datetime_component component
+
+    component_names = {
+        "year": DatetimeComponent.YEAR,
+        "month": DatetimeComponent.MONTH,
+        "day": DatetimeComponent.DAY,
+        "weekday": DatetimeComponent.WEEKDAY,
+        "hour": DatetimeComponent.HOUR,
+        "minute": DatetimeComponent.MINUTE,
+        "second": DatetimeComponent.SECOND,
+        "millisecond": DatetimeComponent.MILLISECOND,
+        "microsecond": DatetimeComponent.MICROSECOND,
+        "nanosecond": DatetimeComponent.NANOSECOND,
+    }
+    if field == "day_of_year":
+        with nogil:
+            c_result = move(libcudf_datetime.day_of_year(col_view))
+    elif field in component_names:
+        component = component_names[field]
+        with nogil:
+            c_result = move(
+                libcudf_datetime.extract_datetime_component(
+                    col_view,
+                    component
+                )
+            )
+    else:
+        raise ValueError(f"Invalid field: '{field}'")
+
+    result = Column.from_unique_ptr(move(c_result))
 
     if field == "weekday":
         # Pandas counts Monday-Sunday as 0-6
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 54476b7fedc..a418560b31c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -961,16 +961,16 @@ def do_evaluate(
 class TemporalFunction(Expr):
     __slots__ = ("name", "options", "children")
     _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
-        pl_expr.TemporalFunction.Year: "year",
-        pl_expr.TemporalFunction.Month: "month",
-        pl_expr.TemporalFunction.Day: "day",
-        pl_expr.TemporalFunction.WeekDay: "weekday",
-        pl_expr.TemporalFunction.Hour: "hour",
-        pl_expr.TemporalFunction.Minute: "minute",
-        pl_expr.TemporalFunction.Second: "second",
-        pl_expr.TemporalFunction.Millisecond: "millisecond",
-        pl_expr.TemporalFunction.Microsecond: "microsecond",
-        pl_expr.TemporalFunction.Nanosecond: "nanosecond",
+        pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
+        pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
+        pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
+        pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
+        pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR,
+        pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE,
+        pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND,
+        pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
+        pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
+        pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
     }
     _non_child = ("dtype", "name", "options")
     children: tuple[Expr, ...]
@@ -1003,8 +1003,12 @@ def do_evaluate(
         ]
         (column,) = columns
         if self.name == pl_expr.TemporalFunction.Microsecond:
-            millis = plc.datetime.extract_datetime_component(column.obj, "millisecond")
-            micros = plc.datetime.extract_datetime_component(column.obj, "microsecond")
+            millis = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
+            )
+            micros = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
+            )
             millis_as_micros = plc.binaryop.binary_operation(
                 millis,
                 plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
@@ -1019,9 +1023,15 @@ def do_evaluate(
             )
             return Column(total_micros)
         elif self.name == pl_expr.TemporalFunction.Nanosecond:
-            millis = plc.datetime.extract_datetime_component(column.obj, "millisecond")
-            micros = plc.datetime.extract_datetime_component(column.obj, "microsecond")
-            nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond")
+            millis = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
+            )
+            micros = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
+            )
+            nanos = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.NANOSECOND
+            )
             millis_as_nanos = plc.binaryop.binary_operation(
                 millis,
                 plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd
index 2fce48cf1b4..72ce680ba7a 100644
--- a/python/pylibcudf/pylibcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/datetime.pxd
@@ -1,8 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from pylibcudf.libcudf.datetime cimport datetime_component
+
 from .column cimport Column
 
 
 cpdef Column extract_year(
     Column col
 )
+
+cpdef Column extract_datetime_component(
+    Column col,
+    datetime_component component
+)
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index e8e0caaf42d..784d29128bf 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -3,19 +3,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.datetime cimport (
-    day_of_year as cpp_day_of_year,
-    extract_day as cpp_extract_day,
-    extract_hour as cpp_extract_hour,
-    extract_microsecond_fraction as cpp_extract_microsecond_fraction,
-    extract_millisecond_fraction as cpp_extract_millisecond_fraction,
-    extract_minute as cpp_extract_minute,
-    extract_month as cpp_extract_month,
-    extract_nanosecond_fraction as cpp_extract_nanosecond_fraction,
-    extract_second as cpp_extract_second,
-    extract_weekday as cpp_extract_weekday,
+    datetime_component,
+    extract_datetime_component as cpp_extract_datetime_component,
     extract_year as cpp_extract_year,
 )
 
+from pylibcudf.libcudf.datetime import \
+    datetime_component as DatetimeComponent  # no-cython-lint
+
 from .column cimport Column
 
 
@@ -41,41 +36,29 @@ cpdef Column extract_year(
         result = move(cpp_extract_year(values.view()))
     return Column.from_libcudf(move(result))
 
+cpdef Column extract_datetime_component(
+    Column values,
+    datetime_component component
+):
+    """
+    Extract a datetime component from a datetime column.
 
-def extract_datetime_component(Column col, str field):
+    For details, see :cpp:func:`cudf::extract_datetime_component`.
 
-    cdef unique_ptr[column] c_result
+    Parameters
+    ----------
+    values : Column
+        The column to extract the component from.
+    component : DatetimeComponent
+        The datetime component to extract.
 
-    with nogil:
-        if field == "year":
-            c_result = move(cpp_extract_year(col.view()))
-        elif field == "month":
-            c_result = move(cpp_extract_month(col.view()))
-        elif field == "day":
-            c_result = move(cpp_extract_day(col.view()))
-        elif field == "weekday":
-            c_result = move(cpp_extract_weekday(col.view()))
-        elif field == "hour":
-            c_result = move(cpp_extract_hour(col.view()))
-        elif field == "minute":
-            c_result = move(cpp_extract_minute(col.view()))
-        elif field == "second":
-            c_result = move(cpp_extract_second(col.view()))
-        elif field == "millisecond":
-            c_result = move(
-                cpp_extract_millisecond_fraction(col.view())
-            )
-        elif field == "microsecond":
-            c_result = move(
-                cpp_extract_microsecond_fraction(col.view())
-            )
-        elif field == "nanosecond":
-            c_result = move(
-                cpp_extract_nanosecond_fraction(col.view())
-            )
-        elif field == "day_of_year":
-            c_result = move(cpp_day_of_year(col.view()))
-        else:
-            raise ValueError(f"Invalid datetime field: '{field}'")
+    Returns
+    -------
+    Column
+        Column with the extracted component.
+    """
+    cdef unique_ptr[column] result
 
-    return Column.from_libcudf(move(c_result))
+    with nogil:
+        result = move(cpp_extract_datetime_component(values.view(), component))
+    return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
index 2167616690f..15beaee47d4 100644
--- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx
-                   replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx
+set(cython_sources
+    aggregation.pyx binaryop.pyx copying.pyx datetime.pyx expressions.pyx labeling.pyx reduce.pyx
+    replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
index a4465343197..73cdfb96af5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -7,6 +8,18 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
+    cpdef enum class datetime_component(uint8_t):
+        YEAR
+        MONTH
+        DAY
+        WEEKDAY
+        HOUR
+        MINUTE
+        SECOND
+        MILLISECOND
+        MICROSECOND
+        NANOSECOND
+
     cdef unique_ptr[column] extract_year(const column_view& column) except +
     cdef unique_ptr[column] extract_month(const column_view& column) except +
     cdef unique_ptr[column] extract_day(const column_view& column) except +
@@ -23,6 +36,10 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] extract_nanosecond_fraction(
         const column_view& column
     ) except +
+    cdef unique_ptr[column] extract_datetime_component(
+        const column_view& column,
+        datetime_component component
+    ) except +
 
     ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency":
         DAY "cudf::datetime::rounding_frequency::DAY"
diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pyx b/python/pylibcudf/pylibcudf/libcudf/datetime.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index 89c96829e71..75930d59058 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import datetime
-import functools
 
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -10,19 +9,6 @@
 from utils import assert_column_eq
 
 
-@pytest.fixture
-def date_column(has_nulls):
-    values = [
-        datetime.date(1999, 1, 1),
-        datetime.date(2024, 10, 12),
-        datetime.date(1, 1, 1),
-        datetime.date(9999, 1, 1),
-    ]
-    if has_nulls:
-        values[2] = None
-    return plc.interop.from_arrow(pa.array(values, type=pa.date32()))
-
-
 @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
 def datetime_column(has_nulls, request):
     values = [
@@ -40,24 +26,35 @@ def datetime_column(has_nulls, request):
     )
 
 
-@pytest.mark.parametrize(
-    "component, pc_fun",
-    [
-        ("year", pc.year),
-        ("month", pc.month),
-        ("day", pc.day),
-        ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)),
-        ("hour", pc.hour),
-        ("minute", pc.minute),
-        ("second", pc.second),
-        ("millisecond", pc.millisecond),
-        ("microsecond", pc.microsecond),
-        ("nanosecond", pc.nanosecond),
+@pytest.fixture(
+    params=[
+        ("year", plc.datetime.DatetimeComponent.YEAR),
+        ("month", plc.datetime.DatetimeComponent.MONTH),
+        ("day", plc.datetime.DatetimeComponent.DAY),
+        ("day_of_week", plc.datetime.DatetimeComponent.WEEKDAY),
+        ("hour", plc.datetime.DatetimeComponent.HOUR),
+        ("minute", plc.datetime.DatetimeComponent.MINUTE),
+        ("second", plc.datetime.DatetimeComponent.SECOND),
+        ("millisecond", plc.datetime.DatetimeComponent.MILLISECOND),
+        ("microsecond", plc.datetime.DatetimeComponent.MICROSECOND),
+        ("nanosecond", plc.datetime.DatetimeComponent.NANOSECOND),
     ],
+    ids=lambda x: x[0],
 )
-def test_extraction(datetime_column, component, pc_fun):
+def component(request):
+    return request.param
+
+
+def test_extract_datetime_component(datetime_column, component):
+    attr, component = component
+    kwargs = {}
+    if attr == "day_of_week":
+        kwargs = {"count_from_zero": False}
     got = plc.datetime.extract_datetime_component(datetime_column, component)
     # libcudf produces an int16, arrow produces an int64
-    expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16())
+
+    expect = getattr(pc, attr)(
+        plc.interop.to_arrow(datetime_column), **kwargs
+    ).cast(pa.int16())
 
     assert_column_eq(expect, got)

From 09ed2105b841fe29be75af8b0d5a41fc09e7b6ac Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 7 Oct 2024 21:25:29 -0400
Subject: [PATCH 030/117] Migrate nvtext generate_ngrams APIs to pylibcudf
 (#17006)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17006
---
 .../pylibcudf/nvtext/generate_ngrams.rst      |   6 +
 .../api_docs/pylibcudf/nvtext/index.rst       |   1 +
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx |  77 +++---------
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |   2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |   3 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |   3 +-
 .../pylibcudf/nvtext/generate_ngrams.pxd      |  12 ++
 .../pylibcudf/nvtext/generate_ngrams.pyx      | 111 ++++++++++++++++++
 .../tests/test_nvtext_generate_ngrams.py      |  54 +++++++++
 9 files changed, 207 insertions(+), 62 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
new file mode 100644
index 00000000000..d68199271bd
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
@@ -0,0 +1,6 @@
+===============
+generate_ngrams
+===============
+
+.. automodule:: pylibcudf.nvtext.generate_ngrams
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index b5cd5ee42c3..2e03b589c8b 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -5,3 +5,4 @@ nvtext
     :maxdepth: 1
 
     edit_distance
+    generate_ngrams
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index 6591b527eec..7fdf9258b7f 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -2,75 +2,34 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
-    generate_character_ngrams as cpp_generate_character_ngrams,
-    generate_ngrams as cpp_generate_ngrams,
-    hash_character_ngrams as cpp_hash_character_ngrams,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def generate_ngrams(Column strings, int ngrams, object py_separator):
-
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_generate_ngrams(
-                c_strings,
-                c_ngrams,
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams,
+        py_separator.device_value.c_value
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def generate_character_ngrams(Column strings, int ngrams):
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_generate_character_ngrams(
-                c_strings,
-                c_ngrams
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_character_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def hash_character_ngrams(Column strings, int ngrams):
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_hash_character_ngrams(
-                c_strings,
-                c_ngrams
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.hash_character_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index ebe1fda1f12..eb5617a1da6 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 82f7c425b1d..7f5fa2b9925 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance
+from . cimport edit_distance, generate_ngrams
 
 __all__ = [
     "edit_distance",
+    "generate_ngrams",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 986652a241f..a66ce984745 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance
+from . import edit_distance, generate_ngrams
 
 __all__ = [
     "edit_distance",
+    "generate_ngrams",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
new file mode 100644
index 00000000000..f15eb1f25e9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator)
+
+cpdef Column generate_character_ngrams(Column input, size_type ngrams=*)
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams=*)
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
new file mode 100644
index 00000000000..8c7a8edc01d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
@@ -0,0 +1,111 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
+    generate_character_ngrams as cpp_generate_character_ngrams,
+    generate_ngrams as cpp_generate_ngrams,
+    hash_character_ngrams as cpp_hash_character_ngrams,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
+    """
+    Returns a single column of strings by generating ngrams from a strings column.
+
+    For details, see :cpp:func:`generate_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+    separator : Scalar
+        The string to use for separating ngram tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef column_view c_strings = input.view()
+    cdef const string_scalar* c_separator = <const string_scalar*>separator.c_obj.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_generate_ngrams(
+                c_strings,
+                ngrams,
+                c_separator[0]
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
+    """
+    Returns a lists column of ngrams of characters within each string.
+
+    For details, see :cpp:func:`generate_character_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Lists column of strings
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_generate_character_ngrams(
+                c_strings,
+                ngrams,
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
+    """
+    Returns a lists column of hash values of the characters in each string
+
+    For details, see :cpp:func:`hash_character_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Lists column of hash values
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_hash_character_ngrams(
+                c_strings,
+                ngrams,
+            )
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
new file mode 100644
index 00000000000..5cf9874d595
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["ab", "cde", "fgh"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+@pytest.mark.parametrize("sep", ["_", "**", ","])
+def test_generate_ngrams(input_col, ngram, sep):
+    result = plc.nvtext.generate_ngrams.generate_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+        plc.interop.from_arrow(pa.scalar(sep)),
+    )
+    expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"])
+    if ngram == 3:
+        expected = pa.array([f"ab{sep}cde{sep}fgh"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+def test_generate_character_ngrams(input_col, ngram):
+    result = plc.nvtext.generate_ngrams.generate_character_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+    )
+    expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]])
+    if ngram == 3:
+        expected = pa.array([[], ["cde"], ["fgh"]])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+def test_hash_character_ngrams(input_col, ngram):
+    result = plc.nvtext.generate_ngrams.hash_character_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(
+        len(got) == max(0, len(s.as_py()) - ngram + 1)
+        for got, s in zip(pa_result, input_col)
+    )
+    assert pa_result.type == pa.list_(
+        pa.field("element", pa.uint32(), nullable=False)
+    )

From 219ec0e7fbff37d7387d25e93510b55a8782e2bf Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 8 Oct 2024 13:40:40 +0100
Subject: [PATCH 031/117] Expunge NamedColumn (#16962)

Everything in the expression evaluation now operates on columns without names. DataFrame construction takes either a mapping from string-valued names to columns, or a sequence of pairs of names and columns.

This removes some duplicate code in the NamedColumn class (by removing it) where we had to fight the inheritance hierarchy.

- Closes #16272

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16962
---
 .../cudf_polars/containers/__init__.py        |   4 +-
 .../cudf_polars/containers/column.py          | 110 ++++++----------
 .../cudf_polars/containers/dataframe.py       | 111 ++++++++---------
 python/cudf_polars/cudf_polars/dsl/expr.py    |  19 ++-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 117 +++++++++---------
 python/cudf_polars/docs/overview.md           |  18 +--
 .../tests/containers/test_column.py           |   9 +-
 .../tests/containers/test_dataframe.py        |  39 +++---
 .../tests/expressions/test_sort.py            |   2 +-
 .../cudf_polars/tests/utils/test_broadcast.py |  20 +--
 10 files changed, 209 insertions(+), 240 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
index 06bb08953f1..3b1eff4a0d0 100644
--- a/python/cudf_polars/cudf_polars/containers/__init__.py
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-__all__: list[str] = ["DataFrame", "Column", "NamedColumn"]
+__all__: list[str] = ["DataFrame", "Column"]
 
-from cudf_polars.containers.column import Column, NamedColumn
+from cudf_polars.containers.column import Column
 from cudf_polars.containers.dataframe import DataFrame
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 3fe3e5557cb..00186098e54 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -15,7 +15,7 @@
 
     import polars as pl
 
-__all__: list[str] = ["Column", "NamedColumn"]
+__all__: list[str] = ["Column"]
 
 
 class Column:
@@ -26,6 +26,9 @@ class Column:
     order: plc.types.Order
     null_order: plc.types.NullOrder
     is_scalar: bool
+    # Optional name, only ever set by evaluation of NamedExpr nodes
+    # The internal evaluation should not care about the name.
+    name: str | None
 
     def __init__(
         self,
@@ -34,14 +37,12 @@ def __init__(
         is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
         order: plc.types.Order = plc.types.Order.ASCENDING,
         null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+        name: str | None = None,
     ):
         self.obj = column
         self.is_scalar = self.obj.size() == 1
-        if self.obj.size() <= 1:
-            is_sorted = plc.types.Sorted.YES
-        self.is_sorted = is_sorted
-        self.order = order
-        self.null_order = null_order
+        self.name = name
+        self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
 
     @functools.cached_property
     def obj_scalar(self) -> plc.Scalar:
@@ -63,9 +64,26 @@ def obj_scalar(self) -> plc.Scalar:
             )
         return plc.copying.get_element(self.obj, 0)
 
+    def rename(self, name: str | None, /) -> Self:
+        """
+        Return a shallow copy with a new name.
+
+        Parameters
+        ----------
+        name
+            New name
+
+        Returns
+        -------
+        Shallow copy of self with new name set.
+        """
+        new = self.copy()
+        new.name = name
+        return new
+
     def sorted_like(self, like: Column, /) -> Self:
         """
-        Copy sortedness properties from a column onto self.
+        Return a shallow copy with sortedness from like.
 
         Parameters
         ----------
@@ -74,20 +92,23 @@ def sorted_like(self, like: Column, /) -> Self:
 
         Returns
         -------
-        Self with metadata set.
+        Shallow copy of self with metadata set.
 
         See Also
         --------
         set_sorted, copy_metadata
         """
-        return self.set_sorted(
-            is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
+        return type(self)(
+            self.obj,
+            name=self.name,
+            is_sorted=like.is_sorted,
+            order=like.order,
+            null_order=like.null_order,
         )
 
-    # TODO: Return Column once #16272 is fixed.
-    def astype(self, dtype: plc.DataType) -> plc.Column:
+    def astype(self, dtype: plc.DataType) -> Column:
         """
-        Return the backing column as the requested dtype.
+        Cast the column to as the requested dtype.
 
         Parameters
         ----------
@@ -109,8 +130,10 @@ def astype(self, dtype: plc.DataType) -> plc.Column:
         the current one.
         """
         if self.obj.type() != dtype:
-            return plc.unary.cast(self.obj, dtype)
-        return self.obj
+            return Column(plc.unary.cast(self.obj, dtype), name=self.name).sorted_like(
+                self
+            )
+        return self
 
     def copy_metadata(self, from_: pl.Series, /) -> Self:
         """
@@ -129,6 +152,7 @@ def copy_metadata(self, from_: pl.Series, /) -> Self:
         --------
         set_sorted, sorted_like
         """
+        self.name = from_.name
         if len(from_) <= 1:
             return self
         ascending = from_.flags["SORTED_ASC"]
@@ -192,6 +216,7 @@ def copy(self) -> Self:
             is_sorted=self.is_sorted,
             order=self.order,
             null_order=self.null_order,
+            name=self.name,
         )
 
     def mask_nans(self) -> Self:
@@ -217,58 +242,3 @@ def nan_count(self) -> int:
                 )
             ).as_py()
         return 0
-
-
-class NamedColumn(Column):
-    """A column with a name."""
-
-    name: str
-
-    def __init__(
-        self,
-        column: plc.Column,
-        name: str,
-        *,
-        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
-        order: plc.types.Order = plc.types.Order.ASCENDING,
-        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
-    ) -> None:
-        super().__init__(
-            column, is_sorted=is_sorted, order=order, null_order=null_order
-        )
-        self.name = name
-
-    def copy(self, *, new_name: str | None = None) -> Self:
-        """
-        A shallow copy of the column.
-
-        Parameters
-        ----------
-        new_name
-            Optional new name for the copied column.
-
-        Returns
-        -------
-        New column sharing data with self.
-        """
-        return type(self)(
-            self.obj,
-            self.name if new_name is None else new_name,
-            is_sorted=self.is_sorted,
-            order=self.order,
-            null_order=self.null_order,
-        )
-
-    def mask_nans(self) -> Self:
-        """Return a shallow copy of self with nans masked out."""
-        # Annoying, the inheritance is not right (can't call the
-        # super-type mask_nans), but will sort that by refactoring
-        # later.
-        if plc.traits.is_floating_point(self.obj.type()):
-            old_count = self.obj.null_count()
-            mask, new_count = plc.transform.nans_to_nulls(self.obj)
-            result = type(self)(self.obj.with_mask(mask, new_count), self.name)
-            if old_count == new_count:
-                return result.sorted_like(self)
-            return result
-        return self.copy()
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index f3e3862d0cc..2c195f6637c 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -5,43 +5,50 @@
 
 from __future__ import annotations
 
-import itertools
 from functools import cached_property
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import pyarrow as pa
 import pylibcudf as plc
 
 import polars as pl
 
-from cudf_polars.containers.column import NamedColumn
+from cudf_polars.containers import Column
 from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence, Set
+    from collections.abc import Iterable, Mapping, Sequence, Set
 
     from typing_extensions import Self
 
-    from cudf_polars.containers import Column
-
 
 __all__: list[str] = ["DataFrame"]
 
 
+# Pacify the type checker. DataFrame init asserts that all the columns
+# have a string name, so let's narrow the type.
+class NamedColumn(Column):
+    name: str
+
+
 class DataFrame:
     """A representation of a dataframe."""
 
-    columns: list[NamedColumn]
+    column_map: dict[str, Column]
     table: plc.Table
+    columns: list[NamedColumn]
 
-    def __init__(self, columns: Sequence[NamedColumn]) -> None:
-        self.columns = list(columns)
-        self._column_map = {c.name: c for c in self.columns}
-        self.table = plc.Table([c.obj for c in columns])
+    def __init__(self, columns: Iterable[Column]) -> None:
+        columns = list(columns)
+        if any(c.name is None for c in columns):
+            raise ValueError("All columns must have a name")
+        self.columns = [cast(NamedColumn, c) for c in columns]
+        self.column_map = {c.name: c for c in self.columns}
+        self.table = plc.Table([c.obj for c in self.columns])
 
     def copy(self) -> Self:
         """Return a shallow copy of self."""
-        return type(self)([c.copy() for c in self.columns])
+        return type(self)(c.copy() for c in self.columns)
 
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
@@ -51,42 +58,38 @@ def to_polars(self) -> pl.DataFrame:
         # https://github.com/pola-rs/polars/issues/11632
         # To guarantee we produce correct names, we therefore
         # serialise with names we control and rename with that map.
-        name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)}
+        name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
         table: pa.Table = plc.interop.to_arrow(
             self.table,
             [plc.interop.ColumnMetadata(name=name) for name in name_map],
         )
         df: pl.DataFrame = pl.from_arrow(table)
         return df.rename(name_map).with_columns(
-            *(
-                pl.col(c.name).set_sorted(
-                    descending=c.order == plc.types.Order.DESCENDING
-                )
-                if c.is_sorted
-                else pl.col(c.name)
-                for c in self.columns
-            )
+            pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING)
+            if c.is_sorted
+            else pl.col(c.name)
+            for c in self.columns
         )
 
     @cached_property
     def column_names_set(self) -> frozenset[str]:
         """Return the column names as a set."""
-        return frozenset(c.name for c in self.columns)
+        return frozenset(self.column_map)
 
     @cached_property
     def column_names(self) -> list[str]:
         """Return a list of the column names."""
-        return [c.name for c in self.columns]
+        return list(self.column_map)
 
     @cached_property
     def num_columns(self) -> int:
         """Number of columns."""
-        return len(self.columns)
+        return len(self.column_map)
 
     @cached_property
     def num_rows(self) -> int:
         """Number of rows."""
-        return 0 if len(self.columns) == 0 else self.table.num_rows()
+        return self.table.num_rows() if self.column_map else 0
 
     @classmethod
     def from_polars(cls, df: pl.DataFrame) -> Self:
@@ -111,12 +114,8 @@ def from_polars(cls, df: pl.DataFrame) -> Self:
         # No-op if the schema is unchanged.
         d_table = plc.interop.from_arrow(table.cast(schema))
         return cls(
-            [
-                NamedColumn(column, h_col.name).copy_metadata(h_col)
-                for column, h_col in zip(
-                    d_table.columns(), df.iter_columns(), strict=True
-                )
-            ]
+            Column(column).copy_metadata(h_col)
+            for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True)
         )
 
     @classmethod
@@ -144,17 +143,14 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls(
-            [
-                NamedColumn(c, name)
-                for c, name in zip(table.columns(), names, strict=True)
-            ]
+            Column(c, name=name) for c, name in zip(table.columns(), names, strict=True)
         )
 
     def sorted_like(
         self, like: DataFrame, /, *, subset: Set[str] | None = None
     ) -> Self:
         """
-        Copy sortedness from a dataframe onto self.
+        Return a shallow copy with sortedness copied from like.
 
         Parameters
         ----------
@@ -165,7 +161,7 @@ def sorted_like(
 
         Returns
         -------
-        Self with metadata set.
+        Shallow copy of self with metadata set.
 
         Raises
         ------
@@ -175,13 +171,12 @@ def sorted_like(
         if like.column_names != self.column_names:
             raise ValueError("Can only copy from identically named frame")
         subset = self.column_names_set if subset is None else subset
-        self.columns = [
+        return type(self)(
             c.sorted_like(other) if c.name in subset else c
             for c, other in zip(self.columns, like.columns, strict=True)
-        ]
-        return self
+        )
 
-    def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
+    def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self:
         """
         Return a new dataframe with extra columns.
 
@@ -189,6 +184,8 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         ----------
         columns
             Columns to add
+        replace_only
+            If true, then only replacements are allowed (matching by name).
 
         Returns
         -------
@@ -196,36 +193,30 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
 
         Notes
         -----
-        If column names overlap, newer names replace older ones.
+        If column names overlap, newer names replace older ones, and
+        appear in the same order as the original frame.
         """
-        columns = list(
-            {c.name: c for c in itertools.chain(self.columns, columns)}.values()
-        )
-        return type(self)(columns)
+        new = {c.name: c for c in columns}
+        if replace_only and not self.column_names_set.issuperset(new.keys()):
+            raise ValueError("Cannot replace with non-existing names")
+        return type(self)((self.column_map | new).values())
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
-        return type(self)([c for c in self.columns if c.name not in names])
+        return type(self)(column for column in self.columns if column.name not in names)
 
     def select(self, names: Sequence[str]) -> Self:
         """Select columns by name returning DataFrame."""
-        want = set(names)
-        if not want.issubset(self.column_names_set):
-            raise ValueError("Can't select missing names")
-        return type(self)([self._column_map[name] for name in names])
-
-    def replace_columns(self, *columns: NamedColumn) -> Self:
-        """Return a new dataframe with columns replaced by name."""
-        new = {c.name: c for c in columns}
-        if not set(new).issubset(self.column_names_set):
-            raise ValueError("Cannot replace with non-existing names")
-        return type(self)([new.get(c.name, c) for c in self.columns])
+        try:
+            return type(self)(self.column_map[name] for name in names)
+        except KeyError as e:
+            raise ValueError("Can't select missing names") from e
 
     def rename_columns(self, mapping: Mapping[str, str]) -> Self:
         """Rename some columns."""
-        return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns])
+        return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns)
 
-    def select_columns(self, names: Set[str]) -> list[NamedColumn]:
+    def select_columns(self, names: Set[str]) -> list[Column]:
         """Select columns by name."""
         return [c for c in self.columns if c.name in names]
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index a418560b31c..f7775ceb238 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -27,7 +27,7 @@
 from polars.exceptions import InvalidOperationError
 from polars.polars import _expr_nodes as pl_expr
 
-from cudf_polars.containers import Column, NamedColumn
+from cudf_polars.containers import Column
 from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
@@ -313,7 +313,7 @@ def evaluate(
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
         mapping: Mapping[Expr, Column] | None = None,
-    ) -> NamedColumn:
+    ) -> Column:
         """
         Evaluate this expression given a dataframe for context.
 
@@ -328,20 +328,15 @@ def evaluate(
 
         Returns
         -------
-        NamedColumn attaching a name to an evaluated Column
+        Evaluated Column with name attached.
 
         See Also
         --------
         :meth:`Expr.evaluate` for details, this function just adds the
         name to a column produced from an expression.
         """
-        obj = self.value.evaluate(df, context=context, mapping=mapping)
-        return NamedColumn(
-            obj.obj,
-            self.name,
-            is_sorted=obj.is_sorted,
-            order=obj.order,
-            null_order=obj.null_order,
+        return self.value.evaluate(df, context=context, mapping=mapping).rename(
+            self.name
         )
 
     def collect_agg(self, *, depth: int) -> AggInfo:
@@ -428,7 +423,9 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        return df._column_map[self.name]
+        # Deliberately remove the name here so that we guarantee
+        # evaluation of the IR produces names.
+        return df.column_map[self.name].rename(None)
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 1c61075be22..e319c363a23 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -26,7 +26,7 @@
 import polars as pl
 
 import cudf_polars.dsl.expr as expr
-from cudf_polars.containers import DataFrame, NamedColumn
+from cudf_polars.containers import Column, DataFrame
 from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
@@ -57,9 +57,7 @@
 ]
 
 
-def broadcast(
-    *columns: NamedColumn, target_length: int | None = None
-) -> list[NamedColumn]:
+def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]:
     """
     Broadcast a sequence of columns to a common length.
 
@@ -112,12 +110,12 @@ def broadcast(
     return [
         column
         if column.obj.size() != 1
-        else NamedColumn(
+        else Column(
             plc.Column.from_scalar(column.obj_scalar, nrows),
-            column.name,
             is_sorted=plc.types.Sorted.YES,
             order=plc.types.Order.ASCENDING,
             null_order=plc.types.NullOrder.BEFORE,
+            name=column.name,
         )
         for column in columns
     ]
@@ -385,15 +383,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             init = plc.interop.from_arrow(
                 pa.scalar(offset, type=plc.interop.to_arrow(dtype))
             )
-            index = NamedColumn(
+            index = Column(
                 plc.filling.sequence(df.num_rows, init, step),
-                name,
                 is_sorted=plc.types.Sorted.YES,
                 order=plc.types.Order.ASCENDING,
                 null_order=plc.types.NullOrder.AFTER,
+                name=name,
             )
             df = DataFrame([index, *df.columns])
-        assert all(c.obj.type() == self.schema[c.name] for c in df.columns)
+        assert all(
+            c.obj.type() == self.schema[name] for name, c in df.column_map.items()
+        )
         if self.predicate is None:
             return df
         else:
@@ -588,15 +588,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 requests.append(plc.groupby.GroupByRequest(col, [req]))
                 replacements.append(rep)
         group_keys, raw_tables = grouper.aggregate(requests)
-        # TODO: names
-        raw_columns: list[NamedColumn] = []
+        raw_columns: list[Column] = []
         for i, table in enumerate(raw_tables):
             (column,) = table.columns()
-            raw_columns.append(NamedColumn(column, f"tmp{i}"))
+            raw_columns.append(Column(column, name=f"tmp{i}"))
         mapping = dict(zip(replacements, raw_columns, strict=True))
         result_keys = [
-            NamedColumn(gk, k.name)
-            for gk, k in zip(group_keys.columns(), keys, strict=True)
+            Column(grouped_key, name=key.name)
+            for key, grouped_key in zip(keys, group_keys.columns(), strict=True)
         ]
         result_subs = DataFrame(raw_columns)
         results = [
@@ -639,8 +638,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
             )
             broadcasted = [
-                NamedColumn(reordered, b.name)
-                for reordered, b in zip(
+                Column(reordered, name=old.name)
+                for reordered, old in zip(
                     ordered_table.columns(), broadcasted, strict=True
                 )
             ]
@@ -787,20 +786,20 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             # result, not the gather maps
             columns = plc.join.cross_join(left.table, right.table).columns()
             left_cols = [
-                NamedColumn(new, old.name).sorted_like(old)
+                Column(new, name=old.name).sorted_like(old)
                 for new, old in zip(
                     columns[: left.num_columns], left.columns, strict=True
                 )
             ]
             right_cols = [
-                NamedColumn(
+                Column(
                     new,
-                    old.name
-                    if old.name not in left.column_names_set
-                    else f"{old.name}{suffix}",
+                    name=name
+                    if name not in left.column_names_set
+                    else f"{name}{suffix}",
                 )
-                for new, old in zip(
-                    columns[left.num_columns :], right.columns, strict=True
+                for new, name in zip(
+                    columns[left.num_columns :], right.column_names, strict=True
                 )
             ]
             return DataFrame([*left_cols, *right_cols])
@@ -838,18 +837,19 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
             if coalesce and how != "inner":
-                left = left.replace_columns(
-                    *(
-                        NamedColumn(
+                left = left.with_columns(
+                    (
+                        Column(
                             plc.replace.replace_nulls(left_col.obj, right_col.obj),
-                            left_col.name,
+                            name=left_col.name,
                         )
                         for left_col, right_col in zip(
                             left.select_columns(left_on.column_names_set),
                             right.select_columns(right_on.column_names_set),
                             strict=True,
                         )
-                    )
+                    ),
+                    replace_only=True,
                 )
                 right = right.discard_columns(right_on.column_names_set)
             if how == "right":
@@ -931,9 +931,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         df = self.df.evaluate(cache=cache)
         if self.subset is None:
             indices = list(range(df.num_columns))
+            keys_sorted = all(c.is_sorted for c in df.column_map.values())
         else:
             indices = [i for i, k in enumerate(df.column_names) if k in self.subset]
-        keys_sorted = all(df.columns[i].is_sorted for i in indices)
+            keys_sorted = all(df.column_map[name].is_sorted for name in self.subset)
         if keys_sorted:
             table = plc.stream_compaction.unique(
                 df.table,
@@ -954,10 +955,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 plc.types.NullEquality.EQUAL,
                 plc.types.NanEquality.ALL_EQUAL,
             )
+        # TODO: Is this sortedness setting correct
         result = DataFrame(
             [
-                NamedColumn(c, old.name).sorted_like(old)
-                for c, old in zip(table.columns(), df.columns, strict=True)
+                Column(new, name=old.name).sorted_like(old)
+                for new, old in zip(table.columns(), df.columns, strict=True)
             ]
         )
         if keys_sorted or self.stable:
@@ -1008,30 +1010,30 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         sort_keys = broadcast(
             *(k.evaluate(df) for k in self.by), target_length=df.num_rows
         )
-        names = {c.name: i for i, c in enumerate(df.columns)}
         # TODO: More robust identification here.
-        keys_in_result = [
-            i
-            for k in sort_keys
-            if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj
-        ]
+        keys_in_result = {
+            k.name: i
+            for i, k in enumerate(sort_keys)
+            if k.name in df.column_map and k.obj is df.column_map[k.name].obj
+        }
         table = self.do_sort(
             df.table,
             plc.Table([k.obj for k in sort_keys]),
             self.order,
             self.null_order,
         )
-        columns = [
-            NamedColumn(c, old.name)
-            for c, old in zip(table.columns(), df.columns, strict=True)
-        ]
-        # If a sort key is in the result table, set the sortedness property
-        for k, i in enumerate(keys_in_result):
-            columns[i] = columns[i].set_sorted(
-                is_sorted=plc.types.Sorted.YES,
-                order=self.order[k],
-                null_order=self.null_order[k],
-            )
+        columns: list[Column] = []
+        for name, c in zip(df.column_map, table.columns(), strict=True):
+            column = Column(c, name=name)
+            # If a sort key is in the result table, set the sortedness property
+            if name in keys_in_result:
+                i = keys_in_result[name]
+                column = column.set_sorted(
+                    is_sorted=plc.types.Sorted.YES,
+                    order=self.order[i],
+                    null_order=self.null_order[i],
+                )
+            columns.append(column)
         return DataFrame(columns).slice(self.zlice)
 
 
@@ -1080,7 +1082,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         df = self.df.evaluate(cache=cache)
         # This can reorder things.
         columns = broadcast(
-            *df.select(list(self.schema.keys())).columns, target_length=df.num_rows
+            *(df.column_map[name] for name in self.schema), target_length=df.num_rows
         )
         return DataFrame(columns)
 
@@ -1125,7 +1127,7 @@ def __post_init__(self) -> None:
             old, new, _ = self.options
             # TODO: perhaps polars should validate renaming in the IR?
             if len(new) != len(set(new)) or (
-                set(new) & (set(self.df.schema.keys() - set(old)))
+                set(new) & (set(self.df.schema.keys()) - set(old))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
         elif self.name == "unpivot":
@@ -1170,7 +1172,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             npiv = len(pivotees)
             df = self.df.evaluate(cache=cache)
             index_columns = [
-                NamedColumn(col, name)
+                Column(col, name=name)
                 for col, name in zip(
                     plc.reshape.tile(df.select(indices).table, npiv).columns(),
                     indices,
@@ -1191,13 +1193,16 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 df.num_rows,
             ).columns()
             value_column = plc.concatenate.concatenate(
-                [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns]
+                [
+                    df.column_map[pivotee].astype(self.schema[value_name]).obj
+                    for pivotee in pivotees
+                ]
             )
             return DataFrame(
                 [
                     *index_columns,
-                    NamedColumn(variable_column, variable_name),
-                    NamedColumn(value_column, value_name),
+                    Column(variable_column, name=variable_name),
+                    Column(value_column, name=value_name),
                 ]
             )
         else:
@@ -1278,6 +1283,4 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             )
             for df in dfs
         ]
-        return DataFrame(
-            list(itertools.chain.from_iterable(df.columns for df in dfs)),
-        )
+        return DataFrame(itertools.chain.from_iterable(df.columns for df in dfs))
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index bff44af1468..7837a275f20 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -201,21 +201,21 @@ the logical plan in any case, so is reasonably natural.
 # Containers
 
 Containers should be constructed as relatively lightweight objects
-around their pylibcudf counterparts. We have four (in
+around their pylibcudf counterparts. We have three (in
 `cudf_polars/containers/`):
 
 1. `Scalar` (a wrapper around a pylibcudf `Scalar`)
 2. `Column` (a wrapper around a pylibcudf `Column`)
-3. `NamedColumn` (a `Column` with an additional name)
-4. `DataFrame` (a wrapper around a pylibcudf `Table`)
+3. `DataFrame` (a wrapper around a pylibcudf `Table`)
 
 The interfaces offered by these are somewhat in flux, but broadly
-speaking, a `DataFrame` is just a list of `NamedColumn`s which each
-hold a `Column` plus a string `name`. `NamedColumn`s are only ever
-constructed via `NamedExpr`s, which are the top-level expression node
-that lives inside an `IR` node. This means that the expression
-evaluator never has to concern itself with column names: columns are
-only ever decorated with names when constructing a `DataFrame`.
+speaking, a `DataFrame` is just a mapping from string `name`s to
+`Column`s, and thus also holds a pylibcudf `Table`. Names are only
+attached to `Column`s and hence inserted into `DataFrames` via
+`NamedExpr`s, which are the top-level expression nodes that live
+inside an `IR` node. This means that the expression evaluator never
+has to concern itself with column names: columns are only ever
+decorated with names when constructing a `DataFrame`.
 
 The columns keep track of metadata (for example, whether or not they
 are sorted). We could imagine tracking more metadata, like minimum and
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 19919877f84..1f26ab1af9f 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -3,13 +3,11 @@
 
 from __future__ import annotations
 
-from functools import partial
-
 import pyarrow
 import pylibcudf as plc
 import pytest
 
-from cudf_polars.containers import Column, NamedColumn
+from cudf_polars.containers import Column
 
 
 def test_non_scalar_access_raises():
@@ -55,11 +53,10 @@ def test_shallow_copy():
 
 
 @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32])
-@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")])
-def test_mask_nans(typeid, constructor):
+def test_mask_nans(typeid):
     dtype = plc.DataType(typeid)
     values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype))
-    column = constructor(plc.interop.from_arrow(values))
+    column = Column(plc.interop.from_arrow(values))
     masked = column.mask_nans()
     assert column.obj.null_count() == masked.obj.null_count()
 
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 39fb44d55a5..5c68fb8f0aa 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -8,18 +8,18 @@
 
 import polars as pl
 
-from cudf_polars.containers import DataFrame, NamedColumn
+from cudf_polars.containers import Column, DataFrame
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
 def test_select_missing_raises():
     df = DataFrame(
         [
-            NamedColumn(
+            Column(
                 plc.column_factories.make_numeric_column(
                     plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
                 ),
-                "a",
+                name="a",
             )
         ]
     )
@@ -30,17 +30,17 @@ def test_select_missing_raises():
 def test_replace_missing_raises():
     df = DataFrame(
         [
-            NamedColumn(
+            Column(
                 plc.column_factories.make_numeric_column(
                     plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
                 ),
-                "a",
+                name="a",
             )
         ]
     )
-    replacement = df.columns[0].copy(new_name="b")
+    replacement = df.column_map["a"].copy().rename("b")
     with pytest.raises(ValueError):
-        df.replace_columns(replacement)
+        df.with_columns([replacement], replace_only=True)
 
 
 def test_from_table_wrong_names():
@@ -55,14 +55,23 @@ def test_from_table_wrong_names():
         DataFrame.from_table(table, ["a", "b"])
 
 
+def test_unnamed_column_raise():
+    payload = plc.column_factories.make_numeric_column(
+        plc.DataType(plc.TypeId.INT8), 0, plc.MaskState.ALL_VALID
+    )
+
+    with pytest.raises(ValueError):
+        DataFrame([Column(payload, name="a"), Column(payload)])
+
+
 def test_sorted_like_raises_mismatching_names():
     df = DataFrame(
         [
-            NamedColumn(
+            Column(
                 plc.column_factories.make_numeric_column(
                     plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
                 ),
-                "a",
+                name="a",
             )
         ]
     )
@@ -72,11 +81,11 @@ def test_sorted_like_raises_mismatching_names():
 
 
 def test_shallow_copy():
-    column = NamedColumn(
+    column = Column(
         plc.column_factories.make_numeric_column(
             plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
         ),
-        "a",
+        name="a",
     )
     column.set_sorted(
         is_sorted=plc.types.Sorted.YES,
@@ -85,13 +94,13 @@ def test_shallow_copy():
     )
     df = DataFrame([column])
     copy = df.copy()
-    copy.columns[0].set_sorted(
+    copy.column_map["a"].set_sorted(
         is_sorted=plc.types.Sorted.NO,
         order=plc.types.Order.ASCENDING,
         null_order=plc.types.NullOrder.AFTER,
     )
-    assert df.columns[0].is_sorted == plc.types.Sorted.YES
-    assert copy.columns[0].is_sorted == plc.types.Sorted.NO
+    assert df.column_map["a"].is_sorted == plc.types.Sorted.YES
+    assert copy.column_map["a"].is_sorted == plc.types.Sorted.NO
 
 
 def test_sorted_flags_preserved_empty():
@@ -100,7 +109,7 @@ def test_sorted_flags_preserved_empty():
 
     gf = DataFrame.from_polars(df)
 
-    (a,) = gf.columns
+    a = gf.column_map["a"]
 
     assert a.is_sorted == plc.types.Sorted.YES
 
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 76c7648813a..2a37683478b 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -69,7 +69,7 @@ def test_setsorted(descending, nulls_last, with_nulls):
 
     df = translate_ir(q._ldf.visit()).evaluate(cache={})
 
-    (a,) = df.columns
+    a = df.column_map["a"]
 
     assert a.is_sorted == plc.types.Sorted.YES
     null_order = (
diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py
index 35aaef44e1f..e7770bfadac 100644
--- a/python/cudf_polars/tests/utils/test_broadcast.py
+++ b/python/cudf_polars/tests/utils/test_broadcast.py
@@ -6,34 +6,35 @@
 import pylibcudf as plc
 import pytest
 
-from cudf_polars.containers import NamedColumn
+from cudf_polars.containers import Column
 from cudf_polars.dsl.ir import broadcast
 
 
 @pytest.mark.parametrize("target", [4, None])
 def test_broadcast_all_scalar(target):
     columns = [
-        NamedColumn(
+        Column(
             plc.column_factories.make_numeric_column(
                 plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID
             ),
-            f"col{i}",
+            name=f"col{i}",
         )
         for i in range(3)
     ]
     result = broadcast(*columns, target_length=target)
     expected = 1 if target is None else target
 
+    assert [c.name for c in result] == [f"col{i}" for i in range(3)]
     assert all(column.obj.size() == expected for column in result)
 
 
 def test_invalid_target_length():
     columns = [
-        NamedColumn(
+        Column(
             plc.column_factories.make_numeric_column(
                 plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID
             ),
-            f"col{i}",
+            name=f"col{i}",
         )
         for i in range(3)
     ]
@@ -43,11 +44,11 @@ def test_invalid_target_length():
 
 def test_broadcast_mismatching_column_lengths():
     columns = [
-        NamedColumn(
+        Column(
             plc.column_factories.make_numeric_column(
                 plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID
             ),
-            f"col{i}",
+            name=f"col{i}",
         )
         for i in range(3)
     ]
@@ -58,16 +59,17 @@ def test_broadcast_mismatching_column_lengths():
 @pytest.mark.parametrize("nrows", [0, 5])
 def test_broadcast_with_scalars(nrows):
     columns = [
-        NamedColumn(
+        Column(
             plc.column_factories.make_numeric_column(
                 plc.DataType(plc.TypeId.INT8),
                 nrows if i == 0 else 1,
                 plc.MaskState.ALL_VALID,
             ),
-            f"col{i}",
+            name=f"col{i}",
         )
         for i in range(3)
     ]
 
     result = broadcast(*columns)
+    assert [c.name for c in result] == [f"col{i}" for i in range(3)]
     assert all(column.obj.size() == nrows for column in result)

From bcf9425a8fc8bfe4a08840749a16cf83e1bc89e8 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 8 Oct 2024 14:54:04 +0100
Subject: [PATCH 032/117] Compute whole column variance using numerically
 stable approach (#16448)

We use the pairwise approach of Chan, Golub, and LeVeque (1983).

- Closes #16444

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/16448
---
 .../reduction/detail/reduction_operators.cuh  |  48 ++++---
 cpp/src/reductions/compound.cuh               |  26 ++--
 cpp/tests/reductions/reduction_tests.cpp      | 131 ++++++++----------
 .../java/ai/rapids/cudf/ReductionTest.java    |   4 +-
 .../cudf/cudf/core/column/numerical_base.py   |  10 +-
 python/cudf/cudf/core/series.py               |   2 +-
 python/cudf/cudf/tests/test_datetime.py       |  18 +--
 7 files changed, 116 insertions(+), 123 deletions(-)

diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
index 4cf8564ab3a..5694362af8f 100644
--- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
@@ -31,17 +31,41 @@ namespace detail {
 // intermediate data structure to compute `var`, `std`
 template <typename ResultType>
 struct var_std {
-  ResultType value;          /// the value
-  ResultType value_squared;  /// the value of squared
-
-  CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0)
-    : value(_value), value_squared(_value_squared){};
+  // Uses the pairwise approach of Chan, Golub, and LeVeque,
+  // _Algorithms for computing the sample variance: analysis and
+  // recommendations_ (1983)
+  // https://doi.org/10.1080/00031305.1983.10483115
+  // Also http://www.cs.yale.edu/publications/techreports/tr222.pdf
+  // This is a modification of Youngs and Cramer's online approach.
+  ResultType running_sum;
+  ResultType running_square_deviations;
+  size_type count;
+
+  CUDF_HOST_DEVICE inline var_std(ResultType t = 0, ResultType s = 0, size_type n = 0)
+    : running_sum(t), running_square_deviations(s), count(n){};
 
   using this_t = var_std<ResultType>;
 
   CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
   {
-    return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared));
+    // Updates as per equations 1.5a and 1.5b in the paper
+    // T_{1,m+n} = T_{1,m} + T_{m+1,n+1}
+    // S_{1,m+n} = S_{1,m} + S_{m+1,n+1} + m/(n(m+n)) * (n/m T_{1,m} - T_{m+1,n+1})**2
+    // Here the first m samples are in this, the remaining n samples are in rhs.
+    auto const m = this->count;
+    auto const n = rhs.count;
+    // Avoid division by zero.
+    if (m == 0) { return rhs; }
+    if (n == 0) { return *this; }
+    auto const tm   = this->running_sum;
+    auto const tn   = rhs.running_sum;
+    auto const sm   = this->running_square_deviations;
+    auto const sn   = rhs.running_square_deviations;
+    auto const tmn  = tm + tn;
+    auto const diff = ((static_cast<ResultType>(n) / m) * tm) - tn;
+    // Computing m/n(m+n) as m/n/(m+n) to avoid integer overflow
+    auto const smn = sm + sn + ((static_cast<ResultType>(m) / n) / (m + n)) * diff * diff;
+    return {tmn, smn, m + n};
   };
 };
 
@@ -50,10 +74,7 @@ template <typename ResultType>
 struct transformer_var_std {
   using OutputType = var_std<ResultType>;
 
-  CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value)
-  {
-    return OutputType(value, value * value);
-  };
+  CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) { return {value, 0, 1}; };
 };
 
 // ------------------------------------------------------------------------
@@ -257,12 +278,7 @@ struct variance : public compound_op<variance> {
                                                              cudf::size_type const& count,
                                                              cudf::size_type const& ddof)
     {
-      ResultType mean     = input.value / count;
-      ResultType asum     = input.value_squared;
-      cudf::size_type div = count - ddof;
-      ResultType var      = asum / div - ((mean * mean) * count) / div;
-
-      return var;
+      return input.running_square_deviations / (count - ddof);
     };
   };
 };
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 6bc8b48832f..cd9fade164a 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -18,13 +18,18 @@
 
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/reduction/detail/reduction.cuh>
+#include <cudf/reduction/detail/reduction_operators.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
+#include <stdexcept>
+#include <type_traits>
+
 namespace cudf {
 namespace reduction {
 namespace compound {
@@ -53,9 +58,17 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
 {
   auto const valid_count = col.size() - col.null_count();
 
+  // All null input produces all null output
+  if (valid_count == 0 ||
+      // Only care about ddof for standard deviation and variance right now
+      valid_count <= ddof && (std::is_same_v<Op, cudf::reduction::detail::op::standard_deviation> ||
+                              std::is_same_v<Op, cudf::reduction::detail::op::variance>)) {
+    auto result = cudf::make_fixed_width_scalar(output_dtype, stream, mr);
+    result->set_valid_async(false, stream);
+    return result;
+  }
   // reduction by iterator
   auto dcol = cudf::column_device_view::create(col, stream);
-  std::unique_ptr<scalar> result;
   Op compound_op{};
 
   if (!cudf::is_dictionary(col.type())) {
@@ -63,25 +76,21 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
       auto it = thrust::make_transform_iterator(
         dcol->pair_begin<ElementType, true>(),
         compound_op.template get_null_replacing_element_transformer<ResultType>());
-      result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
+      return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
         it, col.size(), compound_op, valid_count, ddof, stream, mr);
     } else {
       auto it = thrust::make_transform_iterator(
         dcol->begin<ElementType>(), compound_op.template get_element_transformer<ResultType>());
-      result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
+      return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
         it, col.size(), compound_op, valid_count, ddof, stream, mr);
     }
   } else {
     auto it = thrust::make_transform_iterator(
       cudf::dictionary::detail::make_dictionary_pair_iterator<ElementType>(*dcol, col.has_nulls()),
       compound_op.template get_null_replacing_element_transformer<ResultType>());
-    result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
+    return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
       it, col.size(), compound_op, valid_count, ddof, stream, mr);
   }
-
-  // set scalar is valid
-  result->set_valid_async(col.null_count() < col.size(), stream);
-  return result;
 };
 
 // @brief result type dispatcher for compound reduction (a.k.a. mean, var, std)
@@ -137,6 +146,7 @@ struct element_type_dispatcher {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
+    CUDF_EXPECTS(ddof >= 0, "ddof must be non-negative", std::domain_error);
     return cudf::type_dispatcher(
       output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, ddof, stream, mr);
   }
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 1e9e13ded93..bdb98372836 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -33,8 +33,12 @@
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <algorithm>
+#include <iostream>
+#include <iterator>
 #include <vector>
 
 using aggregation        = cudf::aggregation;
@@ -765,6 +769,25 @@ TYPED_TEST(MultiStepReductionTest, Mean)
             expected_value_nulls);
 }
 
+template <typename T>
+double calc_var(std::vector<T> const& v, int ddof, std::vector<bool> const& mask = {})
+{
+  auto const values = [&]() {
+    if (mask.empty()) { return v; }
+    std::vector<T> masked{};
+    thrust::copy_if(
+      v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; });
+    return masked;
+  }();
+  auto const valid_count = values.size();
+  double const mean      = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count;
+  double const sq_sum_of_differences =
+    std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) {
+      return acc + std::pow(v - mean, 2);
+    });
+  return sq_sum_of_differences / (valid_count - ddof);
+}
+
 // This test is disabled for only a Debug build because a compiler error
 // documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu
 #ifdef NDEBUG
@@ -777,25 +800,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
   std::vector<bool> host_bools({true, true, false, true, true, true, false, true});
 
-  auto calc_var = [](std::vector<T>& v, cudf::size_type valid_count, int ddof) {
-    double mean = std::accumulate(v.begin(), v.end(), double{0});
-    mean /= valid_count;
-
-    double sum_of_sq = std::accumulate(
-      v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; });
-
-    cudf::size_type div = valid_count - ddof;
-
-    double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
-    return var;
-  };
-
   // test without nulls
   std::vector<T> v = convert_values<T>(int_values);
   cudf::test::fixed_width_column_wrapper<T> col(v.begin(), v.end());
 
   auto const ddof = 1;
-  double var      = calc_var(v, v.size(), ddof);
+  double var      = calc_var(v, ddof);
   double std      = std::sqrt(var);
   auto var_agg    = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
   auto std_agg    = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -811,23 +821,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
 
   // test with nulls
   cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(v, host_bools);
-  cudf::size_type valid_count =
-    cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count();
-  auto replaced_array = replace_nulls(v, host_bools, T{0});
-
-  double var_nulls = calc_var(replaced_array, valid_count, ddof);
-  double std_nulls = std::sqrt(var_nulls);
+  double var_nulls                                    = calc_var(v, ddof, host_bools);
+  double std_nulls                                    = std::sqrt(var_nulls);
 
-  EXPECT_EQ(this
-              ->template reduction_test<double>(
-                col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
-              .first,
-            var_nulls);
-  EXPECT_EQ(this
-              ->template reduction_test<double>(
-                col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
-              .first,
-            std_nulls);
+  EXPECT_DOUBLE_EQ(this
+                     ->template reduction_test<double>(
+                       col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
+                     .first,
+                   var_nulls);
+  EXPECT_DOUBLE_EQ(this
+                     ->template reduction_test<double>(
+                       col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
+                     .first,
+                   std_nulls);
 }
 
 // ----------------------------------------------------------------------------
@@ -1139,23 +1145,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
   std::vector<double> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
   std::vector<bool> host_bools({true, true, false, true, true, true, false, true});
 
-  auto calc_var = [ddof](std::vector<double>& v, cudf::size_type valid_count) {
-    double mean = std::accumulate(v.begin(), v.end(), double{0});
-    mean /= valid_count;
-
-    double sum_of_sq = std::accumulate(
-      v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; });
-
-    cudf::size_type div = valid_count - ddof;
-
-    double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
-    return var;
-  };
-
   // test without nulls
   cudf::test::fixed_width_column_wrapper<double> col(int_values.begin(), int_values.end());
 
-  double var   = calc_var(int_values, int_values.size());
+  double var   = calc_var(int_values, ddof);
   double std   = std::sqrt(var);
   auto var_agg = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
   auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -1172,23 +1165,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
   // test with nulls
   cudf::test::fixed_width_column_wrapper<double> col_nulls =
     construct_null_column(int_values, host_bools);
-  cudf::size_type valid_count =
-    cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count();
-  auto replaced_array = replace_nulls<double>(int_values, host_bools, int{0});
-
-  double var_nulls = calc_var(replaced_array, valid_count);
+  double var_nulls = calc_var(int_values, ddof, host_bools);
   double std_nulls = std::sqrt(var_nulls);
 
-  EXPECT_EQ(this
-              ->template reduction_test<double>(
-                col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
-              .first,
-            var_nulls);
-  EXPECT_EQ(this
-              ->template reduction_test<double>(
-                col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
-              .first,
-            std_nulls);
+  EXPECT_DOUBLE_EQ(this
+                     ->template reduction_test<double>(
+                       col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
+                     .first,
+                   var_nulls);
+  EXPECT_DOUBLE_EQ(this
+                     ->template reduction_test<double>(
+                       col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
+                     .first,
+                   std_nulls);
 }
 
 //-------------------------------------------------------------------
@@ -2471,21 +2460,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
   std::vector<T> v = convert_values<T>(int_values);
   cudf::data_type output_type{cudf::type_to_id<double>()};
 
-  auto calc_var = [](std::vector<T> const& v, cudf::size_type valid_count, cudf::size_type ddof) {
-    double mean = std::accumulate(v.cbegin(), v.cend(), double{0});
-    mean /= valid_count;
-    double sum_of_sq = std::accumulate(
-      v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; });
-    auto const div = valid_count - ddof;
-    double var     = sum_of_sq / div - ((mean * mean) * valid_count) / div;
-    return var;
-  };
-
   // test without nulls
   cudf::test::dictionary_column_wrapper<T> col(v.begin(), v.end());
 
   cudf::size_type const ddof = 1;
-  double var                 = calc_var(v, v.size(), ddof);
+  double var                 = calc_var(v, ddof);
   double std                 = std::sqrt(var);
   auto var_agg               = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
   auto std_agg               = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -2497,15 +2476,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
   std::vector<bool> validity({true, true, false, true, true, true, false, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
-  cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true);
-
-  double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof);
+  double var_nulls = calc_var(v, ddof, validity);
   double std_nulls = std::sqrt(var_nulls);
 
-  EXPECT_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
-            var_nulls);
-  EXPECT_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
-            std_nulls);
+  EXPECT_DOUBLE_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
+                   var_nulls);
+  EXPECT_DOUBLE_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
+                   std_nulls);
 }
 
 TYPED_TEST(DictionaryReductionTest, NthElement)
diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
index 8cc7df1ce7f..6bd6603d71b 100644
--- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
@@ -612,13 +612,13 @@ void testWithSetOutputType() {
       assertEquals(expected, result);
     }
 
-    try (Scalar expected = Scalar.fromFloat(1.666667f);
+    try (Scalar expected = Scalar.fromFloat(1.6666666f);
          ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4});
          Scalar result = cv.variance(DType.FLOAT32)) {
       assertEquals(expected, result);
     }
 
-    try (Scalar expected = Scalar.fromFloat(1.2909945f);
+    try (Scalar expected = Scalar.fromFloat(1.2909944f);
          ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4});
          Scalar result = cv.standardDeviation(DType.FLOAT32)) {
       assertEquals(expected, result);
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 3b8dd05c13a..f6ab91f2f01 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -180,9 +180,12 @@ def var(
         min_count: int = 0,
         ddof=1,
     ):
-        return self._reduce(
+        result = self._reduce(
             "var", skipna=skipna, min_count=min_count, ddof=ddof
         )
+        if result is NA:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+        return result
 
     def std(
         self,
@@ -190,9 +193,12 @@ def std(
         min_count: int = 0,
         ddof=1,
     ):
-        return self._reduce(
+        result = self._reduce(
             "std", skipna=skipna, min_count=min_count, ddof=ddof
         )
+        if result is NA:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+        return result
 
     def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index acd97c2047c..41ee94b72c8 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2943,7 +2943,7 @@ def corr(self, other, method="pearson", min_periods=None):
         >>> ser1 = cudf.Series([0.9, 0.13, 0.62])
         >>> ser2 = cudf.Series([0.12, 0.26, 0.51])
         >>> ser1.corr(ser2, method="pearson")
-        -0.20454263717316112
+        -0.20454263717316126
         >>> ser1.corr(ser2, method="spearman")
         -0.5
         """
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 4a2345fc009..976b12a9ab5 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2525,23 +2525,7 @@ def test_dti_asi8():
 
 @pytest.mark.parametrize(
     "method, kwargs",
-    [
-        ["mean", {}],
-        pytest.param(
-            "std",
-            {},
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/16444"
-            ),
-        ),
-        pytest.param(
-            "std",
-            {"ddof": 0},
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/16444"
-            ),
-        ),
-    ],
+    [["mean", {}], ["std", {}], ["std", {"ddof": 0}]],
 )
 def test_dti_reduction(method, kwargs):
     pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")

From cc23474129433d7700058c311be5154340a6bce3 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 8 Oct 2024 17:12:10 +0100
Subject: [PATCH 033/117] Turn on `xfail_strict = true` for all python packages
 (#16977)

The cudf tests already treat tests that are expected to fail but pass as errors, but at the time we introduced that change, we didn't do the same for the other packages. Do that now, it turns out there are only a few xpassing tests.

While here, it turns out that having multiple different pytest configuration files does not work. `pytest.ini` takes precedence over other options, and it's "first file wins". Consequently, the merge of #16851 turned off `xfail_strict = true` (and other options) for many of the subpackages.

To fix this, migrate all pytest configuration into the appropriate section of the `pyproject.toml` files, so that all tool configuration lives in the same place. We also add a section in the developer guide to document this choice.

- Closes #12391
- Closes #16974

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16977
---
 docs/cudf/source/developer_guide/testing.md   | 17 ++++++++
 python/cudf/cudf/tests/pytest.ini             | 19 ---------
 python/cudf/cudf_pandas_tests/pytest.ini      |  9 ++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 41 +++++++++++--------
 python/cudf/pyproject.toml                    | 21 ++++++++++
 python/cudf_kafka/cudf_kafka/tests/pytest.ini |  4 --
 python/cudf_kafka/pyproject.toml              |  3 ++
 python/cudf_polars/pyproject.toml             |  5 +++
 python/cudf_polars/tests/pytest.ini           |  4 --
 python/custreamz/custreamz/tests/pytest.ini   |  4 --
 .../custreamz/tests/test_dataframes.py        | 18 +++-----
 python/custreamz/pyproject.toml               |  6 +++
 python/dask_cudf/dask_cudf/tests/pytest.ini   |  4 --
 python/dask_cudf/pyproject.toml               |  3 ++
 python/pylibcudf/pylibcudf/tests/pytest.ini   |  9 ----
 python/pylibcudf/pyproject.toml               | 10 +++++
 16 files changed, 103 insertions(+), 74 deletions(-)
 delete mode 100644 python/cudf/cudf/tests/pytest.ini
 create mode 100644 python/cudf/cudf_pandas_tests/pytest.ini
 delete mode 100644 python/cudf_kafka/cudf_kafka/tests/pytest.ini
 delete mode 100644 python/cudf_polars/tests/pytest.ini
 delete mode 100644 python/custreamz/custreamz/tests/pytest.ini
 delete mode 100644 python/dask_cudf/dask_cudf/tests/pytest.ini
 delete mode 100644 python/pylibcudf/pylibcudf/tests/pytest.ini

diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md
index f12f809d5db..22cc1b5b8de 100644
--- a/docs/cudf/source/developer_guide/testing.md
+++ b/docs/cudf/source/developer_guide/testing.md
@@ -7,6 +7,23 @@ specifically the [`pytest-cov`](https://github.com/pytest-dev/pytest-cov) plugin
 Code coverage reports are uploaded to [Codecov](https://app.codecov.io/gh/rapidsai/cudf).
 Each PR also indicates whether it increases or decreases test coverage.
 
+### Configuring pytest
+
+Pytest will accept configuration in [multiple different
+files](https://docs.pytest.org/en/stable/reference/customize.html),
+with a specified discovery and precedence order. Note in particular
+that there is no automatic "include" mechanism, as soon as a matching
+configuration file is found, discovery stops.
+
+For preference, so that all tool configuration lives in the same
+place, we use `pyproject.toml`-based configuration. Test configuration
+for a given package should live in that package's `pyproject.toml`
+file.
+
+Where tests do not naturally belong to a project, for example the
+`cudf.pandas` integration tests and the cuDF benchmarks, use a
+`pytest.ini` file as close to the tests as possible.
+
 ## Test organization
 
 How tests are organized depends on which of the following two groups they fall into:
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
deleted file mode 100644
index 496a322ff80..00000000000
--- a/python/cudf/cudf/tests/pytest.ini
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-[pytest]
-markers =
-    spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`
-xfail_strict = true
-filterwarnings =
-    error
-    ignore:::.*xdist.*
-    ignore:::.*pytest.*
-    # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
-    ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore
-    # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
-    ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
-    # PerformanceWarning from cupy warming up the JIT cache
-    ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning
-    # Ignore numba PEP 456 warning specific to arm machines
-    ignore:FNV hashing is not implemented in Numba.*:UserWarning
-addopts = --tb=native
diff --git a/python/cudf/cudf_pandas_tests/pytest.ini b/python/cudf/cudf_pandas_tests/pytest.ini
new file mode 100644
index 00000000000..46e2448ea24
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/pytest.ini
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# Note, this config file overrides the default "cudf" test config in
+# ../pyproject.toml We do so deliberately because we have different
+# treatment of markers and warnings
+[pytest]
+addopts = --tb=native --strict-config --strict-markers
+empty_parameter_set_mark = fail_at_collect
+xfail_strict = true
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 2bbed40e34e..a74b7148c00 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import collections
+import contextlib
 import copy
 import datetime
 import operator
@@ -21,10 +22,15 @@
 import pyarrow as pa
 import pytest
 from nbconvert.preprocessors import ExecutePreprocessor
-from numba import NumbaDeprecationWarning, vectorize
+from numba import (
+    NumbaDeprecationWarning,
+    __version__ as numba_version,
+    vectorize,
+)
+from packaging import version
 from pytz import utc
 
-from cudf.core._compat import PANDAS_GE_220
+from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220, PANDAS_VERSION
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import (
     ProxyFallbackError,
@@ -52,8 +58,6 @@
     get_calendar,
 )
 
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-
 # Accelerated pandas has the real pandas and cudf modules as attributes
 pd = xpd._fsproxy_slow
 cudf = xpd._fsproxy_fast
@@ -622,10 +626,6 @@ def test_array_function_series_fallback(series):
     tm.assert_equal(expect, got)
 
 
-@pytest.mark.xfail(
-    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
-    reason="Fails in older versions of pandas",
-)
 def test_timedeltaproperties(series):
     psr, sr = series
     psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]")
@@ -685,10 +685,6 @@ def test_maintain_container_subclasses(multiindex):
     assert isinstance(got, xpd.core.indexes.frozen.FrozenList)
 
 
-@pytest.mark.xfail(
-    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
-    reason="Fails in older versions of pandas due to unsupported boxcar window type",
-)
 def test_rolling_win_type():
     pdf = pd.DataFrame(range(5))
     df = xpd.DataFrame(range(5))
@@ -697,8 +693,14 @@ def test_rolling_win_type():
     tm.assert_equal(result, expected)
 
 
-@pytest.mark.skip(
-    reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009"
+@pytest.mark.skipif(
+    version.parse(numba_version) < version.parse("0.59"),
+    reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009",
+)
+@pytest.mark.xfail(
+    version.parse(numba_version) >= version.parse("0.59")
+    and PANDAS_VERSION < version.parse("2.1"),
+    reason="numba.generated_jit removed in 0.59, requires pandas >= 2.1",
 )
 def test_rolling_apply_numba_engine():
     def weighted_mean(x):
@@ -709,7 +711,12 @@ def weighted_mean(x):
     pdf = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
     df = xpd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
 
-    with pytest.warns(NumbaDeprecationWarning):
+    ctx = (
+        contextlib.nullcontext()
+        if PANDAS_GE_210
+        else pytest.warns(NumbaDeprecationWarning)
+    )
+    with ctx:
         expect = pdf.rolling(2, method="table", min_periods=0).apply(
             weighted_mean, raw=True, engine="numba"
         )
@@ -1305,7 +1312,7 @@ def max_times_two(self):
 
 
 @pytest.mark.xfail(
-    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_VERSION < version.parse("2.1"),
     reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0",
 )
 def test_floordiv_array_vs_df():
@@ -1580,7 +1587,7 @@ def test_numpy_cupy_flatiter(series):
 
 
 @pytest.mark.xfail(
-    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_VERSION < version.parse("2.1"),
     reason="pyarrow_numpy storage type was not supported in pandas-2.0.0",
 )
 def test_arrow_string_arrays():
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 1b730ffd13c..c0776fd0de6 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -124,6 +124,27 @@ skip = [
     "__init__.py",
 ]
 
+[tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
+filterwarnings = [
+    "error",
+    "ignore:::.*xdist.*",
+    "ignore:::.*pytest.*",
+    # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
+    "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore",
+    # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
+    "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning",
+    # PerformanceWarning from cupy warming up the JIT cache
+    "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning",
+    # Ignore numba PEP 456 warning specific to arm machines
+    "ignore:FNV hashing is not implemented in Numba.*:UserWarning"
+]
+markers = [
+    "spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`"
+]
+xfail_strict = true
+
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/cudf_kafka/cudf_kafka/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index a1a3ec37842..87e19a2bccf 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -79,9 +79,12 @@ skip = [
 ]
 
 [tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
   "error"
 ]
+xfail_strict = true
 
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f55031e0826..5345fad41a2 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -50,6 +50,11 @@ license-files = ["LICENSE"]
 version = {file = "cudf_polars/VERSION"}
 
 [tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
+filterwarnings = [
+  "error"
+]
 xfail_strict = true
 
 [tool.coverage.report]
diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/cudf_polars/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/custreamz/custreamz/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py
index bae4b051cae..8c0130d2818 100644
--- a/python/custreamz/custreamz/tests/test_dataframes.py
+++ b/python/custreamz/custreamz/tests/test_dataframes.py
@@ -377,24 +377,16 @@ def test_setitem_overwrites(stream):
     [
         ({}, "sum"),
         ({}, "mean"),
-        pytest.param({}, "min"),
+        ({}, "min"),
         pytest.param(
             {},
             "median",
             marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
         ),
-        pytest.param({}, "max"),
-        pytest.param(
-            {},
-            "var",
-            marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
-        ),
-        pytest.param({}, "count"),
-        pytest.param(
-            {"ddof": 0},
-            "std",
-            marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
-        ),
+        ({}, "max"),
+        ({}, "var"),
+        ({}, "count"),
+        ({"ddof": 0}, "std"),
         pytest.param(
             {"quantile": 0.5},
             "quantile",
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 85ab0024bb5..af45f49d9b4 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -111,6 +111,8 @@ skip = [
 ]
 
 [tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
     "error",
     "ignore:unclosed <socket.socket:ResourceWarning",
@@ -118,4 +120,8 @@ filterwarnings = [
     # Should be fixed in the next streamz release
     # https://github.com/python-streamz/streamz/commit/2812f1f961dfcb3f17e948d8b12a12472975558e
     "ignore:pkg_resources is deprecated as an API:DeprecationWarning:streamz",
+    "ignore:Deprecated call to `pkg_resources.declare_namespace:DeprecationWarning",
+    # Ignore numba PEP 456 warning specific to arm machines
+    "ignore:FNV hashing is not implemented in Numba.*:UserWarning"
 ]
+xfail_strict = true
diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/dask_cudf/dask_cudf/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index ce825c7647b..fbcd7ae5dfb 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -115,6 +115,8 @@ skip = [
 ]
 
 [tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
     "error::FutureWarning",
     "error::DeprecationWarning",
@@ -125,3 +127,4 @@ filterwarnings = [
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
     "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
 ]
+xfail_strict = true
diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini
deleted file mode 100644
index f572f85ca49..00000000000
--- a/python/pylibcudf/pylibcudf/tests/pytest.ini
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-xfail_strict = true
-filterwarnings =
-    error
-    ignore:::.*xdist.*
-    ignore:::.*pytest.*
-addopts = --tb=native
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index a8224f54e1c..be65142850f 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -95,6 +95,16 @@ skip = [
     "__init__.py",
 ]
 
+[tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
+filterwarnings = [
+  "error",
+  "ignore:::.*xdist.*",
+  "ignore:::.*pytest.*"
+]
+xfail_strict = true
+
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"

From 553d8ec197c45f7d10ae4571f625e97d7b88be82 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 8 Oct 2024 11:25:42 -0500
Subject: [PATCH 034/117] Performance optimization of JSON validation (#16996)

As part of JSON validation, field, value and string tokens are validated. Right now the code has single transform_inclusive_scan. Since this transform functor is a heavy operation, it slows down the entire scan drastically.
This PR splits transform and scan in validation. The runtime of validation went from 200ms to 20ms.

Also, a few hardcoded string comparisons are moved to trie.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/16996
---
 cpp/src/io/json/process_tokens.cu | 88 ++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 38 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 83c7b663980..d41d137a2c9 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/tokenize_json.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -87,13 +88,25 @@ void validate_token_stream(device_span<char const> d_input,
 {
   CUDF_FUNC_RANGE();
   if (!options.is_strict_validation()) { return; }
+
+  rmm::device_uvector<bool> d_invalid = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    tokens.size(), stream, cudf::get_current_device_resource_ref());
+
   using token_t = cudf::io::json::token_t;
-  cudf::detail::optional_trie trie_na =
-    cudf::detail::create_serialized_trie(options.get_na_values(), stream);
-  auto trie_na_view    = cudf::detail::make_trie_view(trie_na);
+  auto literals = options.get_na_values();
+  literals.emplace_back("null");  // added these too to single trie
+  literals.emplace_back("true");
+  literals.emplace_back("false");
+
+  cudf::detail::optional_trie trie_literals =
+    cudf::detail::create_serialized_trie(literals, stream);
+  cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie(
+    {"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream);
+
   auto validate_values = cuda::proclaim_return_type<bool>(
     [data                        = d_input.data(),
-     trie_na                     = trie_na_view,
+     trie_literals               = cudf::detail::make_trie_view(trie_literals),
+     trie_nonnumeric             = cudf::detail::make_trie_view(trie_nonnumeric),
      allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
      allow_nonnumeric =
        options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
@@ -101,24 +114,15 @@ void validate_token_stream(device_span<char const> d_input,
       // This validates an unquoted value. A value must match https://www.json.org/json-en.html
       // but the leading and training whitespace should already have been removed, and is not
       // a string
-      auto c               = data[start];
-      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
-      if (is_null_literal) {
-        return true;
-      } else if ('n' == c) {
-        return substr_eq(data, start, end, 4, "null");
-      } else if ('t' == c) {
-        return substr_eq(data, start, end, 4, "true");
-      } else if ('f' == c) {
-        return substr_eq(data, start, end, 5, "false");
-      } else if (allow_nonnumeric && c == 'N') {
-        return substr_eq(data, start, end, 3, "NaN");
-      } else if (allow_nonnumeric && c == 'I') {
-        return substr_eq(data, start, end, 8, "Infinity");
-      } else if (allow_nonnumeric && c == '+') {
-        return substr_eq(data, start, end, 4, "+INF") ||
-               substr_eq(data, start, end, 9, "+Infinity");
-      } else if ('-' == c || c <= '9' && 'c' >= '0') {
+      auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start});
+      if (is_literal) { return true; }
+      if (allow_nonnumeric) {
+        auto const is_nonnumeric =
+          serialized_trie_contains(trie_nonnumeric, {data + start, end - start});
+        if (is_nonnumeric) { return true; }
+      }
+      auto c = data[start];
+      if ('-' == c || c <= '9' && 'c' >= '0') {
         // number
         auto num_state = number_state::START;
         for (auto at = start; at < end; at++) {
@@ -140,9 +144,6 @@ void validate_token_stream(device_span<char const> d_input,
                 num_state = number_state::LEADING_ZERO;
               } else if (c >= '1' && c <= '9') {
                 num_state = number_state::WHOLE;
-              } else if (allow_nonnumeric && 'I' == c) {
-                return substr_eq(data, start, end, 4, "-INF") ||
-                       substr_eq(data, start, end, 9, "-Infinity");
               } else {
                 return false;
               }
@@ -273,33 +274,44 @@ void validate_token_stream(device_span<char const> d_input,
 
   auto num_tokens = tokens.size();
   auto count_it   = thrust::make_counting_iterator(0);
-  auto predicate  = [tokens        = tokens.begin(),
-                    token_indices = token_indices.begin(),
-                    validate_values,
-                    validate_strings] __device__(auto i) -> bool {
+  auto predicate  = cuda::proclaim_return_type<bool>([tokens        = tokens.begin(),
+                                                     token_indices = token_indices.begin(),
+                                                     validate_values,
+                                                     validate_strings] __device__(auto i) -> bool {
     if (tokens[i] == token_t::ValueEnd) {
       return !validate_values(token_indices[i - 1], token_indices[i]);
     } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
       return !validate_strings(token_indices[i - 1], token_indices[i]);
     }
     return false;
-  };
+  });
+
+  auto conditional_invalidout_it =
+    cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type<void>(
+      [d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void {
+        if (x) { d_invalid[i] = true; }
+      }));
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    count_it,
+                    count_it + num_tokens,
+                    conditional_invalidout_it,
+                    predicate);
 
   using scan_type            = write_if::scan_type;
   auto conditional_write     = write_if{tokens.begin(), num_tokens};
   auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
-  auto transform_op          = cuda::proclaim_return_type<scan_type>(
-    [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
-      if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
-      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
-    });
-  auto binary_op = cuda::proclaim_return_type<scan_type>(
+  auto binary_op             = cuda::proclaim_return_type<scan_type>(
     [] __device__(scan_type prev, scan_type curr) -> scan_type {
       auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
-      return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
+      return {(curr.second ? curr.first : op_result), prev.second | curr.second};
+    });
+  auto transform_op = cuda::proclaim_return_type<scan_type>(
+    [d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type {
+      if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
+      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
     });
 
-  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+  thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream),
                                    count_it,
                                    count_it + num_tokens,
                                    conditional_output_it,

From 618a93fc99ccb916177cb03429c69c8bbd5639b3 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 8 Oct 2024 17:24:45 -0400
Subject: [PATCH 035/117] Migrate nvtext jaccard API to pylibcudf (#17007)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17007
---
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../api_docs/pylibcudf/nvtext/jaccard.rst     |  6 +++
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      | 33 ++++---------
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  3 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  3 +-
 python/pylibcudf/pylibcudf/nvtext/jaccard.pxd |  7 +++
 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx | 47 +++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_jaccard.py    | 37 +++++++++++++++
 9 files changed, 111 insertions(+), 28 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 2e03b589c8b..6300f77d686 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -6,3 +6,4 @@ nvtext
 
     edit_distance
     generate_ngrams
+    jaccard
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
new file mode 100644
index 00000000000..ea59657c25e
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
@@ -0,0 +1,6 @@
+=======
+jaccard
+=======
+
+.. automodule:: pylibcudf.nvtext.jaccard
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 0ebf7c281e3..c964d0206b7 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -2,33 +2,16 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.jaccard cimport (
-    jaccard_index as cpp_jaccard_index,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 @acquire_spill_lock()
 def jaccard_index(Column input1, Column input2, int width):
-    cdef column_view c_input1 = input1.view()
-    cdef column_view c_input2 = input2.view()
-    cdef size_type c_width = width
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_jaccard_index(
-                c_input1,
-                c_input2,
-                c_width
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.jaccard.jaccard_index(
+        input1.to_pylibcudf(mode="read"),
+        input2.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index eb5617a1da6..9913e1fbadb 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 7f5fa2b9925..5f1762b1e3d 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance, generate_ngrams
+from . cimport edit_distance, generate_ngrams, jaccard
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
+    "jaccard",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index a66ce984745..1c0ddb1e5a4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams
+from . import edit_distance, generate_ngrams, jaccard
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
+    "jaccard",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
new file mode 100644
index 00000000000..a4d4a15335b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column jaccard_index(Column input1, Column input2, size_type width)
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
new file mode 100644
index 00000000000..9334d7ce751
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
@@ -0,0 +1,47 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.jaccard cimport (
+    jaccard_index as cpp_jaccard_index,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column jaccard_index(Column input1, Column input2, size_type width):
+    """
+    Returns the Jaccard similarity between individual rows in two strings columns.
+
+    For details, see :cpp:func:`jaccard_index`
+
+    Parameters
+    ----------
+    input1 : Column
+        Input strings column
+    input2 : Column
+        Input strings column
+    width : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Index calculation values
+    """
+    cdef column_view c_input1 = input1.view()
+    cdef column_view c_input2 = input2.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_jaccard_index(
+                c_input1,
+                c_input2,
+                width
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
new file mode 100644
index 00000000000..d5a168426b1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_data():
+    input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
+    input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
+    return pa.array(input1), pa.array(input2)
+
+
+@pytest.mark.parametrize("width", [2, 3])
+def test_jaccard_index(input_data, width):
+    def get_tokens(s, width):
+        return [s[i : i + width] for i in range(len(s) - width + 1)]
+
+    def jaccard_index(s1, s2, width):
+        x = set(get_tokens(s1, width))
+        y = set(get_tokens(s2, width))
+        return len(x & y) / len(x | y)
+
+    input1, input2 = input_data
+    result = plc.nvtext.jaccard.jaccard_index(
+        plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width
+    )
+    expected = pa.array(
+        [
+            jaccard_index(s1.as_py(), s2.as_py(), width)
+            for s1, s2 in zip(input1, input2)
+        ],
+        type=pa.float32(),
+    )
+    assert_column_eq(result, expected)

From 349ba5d37789938a34c1ad75eb5eb57f1db85b2c Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 8 Oct 2024 17:06:10 -0500
Subject: [PATCH 036/117] make conda installs in CI stricter (#17013)

Contributes to https://github.com/rapidsai/build-planning/issues/106

Proposes specifying the RAPIDS version in `conda install` calls in CI that install CI artifacts, to reduce the risk of CI jobs picking up artifacts from other releases.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17013
---
 ci/build_docs.sh         | 8 +++++---
 ci/test_cpp_common.sh    | 7 ++++++-
 ci/test_java.sh          | 4 +++-
 ci/test_notebooks.sh     | 5 ++++-
 ci/test_python_common.sh | 5 ++++-
 ci/test_python_other.sh  | 6 +++++-
 6 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index c67d127e635..dae6ac46757 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -3,8 +3,7 @@
 
 set -euo pipefail
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 
 rapids-logger "Create test conda environment"
@@ -29,7 +28,10 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcudf pylibcudf cudf dask-cudf
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "pylibcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "dask-cudf=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
index f5a8de543f6..e8f6e9388f4 100755
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate C++ testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -31,7 +33,10 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf libcudf_kafka libcudf-tests libcudf-example
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf_kafka=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf-tests=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf-example=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_java.sh b/ci/test_java.sh
index 629ad11014a..9b7b2e48dd6 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate Java testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -30,7 +32,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
index da9478ce25d..3e0712a0691 100755
--- a/ci/test_notebooks.sh
+++ b/ci/test_notebooks.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate notebook testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -30,7 +32,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  cudf libcudf
+  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")"
 pushd notebooks
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index dc70661a17a..81e82908eb4 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -7,6 +7,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate Python testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -38,4 +40,5 @@ rapids-print-env
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  cudf libcudf
+  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index 67c97ad29a5..eee1d54083f 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -7,10 +7,14 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 # Common setup steps shared by Python test jobs
 source ./ci/test_python_common.sh test_python_other
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  dask-cudf cudf_kafka custreamz
+  "dask-cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "cudf_kafka=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "custreamz=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi

From 5b931aca22a06734332963577a91e6af90bb6a68 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 8 Oct 2024 14:35:13 -1000
Subject: [PATCH 037/117] Add string.convert.convert_urls APIs to pylibcudf
 (#17003)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Also I believe the cpp docstrings were incorrect, but could use a second look.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17003
---
 .../cudf/strings/convert/convert_urls.hpp     |  4 +-
 .../_lib/strings/convert/convert_urls.pyx     | 36 +++--------
 .../libcudf/strings/convert/convert_urls.pxd  |  4 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |  2 +-
 .../pylibcudf/strings/convert/__init__.pxd    |  1 +
 .../pylibcudf/strings/convert/__init__.py     |  1 +
 .../strings/convert/convert_urls.pxd          |  8 +++
 .../strings/convert/convert_urls.pyx          | 63 +++++++++++++++++++
 .../tests/test_string_convert_urls.py         | 36 +++++++++++
 9 files changed, 121 insertions(+), 34 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py

diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index d6e87f9d543..febc63d8779 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -28,7 +28,7 @@ namespace strings {
  */
 
 /**
- * @brief Decodes each string using URL encoding.
+ * @brief Encodes each string using URL encoding.
  *
  * Converts mostly non-ascii characters and control characters into UTF-8 hex code-points
  * prefixed with '%'. For example, the space character must be converted to characters '%20' where
@@ -49,7 +49,7 @@ std::unique_ptr<column> url_encode(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Encodes each string using URL encoding.
+ * @brief Decodes each string using URL encoding.
  *
  * Converts all character sequences starting with '%' into character code-points
  * interpreting the 2 following characters as hex values to create the code-point.
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index e52116d6247..d5c2f771970 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -1,17 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_urls cimport (
-    url_decode as cpp_url_decode,
-    url_encode as cpp_url_encode,
-)
-
 from cudf._lib.column cimport Column
 
 
@@ -28,17 +20,10 @@ def url_decode(Column source_strings):
     -------
     URL decoded string column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_url_decode(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_urls.url_decode(
+        source_strings.to_pylibcudf(mode="read")
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -57,14 +42,7 @@ def url_encode(Column source_strings):
     -------
     URL encoded string column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_url_encode(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_urls.url_encode(
+        source_strings.to_pylibcudf(mode="read")
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index 5c07b698454..cb319ad143b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] url_encode(
-        column_view input_col) except +
+        column_view input) except +
 
     cdef unique_ptr[column] url_decode(
-        column_view input_col) except +
+        column_view input) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index eb0d6ee6999..41aeb72039b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx convert_ipv4.pyx
+                   convert_fixed_point.pyx convert_ipv4.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index 431beed8e5d..b4b0b521e39 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -5,4 +5,5 @@ from . cimport (
     convert_durations,
     convert_fixed_point,
     convert_ipv4,
+    convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index a601b562c2e..409620fce45 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -5,4 +5,5 @@
     convert_durations,
     convert_fixed_point,
     convert_ipv4,
+    convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
new file mode 100644
index 00000000000..da05ce93426
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column url_encode(Column Input)
+
+cpdef Column url_decode(Column Input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
new file mode 100644
index 00000000000..a5e080e53b7
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls
+
+
+cpdef Column url_encode(Column input):
+    """
+    Encodes each string using URL encoding.
+
+    For details, see :cpp:func:`cudf::strings::url_encode`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_urls.url_encode(
+                input.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column url_decode(Column input):
+    """
+    Decodes each string using URL encoding.
+
+    For details, see :cpp:func:`cudf::strings::url_decode`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_urls.url_decode(
+                input.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
new file mode 100644
index 00000000000..fee8c3fb8f6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import urllib
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_url_encode():
+    data = ["/home/nfs", None]
+    arr = pa.array(data)
+    result = plc.strings.convert.convert_urls.url_encode(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(
+        [
+            urllib.parse.quote(url, safe="") if isinstance(url, str) else url
+            for url in data
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_url_decode():
+    data = ["%2Fhome%2fnfs", None]
+    arr = pa.array(data)
+    result = plc.strings.convert.convert_urls.url_decode(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(
+        [
+            urllib.parse.unquote(url) if isinstance(url, str) else url
+            for url in data
+        ]
+    )
+    assert_column_eq(result, expected)

From ded4dd2acbf2c5933765853eab56f4d37599c909 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 8 Oct 2024 18:02:14 -0700
Subject: [PATCH 038/117] Add pinning for pyarrow in wheels (#17018)

We have recently observed a number of seg faults in our Python tests. From some investigation, the error comes from the import of pyarrow loading the bundled libarrow.so, and in particular when that library runs a jemalloc function `background_thread_entry`. We have observed similar (but not identical) errors in the past that have to do with as-yet unsolved problems in the way that arrow handles multi-threaded environments. The error is currently only observed on arm runners and with pyarrow 17.0.0. In my tests the error is highly sensitive to everything from import order to unrelated code segments, suggesting a race condition, some form of memory corruption, or perhaps symbol resolution errors at runtime. As a result, I have had limited success in drilling down further into specific causes, especially since attempts to rebuild libarrow.so also squash the error and I therefore cannot use debug symbols. From some offline discussion we decided that avoiding the problematic version is a sufficient fix for now. Due to the sensitivity, I am simply skipping 17.0.0 in this PR. I suspect that future builds of pyarrow will also usually not exhibit this bug (although it may recur occasionally on specific versions of pyarrow). Therefore, rather than lowering the upper bound I would prefer to allow us to float and see if and when this problem reappears. Since our DFG+RBB combination for wheel builds does not yet support any matrix entry other than `cuda`, I'm using environment markers to specify the constraint rather than a matrix entry in dependencies.yaml.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17018
---
 dependencies.yaml                                | 11 ++++++++++-
 python/cudf/pyproject.toml                       |  3 ++-
 python/cudf_polars/tests/expressions/test_agg.py |  2 +-
 python/pylibcudf/pyproject.toml                  |  3 ++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 3561b22965d..ca17917c905 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -421,9 +421,18 @@ dependencies:
           - cython>=3.0.3
   pyarrow_run:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: [conda]
         packages:
           - pyarrow>=14.0.0,<18.0.0a0
+      - output_types: [requirements, pyproject]
+        packages:
+          # pyarrow 17.0.0 wheels have a subtle issue around threading that
+          # can cause segmentation faults around imports on arm. It appears to
+          # be highly dependent on the exact build configuration, so we'll just
+          # avoid 17.0.0 for now unless we observe similar issues in future
+          # releases as well.
+          - pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'
+          - pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index c0776fd0de6..feab04ffadc 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -30,7 +30,8 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.4dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.0,<18.0.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'",
     "pylibcudf==24.12.*,>=0.0.0a0",
     "rich",
     "rmm==24.12.*,>=0.0.0a0",
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 56055f4c6c2..3001a61101a 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -93,7 +93,7 @@ def test_bool_agg(agg, request):
     expr = getattr(pl.col("a"), agg)()
     q = df.select(expr)
 
-    assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_exact=False)
 
 
 @pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs)
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index be65142850f..c9a685de3e9 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -22,7 +22,8 @@ dependencies = [
     "libcudf==24.12.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=14.0.0,<18.0.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'",
     "rmm==24.12.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From a6853f4b3832b5338a4d0cd9d0b93c7bcd1ce884 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Tue, 8 Oct 2024 23:03:03 -0500
Subject: [PATCH 039/117] Refactor `histogram` reduction using
 `cuco::static_set::insert_and_find` (#16485)

Refactors `histogram` reduce and groupby aggregations using `cuco::static_set::insert_and_find`. Speed improvement results [here](https://github.com/rapidsai/cudf/pull/16485#issuecomment-2394855796) and [here](https://github.com/rapidsai/cudf/pull/16485#issuecomment-2394865692).

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16485
---
 cpp/benchmarks/CMakeLists.txt                 |  10 +-
 cpp/benchmarks/groupby/group_histogram.cpp    |  90 ++++++++++
 cpp/benchmarks/reduction/histogram.cpp        |  68 +++++++
 .../cudf/detail/hash_reduce_by_row.cuh        | 169 ------------------
 cpp/src/reductions/histogram.cu               | 164 +++++++----------
 5 files changed, 231 insertions(+), 270 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_histogram.cpp
 create mode 100644 cpp/benchmarks/reduction/histogram.cpp
 delete mode 100644 cpp/include/cudf/detail/hash_reduce_by_row.cuh

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b8a53cd8bd9..b0f75b25975 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -245,6 +245,7 @@ ConfigureNVBench(
   REDUCTION_NVBENCH
   reduction/anyall.cpp
   reduction/dictionary.cpp
+  reduction/histogram.cpp
   reduction/minmax.cpp
   reduction/rank.cpp
   reduction/reduce.cpp
@@ -270,8 +271,13 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
-  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH
+  groupby/group_histogram.cpp
+  groupby/group_max.cpp
+  groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp
+  groupby/group_rank.cpp
+  groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp
new file mode 100644
index 00000000000..cd7f9f298af
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_histogram.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/groupby.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void groupby_histogram_helper(nvbench::state& state,
+                              cudf::size_type num_rows,
+                              cudf::size_type cardinality,
+                              double null_probability)
+{
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const values = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  // Vector of 1 request
+  std::vector<cudf::groupby::aggregation_request> requests(1);
+  requests.back().values = values->view();
+  requests.back().aggregations.push_back(
+    cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto gb_obj       = cudf::groupby::groupby(cudf::table_view({keys->view()}));
+    auto const result = gb_obj.aggregate(requests);
+  });
+
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time, "rows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_groupby_histogram(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
+
+  groupby_histogram_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_histogram,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_histogram")
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("cardinality", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000})
+  .add_int64_axis("num_rows", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
new file mode 100644
index 00000000000..d0925de5c87
--- /dev/null
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf/aggregation.hpp"
+#include "cudf/detail/aggregation/aggregation.hpp"
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/reduction/detail/histogram.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename type>
+static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list<type>)
+{
+  auto const dtype = cudf::type_to_id<type>();
+
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
+
+  data_profile const profile = data_profile_builder()
+                                 .null_probability(null_probability)
+                                 .cardinality(cardinality)
+                                 .distribution(dtype, distribution_id::UNIFORM, 0, num_rows);
+
+  auto const input = create_random_column(dtype, row_count{num_rows}, profile);
+  auto agg         = cudf::make_histogram_aggregation<cudf::reduce_aggregation>();
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::reduce(*input, *agg, input->type(), stream_view);
+  });
+
+  state.add_element_count(input->size());
+}
+
+using data_type = nvbench::type_list<int32_t, int64_t>;
+
+NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
+  .set_name("histogram")
+  .add_float64_axis("null_probability", {0.1})
+  .add_int64_axis("cardinality",
+                  {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000})
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
deleted file mode 100644
index 7de79b31bc7..00000000000
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cuco/static_map.cuh>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace cudf::detail {
-
-using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator<char>>;
-
-/**
- * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
- * rows that compared equal.
- *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
-struct reduce_by_row_fn_base {
- protected:
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
-  OutputType* const d_output;
-
-  reduce_by_row_fn_base(MapView const& d_map,
-                        KeyHasher const& d_hasher,
-                        KeyEqual const& d_equal,
-                        OutputType* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
-  {
-  }
-
-  /**
-   * @brief Return a pointer to the output array at the given index.
-   *
-   * @param idx The access index
-   * @return A pointer to the given index in the output array
-   */
-  __device__ OutputType* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one (undetermined) index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
-
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
-  }
-};
-
-/**
- * @brief Perform a reduction on groups of rows that are compared equal.
- *
- * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
- *
- * At the beginning of the operation, the entire output array is filled with a value given by
- * the `init` parameter. Then, the reduction result for each row group is written into the output
- * array at the index of an unspecified row in the group.
- *
- * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a
- *         reduction functor derived from `reduce_by_row_fn_base`
- * @tparam OutputType Type of the reduction results
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
- * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
- * @param init The initial value for reduction of each row group
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
- */
-template <typename ReduceFuncBuilder, typename OutputType>
-rmm::device_uvector<OutputType> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  ReduceFuncBuilder func_builder,
-  OutputType init,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto const map_dview  = map.get_device_view();
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
-  thrust::uninitialized_fill(
-    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 362b5f74c46..b40b2b6dd2e 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -15,18 +15,24 @@
  */
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/operator.hpp>
+#include <cuco/static_set.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
+#include <thrust/uninitialized_fill.h>
 
 #include <optional>
 
@@ -34,61 +40,12 @@ namespace cudf::reduction::detail {
 
 namespace {
 
+// A CUDA Cooperative Group of 1 thread for the hash set for histogram
+auto constexpr DEFAULT_HISTOGRAM_CG_SIZE = 1;
+
 // Always use 64-bit signed integer for storing count.
 using histogram_count_type = int64_t;
 
-/**
- * @brief The functor to accumulate the frequency of each distinct rows in the input table.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename CountType>
-struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType> {
-  CountType const* d_partial_output;
-
-  reduce_fn(MapView const& d_map,
-            KeyHasher const& d_hasher,
-            KeyEqual const& d_equal,
-            CountType* const d_output,
-            CountType const* const d_partial_output)
-    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType>{d_map,
-                                                                                   d_hasher,
-                                                                                   d_equal,
-                                                                                   d_output},
-      d_partial_output{d_partial_output}
-  {
-  }
-
-  // Count the number of rows in each group of rows that are compared equal.
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1};
-    auto const count =
-      cuda::atomic_ref<CountType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
-    count.fetch_add(increment, cuda::std::memory_order_relaxed);
-  }
-};
-
-/**
- * @brief The builder to construct an instance of `reduce_fn` functor.
- */
-template <typename CountType>
-struct reduce_func_builder {
-  CountType const* const d_partial_output;
-
-  reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output}
-  {
-  }
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             CountType* const d_output)
-  {
-    return reduce_fn<MapView, KeyHasher, KeyEqual, CountType>{
-      d_map, d_hasher, d_equal, d_output, d_partial_output};
-  }
-};
-
 /**
  * @brief Specialized functor to check for not-zero of the second component of the input.
  */
@@ -163,14 +120,6 @@ compute_row_frequencies(table_view const& input,
                "Nested types are not yet supported in histogram aggregation.",
                std::invalid_argument);
 
-  auto map = cudf::detail::hash_map_type{
-    compute_hash_table_size(input.num_rows()),
-    cuco::empty_key{-1},
-    cuco::empty_value{std::numeric_limits<size_type>::min()},
-
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
   auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
@@ -179,51 +128,68 @@ compute_row_frequencies(table_view const& input,
   auto const key_hasher = row_hasher.device_hasher(has_nulls);
   auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
-  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, size_type>>(
-      [] __device__(size_type const i) { return cuco::make_pair(i, i); }));
-
   // Always compare NaNs as equal.
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
+  // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns
+  auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+
+  using row_hash =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
+
+  size_t const num_rows = input.num_rows();
+
+  // Construct a vector to store reduced counts and init to zero
+  rmm::device_uvector<histogram_count_type> reduction_results(num_rows, stream, mr);
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             histogram_count_type{0});
+
+  // Construct a hash set
+  auto row_set = cuco::static_set{
+    cuco::extent{num_rows},
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
+    cuco::empty_key<size_type>{-1},
+    key_equal,
+    cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
+    {},  // thread scope
+    {},  // storage
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
-  if (has_nested_columns) {
-    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-  } else {
-    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-  }
-
-  // Gather the indices of distinct rows.
-  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(
-    static_cast<size_type>(map.get_size()), stream, mr);
-
-  // Store the number of occurrences of each distinct row.
-  auto distinct_counts = make_numeric_column(data_type{type_to_id<histogram_count_type>()},
-                                             static_cast<size_type>(map.get_size()),
-                                             mask_state::UNALLOCATED,
-                                             stream,
-                                             mr);
+  // Device-accessible reference to the hash set with `insert_and_find` operator
+  auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
 
   // Compute frequencies (aka distinct counts) for the input rows.
   // Note that we consider null and NaNs as always equal.
-  auto const reduction_results = cudf::detail::hash_reduce_by_row(
-    map,
-    preprocessed_input,
-    input.num_rows(),
-    has_nulls,
-    has_nested_columns,
-    null_equality::EQUAL,
-    nan_equality::ALL_EQUAL,
-    reduce_func_builder<histogram_count_type>{
-      partial_counts ? partial_counts.value().begin<histogram_count_type>() : nullptr},
-    histogram_count_type{0},
-    stream,
-    cudf::get_current_device_resource_ref());
-
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(num_rows),
+    [set_ref = row_set_ref,
+     increments =
+       partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,
+     counts = reduction_results.begin()] __device__(auto const idx) mutable {
+      auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+      cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
+        counts[*inserted_idx_ptr]};
+      auto const increment = increments ? increments[idx] : histogram_count_type{1};
+      count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
+    });
+
+  // Set-size is the number of distinct (inserted) rows
+  auto const set_size = row_set.size(stream);
+
+  // Vector of distinct indices
+  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(set_size, stream, mr);
+  // Column of distinct counts
+  auto distinct_counts = make_numeric_column(
+    data_type{type_to_id<histogram_count_type>()}, set_size, mask_state::UNALLOCATED, stream, mr);
+
+  // Copy row indices and counts to the output if counts are non-zero
   auto const input_it = thrust::make_zip_iterator(
     thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
   auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
@@ -232,7 +198,7 @@ compute_row_frequencies(table_view const& input,
   // Reduction results above are either group sizes of equal rows, or `0`.
   // The final output is non-zero group sizes only.
   thrust::copy_if(
-    rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{});
+    rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{});
 
   return {std::move(distinct_indices), std::move(distinct_counts)};
 }

From bfac5e5d9b2c10718d2f0f925b4f2c9f62d8fea1 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Wed, 9 Oct 2024 13:56:10 +0800
Subject: [PATCH 040/117] Disable kvikio remote I/O to avoid openssl
 dependencies in JNI build (#17026)

the same issue as https://github.com/NVIDIA/spark-rapids-jni/issues/2475 due to https://github.com/rapidsai/kvikio/pull/464

Port the fix from https://github.com/NVIDIA/spark-rapids-jni/pull/2476, verified locally

Authors:
  - Peixin (https://github.com/pxLi)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17026
---
 java/ci/build-in-docker.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 5a429bdc739..4b5379cf0f1 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -64,7 +64,8 @@ cmake .. -G"${CMAKE_GENERATOR}" \
          -DBUILD_TESTS=$BUILD_CPP_TESTS \
          -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \
          -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \
-         -DBUILD_SHARED_LIBS=OFF
+         -DBUILD_SHARED_LIBS=OFF \
+         -DKvikIO_REMOTE_SUPPORT=OFF
 
 if [[ -z "${PARALLEL_LEVEL}" ]]; then
     cmake --build .

From 319a53327ac7c921a78979a1f23c5caf7171129d Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 9 Oct 2024 09:38:30 -0400
Subject: [PATCH 041/117] Update Changelog [skip ci]

---
 CHANGELOG.md | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 296 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f2a7c337675..7a75b2a95a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,299 @@
+# cudf 24.10.00 (9 Oct 2024)
+
+## 🚨 Breaking Changes
+
+- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi)
+- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism)
+- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson)
+- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2)
+- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr)
+- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr)
+- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Make proxy NumPy arrays pass isinstance check in `cudf.pandas`&quot; ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711)
+- Align public utility function signatures  with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke)
+- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt)
+- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann)
+- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke)
+- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke)
+- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke)
+- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke)
+- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+
+## 🐛 Bug Fixes
+
+- Add license to the pylibcudf wheel ([#16976](https://github.com/rapidsai/cudf/pull/16976)) [@raydouglass](https://github.com/raydouglass)
+- Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter ([#16950](https://github.com/rapidsai/cudf/pull/16950)) [@shrshi](https://github.com/shrshi)
+- Add dask-cudf workaround for missing `rename_axis` support in cudf ([#16899](https://github.com/rapidsai/cudf/pull/16899)) [@rjzamora](https://github.com/rjzamora)
+- Update oldest deps for `pyarrow` &amp; `numpy` ([#16883](https://github.com/rapidsai/cudf/pull/16883)) [@galipremsagar](https://github.com/galipremsagar)
+- Update labeler for pylibcudf ([#16868](https://github.com/rapidsai/cudf/pull/16868)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Refactor mixed_semi_join using cuco::static_set&quot; ([#16855](https://github.com/rapidsai/cudf/pull/16855)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Fix metadata after implicit array conversion from Dask cuDF ([#16842](https://github.com/rapidsai/cudf/pull/16842)) [@rjzamora](https://github.com/rjzamora)
+- Add cudf.pandas dependencies.yaml to update-version.sh ([#16840](https://github.com/rapidsai/cudf/pull/16840)) [@raydouglass](https://github.com/raydouglass)
+- Use cupy 12.2.0 as oldest dependency pinning on CUDA 12 ARM ([#16808](https://github.com/rapidsai/cudf/pull/16808)) [@bdice](https://github.com/bdice)
+- Revert &quot;Fix empty cluster handling in tdigest merge ([#16675)&quot; (#16800](https://github.com/rapidsai/cudf/pull/16675)&quot; (#16800)) [@jihoonson](https://github.com/jihoonson)
+- Intentionally leak thread_local CUDA resources to avoid crash (part 1) ([#16787](https://github.com/rapidsai/cudf/pull/16787)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix `cov`/`corr` bug in dask-cudf ([#16786](https://github.com/rapidsai/cudf/pull/16786)) [@rjzamora](https://github.com/rjzamora)
+- Fix slice_strings wide strings logic with multi-byte characters ([#16777](https://github.com/rapidsai/cudf/pull/16777)) [@davidwendt](https://github.com/davidwendt)
+- Fix nvbench output for sha512 ([#16773](https://github.com/rapidsai/cudf/pull/16773)) [@davidwendt](https://github.com/davidwendt)
+- Allow read_csv(header=None) to return int column labels in `mode.pandas_compatible` ([#16769](https://github.com/rapidsai/cudf/pull/16769)) [@mroeschke](https://github.com/mroeschke)
+- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi)
+- Fix DataFrame.drop(columns=cudf.Series/Index, axis=1) ([#16712](https://github.com/rapidsai/cudf/pull/16712)) [@mroeschke](https://github.com/mroeschke)
+- Use merge base when calculating changed files ([#16709](https://github.com/rapidsai/cudf/pull/16709)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Ensure we pass the has_nulls tparam to mixed_join kernels ([#16708](https://github.com/rapidsai/cudf/pull/16708)) [@abellina](https://github.com/abellina)
+- Add boost-devel to Java CI Docker image ([#16707](https://github.com/rapidsai/cudf/pull/16707)) [@jlowe](https://github.com/jlowe)
+- [BUG] Add gpu node type to cudf-pandas 3rd-party integration nightly CI job ([#16704](https://github.com/rapidsai/cudf/pull/16704)) [@Matt711](https://github.com/Matt711)
+- Fix typo in column_factories.hpp comment from &#39;depth 1&#39; to &#39;depth 2&#39; ([#16700](https://github.com/rapidsai/cudf/pull/16700)) [@a-hirota](https://github.com/a-hirota)
+- Fix Series.to_frame(name=None) setting a None name ([#16698](https://github.com/rapidsai/cudf/pull/16698)) [@mroeschke](https://github.com/mroeschke)
+- Disable gtests/ERROR_TEST during compute-sanitizer memcheck test ([#16691](https://github.com/rapidsai/cudf/pull/16691)) [@davidwendt](https://github.com/davidwendt)
+- Enable batched multi-source reading of JSONL files with large records ([#16687](https://github.com/rapidsai/cudf/pull/16687)) [@shrshi](https://github.com/shrshi)
+- Handle `ordered` parameter in `CategoricalIndex.__repr__` ([#16683](https://github.com/rapidsai/cudf/pull/16683)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix loc/iloc.__setitem__[:, loc] with non cupy types ([#16677](https://github.com/rapidsai/cudf/pull/16677)) [@mroeschke](https://github.com/mroeschke)
+- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson)
+- Fix `cudf::rank` not getting enough params ([#16666](https://github.com/rapidsai/cudf/pull/16666)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Fix slowdown in `CategoricalIndex.__repr__` ([#16665](https://github.com/rapidsai/cudf/pull/16665)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2)
+- Fix slowdown in DataFrame repr in jupyter notebook ([#16656](https://github.com/rapidsai/cudf/pull/16656)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve Series name in duplicated method. ([#16655](https://github.com/rapidsai/cudf/pull/16655)) [@bdice](https://github.com/bdice)
+- Fix interval_range right child non-zero offset ([#16651](https://github.com/rapidsai/cudf/pull/16651)) [@mroeschke](https://github.com/mroeschke)
+- fix libcudf wheel publishing, make package-type explicit in wheel publishing ([#16650](https://github.com/rapidsai/cudf/pull/16650)) [@jameslamb](https://github.com/jameslamb)
+- Revert &quot;Hide all gtest symbols in cudftestutil ([#16546)&quot; (#16644](https://github.com/rapidsai/cudf/pull/16546)&quot; (#16644)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix integer overflow in indexalator pointer logic ([#16643](https://github.com/rapidsai/cudf/pull/16643)) [@davidwendt](https://github.com/davidwendt)
+- Allow for binops between two differently sized DecimalDtypes ([#16638](https://github.com/rapidsai/cudf/pull/16638)) [@mroeschke](https://github.com/mroeschke)
+- Move pragma once in rolling/jit/operation.hpp. ([#16636](https://github.com/rapidsai/cudf/pull/16636)) [@bdice](https://github.com/bdice)
+- Fix overflow bug in low-memory JSON reader ([#16632](https://github.com/rapidsai/cudf/pull/16632)) [@shrshi](https://github.com/shrshi)
+- Add the missing `num_aggregations` axis for `groupby_max_cardinality` ([#16630](https://github.com/rapidsai/cudf/pull/16630)) [@PointKernel](https://github.com/PointKernel)
+- Fix strings::detail::copy_range when target contains nulls ([#16626](https://github.com/rapidsai/cudf/pull/16626)) [@davidwendt](https://github.com/davidwendt)
+- Fix function parameters with common dependency modified during their evaluation ([#16620](https://github.com/rapidsai/cudf/pull/16620)) [@ttnghia](https://github.com/ttnghia)
+- bug-fix: Don&#39;t enable the CUDA language if testing was requested when finding cudf ([#16615](https://github.com/rapidsai/cudf/pull/16615)) [@cryos](https://github.com/cryos)
+- bug-fix: cudf/io/json.hpp use after move ([#16609](https://github.com/rapidsai/cudf/pull/16609)) [@NicolasDenoyelle](https://github.com/NicolasDenoyelle)
+- Remove CUDA whole compilation ODR violations ([#16603](https://github.com/rapidsai/cudf/pull/16603)) [@robertmaynard](https://github.com/robertmaynard)
+- MAINT: Adapt to numpy hiding flagsobject away ([#16593](https://github.com/rapidsai/cudf/pull/16593)) [@seberg](https://github.com/seberg)
+- Revert &quot;Make proxy NumPy arrays pass isinstance check in `cudf.pandas`&quot; ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711)
+- Switch python version to `3.10` in `cudf.pandas` pandas test scripts ([#16559](https://github.com/rapidsai/cudf/pull/16559)) [@galipremsagar](https://github.com/galipremsagar)
+- Hide all gtest symbols in cudftestutil ([#16546](https://github.com/rapidsai/cudf/pull/16546)) [@robertmaynard](https://github.com/robertmaynard)
+- Update the java code to properly deal with lists being returned as strings ([#16536](https://github.com/rapidsai/cudf/pull/16536)) [@revans2](https://github.com/revans2)
+- Register `read_parquet` and `read_csv` with dask-expr ([#16535](https://github.com/rapidsai/cudf/pull/16535)) [@rjzamora](https://github.com/rjzamora)
+- Change cudf::empty_like to not include offsets for empty strings columns ([#16529](https://github.com/rapidsai/cudf/pull/16529)) [@davidwendt](https://github.com/davidwendt)
+- Fix DataFrame reductions with median returning scalar instead of Series ([#16527](https://github.com/rapidsai/cudf/pull/16527)) [@mroeschke](https://github.com/mroeschke)
+- Allow DataFrame.sort_values(by=) to select an index level ([#16519](https://github.com/rapidsai/cudf/pull/16519)) [@mroeschke](https://github.com/mroeschke)
+- Fix `date_range(start, end, freq)` when end-start is divisible by freq ([#16516](https://github.com/rapidsai/cudf/pull/16516)) [@mroeschke](https://github.com/mroeschke)
+- Preserve array name in MultiIndex.from_arrays ([#16515](https://github.com/rapidsai/cudf/pull/16515)) [@mroeschke](https://github.com/mroeschke)
+- Disallow indexing by selecting duplicate labels ([#16514](https://github.com/rapidsai/cudf/pull/16514)) [@mroeschke](https://github.com/mroeschke)
+- Fix `.replace(Index, Index)` raising a TypeError ([#16513](https://github.com/rapidsai/cudf/pull/16513)) [@mroeschke](https://github.com/mroeschke)
+- Check index bounds in compact protocol reader. ([#16493](https://github.com/rapidsai/cudf/pull/16493)) [@bdice](https://github.com/bdice)
+- Fix build failures with GCC 13 ([#16488](https://github.com/rapidsai/cudf/pull/16488)) [@PointKernel](https://github.com/PointKernel)
+- Fix all-empty input column for strings split APIs ([#16466](https://github.com/rapidsai/cudf/pull/16466)) [@davidwendt](https://github.com/davidwendt)
+- Fix segmented-sort overlapped input/output indices ([#16463](https://github.com/rapidsai/cudf/pull/16463)) [@davidwendt](https://github.com/davidwendt)
+- Fix merge conflict for auto merge 16447 ([#16449](https://github.com/rapidsai/cudf/pull/16449)) [@davidwendt](https://github.com/davidwendt)
+
+## 📖 Documentation
+
+- Fix links in Dask cuDF documentation ([#16929](https://github.com/rapidsai/cudf/pull/16929)) [@rjzamora](https://github.com/rjzamora)
+- Improve aggregation documentation ([#16822](https://github.com/rapidsai/cudf/pull/16822)) [@PointKernel](https://github.com/PointKernel)
+- Add best practices page to Dask cuDF docs ([#16821](https://github.com/rapidsai/cudf/pull/16821)) [@rjzamora](https://github.com/rjzamora)
+- [DOC] Update Pylibcudf doc strings ([#16810](https://github.com/rapidsai/cudf/pull/16810)) [@Matt711](https://github.com/Matt711)
+- Recommending `miniforge` for conda install ([#16782](https://github.com/rapidsai/cudf/pull/16782)) [@mmccarty](https://github.com/mmccarty)
+- Add labeling pylibcudf doc pages ([#16779](https://github.com/rapidsai/cudf/pull/16779)) [@mroeschke](https://github.com/mroeschke)
+- Migrate dask-cudf README improvements to dask-cudf sphinx docs ([#16765](https://github.com/rapidsai/cudf/pull/16765)) [@rjzamora](https://github.com/rjzamora)
+- [DOC] Remove out of date section from cudf.pandas docs ([#16697](https://github.com/rapidsai/cudf/pull/16697)) [@Matt711](https://github.com/Matt711)
+- Add performance tips to cudf.pandas FAQ. ([#16693](https://github.com/rapidsai/cudf/pull/16693)) [@bdice](https://github.com/bdice)
+- Update documentation for Dask cuDF ([#16671](https://github.com/rapidsai/cudf/pull/16671)) [@rjzamora](https://github.com/rjzamora)
+- Add missing pylibcudf strings docs ([#16471](https://github.com/rapidsai/cudf/pull/16471)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- DOC: Refresh pylibcudf guide ([#15856](https://github.com/rapidsai/cudf/pull/15856)) [@lithomas1](https://github.com/lithomas1)
+
+## 🚀 New Features
+
+- Build `cudf-polars` with `build.sh` ([#16898](https://github.com/rapidsai/cudf/pull/16898)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add polars to &quot;all&quot; dependency list. ([#16875](https://github.com/rapidsai/cudf/pull/16875)) [@bdice](https://github.com/bdice)
+- nvCOMP GZIP integration ([#16770](https://github.com/rapidsai/cudf/pull/16770)) [@vuule](https://github.com/vuule)
+- [FEA] Add support for `cudf.NamedAgg` ([#16744](https://github.com/rapidsai/cudf/pull/16744)) [@Matt711](https://github.com/Matt711)
+- Add experimental `filesystem=&quot;arrow&quot;` support in `dask_cudf.read_parquet` ([#16684](https://github.com/rapidsai/cudf/pull/16684)) [@rjzamora](https://github.com/rjzamora)
+- Relax Arrow pin ([#16681](https://github.com/rapidsai/cudf/pull/16681)) [@vyasr](https://github.com/vyasr)
+- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism)
+- Move NDS-H examples into benchmarks ([#16663](https://github.com/rapidsai/cudf/pull/16663)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- [FEA] Add third-party library integration testing of cudf.pandas to cudf ([#16645](https://github.com/rapidsai/cudf/pull/16645)) [@Matt711](https://github.com/Matt711)
+- Make isinstance check pass for proxy ndarrays ([#16601](https://github.com/rapidsai/cudf/pull/16601)) [@Matt711](https://github.com/Matt711)
+- [FEA] Add an environment variable to fail on fallback in `cudf.pandas` ([#16562](https://github.com/rapidsai/cudf/pull/16562)) [@Matt711](https://github.com/Matt711)
+- [FEA] Add support for `cudf.unique` ([#16554](https://github.com/rapidsai/cudf/pull/16554)) [@Matt711](https://github.com/Matt711)
+- [FEA] Support named aggregations in `df.groupby().agg()` ([#16528](https://github.com/rapidsai/cudf/pull/16528)) [@Matt711](https://github.com/Matt711)
+- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt)
+- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann)
+- Remove cuDF dependency from pylibcudf column from_device tests ([#16441](https://github.com/rapidsai/cudf/pull/16441)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Enable cudf.pandas REPL and -c command support ([#16428](https://github.com/rapidsai/cudf/pull/16428)) [@bdice](https://github.com/bdice)
+- Setup pylibcudf package ([#16299](https://github.com/rapidsai/cudf/pull/16299)) [@lithomas1](https://github.com/lithomas1)
+- Add a libcudf/thrust-based TPC-H derived datagen ([#16294](https://github.com/rapidsai/cudf/pull/16294)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Make proxy NumPy arrays pass isinstance check in `cudf.pandas` ([#16286](https://github.com/rapidsai/cudf/pull/16286)) [@Matt711](https://github.com/Matt711)
+- Add skiprows and nrows to parquet reader ([#16214](https://github.com/rapidsai/cudf/pull/16214)) [@lithomas1](https://github.com/lithomas1)
+- Upgrade to nvcomp 4.0.1 ([#16076](https://github.com/rapidsai/cudf/pull/16076)) [@vuule](https://github.com/vuule)
+- Migrate ORC reader to pylibcudf ([#16042](https://github.com/rapidsai/cudf/pull/16042)) [@lithomas1](https://github.com/lithomas1)
+- JSON reader validation of values ([#15968](https://github.com/rapidsai/cudf/pull/15968)) [@karthikeyann](https://github.com/karthikeyann)
+- Implement exposed null mask APIs in pylibcudf ([#15908](https://github.com/rapidsai/cudf/pull/15908)) [@charlesbluca](https://github.com/charlesbluca)
+- Word-based nvtext::minhash function ([#15368](https://github.com/rapidsai/cudf/pull/15368)) [@davidwendt](https://github.com/davidwendt)
+
+## 🛠️ Improvements
+
+- Make tests deterministic ([#16910](https://github.com/rapidsai/cudf/pull/16910)) [@galipremsagar](https://github.com/galipremsagar)
+- Update update-version.sh to use packaging lib ([#16891](https://github.com/rapidsai/cudf/pull/16891)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Pin polars for 24.10 and update polars test suite xfail list ([#16886](https://github.com/rapidsai/cudf/pull/16886)) [@wence-](https://github.com/wence-)
+- Add in support for setting delim when parsing JSON through java ([#16867) (#16880](https://github.com/rapidsai/cudf/pull/16867) (#16880)) [@revans2](https://github.com/revans2)
+- Remove unnecessary flag from build.sh ([#16879](https://github.com/rapidsai/cudf/pull/16879)) [@vyasr](https://github.com/vyasr)
+- Ignore numba warning specific to ARM runners ([#16872](https://github.com/rapidsai/cudf/pull/16872)) [@galipremsagar](https://github.com/galipremsagar)
+- Display deltas for `cudf.pandas` test summary ([#16864](https://github.com/rapidsai/cudf/pull/16864)) [@galipremsagar](https://github.com/galipremsagar)
+- Switch to using native `traceback` ([#16851](https://github.com/rapidsai/cudf/pull/16851)) [@galipremsagar](https://github.com/galipremsagar)
+- JSON tree algorithm code reorg ([#16836](https://github.com/rapidsai/cudf/pull/16836)) [@karthikeyann](https://github.com/karthikeyann)
+- Add string.repeats API to pylibcudf ([#16834](https://github.com/rapidsai/cudf/pull/16834)) [@mroeschke](https://github.com/mroeschke)
+- Use CI workflow branch &#39;branch-24.10&#39; again ([#16832](https://github.com/rapidsai/cudf/pull/16832)) [@jameslamb](https://github.com/jameslamb)
+- Rename the NDS-H benchmark binaries ([#16831](https://github.com/rapidsai/cudf/pull/16831)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add string.findall APIs to pylibcudf ([#16825](https://github.com/rapidsai/cudf/pull/16825)) [@mroeschke](https://github.com/mroeschke)
+- Add string.extract APIs to pylibcudf ([#16823](https://github.com/rapidsai/cudf/pull/16823)) [@mroeschke](https://github.com/mroeschke)
+- use get-pr-info from nv-gha-runners ([#16819](https://github.com/rapidsai/cudf/pull/16819)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Add string.contains APIs to pylibcudf ([#16814](https://github.com/rapidsai/cudf/pull/16814)) [@mroeschke](https://github.com/mroeschke)
+- Forward-merge branch-24.08 to branch-24.10 ([#16813](https://github.com/rapidsai/cudf/pull/16813)) [@bdice](https://github.com/bdice)
+- Add io_type axis with default `PINNED_BUFFER` to nvbench PQ multithreaded reader ([#16809](https://github.com/rapidsai/cudf/pull/16809)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Update fmt (to 11.0.2) and spdlog (to 1.14.1). ([#16806](https://github.com/rapidsai/cudf/pull/16806)) [@jameslamb](https://github.com/jameslamb)
+- Add ability to set parquet row group max #rows and #bytes in java ([#16805](https://github.com/rapidsai/cudf/pull/16805)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Add in option for Java JSON APIs to do column pruning in CUDF ([#16796](https://github.com/rapidsai/cudf/pull/16796)) [@revans2](https://github.com/revans2)
+- Support drop_first in get_dummies ([#16795](https://github.com/rapidsai/cudf/pull/16795)) [@mroeschke](https://github.com/mroeschke)
+- Exposed stream-ordering to join API ([#16793](https://github.com/rapidsai/cudf/pull/16793)) [@lamarrr](https://github.com/lamarrr)
+- Add string.attributes APIs to pylibcudf ([#16785](https://github.com/rapidsai/cudf/pull/16785)) [@mroeschke](https://github.com/mroeschke)
+- Java: Make ColumnVector.fromViewWithContiguousAllocation public ([#16784](https://github.com/rapidsai/cudf/pull/16784)) [@jlowe](https://github.com/jlowe)
+- Add partitioning APIs to pylibcudf ([#16781](https://github.com/rapidsai/cudf/pull/16781)) [@mroeschke](https://github.com/mroeschke)
+- Optimization of tdigest merge aggregation. ([#16780](https://github.com/rapidsai/cudf/pull/16780)) [@nvdbaranec](https://github.com/nvdbaranec)
+- use libkvikio wheels in wheel builds ([#16778](https://github.com/rapidsai/cudf/pull/16778)) [@jameslamb](https://github.com/jameslamb)
+- Exposed stream-ordering to datetime API ([#16774](https://github.com/rapidsai/cudf/pull/16774)) [@lamarrr](https://github.com/lamarrr)
+- Add io/timezone APIs to pylibcudf ([#16771](https://github.com/rapidsai/cudf/pull/16771)) [@mroeschke](https://github.com/mroeschke)
+- Remove `MultiIndex._poplevel` inplace implementation. ([#16767](https://github.com/rapidsai/cudf/pull/16767)) [@mroeschke](https://github.com/mroeschke)
+- allow pandas patch version to float in cudf-pandas unit tests ([#16763](https://github.com/rapidsai/cudf/pull/16763)) [@jameslamb](https://github.com/jameslamb)
+- Simplify the nvCOMP adapter ([#16762](https://github.com/rapidsai/cudf/pull/16762)) [@vuule](https://github.com/vuule)
+- Add labeling APIs to pylibcudf ([#16761](https://github.com/rapidsai/cudf/pull/16761)) [@mroeschke](https://github.com/mroeschke)
+- Add transform APIs to pylibcudf ([#16760](https://github.com/rapidsai/cudf/pull/16760)) [@mroeschke](https://github.com/mroeschke)
+- Add a benchmark to study Parquet reader&#39;s performance for wide tables ([#16751](https://github.com/rapidsai/cudf/pull/16751)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Change the Parquet writer&#39;s `default_row_group_size_bytes` from 128MB to inf ([#16750](https://github.com/rapidsai/cudf/pull/16750)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Add transpose API to pylibcudf ([#16749](https://github.com/rapidsai/cudf/pull/16749)) [@mroeschke](https://github.com/mroeschke)
+- Add support for Python 3.12, update Kafka dependencies to 2.5.x ([#16745](https://github.com/rapidsai/cudf/pull/16745)) [@jameslamb](https://github.com/jameslamb)
+- Generate GPU vs CPU usage metrics per pytest file in pandas testsuite for `cudf.pandas` ([#16739](https://github.com/rapidsai/cudf/pull/16739)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor cudf pandas integration tests CI ([#16728](https://github.com/rapidsai/cudf/pull/16728)) [@Matt711](https://github.com/Matt711)
+- Remove ERROR_TEST gtest from libcudf ([#16722](https://github.com/rapidsai/cudf/pull/16722)) [@davidwendt](https://github.com/davidwendt)
+- Use Series._from_column more consistently to avoid validation ([#16716](https://github.com/rapidsai/cudf/pull/16716)) [@mroeschke](https://github.com/mroeschke)
+- remove some unnecessary libcudf nightly builds ([#16714](https://github.com/rapidsai/cudf/pull/16714)) [@jameslamb](https://github.com/jameslamb)
+- Remove xfail from torch-cudf.pandas integration test ([#16705](https://github.com/rapidsai/cudf/pull/16705)) [@Matt711](https://github.com/Matt711)
+- Add return type annotations to MultiIndex ([#16696](https://github.com/rapidsai/cudf/pull/16696)) [@mroeschke](https://github.com/mroeschke)
+- Add type annotations to Index classes, utilize _from_column more ([#16695](https://github.com/rapidsai/cudf/pull/16695)) [@mroeschke](https://github.com/mroeschke)
+- Have interval_range use IntervalIndex.from_breaks, remove column_empty_same_mask ([#16694](https://github.com/rapidsai/cudf/pull/16694)) [@mroeschke](https://github.com/mroeschke)
+- Increase timeouts for couple of tests ([#16692](https://github.com/rapidsai/cudf/pull/16692)) [@galipremsagar](https://github.com/galipremsagar)
+- Replace raw device_memory_resource pointer in pylibcudf Cython ([#16674](https://github.com/rapidsai/cudf/pull/16674)) [@harrism](https://github.com/harrism)
+- switch from typing.Callable to collections.abc.Callable ([#16670](https://github.com/rapidsai/cudf/pull/16670)) [@jameslamb](https://github.com/jameslamb)
+- Update rapidsai/pre-commit-hooks ([#16669](https://github.com/rapidsai/cudf/pull/16669)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Multi-file and Parquet-aware prefetching from remote storage ([#16657](https://github.com/rapidsai/cudf/pull/16657)) [@rjzamora](https://github.com/rjzamora)
+- Access Frame attributes instead of ColumnAccessor attributes when available ([#16652](https://github.com/rapidsai/cudf/pull/16652)) [@mroeschke](https://github.com/mroeschke)
+- Use non-mangled type names in nvbench output ([#16649](https://github.com/rapidsai/cudf/pull/16649)) [@davidwendt](https://github.com/davidwendt)
+- Add pylibcudf build dir in build.sh for `clean` ([#16648](https://github.com/rapidsai/cudf/pull/16648)) [@galipremsagar](https://github.com/galipremsagar)
+- Prune workflows based on changed files ([#16642](https://github.com/rapidsai/cudf/pull/16642)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove arrow dependency ([#16640](https://github.com/rapidsai/cudf/pull/16640)) [@vyasr](https://github.com/vyasr)
+- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Drop Python 3.9 support ([#16637](https://github.com/rapidsai/cudf/pull/16637)) [@jameslamb](https://github.com/jameslamb)
+- Support DecimalDtype meta in dask_cudf ([#16634](https://github.com/rapidsai/cudf/pull/16634)) [@mroeschke](https://github.com/mroeschke)
+- Add `num_multiprocessors` utility ([#16628](https://github.com/rapidsai/cudf/pull/16628)) [@PointKernel](https://github.com/PointKernel)
+- Annotate `ColumnAccessor._data` labels as `Hashable` ([#16623](https://github.com/rapidsai/cudf/pull/16623)) [@mroeschke](https://github.com/mroeschke)
+- Remove build_categorical_column in favor of CategoricalColumn constructor ([#16617](https://github.com/rapidsai/cudf/pull/16617)) [@mroeschke](https://github.com/mroeschke)
+- Move apply_boolean_mask benchmark to nvbench ([#16616](https://github.com/rapidsai/cudf/pull/16616)) [@davidwendt](https://github.com/davidwendt)
+- Revise `get_reader_filepath_or_buffer` to handle a list of data sources ([#16613](https://github.com/rapidsai/cudf/pull/16613)) [@rjzamora](https://github.com/rjzamora)
+- do not install cudf in cudf_polars wheel tests ([#16612](https://github.com/rapidsai/cudf/pull/16612)) [@jameslamb](https://github.com/jameslamb)
+- remove streamz git dependency, standardize build dependency names, consolidate some dependency lists ([#16611](https://github.com/rapidsai/cudf/pull/16611)) [@jameslamb](https://github.com/jameslamb)
+- Fix C++ and Cython io types ([#16610](https://github.com/rapidsai/cudf/pull/16610)) [@vyasr](https://github.com/vyasr)
+- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr)
+- Remove thrust::optional from expression evaluator ([#16604](https://github.com/rapidsai/cudf/pull/16604)) [@bdice](https://github.com/bdice)
+- Add stricter typing and validation to ColumnAccessor ([#16602](https://github.com/rapidsai/cudf/pull/16602)) [@mroeschke](https://github.com/mroeschke)
+- make more use of YAML anchors in dependencies.yaml ([#16597](https://github.com/rapidsai/cudf/pull/16597)) [@jameslamb](https://github.com/jameslamb)
+- Enable testing `cudf.pandas` unit tests for all minor versions of pandas ([#16595](https://github.com/rapidsai/cudf/pull/16595)) [@galipremsagar](https://github.com/galipremsagar)
+- Extend the Parquet writer&#39;s dictionary encoding benchmark. ([#16591](https://github.com/rapidsai/cudf/pull/16591)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr)
+- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr)
+- Add build job for pylibcudf ([#16587](https://github.com/rapidsai/cudf/pull/16587)) [@vyasr](https://github.com/vyasr)
+- Add `public` qualifier for some member functions in Java class `Schema` ([#16583](https://github.com/rapidsai/cudf/pull/16583)) [@ttnghia](https://github.com/ttnghia)
+- Enable gtests previously disabled for compute-sanitizer bug ([#16581](https://github.com/rapidsai/cudf/pull/16581)) [@davidwendt](https://github.com/davidwendt)
+- [FEA] Add filesystem argument to `cudf.read_parquet` ([#16577](https://github.com/rapidsai/cudf/pull/16577)) [@rjzamora](https://github.com/rjzamora)
+- Ensure size is always passed to NumericalColumn ([#16576](https://github.com/rapidsai/cudf/pull/16576)) [@mroeschke](https://github.com/mroeschke)
+- standardize and consolidate wheel installations in testing scripts ([#16575](https://github.com/rapidsai/cudf/pull/16575)) [@jameslamb](https://github.com/jameslamb)
+- Performance improvement for strings::slice for wide strings ([#16574](https://github.com/rapidsai/cudf/pull/16574)) [@davidwendt](https://github.com/davidwendt)
+- Add `ToCudfBackend` expression to dask-cudf ([#16573](https://github.com/rapidsai/cudf/pull/16573)) [@rjzamora](https://github.com/rjzamora)
+- CI: Test against old versions of key dependencies ([#16570](https://github.com/rapidsai/cudf/pull/16570)) [@seberg](https://github.com/seberg)
+- Replace `NativeFile` dependency in dask-cudf Parquet reader ([#16569](https://github.com/rapidsai/cudf/pull/16569)) [@rjzamora](https://github.com/rjzamora)
+- Align public utility function signatures  with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke)
+- Move libcudf reduction google-benchmarks to nvbench ([#16564](https://github.com/rapidsai/cudf/pull/16564)) [@davidwendt](https://github.com/davidwendt)
+- Rework strings::slice benchmark to use nvbench ([#16563](https://github.com/rapidsai/cudf/pull/16563)) [@davidwendt](https://github.com/davidwendt)
+- Reenable arrow tests ([#16556](https://github.com/rapidsai/cudf/pull/16556)) [@vyasr](https://github.com/vyasr)
+- Clean up reshaping ops ([#16553](https://github.com/rapidsai/cudf/pull/16553)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke)
+- Rewrite remaining Python Arrow interop conversions using the C Data Interface ([#16548](https://github.com/rapidsai/cudf/pull/16548)) [@vyasr](https://github.com/vyasr)
+- [REVIEW] JSON host tree algorithms ([#16545](https://github.com/rapidsai/cudf/pull/16545)) [@shrshi](https://github.com/shrshi)
+- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove hardcoded versions from workflows. ([#16540](https://github.com/rapidsai/cudf/pull/16540)) [@bdice](https://github.com/bdice)
+- Ensure comparisons with pyints and integer series always succeed ([#16532](https://github.com/rapidsai/cudf/pull/16532)) [@seberg](https://github.com/seberg)
+- Remove unneeded output size parameter from internal count_matches utility ([#16531](https://github.com/rapidsai/cudf/pull/16531)) [@davidwendt](https://github.com/davidwendt)
+- Remove invalid column_view usage in string-scalar-to-column function ([#16530](https://github.com/rapidsai/cudf/pull/16530)) [@davidwendt](https://github.com/davidwendt)
+- Raise NotImplementedError for Series.rename that&#39;s not a scalar ([#16525](https://github.com/rapidsai/cudf/pull/16525)) [@mroeschke](https://github.com/mroeschke)
+- Remove deprecated public APIs from libcudf ([#16524](https://github.com/rapidsai/cudf/pull/16524)) [@davidwendt](https://github.com/davidwendt)
+- Return Interval object in pandas compat mode for IntervalIndex reductions ([#16523](https://github.com/rapidsai/cudf/pull/16523)) [@mroeschke](https://github.com/mroeschke)
+- Update json normalization to take device_buffer ([#16520](https://github.com/rapidsai/cudf/pull/16520)) [@karthikeyann](https://github.com/karthikeyann)
+- Rework cudf::io::text::byte_range_info class member functions ([#16518](https://github.com/rapidsai/cudf/pull/16518)) [@davidwendt](https://github.com/davidwendt)
+- Remove unneeded pair-iterator benchmark ([#16511](https://github.com/rapidsai/cudf/pull/16511)) [@davidwendt](https://github.com/davidwendt)
+- Update pre-commit hooks ([#16510](https://github.com/rapidsai/cudf/pull/16510)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve update-version.sh ([#16506](https://github.com/rapidsai/cudf/pull/16506)) [@bdice](https://github.com/bdice)
+- Use tool.scikit-build.cmake.version, set scikit-build-core minimum-version ([#16503](https://github.com/rapidsai/cudf/pull/16503)) [@jameslamb](https://github.com/jameslamb)
+- Pass batch size to JSON reader using environment variable ([#16502](https://github.com/rapidsai/cudf/pull/16502)) [@shrshi](https://github.com/shrshi)
+- Remove a deprecated multibyte_split API ([#16501](https://github.com/rapidsai/cudf/pull/16501)) [@davidwendt](https://github.com/davidwendt)
+- Add interop example for `arrow::StringViewArray` to `cudf::column` ([#16498](https://github.com/rapidsai/cudf/pull/16498)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add keep option to distinct nvbench ([#16497](https://github.com/rapidsai/cudf/pull/16497)) [@bdice](https://github.com/bdice)
+- Use more idomatic cudf APIs in dask_cudf meta generation ([#16487](https://github.com/rapidsai/cudf/pull/16487)) [@mroeschke](https://github.com/mroeschke)
+- Fix typo in dispatch_row_equal. ([#16473](https://github.com/rapidsai/cudf/pull/16473)) [@bdice](https://github.com/bdice)
+- Use explicit construction of column subclass instead of `build_column` when type is known ([#16470](https://github.com/rapidsai/cudf/pull/16470)) [@mroeschke](https://github.com/mroeschke)
+- Move exception handler into pylibcudf from cudf ([#16468](https://github.com/rapidsai/cudf/pull/16468)) [@lithomas1](https://github.com/lithomas1)
+- Make StructColumn.__init__ strict ([#16467](https://github.com/rapidsai/cudf/pull/16467)) [@mroeschke](https://github.com/mroeschke)
+- Make ListColumn.__init__ strict ([#16465](https://github.com/rapidsai/cudf/pull/16465)) [@mroeschke](https://github.com/mroeschke)
+- Make Timedelta/DatetimeColumn.__init__ strict ([#16464](https://github.com/rapidsai/cudf/pull/16464)) [@mroeschke](https://github.com/mroeschke)
+- Make NumericalColumn.__init__ strict ([#16457](https://github.com/rapidsai/cudf/pull/16457)) [@mroeschke](https://github.com/mroeschke)
+- Make CategoricalColumn.__init__ strict ([#16456](https://github.com/rapidsai/cudf/pull/16456)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke)
+- Expose `stream` param in transform APIs ([#16452](https://github.com/rapidsai/cudf/pull/16452)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add upper bound pin for polars ([#16442](https://github.com/rapidsai/cudf/pull/16442)) [@wence-](https://github.com/wence-)
+- Make (Indexed)Frame.__init__ require data (and index) ([#16430](https://github.com/rapidsai/cudf/pull/16430)) [@mroeschke](https://github.com/mroeschke)
+- Add Java APIs to copy column data to host asynchronously ([#16429](https://github.com/rapidsai/cudf/pull/16429)) [@jlowe](https://github.com/jlowe)
+- Update docs of the TPC-H derived examples ([#16423](https://github.com/rapidsai/cudf/pull/16423)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Use RMM adaptor constructors instead of factories. ([#16414](https://github.com/rapidsai/cudf/pull/16414)) [@bdice](https://github.com/bdice)
+- Align ewm APIs with pandas 2.x ([#16413](https://github.com/rapidsai/cudf/pull/16413)) [@mroeschke](https://github.com/mroeschke)
+- Remove checking for specific tests in memcheck script ([#16412](https://github.com/rapidsai/cudf/pull/16412)) [@davidwendt](https://github.com/davidwendt)
+- Add stream parameter to reshape APIs ([#16410](https://github.com/rapidsai/cudf/pull/16410)) [@davidwendt](https://github.com/davidwendt)
+- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke)
+- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke)
+- update some branch references in GitHub Actions configs ([#16397](https://github.com/rapidsai/cudf/pull/16397)) [@jameslamb](https://github.com/jameslamb)
+- Support reading matching projected and filter cols from Parquet files with otherwise mismatched schemas ([#16394](https://github.com/rapidsai/cudf/pull/16394)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Merge branch-24.08 into branch-24.10 ([#16393](https://github.com/rapidsai/cudf/pull/16393)) [@jameslamb](https://github.com/jameslamb)
+- Add query 10 to the TPC-H suite ([#16392](https://github.com/rapidsai/cudf/pull/16392)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Use `make_host_vector` instead of `make_std_vector` to facilitate pinned memory optimizations ([#16386](https://github.com/rapidsai/cudf/pull/16386)) [@vuule](https://github.com/vuule)
+- Fix some issues with deprecated / removed cccl facilities ([#16377](https://github.com/rapidsai/cudf/pull/16377)) [@miscco](https://github.com/miscco)
+- Align IntervalIndex APIs with pandas 2.x ([#16371](https://github.com/rapidsai/cudf/pull/16371)) [@mroeschke](https://github.com/mroeschke)
+- Align CategoricalIndex APIs with pandas 2.x ([#16369](https://github.com/rapidsai/cudf/pull/16369)) [@mroeschke](https://github.com/mroeschke)
+- Align TimedeltaIndex APIs with pandas 2.x ([#16368](https://github.com/rapidsai/cudf/pull/16368)) [@mroeschke](https://github.com/mroeschke)
+- Align DatetimeIndex APIs with pandas 2.x ([#16367](https://github.com/rapidsai/cudf/pull/16367)) [@mroeschke](https://github.com/mroeschke)
+- fix [tool.setuptools] reference in custreamz config ([#16365](https://github.com/rapidsai/cudf/pull/16365)) [@jameslamb](https://github.com/jameslamb)
+- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke)
+- Rebuild for &amp; Support NumPy 2 ([#16300](https://github.com/rapidsai/cudf/pull/16300)) [@jakirkham](https://github.com/jakirkham)
+- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Added batch memset to memset data and validity buffers in parquet reader ([#16281](https://github.com/rapidsai/cudf/pull/16281)) [@sdrp713](https://github.com/sdrp713)
+- Deduplicate decimal32/decimal64 to decimal128 conversion function ([#16236](https://github.com/rapidsai/cudf/pull/16236)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Refactor mixed_semi_join using cuco::static_set ([#16230](https://github.com/rapidsai/cudf/pull/16230)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Improve performance of hash_character_ngrams using warp-per-string kernel ([#16212](https://github.com/rapidsai/cudf/pull/16212)) [@davidwendt](https://github.com/davidwendt)
+- Add environment variable to log cudf.pandas fallback calls ([#16161](https://github.com/rapidsai/cudf/pull/16161)) [@mroeschke](https://github.com/mroeschke)
+- Add libcudf example with large strings ([#15983](https://github.com/rapidsai/cudf/pull/15983)) [@davidwendt](https://github.com/davidwendt)
+- JSON tree algorithms refactor I: CSR data structure for column tree ([#15979](https://github.com/rapidsai/cudf/pull/15979)) [@shrshi](https://github.com/shrshi)
+- Support multiple new-line characters in regex APIs ([#15961](https://github.com/rapidsai/cudf/pull/15961)) [@davidwendt](https://github.com/davidwendt)
+- adding wheel build for libcudf ([#15483](https://github.com/rapidsai/cudf/pull/15483)) [@msarahan](https://github.com/msarahan)
+- Replace usages of `thrust::optional` with `std::optional` ([#15091](https://github.com/rapidsai/cudf/pull/15091)) [@miscco](https://github.com/miscco)
+
 # cudf 24.08.00 (7 Aug 2024)
 
 ## 🚨 Breaking Changes

From dfdae599622841bf3f4d523c01eee3ae1fe933f0 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 9 Oct 2024 14:02:28 -0400
Subject: [PATCH 042/117] Use std::optional for host types (#17015)

cuda::std::optional shouldn't be used for host types such as `std::vector` as it requires the constructors of the `T` types to be host+device.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17015
---
 .../io/parquet/compact_protocol_reader.cpp    |  8 +--
 cpp/src/io/parquet/parquet.hpp                | 64 +++++++++----------
 cpp/src/io/parquet/parquet_gpu.hpp            | 14 ++--
 cpp/src/io/parquet/predicate_pushdown.cpp     |  6 +-
 cpp/src/io/parquet/reader_impl.cpp            |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  8 +--
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  6 +-
 cpp/src/io/parquet/writer_impl.cu             |  8 +--
 cpp/tests/io/parquet_common.cpp               |  2 +-
 cpp/tests/io/parquet_common.hpp               |  2 +-
 10 files changed, 59 insertions(+), 61 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 312a5243687..d276e946a51 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -309,10 +309,10 @@ class parquet_field_struct : public parquet_field {
 template <typename E, typename T>
 class parquet_field_union_struct : public parquet_field {
   E& enum_val;
-  cuda::std::optional<T>& val;  // union structs are always wrapped in std::optional
+  std::optional<T>& val;  // union structs are always wrapped in std::optional
 
  public:
-  parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
+  parquet_field_union_struct(int f, E& ev, std::optional<T>& v)
     : parquet_field(f), enum_val(ev), val(v)
   {
   }
@@ -439,10 +439,10 @@ class parquet_field_struct_blob : public parquet_field {
  */
 template <typename T, typename FieldFunctor>
 class parquet_field_optional : public parquet_field {
-  cuda::std::optional<T>& val;
+  std::optional<T>& val;
 
  public:
-  parquet_field_optional(int f, cuda::std::optional<T>& v) : parquet_field(f), val(v) {}
+  parquet_field_optional(int f, std::optional<T>& v) : parquet_field(f), val(v) {}
 
   inline void operator()(CompactProtocolReader* cpr, int field_type)
   {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 7c985643887..2851ef67a65 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -20,8 +20,6 @@
 
 #include <cudf/types.hpp>
 
-#include <cuda/std/optional>
-
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -94,10 +92,10 @@ struct LogicalType {
     BSON
   };
   Type type;
-  cuda::std::optional<DecimalType> decimal_type;
-  cuda::std::optional<TimeType> time_type;
-  cuda::std::optional<TimestampType> timestamp_type;
-  cuda::std::optional<IntType> int_type;
+  std::optional<DecimalType> decimal_type;
+  std::optional<TimeType> time_type;
+  std::optional<TimestampType> timestamp_type;
+  std::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
@@ -178,21 +176,21 @@ struct SchemaElement {
   // 5: nested fields
   int32_t num_children = 0;
   // 6: DEPRECATED: record the original type before conversion to parquet type
-  cuda::std::optional<ConvertedType> converted_type;
+  std::optional<ConvertedType> converted_type;
   // 7: DEPRECATED: record the scale for DECIMAL converted type
   int32_t decimal_scale = 0;
   // 8: DEPRECATED: record the precision for DECIMAL converted type
   int32_t decimal_precision = 0;
   // 9: save field_id from original schema
-  cuda::std::optional<int32_t> field_id;
+  std::optional<int32_t> field_id;
   // 10: replaces converted type
-  cuda::std::optional<LogicalType> logical_type;
+  std::optional<LogicalType> logical_type;
 
   // extra cudf specific fields
   bool output_as_byte_array = false;
 
   // cudf type determined from arrow:schema
-  cuda::std::optional<type_id> arrow_type;
+  std::optional<type_id> arrow_type;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
@@ -258,21 +256,21 @@ struct SchemaElement {
  */
 struct Statistics {
   // deprecated max value in signed comparison order
-  cuda::std::optional<std::vector<uint8_t>> max;
+  std::optional<std::vector<uint8_t>> max;
   // deprecated min value in signed comparison order
-  cuda::std::optional<std::vector<uint8_t>> min;
+  std::optional<std::vector<uint8_t>> min;
   // count of null values in the column
-  cuda::std::optional<int64_t> null_count;
+  std::optional<int64_t> null_count;
   // count of distinct values occurring
-  cuda::std::optional<int64_t> distinct_count;
+  std::optional<int64_t> distinct_count;
   // max value for column determined by ColumnOrder
-  cuda::std::optional<std::vector<uint8_t>> max_value;
+  std::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
-  cuda::std::optional<std::vector<uint8_t>> min_value;
+  std::optional<std::vector<uint8_t>> min_value;
   // If true, max_value is the actual maximum value for a column
-  cuda::std::optional<bool> is_max_value_exact;
+  std::optional<bool> is_max_value_exact;
   // If true, min_value is the actual minimum value for a column
-  cuda::std::optional<bool> is_min_value_exact;
+  std::optional<bool> is_min_value_exact;
 };
 
 /**
@@ -281,7 +279,7 @@ struct Statistics {
 struct SizeStatistics {
   // Number of variable-width bytes stored for the page/chunk. Should not be set for anything
   // but the BYTE_ARRAY physical type.
-  cuda::std::optional<int64_t> unencoded_byte_array_data_bytes;
+  std::optional<int64_t> unencoded_byte_array_data_bytes;
   /**
    * When present, there is expected to be one element corresponding to each
    * repetition (i.e. size=max repetition_level+1) where each element
@@ -290,14 +288,14 @@ struct SizeStatistics {
    *
    * This value should not be written if max_repetition_level is 0.
    */
-  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
+  std::optional<std::vector<int64_t>> repetition_level_histogram;
 
   /**
    * Same as repetition_level_histogram except for definition levels.
    *
    * This value should not be written if max_definition_level is 0 or 1.
    */
-  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
+  std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -318,7 +316,7 @@ struct OffsetIndex {
   std::vector<PageLocation> page_locations;
   // per-page size info. see description of the same field in SizeStatistics. only present for
   // columns with a BYTE_ARRAY physical type.
-  cuda::std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
+  std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
 };
 
 /**
@@ -329,11 +327,11 @@ struct ColumnIndex {
   std::vector<std::vector<uint8_t>> min_values;  // lower bound for values in each page
   std::vector<std::vector<uint8_t>> max_values;  // upper bound for values in each page
   BoundaryOrder boundary_order =
-    BoundaryOrder::UNORDERED;  // Indicates if min and max values are ordered
-  cuda::std::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
+    BoundaryOrder::UNORDERED;                       // Indicates if min and max values are ordered
+  std::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
   // Repetition/definition level histograms for the column chunk
-  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
-  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
+  std::optional<std::vector<int64_t>> repetition_level_histogram;
+  std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -383,11 +381,11 @@ struct ColumnChunkMetaData {
   Statistics statistics;
   // Set of all encodings used for pages in this column chunk. This information can be used to
   // determine if all data pages are dictionary encoded for example.
-  cuda::std::optional<std::vector<PageEncodingStats>> encoding_stats;
+  std::optional<std::vector<PageEncodingStats>> encoding_stats;
   // Optional statistics to help estimate total memory when converted to in-memory representations.
   // The histograms contained in these statistics can also be useful in some cases for more
   // fine-grained nullability/list length filter pushdown.
-  cuda::std::optional<SizeStatistics> size_statistics;
+  std::optional<SizeStatistics> size_statistics;
 };
 
 /**
@@ -429,13 +427,13 @@ struct RowGroup {
   int64_t num_rows = 0;
   // If set, specifies a sort ordering of the rows in this RowGroup.
   // The sorting columns can be a subset of all the columns.
-  cuda::std::optional<std::vector<SortingColumn>> sorting_columns;
+  std::optional<std::vector<SortingColumn>> sorting_columns;
   // Byte offset from beginning of file to first page (data or dictionary) in this row group
-  cuda::std::optional<int64_t> file_offset;
+  std::optional<int64_t> file_offset;
   // Total byte size of all compressed (and potentially encrypted) column data in this row group
-  cuda::std::optional<int64_t> total_compressed_size;
+  std::optional<int64_t> total_compressed_size;
   // Row group ordinal in the file
-  cuda::std::optional<int16_t> ordinal;
+  std::optional<int16_t> ordinal;
 };
 
 /**
@@ -460,7 +458,7 @@ struct FileMetaData {
   std::vector<RowGroup> row_groups;
   std::vector<KeyValue> key_value_metadata;
   std::string created_by = "";
-  cuda::std::optional<std::vector<ColumnOrder>> column_orders;
+  std::optional<std::vector<ColumnOrder>> column_orders;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a8ba3a969ce..4f6d41a97da 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -395,7 +395,7 @@ struct ColumnChunkDesc {
                            uint8_t def_level_bits_,
                            uint8_t rep_level_bits_,
                            Compression codec_,
-                           cuda::std::optional<LogicalType> logical_type_,
+                           std::optional<LogicalType> logical_type_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
                            int32_t src_col_schema_,
@@ -441,12 +441,12 @@ struct ColumnChunkDesc {
   int32_t num_data_pages{};                           // number of data pages
   int32_t num_dict_pages{};                           // number of dictionary pages
   PageInfo const* dict_page{};
-  string_index_pair* str_dict_index{};  // index for string dictionary
-  bitmask_type** valid_map_base{};      // base pointers of valid bit map for this column
-  void** column_data_base{};            // base pointers of column data
-  void** column_string_base{};          // base pointers of column string data
-  Compression codec{};                  // compressed codec enum
-  cuda::std::optional<LogicalType> logical_type{};  // logical type
+  string_index_pair* str_dict_index{};        // index for string dictionary
+  bitmask_type** valid_map_base{};            // base pointers of valid bit map for this column
+  void** column_data_base{};                  // base pointers of column data
+  void** column_string_base{};                // base pointers of column string data
+  Compression codec{};                        // compressed codec enum
+  std::optional<LogicalType> logical_type{};  // logical type
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index b90ca36c8c7..f0a0bc0b51b 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -152,7 +152,7 @@ struct stats_caster {
         }
 
         void set_index(size_type index,
-                       cuda::std::optional<std::vector<uint8_t>> const& binary_value,
+                       std::optional<std::vector<uint8_t>> const& binary_value,
                        Type const type)
         {
           if (binary_value.has_value()) {
@@ -234,8 +234,8 @@ struct stats_caster {
             max.set_index(stats_idx, max_value, colchunk.meta_data.type);
           } else {
             // Marking it null, if column present in row group
-            min.set_index(stats_idx, cuda::std::nullopt, {});
-            max.set_index(stats_idx, cuda::std::nullopt, {});
+            min.set_index(stats_idx, std::nullopt, {});
+            max.set_index(stats_idx, std::nullopt, {});
           }
           stats_idx++;
         }
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 1b69ccb7742..f0865c715bc 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -38,7 +38,7 @@ namespace {
 // be treated as a string. Currently the only logical type that has special handling is DECIMAL.
 // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
 // for now would also be treated as a string).
-inline bool is_treat_fixed_length_as_string(cuda::std::optional<LogicalType> const& logical_type)
+inline bool is_treat_fixed_length_as_string(std::optional<LogicalType> const& logical_type)
 {
   if (!logical_type.has_value()) { return true; }
   return logical_type->type != LogicalType::DECIMAL;
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index c588fedb85c..27312a4da89 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos,
  *
  * @return A tuple of Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, cuda::std::optional<LogicalType>> conversion_info(
+[[nodiscard]] std::tuple<int32_t, std::optional<LogicalType>> conversion_info(
   type_id column_type_id,
   type_id timestamp_type_id,
   Type physical,
-  cuda::std::optional<LogicalType> logical_type)
+  std::optional<LogicalType> logical_type)
 {
   int32_t const clock_rate =
     is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
@@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos,
     // if decimal but not outputting as float or decimal, then convert to no logical type
     if (column_type_id != type_id::FLOAT64 and
         not cudf::is_fixed_point(data_type{column_type_id})) {
-      return std::make_tuple(clock_rate, cuda::std::nullopt);
+      return {clock_rate, std::nullopt};
     }
   }
 
-  return std::make_tuple(clock_rate, std::move(logical_type));
+  return {clock_rate, std::move(logical_type)};
 }
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 6d566b5815e..a6562d33de2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf;
 
 namespace {
 
-cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
+std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
 {
   if (schema.converted_type.has_value()) {
     switch (schema.converted_type.value()) {
@@ -66,7 +66,7 @@ cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const&
       default: return LogicalType{LogicalType::UNDEFINED};
     }
   }
-  return cuda::std::nullopt;
+  return std::nullopt;
 }
 
 }  // namespace
@@ -246,7 +246,7 @@ void metadata::sanitize_schema()
         struct_elem.repetition_type = REQUIRED;
         struct_elem.num_children    = schema_elem.num_children;
         struct_elem.type            = UNDEFINED_TYPE;
-        struct_elem.converted_type  = cuda::std::nullopt;
+        struct_elem.converted_type  = std::nullopt;
 
         // swap children
         struct_elem.children_idx = std::move(schema_elem.children_idx);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index ec05f35d405..190f13eb688 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -186,7 +186,7 @@ struct aggregate_writer_metadata {
     std::vector<std::vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
-  cuda::std::optional<std::vector<ColumnOrder>> column_orders = cuda::std::nullopt;
+  std::optional<std::vector<ColumnOrder>> column_orders = std::nullopt;
 };
 
 namespace {
@@ -472,7 +472,7 @@ struct leaf_schema_fn {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type = cuda::std::nullopt;
+    col_schema.converted_type = std::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
     if (timestamp_is_int96) {
       col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
@@ -750,7 +750,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
           col_schema.type = Type::BYTE_ARRAY;
         }
 
-        col_schema.converted_type  = cuda::std::nullopt;
+        col_schema.converted_type  = std::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -2795,7 +2795,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
   for (auto& se : md.schema) {
     if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
-      se.logical_type = cuda::std::nullopt;
+      se.logical_type = std::nullopt;
     }
   }
 
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index 6141a40bc95..a1b8677eac8 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2)
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
+                       std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
 {
   auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
   switch (ptype) {
diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp
index bd1579eaa1b..c90b81ed27a 100644
--- a/cpp/tests/io/parquet_common.hpp
+++ b/cpp/tests/io/parquet_common.hpp
@@ -172,7 +172,7 @@ std::pair<cudf::table, std::string> create_parquet_typed_with_stats(std::string
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
+                       std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
 
 void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats);
 

From bd51a25ea6fdab6ab11e95e2c8192ed7eee43e75 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:05:05 -0400
Subject: [PATCH 043/117] [DOC] Document limitation using  `cudf.pandas` proxy
 arrays (#16955)

When instantiating a `cudf.pandas` proxy array, a DtoH transfer occurs so that the data buffer is set correctly. We do this because functions which utilize NumPy's C API can utilize the data buffer directly instead of going through `__array__`. This PR documents this limitation.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16955
---
 docs/cudf/source/cudf_pandas/faq.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 34b657488c1..5024747227e 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of:
    ```
 - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version
   24.02 of cudf was the last to support pandas 1.5.x.
+- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy
+  array, we create a proxy type that actually subclasses `numpy.ndarray`. We can
+  verify this with an isinstance check.
+
+  ```python
+  %load_ext cudf.pandas
+  import pandas as pd
+  import numpy as np
+
+  arr = pd.Series([1, 1, 2]).unique() # returns a proxy array
+  isinstance(arr, np.ndarray) # returns True, where arr is a proxy array
+  ```
+  Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to
+  access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API.
+  However, our proxy mechanism is designed to proxy function calls at the Python
+  level, which is incompatible with these types of accesses. To handle these
+  situations, we perform an eager device-to-host (DtoH) copy, which sets the data
+  buffer correctly but incurs the cost of extra time when creating the proxy array.
+  In the previous example, creating `arr` performed this kind of implicit DtoH transfer.
+
+  With this approach, we also get compatibility with third party libraries like `torch`.
+
+  ```python
+  import torch
+  x = torch.from_numpy(arr)
+  ```
 
 ## Can I force running on the CPU?
 

From c7b51195c675af47d0f3dd69c04d0fcc6920eca5 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 9 Oct 2024 15:17:32 -0700
Subject: [PATCH 044/117] Fix `host_span` constructor to correctly copy
 `is_device_accessible` (#17020)

One of the `host_span` constructors was not updated when we added `is_device_accessible`, so the value was not assigned.
This PR fixes this simple error and adds tests that checks that this property is correctly set when creating `host_span`s.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17020
---
 cpp/include/cudf/utilities/span.hpp           |  2 +-
 .../utilities_tests/pinned_memory_tests.cpp   | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 914731ea417..f3e1a61d075 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -288,7 +288,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
   constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
-    : base(other.data(), other.size())
+    : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
 
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index ae7c6fa8b8c..1e1e21fe18a 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
@@ -125,3 +126,22 @@ TEST_F(PinnedMemoryTest, MakeHostVector)
     EXPECT_FALSE(vec.get_allocator().is_device_accessible());
   }
 }
+
+TEST_F(PinnedMemoryTest, HostSpan)
+{
+  auto test_ctors = [](auto&& vec) {
+    auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible();
+    // Test conversion from a vector
+    auto const span = cudf::host_span<int16_t>{vec};
+    EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible);
+    // Test conversion from host_span with different type
+    auto const span_converted = cudf::host_span<int16_t const>{span};
+    EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible);
+  };
+
+  cudf::set_allocate_host_as_pinned_threshold(7);
+  for (int i = 1; i < 10; i++) {
+    // some iterations will use pinned memory, some will not
+    test_ctors(cudf::detail::make_host_vector<int16_t>(i, cudf::get_default_stream()));
+  }
+}

From 3791c8a9d1aeb7474bb9ef324a089a569183406c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 9 Oct 2024 13:45:02 -1000
Subject: [PATCH 045/117] Add string.convert_floats APIs to pylibcudf (#16990)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16990
---
 python/cudf/cudf/_lib/string_casting.pyx      |  34 ++----
 .../_lib/strings/convert/convert_floats.pyx   |  24 ++---
 .../strings/convert/convert_floats.pxd        |   6 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |   2 +-
 .../pylibcudf/strings/convert/__init__.pxd    |   1 +
 .../pylibcudf/strings/convert/__init__.py     |   1 +
 .../strings/convert/convert_floats.pxd        |  11 ++
 .../strings/convert/convert_floats.pyx        | 101 ++++++++++++++++++
 .../tests/test_string_convert_floats.py       |  33 ++++++
 9 files changed, 165 insertions(+), 48 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py

diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index d9595f4ab0a..93b67bd4c9d 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -10,10 +10,6 @@ from libcpp.utility cimport move
 
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    from_floats as cpp_from_floats,
-    to_floats as cpp_to_floats,
-)
 from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
@@ -33,32 +29,18 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 
 def floating_to_string(Column input_col):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_floats(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.from_floats(
+        input_col.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def string_to_floating(Column input_col, object out_type):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
-        )
+    plc_column = plc.strings.convert.convert_floats.to_floats(
+        input_col.to_pylibcudf(mode="read"),
+        dtype_to_pylibcudf_type(out_type)
     )
-    cdef data_type c_out_type = data_type(tid)
-    with nogil:
-        c_result = move(
-            cpp_to_floats(
-                input_column_view,
-                c_out_type))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 def dtos(Column input_col):
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index 7965b588703..5da6e3f10cc 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,18 +1,11 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    is_float as cpp_is_float,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def is_float(Column source_strings):
@@ -20,12 +13,7 @@ def is_float(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that have floats.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.is_float(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index f4fc4674506..a45c7f9979e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_floats(
-        column_view input_col,
+        column_view strings,
         data_type output_type) except +
 
     cdef unique_ptr[column] from_floats(
-        column_view input_col) except +
+        column_view floats) except +
 
     cdef unique_ptr[column] is_float(
-        column_view source_strings
+        column_view input
     ) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 41aeb72039b..7b228c06a18 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx convert_ipv4.pyx convert_urls.pyx
+                   convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index b4b0b521e39..be6145384ad 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -4,6 +4,7 @@ from . cimport (
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_floats,
     convert_ipv4,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index 409620fce45..7c94387282b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -4,6 +4,7 @@
     convert_datetime,
     convert_durations,
     convert_fixed_point,
+    convert_floats,
     convert_ipv4,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
new file mode 100644
index 00000000000..1284ff552aa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type)
+
+cpdef Column from_floats(Column floats)
+
+cpdef Column is_float(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..8081aadb085
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -0,0 +1,101 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_floats as cpp_convert_floats,
+)
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type):
+    """
+    Returns a new numeric column by parsing float values from each string
+    in the provided strings column.
+
+    For details, see cpp:func:`cudf::strings::to_floats`
+
+    Parameters
+    ----------
+    strings : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of float numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with floats converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.to_floats(
+                strings.view(),
+                output_type.c_obj,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column from_floats(Column floats):
+    """
+    Returns a new strings column converting the float values from the
+    provided column into strings.
+
+    For details, see cpp:func:`cudf::strings::from_floats`
+
+    Parameters
+    ----------
+    floats : Column
+        Numeric column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column with floats as strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.from_floats(
+                floats.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_float(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to floats.
+
+    For details, see cpp:func:`cudf::strings::is_float`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_floats.is_float(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
new file mode 100644
index 00000000000..e9918fab559
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_to_floats():
+    typ = pa.float32()
+    arr = pa.array(["-1.23", "1", None])
+    result = plc.strings.convert.convert_floats.to_floats(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_floats():
+    arr = pa.array([-1.23, 1, None])
+    result = plc.strings.convert.convert_floats.from_floats(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array(["-1.23", "1.0", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_float():
+    arr = pa.array(["-1.23", "1", "1.2.3", "A", None])
+    result = plc.strings.convert.convert_floats.is_float(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)

From 31423d056c45bd6352f0c611ed5e63423b09b954 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 9 Oct 2024 21:01:30 -0400
Subject: [PATCH 046/117] Update all rmm imports to use pylibrmm/librmm
 (#16913)

This PR updates all the RMM imports to use pylibrmm/librmm now that `rmm._lib` is deprecated . It should be merged after [rmm/1676](https://github.com/rapidsai/rmm/pull/1676).

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/16913
---
 docs/cudf/source/conf.py                                   | 5 ++++-
 python/cudf/cudf/_lib/column.pxd                           | 2 +-
 python/cudf/cudf/_lib/column.pyx                           | 2 +-
 python/cudf/cudf/_lib/copying.pyx                          | 2 +-
 python/cudf/cudf/_lib/scalar.pxd                           | 2 +-
 python/cudf/cudf/_lib/strings_udf.pyx                      | 3 ++-
 python/cudf/cudf/core/buffer/buffer.py                     | 2 +-
 python/cudf/cudf/core/buffer/spillable_buffer.py           | 4 ++--
 python/cudf/cudf/core/udf/strings_typing.py                | 2 +-
 python/pylibcudf/pylibcudf/column.pyx                      | 2 +-
 python/pylibcudf/pylibcudf/join.pyx                        | 2 +-
 python/pylibcudf/pylibcudf/libcudf/column/column.pxd       | 2 +-
 .../pylibcudf/libcudf/column/column_factories.pxd          | 2 +-
 python/pylibcudf/pylibcudf/libcudf/concatenate.pxd         | 2 +-
 python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd    | 2 +-
 python/pylibcudf/pylibcudf/libcudf/copying.pxd             | 2 +-
 python/pylibcudf/pylibcudf/libcudf/join.pxd                | 2 +-
 python/pylibcudf/pylibcudf/libcudf/null_mask.pxd           | 2 +-
 python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd         | 2 +-
 python/pylibcudf/pylibcudf/libcudf/transform.pxd           | 2 +-
 python/pylibcudf/pylibcudf/null_mask.pxd                   | 2 +-
 python/pylibcudf/pylibcudf/null_mask.pyx                   | 7 ++++---
 python/pylibcudf/pylibcudf/scalar.pxd                      | 2 +-
 python/pylibcudf/pylibcudf/scalar.pyx                      | 2 +-
 python/pylibcudf/pylibcudf/transform.pyx                   | 3 ++-
 25 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 95813907bf4..ecf619ddc44 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,7 +342,10 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
+    # TODO: Replace the first entry in a follow-up with rmm.pylibrmm.device_buffer.DeviceBuffer
+    # when the RMM objects inventory is generated from branch-24.12. The RMM objects inventory
+    # can be accessed here : https://docs.rapids.ai/api/rmm/nightly/objects.inv
+    "DeviceBuffer": ("rmm.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 8ceea4920e2..8b1d16f0d85 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport (
     mutable_column_view,
 )
 from pylibcudf.libcudf.types cimport size_type
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef class Column:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 99e4c21df8a..065655505b8 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -28,7 +28,7 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.types cimport (
     dtype_from_column_view,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 49714091f46..30353c4be6c 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -8,7 +8,7 @@ from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 import pylibcudf
 
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 27095ca02d4..0f9820ed1db 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -4,7 +4,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from pylibcudf.libcudf.scalar.scalar cimport scalar
-from rmm._lib.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
 cdef class DeviceScalar:
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index 78fc9f08bd8..dd2fafbe07f 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -23,7 +23,8 @@ from pylibcudf.libcudf.strings_udf cimport (
     to_string_view_array as cpp_to_string_view_array,
     udf_string,
 )
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.column cimport Column
 
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 32ae8c5ee53..caff019f575 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -284,7 +284,7 @@ def memoryview(
         """Read-only access to the buffer through host memory."""
         size = self._size if size is None else size
         host_buf = host_memory_allocation(size)
-        rmm._lib.device_buffer.copy_ptr_to_host(
+        rmm.pylibrmm.device_buffer.copy_ptr_to_host(
             self.get_ptr(mode="read") + offset, host_buf
         )
         return memoryview(host_buf).toreadonly()
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 4c9e524ee05..b40c56c9a6b 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -207,7 +207,7 @@ def spill(self, target: str = "cpu") -> None:
                     domain="cudf_python-spill",
                 ):
                     host_mem = host_memory_allocation(self.size)
-                    rmm._lib.device_buffer.copy_ptr_to_host(
+                    rmm.pylibrmm.device_buffer.copy_ptr_to_host(
                         self._ptr, host_mem
                     )
                 self._ptr_desc["memoryview"] = host_mem
@@ -352,7 +352,7 @@ def memoryview(
             else:
                 assert self._ptr_desc["type"] == "gpu"
                 ret = host_memory_allocation(size)
-                rmm._lib.device_buffer.copy_ptr_to_host(
+                rmm.pylibrmm.device_buffer.copy_ptr_to_host(
                     self._ptr + offset, ret
                 )
                 return ret
diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py
index 43604ab21a7..a0cbe7ada19 100644
--- a/python/cudf/cudf/core/udf/strings_typing.py
+++ b/python/cudf/cudf/core/udf/strings_typing.py
@@ -99,7 +99,7 @@ def prepare_args(self, ty, val, **kwargs):
             ty.dtype, (StringView, UDFString)
         ):
             return types.uint64, val.ptr if isinstance(
-                val, rmm._lib.device_buffer.DeviceBuffer
+                val, rmm.pylibrmm.device_buffer.DeviceBuffer
             ) else val.get_ptr(mode="read")
         else:
             return ty, val
diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
index a37a12fc7e1..03808f0b664 100644
--- a/python/pylibcudf/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
index 25664286f19..b019ed8f099 100644
--- a/python/pylibcudf/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport null_equality
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
index 7a369701bbd..76f35cbba71 100644
--- a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column_view cimport (
 )
 from pylibcudf.libcudf.types cimport data_type, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
index f1a326bcd40..b2388858127 100644
--- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.types cimport (
     type_id,
 )
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
index 92f5a185a54..a09b6c01392 100644
--- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column, column_view
 from pylibcudf.libcudf.table.table cimport table, table_view
 from pylibcudf.libcudf.utilities.host_span cimport host_span
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
index cadac6a0022..6de9c4382d3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
@@ -6,7 +6,7 @@ from libcpp.vector cimport vector
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
index 4d4a4ba9b89..e6e719d6436 100644
--- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
@@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 ctypedef const scalar constscalar
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd
index 6f6c145b23c..21033a0284e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport null_equality, size_type
 
-from rmm._lib.device_uvector cimport device_uvector
+from rmm.librmm.device_uvector cimport device_uvector
 
 ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
 ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type
diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
index 5f582091b06..27af4a3bdb1 100644
--- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
index 0c8fe1060ac..2eca043e451 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
index 38298a7c1f1..d21510bd731 100644
--- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd
index ab5c0080312..9bdfaee2842 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/null_mask.pxd
@@ -2,7 +2,7 @@
 
 from pylibcudf.libcudf.types cimport mask_state, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .column cimport Column
 
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
index 5bdde06f21f..aae39987dac 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pyx
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -6,7 +6,8 @@ from libcpp.utility cimport move
 from pylibcudf.libcudf cimport null_mask as cpp_null_mask
 from pylibcudf.libcudf.types cimport mask_state, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint
 
@@ -31,8 +32,8 @@ cpdef DeviceBuffer copy_bitmask(Column col):
     Returns
     -------
     rmm.DeviceBuffer
-        A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer``
-        if ``col`` is not nullable
+        A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty
+        ``DeviceBuffer`` if ``col`` is not nullable
     """
     cdef device_buffer db
 
diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd
index 8664dfa4b7e..a273647c98d 100644
--- a/python/pylibcudf/pylibcudf/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/scalar.pxd
@@ -4,7 +4,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
-from rmm._lib.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
index 3e20938af0c..d4888a62ad1 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -6,7 +6,7 @@ from libcpp.utility cimport move
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like
 
-from rmm._lib.memory_resource cimport get_current_device_resource
+from rmm.pylibrmm.memory_resource cimport get_current_device_resource
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index de425a27c15..74134caeb78 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -9,7 +9,8 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview

From 7173b52fce25937bb69e22a083a5de4655078fa1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 10 Oct 2024 08:48:05 -0400
Subject: [PATCH 047/117] Fix regex parsing logic handling of nested
 quantifiers (#16798)

Fixes the libcudf regex parsing logic when handling nested fixed quantifiers. The logic handles fixed quantifiers by simple repeating the previous instruction. If the previous item is a group (capture or non-capture) that group may also contain an internal fixed quantifier as well.
Found while working on #16730

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16798
---
 cpp/src/strings/regex/regcomp.cpp    | 40 +++++++++++++++++++---------
 cpp/tests/strings/contains_tests.cpp | 14 ++++++++++
 cpp/tests/strings/extract_tests.cpp  | 16 ++++++++++-
 3 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 51c6e765edd..775a2580f60 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -716,13 +716,13 @@ class regex_parser {
       if (item.type != COUNTED && item.type != COUNTED_LAZY) {
         out.push_back(item);
         if (item.type == LBRA || item.type == LBRA_NC) {
-          lbra_stack.push(index);
+          lbra_stack.push(out.size() - 1);
           repeat_start_index = -1;
         } else if (item.type == RBRA) {
           repeat_start_index = lbra_stack.top();
           lbra_stack.pop();
         } else if ((item.type & ITEM_MASK) != OPERATOR_MASK) {
-          repeat_start_index = index;
+          repeat_start_index = out.size() - 1;
         }
       } else {
         // item is of type COUNTED or COUNTED_LAZY
@@ -731,26 +731,39 @@ class regex_parser {
         CUDF_EXPECTS(repeat_start_index >= 0, "regex: invalid counted quantifier location");
 
         // range of affected item(s) to repeat
-        auto const begin = in.begin() + repeat_start_index;
-        auto const end   = in.begin() + index;
+        auto const begin = out.begin() + repeat_start_index;
+        auto const end   = out.end();
+
         // count range values
         auto const n = item.d.count.n;  // minimum count
         auto const m = item.d.count.m;  // maximum count
-
         assert(n >= 0 && "invalid repeat count value n");
         // zero-repeat edge-case: need to erase the previous items
-        if (n == 0) { out.erase(out.end() - (index - repeat_start_index), out.end()); }
-
-        // minimum repeats (n)
-        for (int j = 1; j < n; j++) {
-          out.insert(out.end(), begin, end);
+        if (n == 0 && m == 0) { out.erase(begin, end); }
+
+        std::vector<regex_parser::Item> repeat_copy(begin, end);
+        // special handling for quantified capture groups
+        if ((n > 1) && (*begin).type == LBRA) {
+          (*begin).type = LBRA_NC;  // change first one to non-capture
+          // add intermediate groups as non-capture
+          std::vector<regex_parser::Item> ncg_copy(begin, end);
+          for (int j = 1; j < (n - 1); j++) {
+            out.insert(out.end(), ncg_copy.begin(), ncg_copy.end());
+          }
+          // add the last entry as a regular capture-group
+          out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
+        } else {
+          // minimum repeats (n)
+          for (int j = 1; j < n; j++) {
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
+          }
         }
 
         // optional maximum repeats (m)
         if (m >= 0) {
           for (int j = n; j < m; j++) {
             out.emplace_back(LBRA_NC, 0);
-            out.insert(out.end(), begin, end);
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
           }
           for (int j = n; j < m; j++) {
             out.emplace_back(RBRA, 0);
@@ -760,8 +773,9 @@ class regex_parser {
           // infinite repeats
           if (n > 0) {  // append '+' after last repetition
             out.emplace_back(item.type == COUNTED ? PLUS : PLUS_LAZY, 0);
-          } else {  // copy it once then append '*'
-            out.insert(out.end(), begin, end);
+          } else {
+            // copy it once then append '*'
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
             out.emplace_back(item.type == COUNTED ? STAR : STAR_LAZY, 0);
           }
         }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index bdfd38267e6..216ddfce5f1 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -474,6 +474,20 @@ TEST_F(StringsContainsTests, FixedQuantifier)
   }
 }
 
+TEST_F(StringsContainsTests, NestedQuantifier)
+{
+  auto input   = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555",
+                                                     "0000 AAAA 9999 BBBB 8888",
+                                                     "7777 6666 4444 3333",
+                                                     "12345 3333 4444 1111 ABCD"});
+  auto sv      = cudf::strings_column_view(input);
+  auto pattern = std::string(R"((\d{4}\s){4})");
+  cudf::test::fixed_width_column_wrapper<bool> expected({true, false, false, true});
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::contains_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, QuantifierErrors)
 {
   EXPECT_THROW(cudf::strings::regex_program::create("^+"), cudf::logic_error);
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 61246fb098d..7e0338f1bf4 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -240,6 +239,21 @@ TEST_F(StringsExtractTests, SpecialNewLines)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
 }
 
+TEST_F(StringsExtractTests, NestedQuantifier)
+{
+  auto input   = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555",
+                                                     "0000 AAAA 9999 BBBB 8888",
+                                                     "7777 6666 4444 3333",
+                                                     "12345 3333 4444 1111 ABCD"});
+  auto sv      = cudf::strings_column_view(input);
+  auto pattern = std::string(R"((\d{4}\s){4})");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::extract(sv, *prog);
+  // fixed quantifier on capture group only honors the last group
+  auto expected = cudf::test::strings_column_wrapper({"4444 ", "", "", "1111 "}, {1, 0, 0, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};

From 69b0f661ff2fc4c12bb0fe696e556f6b3224b381 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 08:38:11 -1000
Subject: [PATCH 048/117] Add string.convert.convert_lists APIs to pylibcudf
 (#16997)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16997
---
 .../strings/convert/convert_booleans.rst      |  6 ++
 .../strings/convert/convert_datetime.rst      |  6 ++
 .../strings/convert/convert_durations.rst     |  6 ++
 .../strings/convert/convert_fixed_point.rst   |  6 ++
 .../strings/convert/convert_floats.rst        |  6 ++
 .../strings/convert/convert_ipv4.rst          |  6 ++
 .../strings/convert/convert_lists.rst         |  6 ++
 .../strings/convert/convert_urls.rst          |  6 ++
 .../pylibcudf/strings/convert/index.rst       | 14 ++++
 .../api_docs/pylibcudf/strings/index.rst      |  6 ++
 .../_lib/strings/convert/convert_lists.pyx    | 32 ++-------
 .../libcudf/strings/convert/convert_lists.pxd |  2 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |  5 +-
 .../pylibcudf/strings/convert/__init__.pxd    |  1 +
 .../pylibcudf/strings/convert/__init__.py     |  1 +
 .../strings/convert/convert_fixed_point.pyx   |  6 +-
 .../strings/convert/convert_lists.pxd         | 11 +++
 .../strings/convert/convert_lists.pyx         | 72 +++++++++++++++++++
 .../tests/test_string_convert_lists.py        | 21 ++++++
 19 files changed, 187 insertions(+), 32 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
new file mode 100644
index 00000000000..de62221456f
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
@@ -0,0 +1,6 @@
+================
+convert_booleans
+================
+
+.. automodule:: pylibcudf.strings.convert.convert_booleans
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
new file mode 100644
index 00000000000..fc5d5204ab3
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
@@ -0,0 +1,6 @@
+================
+convert_datetime
+================
+
+.. automodule:: pylibcudf.strings.convert.convert_datetime
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
new file mode 100644
index 00000000000..e80b0c15a61
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
@@ -0,0 +1,6 @@
+=================
+convert_durations
+=================
+
+.. automodule:: pylibcudf.strings.convert.convert_durations
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
new file mode 100644
index 00000000000..16d971a6849
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
@@ -0,0 +1,6 @@
+===================
+convert_fixed_point
+===================
+
+.. automodule:: pylibcudf.strings.convert.convert_fixed_point
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
new file mode 100644
index 00000000000..9ae4004cea9
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
@@ -0,0 +1,6 @@
+==============
+convert_floats
+==============
+
+.. automodule:: pylibcudf.strings.convert.convert_floats
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
new file mode 100644
index 00000000000..4ead8677a69
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
@@ -0,0 +1,6 @@
+============
+convert_ipv4
+============
+
+.. automodule:: pylibcudf.strings.convert.convert_ipv4
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
new file mode 100644
index 00000000000..33a719a42e1
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
@@ -0,0 +1,6 @@
+=============
+convert_lists
+=============
+
+.. automodule:: pylibcudf.strings.convert.convert_lists
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
new file mode 100644
index 00000000000..f20d95e0cdd
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
@@ -0,0 +1,6 @@
+============
+convert_urls
+============
+
+.. automodule:: pylibcudf.strings.convert.convert_urls
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
new file mode 100644
index 00000000000..fa05cb7d786
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
@@ -0,0 +1,14 @@
+convert
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    convert_booleans
+    convert_datetime
+    convert_durations
+    convert_fixed_point
+    convert_floats
+    convert_ipv4
+    convert_lists
+    convert_urls
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 48dc8a13c3e..65dc5d2d1c3 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -21,3 +21,9 @@ strings
     split
     strip
     wrap
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Subpackages
+
+    convert/index.rst
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 73aebf8ab35..3a2cb4bd5c7 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -1,23 +1,13 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.convert.convert_lists cimport (
-    format_list_column as cpp_format_list_column,
-)
-
 from cudf._lib.column cimport Column
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.scalar cimport DeviceScalar
-
 
 @acquire_spill_lock()
 def format_list_column(Column source_list, Column separators):
@@ -34,19 +24,9 @@ def format_list_column(Column source_list, Column separators):
     -------
     Formatted strings column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_list.view()
-    cdef column_view separators_view = separators.view()
-    # Use 'None' as null-replacement string
-    cdef DeviceScalar str_na_rep = as_device_scalar("None")
-    cdef const string_scalar* string_scalar_na_rep = <const string_scalar*>(
-        str_na_rep.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_format_list_column(
-            source_view, string_scalar_na_rep[0], separators_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_lists.format_list_column(
+        source_list.to_pylibcudf(mode="read"),
+        as_device_scalar("None").c_value,
+        separators.to_pylibcudf(mode="read"),
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index 109111568d8..6e1ecd30539 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -9,6 +9,6 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[column] format_list_column(
-        column_view input_col,
+        column_view input,
         string_scalar na_rep,
         column_view separators) except +
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 7b228c06a18..846070870b1 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
-                   convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx
+set(cython_sources
+    convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx
+    convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index be6145384ad..799532d72c6 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -6,5 +6,6 @@ from . cimport (
     convert_fixed_point,
     convert_floats,
     convert_ipv4,
+    convert_lists,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index 7c94387282b..deb2d8ab74b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -6,5 +6,6 @@
     convert_fixed_point,
     convert_floats,
     convert_ipv4,
+    convert_lists,
     convert_urls,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
index 40dadf6f967..60a8fca8baf 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
@@ -15,7 +15,7 @@ cpdef Column to_fixed_point(Column input, DataType output_type):
     Returns a new fixed-point column parsing decimal values from the
     provided strings column.
 
-    For details, see :cpp:details:`cudf::strings::to_fixed_point`
+    For details, see :cpp:func:`cudf::strings::to_fixed_point`
 
     Parameters
     ----------
@@ -47,7 +47,7 @@ cpdef Column from_fixed_point(Column input):
     Returns a new strings column converting the fixed-point values
     into a strings column.
 
-    For details, see :cpp:details:`cudf::strings::from_fixed_point`
+    For details, see :cpp:func:`cudf::strings::from_fixed_point`
 
     Parameters
     ----------
@@ -75,7 +75,7 @@ cpdef Column is_fixed_point(Column input, DataType decimal_type=None):
     Returns a boolean column identifying strings in which all
     characters are valid for conversion to fixed-point.
 
-    For details, see :cpp:details:`cudf::strings::is_fixed_point`
+    For details, see :cpp:func:`cudf::strings::is_fixed_point`
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
new file mode 100644
index 00000000000..1ba4272afa2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column format_list_column(
+    Column input,
+    Scalar na_rep=*,
+    Column separators=*
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
new file mode 100644
index 00000000000..3fbc08a9ab5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.column_factories cimport make_empty_column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_lists as cpp_convert_lists,
+)
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.types cimport type_id
+
+from cython.operator import dereference
+
+
+cpdef Column format_list_column(
+    Column input,
+    Scalar na_rep=None,
+    Column separators=None
+):
+    """
+    Convert a list column of strings into a formatted strings column.
+
+    For details, see :cpp:func`cudf::strings::format_list_column`
+
+    Parameters
+    ----------
+    input : Column
+        Lists column to format
+
+    na_rep : Scalar
+        Replacement string for null elements.
+        Default, empty string
+
+    separators : Column
+        Strings to use for enclosing list components and separating elements.
+        Default, ``,``, ``[``, ``]``
+
+    Returns
+    -------
+    Column
+        New strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    if na_rep is None:
+        na_rep = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    cdef const string_scalar* c_na_rep = <const string_scalar*>(
+        na_rep.c_obj.get()
+    )
+
+    if separators is None:
+        separators = make_empty_column(type_id.STRING)
+
+    with nogil:
+        c_result = move(
+            cpp_convert_lists.format_list_column(
+                input.view(),
+                dereference(c_na_rep),
+                separators.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
new file mode 100644
index 00000000000..8591732b39e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.mark.parametrize("na_rep", [None, pa.scalar("")])
+@pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])])
+def test_format_list_column(na_rep, separators):
+    arr = pa.array([["1", "A"], None])
+    result = plc.strings.convert.convert_lists.format_list_column(
+        plc.interop.from_arrow(arr),
+        na_rep if na_rep is None else plc.interop.from_arrow(na_rep),
+        separators
+        if separators is None
+        else plc.interop.from_arrow(separators),
+    )
+    expected = pa.array(["[1,A]", ""])
+    assert_column_eq(result, expected)

From 7d49df7d8a1a49de628181d81ef87b186b1ea594 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:52:17 -1000
Subject: [PATCH 049/117] Add json APIs to pylibcudf (#17025)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17025
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../user_guide/api_docs/pylibcudf/json.rst    |   6 +
 python/cudf/cudf/_lib/strings/__init__.py     |   2 +-
 python/cudf/cudf/_lib/strings/json.pyx        |  80 ++-------
 python/cudf/cudf/core/column/string.py        |   3 +-
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 python/pylibcudf/pylibcudf/json.pxd           |  16 ++
 python/pylibcudf/pylibcudf/json.pyx           | 154 ++++++++++++++++++
 .../pylibcudf/libcudf/{strings => }/json.pxd  |   0
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |   4 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |   3 +
 python/pylibcudf/pylibcudf/tests/test_json.py |  42 +++++
 python/pylibcudf/pyproject.toml               |   3 +-
 15 files changed, 246 insertions(+), 73 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
 create mode 100644 python/pylibcudf/pylibcudf/json.pxd
 create mode 100644 python/pylibcudf/pylibcudf/json.pyx
 rename python/pylibcudf/pylibcudf/libcudf/{strings => }/json.pxd (100%)
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_json.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 052479d6720..62e14a67ee5 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
     groupby
     interop
     join
+    json
     labeling
     lists
     merge
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
new file mode 100644
index 00000000000..bb38d179a57
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
@@ -0,0 +1,6 @@
+====
+json
+====
+
+.. automodule:: pylibcudf.json
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index e712937f816..ffa5e603408 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -72,7 +72,7 @@
 )
 from cudf._lib.strings.find_multiple import find_multiple
 from cudf._lib.strings.findall import find_re, findall
-from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
+from cudf._lib.strings.json import get_json_object
 from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill
 from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
 from cudf._lib.strings.replace import (
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index c9b0bba088d..374a104635a 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -1,84 +1,26 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
+from pylibcudf.json cimport GetJsonObjectOptions
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.json cimport (
-    get_json_object as cpp_get_json_object,
-    get_json_object_options,
-)
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
 def get_json_object(
-        Column col, object py_json_path, GetJsonObjectOptions options):
+    Column col,
+    object py_json_path,
+    GetJsonObjectOptions options
+):
     """
     Apply a JSONPath string to all rows in an input column
     of json strings.
     """
-    cdef unique_ptr[column] c_result
-
-    cdef column_view col_view = col.view()
-    cdef DeviceScalar json_path = py_json_path.device_value
-
-    cdef const string_scalar* scalar_json_path = <const string_scalar*>(
-        json_path.get_raw_ptr()
+    plc_column = plc.json.get_json_object(
+        col.to_pylibcudf(mode="read"),
+        py_json_path.device_value.c_value,
+        options
     )
-
-    with nogil:
-        c_result = move(cpp_get_json_object(
-            col_view,
-            scalar_json_path[0],
-            options.options,
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-cdef class GetJsonObjectOptions:
-    cdef get_json_object_options options
-
-    def __init__(
-        self,
-        *,
-        allow_single_quotes=False,
-        strip_quotes_from_single_strings=True,
-        missing_fields_as_nulls=False
-    ):
-        self.options.set_allow_single_quotes(allow_single_quotes)
-        self.options.set_strip_quotes_from_single_strings(
-            strip_quotes_from_single_strings
-        )
-        self.options.set_missing_fields_as_nulls(missing_fields_as_nulls)
-
-    @property
-    def allow_single_quotes(self):
-        return self.options.get_allow_single_quotes()
-
-    @property
-    def strip_quotes_from_single_strings(self):
-        return self.options.get_strip_quotes_from_single_strings()
-
-    @property
-    def missing_fields_as_nulls(self):
-        return self.options.get_missing_fields_as_nulls()
-
-    @allow_single_quotes.setter
-    def allow_single_quotes(self, val):
-        self.options.set_allow_single_quotes(val)
-
-    @strip_quotes_from_single_strings.setter
-    def strip_quotes_from_single_strings(self, val):
-        self.options.set_strip_quotes_from_single_strings(val)
-
-    @missing_fields_as_nulls.setter
-    def missing_fields_as_nulls(self, val):
-        self.options.set_missing_fields_as_nulls(val)
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b50e23bd52e..45d1a8b087b 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2385,8 +2385,7 @@ def get_json_object(
             0    [\n        { "category": "reference",\n       ...
             dtype: object
         """
-
-        options = libstrings.GetJsonObjectOptions(
+        options = plc.json.GetJsonObjectOptions(
             allow_single_quotes=allow_single_quotes,
             strip_quotes_from_single_strings=(
                 strip_quotes_from_single_strings
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index 1d72eacac12..2854d7c42ac 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -27,6 +27,7 @@ set(cython_sources
     groupby.pyx
     interop.pyx
     join.pyx
+    json.pyx
     labeling.pyx
     lists.pyx
     merge.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index b98b37fe0fd..79c2f0c5455 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     filling,
     groupby,
     join,
+    json,
     labeling,
     lists,
     merge,
@@ -60,6 +61,7 @@ __all__ = [
     "gpumemoryview",
     "groupby",
     "join",
+    "json",
     "lists",
     "merge",
     "null_mask",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 304f27be340..88e72418cda 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -24,6 +24,7 @@
     interop,
     io,
     join,
+    json,
     labeling,
     lists,
     merge,
@@ -73,6 +74,7 @@
     "interop",
     "io",
     "join",
+    "json",
     "labeling",
     "lists",
     "merge",
diff --git a/python/pylibcudf/pylibcudf/json.pxd b/python/pylibcudf/pylibcudf/json.pxd
new file mode 100644
index 00000000000..87a87349b8a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/json.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.json cimport get_json_object_options
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class GetJsonObjectOptions:
+    cdef get_json_object_options options
+
+
+cpdef Column get_json_object(
+    Column col,
+    Scalar json_path,
+    GetJsonObjectOptions options=*
+)
diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx
new file mode 100644
index 00000000000..4a8d11068f9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/json.pyx
@@ -0,0 +1,154 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf cimport json as cpp_json
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class GetJsonObjectOptions:
+    """Settings for ``get_json_object()``"""
+    def __init__(
+        self,
+        *,
+        allow_single_quotes=False,
+        strip_quotes_from_single_strings=True,
+        missing_fields_as_nulls=False
+    ):
+        self.set_allow_single_quotes(allow_single_quotes)
+        self.set_strip_quotes_from_single_strings(
+            strip_quotes_from_single_strings
+        )
+        self.set_missing_fields_as_nulls(missing_fields_as_nulls)
+
+    def get_allow_single_quotes(self):
+        """
+        Returns true/false depending on whether single-quotes for representing strings
+        are allowed.
+
+        Returns
+        -------
+        bool
+            true if single-quotes are allowed, false otherwise.
+        """
+        return self.options.get_allow_single_quotes()
+
+    def get_strip_quotes_from_single_strings(self):
+        """
+        Returns true/false depending on whether individually returned string values have
+        their quotes stripped.
+
+        Returns
+        -------
+        bool
+            true if individually returned string values have their quotes stripped.
+        """
+        return self.options.get_strip_quotes_from_single_strings()
+
+    def get_missing_fields_as_nulls(self):
+        """
+        Whether a field not contained by an object is to be interpreted as null.
+
+        Returns
+        -------
+        bool
+            true if missing fields are interpreted as null.
+        """
+        return self.options.get_missing_fields_as_nulls()
+
+    def set_allow_single_quotes(self, bool val):
+        """
+        Set whether single-quotes for strings are allowed.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to allow single quotes
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_allow_single_quotes(val)
+
+    def set_strip_quotes_from_single_strings(self, bool val):
+        """
+        Set whether individually returned string values have their quotes stripped.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to strip quotes from single strings.
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_strip_quotes_from_single_strings(val)
+
+    def set_missing_fields_as_nulls(self, bool val):
+        """
+        Set whether missing fields are interpreted as null.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to treat missing fields as nulls.
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_missing_fields_as_nulls(val)
+
+
+cpdef Column get_json_object(
+    Column col,
+    Scalar json_path,
+    GetJsonObjectOptions options=None
+):
+    """
+    Apply a JSONPath string to all rows in an input strings column.
+
+    For details, see :cpp:func:`cudf::get_json_object`
+
+    Parameters
+    ----------
+    col : Column
+        The input strings column. Each row must contain a valid json string.
+
+    json_path : Scalar
+        The JSONPath string to be applied to each row.
+
+    options : GetJsonObjectOptions
+        Options for controlling the behavior of the function.
+
+    Returns
+    -------
+    Column
+        New strings column containing the retrieved json object strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string_scalar* c_json_path = <string_scalar*>(
+        json_path.c_obj.get()
+    )
+    if options is None:
+        options = GetJsonObjectOptions()
+
+    cdef cpp_json.get_json_object_options c_options = options.options
+
+    with nogil:
+        c_result = move(
+            cpp_json.get_json_object(
+                col.view(),
+                dereference(c_json_path),
+                c_options
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/libcudf/json.pxd
similarity index 100%
rename from python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
rename to python/pylibcudf/pylibcudf/libcudf/json.pxd
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 187ef113073..e45048a500f 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -14,6 +14,7 @@ from . cimport (
     padding,
     regex_flags,
     regex_program,
+    repeat,
     replace,
     side_type,
     slice,
@@ -33,9 +34,12 @@ __all__ = [
     "convert",
     "extract",
     "find",
+    "find_multiple",
     "findall",
+    "padding",
     "regex_flags",
     "regex_program",
+    "repeat",
     "replace",
     "slice",
     "strip",
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 6033cea0625..c6253d94b40 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -34,9 +34,12 @@
     "convert",
     "extract",
     "find",
+    "find_multiple",
     "findall",
+    "padding",
     "regex_flags",
     "regex_program",
+    "repeat",
     "replace",
     "slice",
     "strip",
diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py
new file mode 100644
index 00000000000..3d2955211f8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_json.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def plc_col():
+    arr = pa.array(
+        ['{"foo": {"bar": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None]
+    )
+    return plc.interop.from_arrow(arr)
+
+
+@pytest.fixture(scope="module")
+def json_path():
+    slr = pa.scalar("$.foo.bar")
+    return plc.interop.from_arrow(slr)
+
+
+@pytest.mark.parametrize("allow_single_quotes", [True, False])
+@pytest.mark.parametrize("strip_quotes_from_single_strings", [True, False])
+@pytest.mark.parametrize("missing_fields_as_nulls", [True, False])
+def test_get_json_object(
+    plc_col,
+    json_path,
+    allow_single_quotes,
+    strip_quotes_from_single_strings,
+    missing_fields_as_nulls,
+):
+    result = plc.json.get_json_object(
+        plc_col,
+        json_path,
+        plc.json.GetJsonObjectOptions(
+            allow_single_quotes=allow_single_quotes,
+            strip_quotes_from_single_strings=strip_quotes_from_single_strings,
+            missing_fields_as_nulls=missing_fields_as_nulls,
+        ),
+    )
+    expected = pa.array(['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index c9a685de3e9..ea5b3065896 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -97,7 +97,8 @@ skip = [
 ]
 
 [tool.pytest.ini_options]
-addopts = "--tb=native --strict-config --strict-markers"
+# --import-mode=importlib because two test_json.py exists and tests directory is not a structured module
+addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib"
 empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
   "error",

From 097778e82a6b580bfa91d7941379d33acc324c60 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 14:25:27 -1000
Subject: [PATCH 050/117] Move pylibcudf/libcudf/wrappers/decimals to
 pylibcudf/libcudf/fixed_point (#17048)

Contributes to https://github.com/rapidsai/cudf/issues/15162

I don't think there are any types in this file that needs to be exposed on the Python side; they're just used internally in pylibcudf.

Also moves this to `libcudf/fixed_point` matching the libcudf location more closely

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17048
---
 .../pylibcudf/libcudf/fixed_point/__init__.pxd   |  0
 .../libcudf/fixed_point/fixed_point.pxd          |  8 ++++++++
 .../pylibcudf/libcudf/scalar/scalar.pxd          |  2 +-
 .../pylibcudf/libcudf/wrappers/decimals.pxd      | 16 ----------------
 4 files changed, 9 insertions(+), 17 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd
 delete mode 100644 python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd

diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd
new file mode 100644
index 00000000000..e55574020f4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+
+
+cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
+    cdef cppclass scale_type:
+        scale_type(int32_t)
diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
index 4b40a8a26f6..a51413669c5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
@@ -4,9 +4,9 @@ from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
 from libcpp.string cimport string
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport data_type
-from pylibcudf.libcudf.wrappers.decimals cimport scale_type
 
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
deleted file mode 100644
index 558299501d6..00000000000
--- a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t, int64_t
-from pylibcudf.libcudf.types cimport int128
-
-
-cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
-    # cython type stub to help resolve to numeric::decimal64
-    ctypedef int64_t decimal64
-    # cython type stub to help resolve to numeric::decimal32
-    ctypedef int64_t decimal32
-    # cython type stub to help resolve to numeric::decimal128
-    ctypedef int128 decimal128
-
-    cdef cppclass scale_type:
-        scale_type(int32_t)

From 1436cac9de8b450a32e71d5b779503e9a29edaa6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 14:26:44 -1000
Subject: [PATCH 051/117] Remove unneeded pylibcudf.libcudf.wrappers.duration
 usage in cudf (#17010)

Contributes to https://github.com/rapidsai/cudf/issues/15162

~I just assumed since the associated libcudf files just publicly expose C types, we just want to match the name spacing when importing from pylibcudf (avoid importing from `pylibcudf.libcudf`) and not necessary expose a Python equivalent?~

~Let me know if I am misunderstanding how to expose these types.~

https://github.com/rapidsai/cudf/pull/17010#issuecomment-2403658378

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17010
---
 python/cudf/cudf/_lib/scalar.pyx | 96 +-------------------------------
 1 file changed, 1 insertion(+), 95 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 0dde91316fb..56712402919 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -6,7 +6,6 @@ import numpy as np
 import pandas as pd
 import pyarrow as pa
 
-from libc.stdint cimport int64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -25,25 +24,7 @@ cimport pylibcudf.libcudf.types as libcudf_types
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
 from pylibcudf cimport Scalar as plc_Scalar
-from pylibcudf.libcudf.scalar.scalar cimport (
-    duration_scalar,
-    list_scalar,
-    scalar,
-    struct_scalar,
-    timestamp_scalar,
-)
-from pylibcudf.libcudf.wrappers.durations cimport (
-    duration_ms,
-    duration_ns,
-    duration_s,
-    duration_us,
-)
-from pylibcudf.libcudf.wrappers.timestamps cimport (
-    timestamp_ms,
-    timestamp_ns,
-    timestamp_s,
-    timestamp_us,
-)
+from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
 
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
@@ -284,62 +265,6 @@ cdef class DeviceScalar:
             ]
 
 
-# TODO: Currently the only uses of this function and the one below are in
-# _create_proxy_nat_scalar. See if that code path can be simplified to excise
-# or at least simplify these implementations.
-cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s,
-                                    object value,
-                                    object dtype,
-                                    bool valid=True):
-
-    value = value if valid else 0
-
-    if dtype == "datetime64[s]":
-        s.reset(
-            new timestamp_scalar[timestamp_s](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[ms]":
-        s.reset(
-            new timestamp_scalar[timestamp_ms](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[us]":
-        s.reset(
-            new timestamp_scalar[timestamp_us](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[ns]":
-        s.reset(
-            new timestamp_scalar[timestamp_ns](<int64_t>np.int64(value), valid)
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s,
-                                     object value,
-                                     object dtype,
-                                     bool valid=True):
-
-    value = value if valid else 0
-
-    if dtype == "timedelta64[s]":
-        s.reset(
-            new duration_scalar[duration_s](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[ms]":
-        s.reset(
-            new duration_scalar[duration_ms](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[us]":
-        s.reset(
-            new duration_scalar[duration_us](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[ns]":
-        s.reset(
-            new duration_scalar[duration_ns](<int64_t>np.int64(value), valid)
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-
 def as_device_scalar(val, dtype=None):
     if isinstance(val, (cudf.Scalar, DeviceScalar)):
         if dtype == val.dtype or dtype is None:
@@ -361,22 +286,3 @@ def _is_null_host_scalar(slr):
         return True
     else:
         return False
-
-
-def _create_proxy_nat_scalar(dtype):
-    cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar)
-
-    dtype = cudf.dtype(dtype)
-    if dtype.char in 'mM':
-        nat = dtype.type('NaT').astype(dtype)
-        if dtype.type == np.datetime64:
-            _set_datetime64_from_np_scalar(
-                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
-            )
-        elif dtype.type == np.timedelta64:
-            _set_timedelta64_from_np_scalar(
-                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
-            )
-        return result
-    else:
-        raise TypeError('NAT only valid for datetime and timedelta')

From 89a6fe575f2ac0caa661dd51f87b37dba07507a7 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 11 Oct 2024 08:59:20 -0500
Subject: [PATCH 052/117] make conda installs in CI stricter (part 2) (#17042)

Follow-up to #17013

Changes relative to that PR:

* switches to pinning CI conda installs to the output of `rapids-version` (`{major}.{minor}.{patch}`) instead of `rapids-version-major-minor` (`{major}.{minor}`), to get a bit more protection in the presence of hotfix releases
* restores some exporting of variables needed for docs builds

I made some mistakes in https://github.com/rapidsai/cudf/pull/17013#discussion_r1792317422. Missed that this project's Doxygen setup is expecting to find `RAPIDS_VERSION` and `RAPIDS_VERSION_MAJOR_MINOR` defined in the environment.

https://github.com/rapidsai/cudf/blob/7173b52fce25937bb69e22a083a5de4655078fa1/cpp/doxygen/Doxyfile#L41

https://github.com/rapidsai/cudf/blob/7173b52fce25937bb69e22a083a5de4655078fa1/cpp/doxygen/Doxyfile#L2229

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17042
---
 ci/build_docs.sh         | 16 ++++++++--------
 ci/test_cpp_common.sh    | 10 +++++-----
 ci/test_java.sh          |  4 ++--
 ci/test_notebooks.sh     |  6 +++---
 ci/test_python_common.sh |  6 +++---
 ci/test_python_other.sh  |  8 ++++----
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index dae6ac46757..4290d013fe4 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -3,8 +3,8 @@
 
 set -euo pipefail
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
+export RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
@@ -28,16 +28,16 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "pylibcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "dask-cudf=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "libcudf=${RAPIDS_VERSION}" \
+  "pylibcudf=${RAPIDS_VERSION}" \
+  "cudf=${RAPIDS_VERSION}" \
+  "dask-cudf=${RAPIDS_VERSION}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_NUMBER}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
@@ -57,4 +57,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
index e8f6e9388f4..8cd78eb11c2 100755
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate C++ testing dependencies"
 
@@ -33,10 +33,10 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf_kafka=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf-tests=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf-example=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "libcudf=${RAPIDS_VERSION}" \
+  "libcudf_kafka=${RAPIDS_VERSION}" \
+  "libcudf-tests=${RAPIDS_VERSION}" \
+  "libcudf-example=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_java.sh b/ci/test_java.sh
index 9b7b2e48dd6..7f1aa633afc 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate Java testing dependencies"
 
@@ -32,7 +32,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "libcudf=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
index 3e0712a0691..4197dc5617f 100755
--- a/ci/test_notebooks.sh
+++ b/ci/test_notebooks.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate notebook testing dependencies"
 
@@ -32,8 +32,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "cudf=${RAPIDS_VERSION}" \
+  "libcudf=${RAPIDS_VERSION}"
 
 NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")"
 pushd notebooks
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index 81e82908eb4..4327bfff3af 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -7,7 +7,7 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate Python testing dependencies"
 
@@ -40,5 +40,5 @@ rapids-print-env
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "libcudf=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "cudf=${RAPIDS_VERSION}" \
+  "libcudf=${RAPIDS_VERSION}"
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index eee1d54083f..21a59fa1494 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -7,14 +7,14 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 # Common setup steps shared by Python test jobs
 source ./ci/test_python_common.sh test_python_other
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "dask-cudf=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "cudf_kafka=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "custreamz=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "dask-cudf=${RAPIDS_VERSION}" \
+  "cudf_kafka=${RAPIDS_VERSION}" \
+  "custreamz=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi

From 7cf0a1b4c741f6cc3e4599d41d614e1c046f8a13 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 11 Oct 2024 16:46:53 +0200
Subject: [PATCH 053/117] Pylibcudf: pack and unpack (#17012)

Adding python bindings to [`cudf::pack()`](https://docs.rapids.ai/api/libcudf/legacy/group__copy__split#ga86716e7ec841541deb6edc7e91fcb9e4), [`cudf::unpack()`](https://docs.rapids.ai/api/libcudf/legacy/group__copy__split#ga1d62a18c2e6f087a92289c63693762cc), and [`cudf::packed_columns`](https://docs.rapids.ai/api/libcudf/legacy/structcudf_1_1packed__columns).

This is the first step to support serialization of cudf.polars' IR.

cc. @wence- @rjzamora

#

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17012
---
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 .../pylibcudf/pylibcudf/contiguous_split.pxd  |  20 ++
 .../pylibcudf/pylibcudf/contiguous_split.pyx  | 198 ++++++++++++++++++
 .../pylibcudf/libcudf/contiguous_split.pxd    |   5 +
 .../pylibcudf/tests/test_contiguous_split.py  |  49 +++++
 7 files changed, 277 insertions(+)
 create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pxd
 create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_contiguous_split.py

diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index 2854d7c42ac..15dd2b4c34f 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -17,6 +17,7 @@ set(cython_sources
     binaryop.pyx
     column.pyx
     column_factories.pyx
+    contiguous_split.pyx
     concatenate.pyx
     copying.pyx
     datetime.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 79c2f0c5455..aa67b4b1149 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -6,6 +6,7 @@ from . cimport (
     binaryop,
     column_factories,
     concatenate,
+    contiguous_split,
     copying,
     datetime,
     experimental,
@@ -52,6 +53,7 @@ __all__ = [
     "aggregation",
     "binaryop",
     "column_factories",
+    "contiguous_split",
     "concatenate",
     "copying",
     "datetime",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 88e72418cda..4033062b7e2 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -15,6 +15,7 @@
     binaryop,
     column_factories,
     concatenate,
+    contiguous_split,
     copying,
     datetime,
     experimental,
@@ -63,6 +64,7 @@
     "aggregation",
     "binaryop",
     "column_factories",
+    "contiguous_split",
     "concatenate",
     "copying",
     "datetime",
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd
new file mode 100644
index 00000000000..2a10cb5b3d5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.contiguous_split cimport packed_columns
+
+from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+
+
+cdef class PackedColumns:
+    cdef unique_ptr[packed_columns] c_obj
+
+    @staticmethod
+    cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data)
+
+cpdef PackedColumns pack(Table input)
+
+cpdef Table unpack(PackedColumns input)
+
+cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data)
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx
new file mode 100644
index 00000000000..ed926a3fcc0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx
@@ -0,0 +1,198 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libc.stdint cimport uint8_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.libcudf.contiguous_split cimport (
+    pack as cpp_pack,
+    packed_columns,
+    unpack as cpp_unpack,
+)
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
+
+from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+from .utils cimport int_to_void_ptr
+
+
+cdef class HostBuffer:
+    """Owning host buffer that implements the buffer protocol"""
+    cdef unique_ptr[vector[uint8_t]] c_obj
+    cdef size_t nbytes
+    cdef Py_ssize_t[1] shape
+    cdef Py_ssize_t[1] strides
+
+    @staticmethod
+    cdef HostBuffer from_unique_ptr(
+        unique_ptr[vector[uint8_t]] vec
+    ):
+        cdef HostBuffer out = HostBuffer()
+        out.c_obj = move(vec)
+        out.nbytes = dereference(out.c_obj).size()
+        out.shape[0] = out.nbytes
+        out.strides[0] = 1
+        return out
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        buffer.buf = dereference(self.c_obj).data()
+        buffer.format = NULL  # byte
+        buffer.internal = NULL
+        buffer.itemsize = 1
+        buffer.len = self.nbytes
+        buffer.ndim = 1
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+cdef class PackedColumns:
+    """Column data in a serialized format.
+
+    Contains data from an array of columns in two contiguous buffers:
+    one on host, which contains table metadata and one on device which
+    contains the table data.
+
+    For details, see :cpp:class:`cudf::packed_columns`.
+    """
+    def __init__(self):
+        raise ValueError(
+            "PackedColumns should not be constructed directly. "
+            "Use one of the factories."
+        )
+
+    @staticmethod
+    cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data):
+        """Create a Python PackedColumns from a libcudf packed_columns."""
+        cdef PackedColumns out = PackedColumns.__new__(PackedColumns)
+        out.c_obj = move(data)
+        return out
+
+    def release(self):
+        """Releases and returns the underlying serialized metadata and gpu data.
+
+        The ownership of the memory are transferred to the returned buffers. After
+        this call, `self` is empty.
+
+        Returns
+        -------
+        memoryview (of a HostBuffer)
+            The serialized metadata as contiguous host memory.
+        gpumemoryview (of a rmm.DeviceBuffer)
+            The serialized gpu data as contiguous device memory.
+        """
+        if not (dereference(self.c_obj).metadata and dereference(self.c_obj).gpu_data):
+            raise ValueError("Cannot release empty PackedColumns")
+
+        return (
+            memoryview(
+                HostBuffer.from_unique_ptr(move(dereference(self.c_obj).metadata))
+            ),
+            gpumemoryview(
+                DeviceBuffer.c_from_unique_ptr(move(dereference(self.c_obj).gpu_data))
+            )
+        )
+
+
+cpdef PackedColumns pack(Table input):
+    """Deep-copy a table into a serialized contiguous memory format.
+
+    Later use `unpack` or `unpack_from_memoryviews` to unpack the serialized
+    data back into the table.
+
+    Examples
+    --------
+    >>> packed = pylibcudf.contiguous_split.pack(...)
+    >>> # Either unpack the whole `PackedColumns` at once.
+    >>> pylibcudf.contiguous_split.unpack(packed)
+    >>> # Or unpack the two serialized buffers in `PackedColumns`.
+    >>> metadata, gpu_data = packed.release()
+    >>> pylibcudf.contiguous_split.unpack_from_memoryviews(metadata, gpu_data)
+
+    For details, see :cpp:func:`cudf::pack`.
+
+    Parameters
+    ----------
+    input : Table
+        Table to pack.
+
+    Returns
+    -------
+    PackedColumns
+        The packed columns.
+    """
+    return PackedColumns.from_libcudf(
+        make_unique[packed_columns](cpp_pack(input.view()))
+    )
+
+
+cpdef Table unpack(PackedColumns input):
+    """Deserialize the result of `pack`.
+
+    Copies the result of a serialized table into a table.
+    Contrary to the libcudf C++ function, the returned table is a copy
+    of the serialized data.
+
+    For details, see :cpp:func:`cudf::unpack`.
+
+    Parameters
+    ----------
+    input : PackedColumns
+        The packed columns to unpack.
+
+    Returns
+    -------
+    Table
+        Copy of the packed columns.
+    """
+    cdef table_view v = cpp_unpack(dereference(input.c_obj))
+    # Since `Table.from_table_view` doesn't support an arbitrary owning object,
+    # we copy the table, see <https://github.com/rapidsai/cudf/issues/17040>.
+    cdef unique_ptr[table] t = make_unique[table](v)
+    return Table.from_libcudf(move(t))
+
+
+cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data):
+    """Deserialize the result of `pack`.
+
+    Copies the result of a serialized table into a table.
+    Contrary to the libcudf C++ function, the returned table is a copy
+    of the serialized data.
+
+    For details, see :cpp:func:`cudf::unpack`.
+
+    Parameters
+    ----------
+    metadata : memoryview
+        The packed metadata to unpack.
+    gpu_data : gpumemoryview
+        The packed gpu_data to unpack.
+
+    Returns
+    -------
+    Table
+        Copy of the packed columns.
+    """
+    if metadata.nbytes == 0:
+        if gpu_data.__cuda_array_interface__["data"][0] != 0:
+            raise ValueError("Expected an empty gpu_data from unpacking an empty table")
+        return Table.from_libcudf(make_unique[table](table_view()))
+
+    # Extract the raw data pointers
+    cdef const uint8_t[::1] _metadata = metadata
+    cdef const uint8_t* metadata_ptr = &_metadata[0]
+    cdef const uint8_t* gpu_data_ptr = <uint8_t*>int_to_void_ptr(gpu_data.ptr)
+
+    cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr)
+    # Since `Table.from_table_view` doesn't support an arbitrary owning object,
+    # we copy the table, see <https://github.com/rapidsai/cudf/issues/17040>.
+    cdef unique_ptr[table] t = make_unique[table](v)
+    return Table.from_libcudf(move(t))
diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
index 6de9c4382d3..12090af16cc 100644
--- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
@@ -26,3 +26,8 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
     cdef packed_columns pack (const table_view& input) except +
 
     cdef table_view unpack (const packed_columns& input) except +
+
+    cdef table_view unpack (
+        const uint8_t* metadata,
+        const uint8_t* gpu_data
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
new file mode 100644
index 00000000000..7a5c1664eed
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_table_eq
+
+param_pyarrow_tables = [
+    pa.table([]),
+    pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
+    pa.table({"a": [1, 2, 3]}),
+    pa.table({"a": [1], "b": [2], "c": [3]}),
+    pa.table({"a": ["a", "bb", "ccc"]}),
+    pa.table({"a": [1, 2, None], "b": [None, 3, 4]}),
+    pa.table(
+        {
+            "a": [["a", "b"], ["cde"]],
+            "b": [
+                {"alpha": [1, 2], "beta": None},
+                {"alpha": [3, 4], "beta": 5},
+            ],
+        }
+    ),
+]
+
+
+@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables)
+def test_pack_and_unpack(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    packed = plc.contiguous_split.pack(plc_tbl)
+
+    res = plc.contiguous_split.unpack(packed)
+    assert_table_eq(arrow_tbl, res)
+
+
+@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables)
+def test_pack_and_unpack_from_memoryviews(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    packed = plc.contiguous_split.pack(plc_tbl)
+
+    metadata, gpudata = packed.release()
+
+    with pytest.raises(ValueError, match="Cannot release empty"):
+        packed.release()
+
+    del packed  # `metadata` and `gpudata` will survive
+
+    res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpudata)
+    assert_table_eq(arrow_tbl, res)

From 66a94c3d025931b50b08a8a7bdda3363904dbef4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 08:41:47 -0700
Subject: [PATCH 054/117] Replace deprecated cuco APIs with updated versions
 (#17052)

This PR replaces the deprecated cuco APIs with the new ones, ensuring the code is up to date with the latest API changes.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17052
---
 cpp/src/io/parquet/chunk_dict.cu        | 4 ++--
 cpp/src/join/mixed_join_kernels_semi.cu | 2 +-
 cpp/src/join/mixed_join_semi.cu         | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 17ccb73c0a8..1a2a9eac17d 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -84,7 +84,7 @@ struct map_insert_fn {
                                                storage_ref};
 
       // Create a map ref with `cuco::insert` operator
-      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
+      auto map_insert_ref = hash_map_ref.rebind_operators(cuco::insert);
       auto const t        = threadIdx.x;
 
       // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size
@@ -186,7 +186,7 @@ struct map_find_fn {
                                                storage_ref};
 
       // Create a map ref with `cuco::find` operator
-      auto const map_find_ref = hash_map_ref.with_operators(cuco::find);
+      auto const map_find_ref = hash_map_ref.rebind_operators(cuco::find);
       auto const t            = threadIdx.x;
 
       // Note: Adjust the following loop to use `cg::tiles<map_cg_size>` if needed in the future.
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index bd8c80652a0..a4ec97af235 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -67,7 +67,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
   // Create set ref with the new equality comparator
-  auto const set_ref_equality = set_ref.with_key_eq(equality);
+  auto const set_ref_equality = set_ref.rebind_key_eq(equality);
 
   // Total number of rows to query the set
   auto const outer_num_rows = left_table.num_rows();
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 83a55eca50f..62ba558b0bd 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -184,7 +184,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+  hash_set_ref_type const row_set_ref =
+    row_set.ref(cuco::contains).rebind_hash_function(hash_probe);
 
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);

From 349010e75f94488c6385f773bbca872ccc5f34b6 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 08:42:12 -0700
Subject: [PATCH 055/117] Remove unused hash helper functions (#17056)

This PR removes unused hash detail implementations.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17056
---
 .../cudf/hashing/detail/helper_functions.cuh  | 194 ------------------
 1 file changed, 194 deletions(-)

diff --git a/cpp/include/cudf/hashing/detail/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh
index 3489fdeccee..ea1accc62a4 100644
--- a/cpp/include/cudf/hashing/detail/helper_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/helper_functions.cuh
@@ -47,197 +47,3 @@ inline size_t compute_hash_table_size(cudf::size_type num_keys_to_insert,
 
   return hash_table_size;
 }
-
-template <typename pair_type>
-__forceinline__ __device__ pair_type load_pair_vectorized(pair_type const* __restrict__ const ptr)
-{
-  if (sizeof(uint4) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint4 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0, 0, 0, 0};
-    converter.vec_val            = *reinterpret_cast<uint4 const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(uint2) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint2 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0, 0};
-    converter.vec_val            = *reinterpret_cast<uint2 const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(int) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      int vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.vec_val            = *reinterpret_cast<int const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(short) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      short vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.vec_val            = *reinterpret_cast<short const*>(ptr);
-    return converter.pair_val;
-  } else {
-    return *ptr;
-  }
-}
-
-template <typename pair_type>
-__forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ const ptr,
-                                                      pair_type const val)
-{
-  if (sizeof(uint4) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint4 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0, 0, 0, 0};
-    converter.pair_val             = val;
-    *reinterpret_cast<uint4*>(ptr) = converter.vec_val;
-  } else if (sizeof(uint2) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint2 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0, 0};
-    converter.pair_val             = val;
-    *reinterpret_cast<uint2*>(ptr) = converter.vec_val;
-  } else if (sizeof(int) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      int vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.pair_val           = val;
-    *reinterpret_cast<int*>(ptr) = converter.vec_val;
-  } else if (sizeof(short) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      short vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0};
-    converter.pair_val             = val;
-    *reinterpret_cast<short*>(ptr) = converter.vec_val;
-  } else {
-    *ptr = val;
-  }
-}
-
-template <typename value_type, typename size_type, typename key_type, typename elem_type>
-CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values,
-                              size_type const n,
-                              key_type const key_val,
-                              elem_type const elem_val)
-{
-  size_type const idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    store_pair_vectorized(hashtbl_values + idx, thrust::make_pair(key_val, elem_val));
-  }
-}
-
-template <typename T>
-struct equal_to {
-  using result_type          = bool;
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  __forceinline__ __host__ __device__ constexpr bool operator()(
-    first_argument_type const& lhs, second_argument_type const& rhs) const
-  {
-    return lhs == rhs;
-  }
-};
-
-template <typename Iterator>
-class cycle_iterator_adapter {
- public:
-  using value_type      = typename std::iterator_traits<Iterator>::value_type;
-  using difference_type = typename std::iterator_traits<Iterator>::difference_type;
-  using pointer         = typename std::iterator_traits<Iterator>::pointer;
-  using reference       = typename std::iterator_traits<Iterator>::reference;
-  using iterator_type   = Iterator;
-
-  cycle_iterator_adapter() = delete;
-
-  __host__ __device__ explicit cycle_iterator_adapter(iterator_type const& begin,
-                                                      iterator_type const& end,
-                                                      iterator_type const& current)
-    : m_begin(begin), m_end(end), m_current(current)
-  {
-  }
-
-  __host__ __device__ cycle_iterator_adapter& operator++()
-  {
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return *this;
-  }
-
-  __host__ __device__ cycle_iterator_adapter const& operator++() const
-  {
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return *this;
-  }
-
-  __host__ __device__ cycle_iterator_adapter& operator++(int)
-  {
-    cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return old;
-  }
-
-  __host__ __device__ cycle_iterator_adapter const& operator++(int) const
-  {
-    cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return old;
-  }
-
-  __host__ __device__ bool equal(cycle_iterator_adapter<iterator_type> const& other) const
-  {
-    return m_current == other.m_current && m_begin == other.m_begin && m_end == other.m_end;
-  }
-
-  __host__ __device__ reference& operator*() { return *m_current; }
-
-  __host__ __device__ reference const& operator*() const { return *m_current; }
-
-  __host__ __device__ const pointer operator->() const { return m_current.operator->(); }
-
-  __host__ __device__ pointer operator->() { return m_current; }
-
- private:
-  iterator_type m_current;
-  iterator_type m_begin;
-  iterator_type m_end;
-};
-
-template <class T>
-__host__ __device__ bool operator==(cycle_iterator_adapter<T> const& lhs,
-                                    cycle_iterator_adapter<T> const& rhs)
-{
-  return lhs.equal(rhs);
-}
-
-template <class T>
-__host__ __device__ bool operator!=(cycle_iterator_adapter<T> const& lhs,
-                                    cycle_iterator_adapter<T> const& rhs)
-{
-  return !lhs.equal(rhs);
-}

From 891e5aa7bf00355dea2b10906cebbe02f9ba25f5 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 11 Oct 2024 12:18:18 -0400
Subject: [PATCH 056/117] Organize parquet reader mukernel non-nullable code,
 introduce manual block scans (#16830)

This is a collection of a few small optimizations and tweaks for the parquet reader fixed-width mukernels (flat & nested, lists not implemented yet). The benchmark changes are negligible, this is mainly cleanup and code in preparation for the upcoming list mukernel.

1) If not reading the whole page (chunked reads) exit sooner

2) By having each thread keep track of the current valid_count (and not saving-to or reading-from the nesting_info until the end), we don't need to synchronize the block threads as frequently, so these extra syncs are removed.

3) For (non-list) nested columns that aren't nullable, we don't need to loop over the whole nesting depth; only the last level of nesting is used.  After removing this loop, the non-nullable code for nested and flat hierarchies is identical, so they're extracted and consolidated into a new function.

4) When doing block scans in the parquet reader we also need to know the per-warp results of the scan.  Because cub doesn't return those, we then do an additional warp-wide ballot that is unnecessary.  This introduces code that does a block scan manually, saving the intermediate results.  However using this code in the flat & nested kernels uses 8 more registers, so it isn't used yet.

5) By doing an exclusive-scan instead of an inclusive-scan, we don't need the extra "- 1's" that were everywhere.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/16830
---
 cpp/src/io/parquet/decode_fixed.cu | 389 +++++++++++++++++------------
 1 file changed, 235 insertions(+), 154 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 8a866141c4b..4522ea7fe56 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,6 +24,59 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously.
+// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for
+// lists.
+struct block_scan_results {
+  uint32_t warp_bits;
+  int thread_count_within_warp;
+  int warp_count;
+
+  int thread_count_within_block;
+  int block_count;
+};
+
+template <int decode_block_size>
+static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
+{
+  int const t              = threadIdx.x;
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
+
+  uint32_t warp_bits = ballot(thread_bit);
+  scan_block_exclusive_sum<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
+}
+
+template <int decode_block_size>
+__device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
+                                                int warp_lane,
+                                                int warp_index,
+                                                uint32_t lane_mask,
+                                                block_scan_results& results)
+{
+  // Compute # warps
+  constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
+
+  // Compute the warp-wide results
+  results.warp_bits                = warp_bits;
+  results.warp_count               = __popc(results.warp_bits);
+  results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
+
+  // Share the warp counts amongst the block threads
+  __shared__ int warp_counts[num_warps];
+  if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
+  __syncthreads();
+
+  // Compute block-wide results
+  results.block_count               = 0;
+  results.thread_count_within_block = results.thread_count_within_warp;
+  for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
+    results.block_count += warp_counts[warp_idx];
+    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+  }
+}
+
 template <int block_size, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
@@ -194,7 +247,7 @@ struct decode_fixed_width_split_values_func {
   }
 };
 
-template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+template <int decode_block_size, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesNested(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
@@ -211,29 +264,28 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
 
   int const row_index_lower_bound = s->row_index_lower_bound;
 
-  int const max_depth = s->col.max_nesting_depth - 1;
+  int const max_depth       = s->col.max_nesting_depth - 1;
+  auto& max_depth_ni        = s->nesting_info[max_depth];
+  int max_depth_valid_count = max_depth_ni.valid_count;
+
   __syncthreads();
 
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
+    // definition level
+    int d = 1;
+    if (t >= batch_size) {
+      d = -1;
+    } else if (def) {
+      d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
     }
 
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index           = (thread_value_count + value_count) - 1;
+    int const row_index           = thread_value_count + value_count;
     int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
     int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
     int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
@@ -242,90 +294,75 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
       auto& ni = s->nesting_info[d_idx];
 
-      int is_valid;
-      if constexpr (nullable) {
-        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
-      } else {
-        is_valid = in_row_bounds;
-      }
+      int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
 
       // thread and block validity count
+      using block_scan = cub::BlockScan<int, decode_block_size>;
+      __shared__ typename block_scan::TempStorage scan_storage;
       int thread_valid_count, block_valid_count;
-      if constexpr (nullable) {
-        using block_scan = cub::BlockScan<int, decode_block_size>;
-        __shared__ typename block_scan::TempStorage scan_storage;
-        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-        __syncthreads();
-
-        // validity is processed per-warp
-        //
-        // nested schemas always read and write to the same bounds (that is, read and write
-        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-        // at the first value, even if that is before first_row, because we cannot trivially jump to
-        // the correct position to start reading. since we are about to write the validity vector
-        // here we need to adjust our computed mask to take into account the write row bounds.
-        int warp_null_count = 0;
-        if (write_start >= 0 && ni.valid_map != nullptr) {
-          int const valid_map_offset        = ni.valid_map_offset;
-          uint32_t const warp_validity_mask = ballot(is_valid);
-          // lane 0 from each warp writes out validity
-          if ((t % cudf::detail::warp_size) == 0) {
-            int const vindex =
-              (value_count + thread_value_count) - 1;  // absolute input value index
-            int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                   first_row;  // absolute bit offset into the output validity map
-            int const write_end = cudf::detail::warp_size -
-                                  __clz(in_write_row_bounds);  // last bit in the warp to store
-            int const bit_count = write_end - write_start;
-            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-          }
-        }
+      block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
 
-        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-        // valid_count) because valid_count also includes rows that potentially start before our row
-        // bounds. if we could come up with a way to clean that up, we could remove this and just
-        // compute it directly at the end of the kernel.
-        size_type const block_null_count =
-          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-        if (t == 0) { ni.null_count += block_null_count; }
-      }
-      // trivial for non-nullable columns
-      else {
-        thread_valid_count = thread_value_count;
-        block_valid_count  = block_value_count;
+      // validity is processed per-warp
+      //
+      // nested schemas always read and write to the same bounds (that is, read and write
+      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+      // at the first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int warp_null_count = 0;
+      if (ni.valid_map != nullptr) {
+        uint32_t const warp_validity_mask = ballot(is_valid);
+        // lane 0 from each warp writes out validity
+        if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) {
+          int const valid_map_offset = ni.valid_map_offset;
+          int const vindex     = value_count + thread_value_count;  // absolute input value index
+          int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                 first_row;  // absolute bit offset into the output validity map
+          int const write_end =
+            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+          int const bit_count = write_end - write_start;
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+        }
       }
 
+      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+      // valid_count) because valid_count also includes rows that potentially start before our row
+      // bounds. if we could come up with a way to clean that up, we could remove this and just
+      // compute it directly at the end of the kernel.
+      size_type const block_null_count =
+        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+      if (t == 0) { ni.null_count += block_null_count; }
+
       // if this is valid and we're at the leaf, output dst_pos
-      __syncthreads();  // handle modification of ni.value_count from below
-      if (is_valid && d_idx == max_depth) {
-        // for non-list types, the value count is always the same across
-        int const dst_pos = (value_count + thread_value_count) - 1;
-        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
-        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      if (d_idx == max_depth) {
+        if (is_valid) {
+          int const dst_pos = value_count + thread_value_count;
+          int const src_pos = max_depth_valid_count + thread_valid_count;
+          sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+        }
+        // update stuff
+        max_depth_valid_count += block_valid_count;
       }
-      __syncthreads();  // handle modification of ni.value_count from below
 
-      // update stuff
-      if (t == 0) { ni.valid_count += block_valid_count; }
-    }
+    }  // end depth loop
 
     value_count += block_value_count;
-  }
+  }  // end loop
 
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
-    s->nz_count          = s->nesting_info[max_depth].valid_count;
-    s->input_value_count = value_count;
-    s->input_row_count   = value_count;
+    max_depth_ni.valid_count = max_depth_valid_count;
+    s->nz_count              = max_depth_valid_count;
+    s->input_value_count     = value_count;
+    s->input_row_count       = value_count;
   }
 
-  __syncthreads();
-  return s->nesting_info[max_depth].valid_count;
+  return max_depth_valid_count;
 }
 
-template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+template <int decode_block_size, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
@@ -351,83 +388,67 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
-    }
-
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
+    int const row_index     = thread_value_count + value_count;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // use definition level & row bounds to determine if is valid
     int is_valid;
-    if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+    if (t >= batch_size) {
+      is_valid = 0;
+    } else if (def) {
+      int const def_level =
+        static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
     } else {
       is_valid = in_row_bounds;
     }
 
     // thread and block validity count
+    using block_scan = cub::BlockScan<int, decode_block_size>;
+    __shared__ typename block_scan::TempStorage scan_storage;
     int thread_valid_count, block_valid_count;
-    if constexpr (nullable) {
-      using block_scan = cub::BlockScan<int, decode_block_size>;
-      __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-      __syncthreads();
-
-      // validity is processed per-warp
-      //
-      // nested schemas always read and write to the same bounds (that is, read and write
-      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-      // at the first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector
-      // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
-        // lane 0 from each warp writes out validity
-        if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
-          int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-          int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-        }
-      }
-
-      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-      // valid_count) because valid_count also includes rows that potentially start before our row
-      // bounds. if we could come up with a way to clean that up, we could remove this and just
-      // compute it directly at the end of the kernel.
-      size_type const block_null_count =
-        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-      if (t == 0) { ni.null_count += block_null_count; }
-    }
-    // trivial for non-nullable columns
-    else {
-      thread_valid_count = thread_value_count;
-      block_valid_count  = block_value_count;
+    block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
+    uint32_t const warp_validity_mask = ballot(is_valid);
+
+    // validity is processed per-warp
+    //
+    // nested schemas always read and write to the same bounds (that is, read and write
+    // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+    // at the first value, even if that is before first_row, because we cannot trivially jump to
+    // the correct position to start reading. since we are about to write the validity vector
+    // here we need to adjust our computed mask to take into account the write row bounds.
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+    int warp_null_count   = 0;
+    // lane 0 from each warp writes out validity
+    if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) {
+      int const vindex     = value_count + thread_value_count;  // absolute input value index
+      int const bit_offset = (valid_map_offset + vindex + write_start) -
+                             first_row;  // absolute bit offset into the output validity map
+      int const write_end =
+        cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+      int const bit_count = write_end - write_start;
+      warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+      store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
     }
 
+    // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+    // valid_count) because valid_count also includes rows that potentially start before our row
+    // bounds. if we could come up with a way to clean that up, we could remove this and just
+    // compute it directly at the end of the kernel.
+    size_type const block_null_count =
+      cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+    if (t == 0) { ni.null_count += block_null_count; }
+
     // output offset
     if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
+      int const dst_pos                                          = value_count + thread_value_count;
+      int const src_pos                                          = valid_count + thread_valid_count;
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 
@@ -448,6 +469,70 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   return valid_count;
 }
 
+template <int decode_block_size, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_value_count,
+                                                                page_state_s* s,
+                                                                state_buf* sb,
+                                                                int t)
+{
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
+
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
+  int const row_index_lower_bound     = s->row_index_lower_bound;
+
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
+
+  int const max_depth = s->col.max_nesting_depth - 1;
+  auto& ni            = s->nesting_info[max_depth];
+  int valid_count     = ni.valid_count;
+
+  __syncthreads();
+
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
+
+    int const thread_value_count = t;
+    int const block_value_count  = batch_size;
+
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index     = thread_value_count + value_count;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    int const is_valid           = in_row_bounds;
+    int const thread_valid_count = thread_value_count;
+    int const block_valid_count  = block_value_count;
+
+    // if this is valid and we're at the leaf, output dst_pos
+    if (is_valid) {
+      // for non-list types, the value count is always the same across
+      int const dst_pos = value_count + thread_value_count;
+      int const src_pos = valid_count + thread_valid_count;
+
+      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+    }
+
+    // update stuff
+    value_count += block_value_count;
+    valid_count += block_valid_count;
+  }  // end loop
+
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    ni.valid_count       = valid_count;
+    ni.value_count       = value_count;
+    s->nz_count          = valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
+  }
+
+  return valid_count;
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -605,7 +690,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   int valid_count     = 0;
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
+  // For chunked reads we may not process all of the rows on the page; if not stop early
+  int last_row = s->first_row + s->num_rows;
+  while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
+         (s->input_row_count <= last_row)) {
     int next_valid_count;
 
     // only need to process definition levels if this is a nullable column
@@ -614,10 +702,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       __syncthreads();
 
       if constexpr (has_nesting_t) {
-        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       }
     }
@@ -626,15 +714,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-
-      if constexpr (has_nesting_t) {
-        next_valid_count =
-          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
-            processed_count, s, sb, nullptr, t);
-      } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
-          processed_count, s, sb, nullptr, t);
-      }
+      next_valid_count =
+        gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
     }
     __syncthreads();
 

From 0b840bb0deeffffba8875f5a49395b13334f4f98 Mon Sep 17 00:00:00 2001
From: Hirota Akio <33370421+a-hirota@users.noreply.github.com>
Date: Sat, 12 Oct 2024 02:04:52 +0900
Subject: [PATCH 057/117] docs: change 'CSV' to 'csv' in
 python/custreamz/README.md to match kafka.py (#17041)

This PR corrects a typo in the `python/custreamz/README.md` file by changing the uppercase `'CSV'` to lowercase `'csv'`. This change aligns the documentation with the `message_format` options defined in `python/custreamz/custreamz/kafka.py`, ensuring consistency across the codebase.

Authors:
  - Hirota Akio (https://github.com/a-hirota)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17041
---
 python/custreamz/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 8da17ef09dc..e81fc35c544 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -26,7 +26,7 @@ tips_df = consumer.read_gdf(topic="custreamz_tips",
                         partition=0,
                         start=0,
                         end=10000,
-                        message_format="CSV")
+                        message_format="csv")
 
 print(tips_df.head())
 tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill'] * 100

From b8f3e2100cff86cb48d23200b8250ecfc8714433 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Fri, 11 Oct 2024 12:23:37 -0500
Subject: [PATCH 058/117] Reorganize `cudf_polars` expression code (#17014)

This PR seeks to break up `expr.py` into a less unwieldy monolith.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17014
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 1826 +----------------
 .../cudf_polars/dsl/expressions/__init__.py   |    8 +
 .../dsl/expressions/aggregation.py            |  229 +++
 .../cudf_polars/dsl/expressions/base.py       |  334 +++
 .../cudf_polars/dsl/expressions/binaryop.py   |  135 ++
 .../cudf_polars/dsl/expressions/boolean.py    |  269 +++
 .../cudf_polars/dsl/expressions/datetime.py   |  132 ++
 .../cudf_polars/dsl/expressions/literal.py    |   88 +
 .../cudf_polars/dsl/expressions/rolling.py    |   40 +
 .../cudf_polars/dsl/expressions/selection.py  |   91 +
 .../cudf_polars/dsl/expressions/sorting.py    |   97 +
 .../cudf_polars/dsl/expressions/string.py     |  283 +++
 .../cudf_polars/dsl/expressions/ternary.py    |   53 +
 .../cudf_polars/dsl/expressions/unary.py      |  328 +++
 14 files changed, 2108 insertions(+), 1805 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/base.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/literal.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/selection.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/string.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expressions/unary.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f7775ceb238..e748ec16f14 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -15,33 +15,30 @@
 
 from __future__ import annotations
 
-import enum
-from enum import IntEnum
-from functools import partial, reduce
-from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
-
-import pyarrow as pa
-import pyarrow.compute as pc
-import pylibcudf as plc
-
-from polars.exceptions import InvalidOperationError
-from polars.polars import _expr_nodes as pl_expr
-
-from cudf_polars.containers import Column
-from cudf_polars.utils import dtypes, sorting
-
-if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence
-
-    import polars as pl
-    import polars.type_aliases as pl_types
-
-    from cudf_polars.containers import DataFrame
+from cudf_polars.dsl.expressions.aggregation import Agg
+from cudf_polars.dsl.expressions.base import (
+    AggInfo,
+    Col,
+    Expr,
+    NamedExpr,
+)
+from cudf_polars.dsl.expressions.binaryop import BinOp
+from cudf_polars.dsl.expressions.boolean import BooleanFunction
+from cudf_polars.dsl.expressions.datetime import TemporalFunction
+from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
+from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow
+from cudf_polars.dsl.expressions.selection import Filter, Gather
+from cudf_polars.dsl.expressions.sorting import Sort, SortBy
+from cudf_polars.dsl.expressions.string import StringFunction
+from cudf_polars.dsl.expressions.ternary import Ternary
+from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
 
 __all__ = [
     "Expr",
     "NamedExpr",
     "Literal",
+    "LiteralColumn",
+    "Len",
     "Col",
     "BooleanFunction",
     "StringFunction",
@@ -54,1789 +51,8 @@
     "GroupedRollingWindow",
     "Cast",
     "Agg",
+    "AggInfo",
     "Ternary",
     "BinOp",
+    "UnaryFunction",
 ]
-
-
-class ExecutionContext(IntEnum):
-    FRAME = enum.auto()
-    GROUPBY = enum.auto()
-    ROLLING = enum.auto()
-
-
-class AggInfo(NamedTuple):
-    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
-
-
-class Expr:
-    """
-    An abstract expression object.
-
-    This contains a (potentially empty) tuple of child expressions,
-    along with non-child data. For uniform reconstruction and
-    implementation of hashing and equality schemes, child classes need
-    to provide a certain amount of metadata when they are defined.
-    Specifically, the ``_non_child`` attribute must list, in-order,
-    the names of the slots that are passed to the constructor. The
-    constructor must take arguments in the order ``(*_non_child,
-    *children).``
-    """
-
-    __slots__ = ("dtype", "_hash_value", "_repr_value")
-    dtype: plc.DataType
-    """Data type of the expression."""
-    _hash_value: int
-    """Caching slot for the hash of the expression."""
-    _repr_value: str
-    """Caching slot for repr of the expression."""
-    children: tuple[Expr, ...] = ()
-    """Children of the expression."""
-    _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
-    """Names of non-child data (not Exprs) for reconstruction."""
-
-    # Constructor must take arguments in order (*_non_child, *children)
-    def __init__(self, dtype: plc.DataType) -> None:
-        self.dtype = dtype
-
-    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
-        return (*(getattr(self, attr) for attr in self._non_child), *children)
-
-    def get_hash(self) -> int:
-        """
-        Return the hash of this expr.
-
-        Override this in subclasses, rather than __hash__.
-
-        Returns
-        -------
-        The integer hash value.
-        """
-        return hash((type(self), self._ctor_arguments(self.children)))
-
-    def __hash__(self) -> int:
-        """Hash of an expression with caching."""
-        try:
-            return self._hash_value
-        except AttributeError:
-            self._hash_value = self.get_hash()
-            return self._hash_value
-
-    def is_equal(self, other: Any) -> bool:
-        """
-        Equality of two expressions.
-
-        Override this in subclasses, rather than __eq__.
-
-        Parameter
-        ---------
-        other
-            object to compare to
-
-        Returns
-        -------
-        True if the two expressions are equal, false otherwise.
-        """
-        if type(self) is not type(other):
-            return False  # pragma: no cover; __eq__ trips first
-        return self._ctor_arguments(self.children) == other._ctor_arguments(
-            other.children
-        )
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of expressions."""
-        if type(self) is not type(other) or hash(self) != hash(other):
-            return False
-        else:
-            return self.is_equal(other)
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def __repr__(self) -> str:
-        """String representation of an expression with caching."""
-        try:
-            return self._repr_value
-        except AttributeError:
-            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
-            self._repr_value = f"{type(self).__name__}({args})"
-            return self._repr_value
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame that will provide columns.
-        context
-            What context are we performing this evaluation in?
-        mapping
-            Substitution mapping from expressions to Columns, used to
-            override the evaluation of a given expression if we're
-            performing a simple rewritten evaluation.
-
-        Notes
-        -----
-        Do not call this function directly, but rather
-        :meth:`evaluate` which handles the mapping lookups.
-
-        Returns
-        -------
-        Column representing the evaluation of the expression.
-
-        Raises
-        ------
-        NotImplementedError
-            If we couldn't evaluate the expression. Ideally all these
-            are returned during translation to the IR, but for now we
-            are not perfect.
-        """
-        raise NotImplementedError(
-            f"Evaluation of expression {type(self).__name__}"
-        )  # pragma: no cover; translation of unimplemented nodes trips first
-
-    def evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame that will provide columns.
-        context
-            What context are we performing this evaluation in?
-        mapping
-            Substitution mapping from expressions to Columns, used to
-            override the evaluation of a given expression if we're
-            performing a simple rewritten evaluation.
-
-        Notes
-        -----
-        Individual subclasses should implement :meth:`do_evaluate`,
-        this method provides logic to handle lookups in the
-        substitution mapping.
-
-        Returns
-        -------
-        Column representing the evaluation of the expression.
-
-        Raises
-        ------
-        NotImplementedError
-            If we couldn't evaluate the expression. Ideally all these
-            are returned during translation to the IR, but for now we
-            are not perfect.
-        """
-        if mapping is None:
-            return self.do_evaluate(df, context=context, mapping=mapping)
-        try:
-            return mapping[self]
-        except KeyError:
-            return self.do_evaluate(df, context=context, mapping=mapping)
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """
-        Collect information about aggregations in groupbys.
-
-        Parameters
-        ----------
-        depth
-            The depth of aggregating (reduction or sampling)
-            expressions we are currently at.
-
-        Returns
-        -------
-        Aggregation info describing the expression to aggregate in the
-        groupby.
-
-        Raises
-        ------
-        NotImplementedError
-            If we can't currently perform the aggregation request, for
-            example nested aggregations like ``a.max().min()``.
-        """
-        raise NotImplementedError(
-            f"Collecting aggregation info for {type(self).__name__}"
-        )  # pragma: no cover; check_agg trips first
-
-
-class NamedExpr:
-    # NamedExpr does not inherit from Expr since it does not appear
-    # when evaluating expressions themselves, only when constructing
-    # named return values in dataframe (IR) nodes.
-    __slots__ = ("name", "value")
-    value: Expr
-    name: str
-
-    def __init__(self, name: str, value: Expr) -> None:
-        self.name = name
-        self.value = value
-
-    def __hash__(self) -> int:
-        """Hash of the expression."""
-        return hash((type(self), self.name, self.value))
-
-    def __repr__(self) -> str:
-        """Repr of the expression."""
-        return f"NamedExpr({self.name}, {self.value})"
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of two expressions."""
-        return (
-            type(self) is type(other)
-            and self.name == other.name
-            and self.value == other.value
-        )
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame providing context
-        context
-            Execution context
-        mapping
-            Substitution mapping
-
-        Returns
-        -------
-        Evaluated Column with name attached.
-
-        See Also
-        --------
-        :meth:`Expr.evaluate` for details, this function just adds the
-        name to a column produced from an expression.
-        """
-        return self.value.evaluate(df, context=context, mapping=mapping).rename(
-            self.name
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return self.value.collect_agg(depth=depth)
-
-
-class Literal(Expr):
-    __slots__ = ("value",)
-    _non_child = ("dtype", "value")
-    value: pa.Scalar[Any]
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
-        super().__init__(dtype)
-        assert value.type == plc.interop.to_arrow(dtype)
-        self.value = value
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        # datatype of pyarrow scalar is correct by construction.
-        return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([])
-
-
-class LiteralColumn(Expr):
-    __slots__ = ("value",)
-    _non_child = ("dtype", "value")
-    value: pa.Array[Any, Any]
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
-        super().__init__(dtype)
-        data = value.to_arrow()
-        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
-
-    def get_hash(self) -> int:
-        """Compute a hash of the column."""
-        # This is stricter than necessary, but we only need this hash
-        # for identity in groupby replacements so it's OK. And this
-        # way we avoid doing potentially expensive compute.
-        return hash((type(self), self.dtype, id(self.value)))
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        # datatype of pyarrow array is correct by construction.
-        return Column(plc.interop.from_arrow(self.value))
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([])
-
-
-class Col(Expr):
-    __slots__ = ("name",)
-    _non_child = ("dtype", "name")
-    name: str
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, name: str) -> None:
-        self.dtype = dtype
-        self.name = name
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        # Deliberately remove the name here so that we guarantee
-        # evaluation of the IR produces names.
-        return df.column_map[self.name].rename(None)
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([(self, plc.aggregation.collect_list(), self)])
-
-
-class Len(Expr):
-    children: tuple[()]
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        return Column(
-            plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
-                ),
-                1,
-            )
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: polars returns a uint, not an int for count
-        return AggInfo(
-            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
-        )
-
-
-class BooleanFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.BooleanFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        if self.name == pl_expr.BooleanFunction.IsIn and not all(
-            c.dtype == self.children[0].dtype for c in self.children
-        ):
-            # TODO: If polars IR doesn't put the casts in, we need to
-            # mimic the supertype promotion rules.
-            raise NotImplementedError("IsIn doesn't support supertype casting")
-
-    @staticmethod
-    def _distinct(
-        column: Column,
-        *,
-        keep: plc.stream_compaction.DuplicateKeepOption,
-        source_value: plc.Scalar,
-        target_value: plc.Scalar,
-    ) -> Column:
-        table = plc.Table([column.obj])
-        indices = plc.stream_compaction.distinct_indices(
-            table,
-            keep,
-            # TODO: polars doesn't expose options for these
-            plc.types.NullEquality.EQUAL,
-            plc.types.NanEquality.ALL_EQUAL,
-        )
-        return Column(
-            plc.copying.scatter(
-                [source_value],
-                indices,
-                plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
-            ).columns()[0]
-        )
-
-    _BETWEEN_OPS: ClassVar[
-        dict[
-            pl_types.ClosedInterval,
-            tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
-        ]
-    ] = {
-        "none": (
-            plc.binaryop.BinaryOperator.GREATER,
-            plc.binaryop.BinaryOperator.LESS,
-        ),
-        "left": (
-            plc.binaryop.BinaryOperator.GREATER_EQUAL,
-            plc.binaryop.BinaryOperator.LESS,
-        ),
-        "right": (
-            plc.binaryop.BinaryOperator.GREATER,
-            plc.binaryop.BinaryOperator.LESS_EQUAL,
-        ),
-        "both": (
-            plc.binaryop.BinaryOperator.GREATER_EQUAL,
-            plc.binaryop.BinaryOperator.LESS_EQUAL,
-        ),
-    }
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name in (
-            pl_expr.BooleanFunction.IsFinite,
-            pl_expr.BooleanFunction.IsInfinite,
-        ):
-            # Avoid evaluating the child if the dtype tells us it's unnecessary.
-            (child,) = self.children
-            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
-            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
-                value = plc.interop.from_arrow(
-                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
-                )
-                return Column(plc.Column.from_scalar(value, df.num_rows))
-            needles = child.evaluate(df, context=context, mapping=mapping)
-            to_search = [-float("inf"), float("inf")]
-            if is_finite:
-                # NaN is neither finite not infinite
-                to_search.append(float("nan"))
-            haystack = plc.interop.from_arrow(
-                pa.array(
-                    to_search,
-                    type=plc.interop.to_arrow(needles.obj.type()),
-                )
-            )
-            result = plc.search.contains(haystack, needles.obj)
-            if is_finite:
-                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
-            return Column(result)
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
-        # False
-        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
-            (ignore_nulls,) = self.options
-            (column,) = columns
-            is_any = self.name == pl_expr.BooleanFunction.Any
-            agg = plc.aggregation.any() if is_any else plc.aggregation.all()
-            result = plc.reduce.reduce(column.obj, agg, self.dtype)
-            if not ignore_nulls and column.obj.null_count() > 0:
-                #      Truth tables
-                #     Any         All
-                #   | F U T     | F U T
-                # --+------   --+------
-                # F | F U T   F | F F F
-                # U | U U T   U | F U U
-                # T | T T T   T | F U T
-                #
-                # If the input null count was non-zero, we must
-                # post-process the result to insert the correct value.
-                h_result = plc.interop.to_arrow(result).as_py()
-                if is_any and not h_result or not is_any and h_result:
-                    # Any                     All
-                    # False || Null => Null   True && Null => Null
-                    return Column(plc.Column.all_null_like(column.obj, 1))
-            return Column(plc.Column.from_scalar(result, 1))
-        if self.name == pl_expr.BooleanFunction.IsNull:
-            (column,) = columns
-            return Column(plc.unary.is_null(column.obj))
-        elif self.name == pl_expr.BooleanFunction.IsNotNull:
-            (column,) = columns
-            return Column(plc.unary.is_valid(column.obj))
-        elif self.name == pl_expr.BooleanFunction.IsNan:
-            (column,) = columns
-            return Column(
-                plc.unary.is_nan(column.obj).with_mask(
-                    column.obj.null_mask(), column.obj.null_count()
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsNotNan:
-            (column,) = columns
-            return Column(
-                plc.unary.is_not_nan(column.obj).with_mask(
-                    column.obj.null_mask(), column.obj.null_count()
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsUnique:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
-            return Column(
-                reduce(
-                    partial(
-                        plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-                        output_type=self.dtype,
-                    ),
-                    (c.obj for c in columns),
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
-            return Column(
-                reduce(
-                    partial(
-                        plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-                        output_type=self.dtype,
-                    ),
-                    (c.obj for c in columns),
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsIn:
-            needles, haystack = columns
-            return Column(plc.search.contains(haystack.obj, needles.obj))
-        elif self.name == pl_expr.BooleanFunction.Not:
-            (column,) = columns
-            return Column(
-                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
-            )
-        else:
-            raise NotImplementedError(
-                f"BooleanFunction {self.name}"
-            )  # pragma: no cover; handled by init raising
-
-
-class StringFunction(Expr):
-    __slots__ = ("name", "options", "children", "_regex_program")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.StringFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        self._validate_input()
-
-    def _validate_input(self):
-        if self.name not in (
-            pl_expr.StringFunction.Contains,
-            pl_expr.StringFunction.EndsWith,
-            pl_expr.StringFunction.Lowercase,
-            pl_expr.StringFunction.Replace,
-            pl_expr.StringFunction.ReplaceMany,
-            pl_expr.StringFunction.Slice,
-            pl_expr.StringFunction.Strptime,
-            pl_expr.StringFunction.StartsWith,
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-            pl_expr.StringFunction.Uppercase,
-        ):
-            raise NotImplementedError(f"String function {self.name}")
-        if self.name == pl_expr.StringFunction.Contains:
-            literal, strict = self.options
-            if not literal:
-                if not strict:
-                    raise NotImplementedError(
-                        "f{strict=} is not supported for regex contains"
-                    )
-                if not isinstance(self.children[1], Literal):
-                    raise NotImplementedError(
-                        "Regex contains only supports a scalar pattern"
-                    )
-                pattern = self.children[1].value.as_py()
-                try:
-                    self._regex_program = plc.strings.regex_program.RegexProgram.create(
-                        pattern,
-                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
-                    )
-                except RuntimeError as e:
-                    raise NotImplementedError(
-                        f"Unsupported regex {pattern} for GPU engine."
-                    ) from e
-        elif self.name == pl_expr.StringFunction.Replace:
-            _, literal = self.options
-            if not literal:
-                raise NotImplementedError("literal=False is not supported for replace")
-            if not all(isinstance(expr, Literal) for expr in self.children[1:]):
-                raise NotImplementedError("replace only supports scalar target")
-            target = self.children[1]
-            if target.value == pa.scalar("", type=pa.string()):
-                raise NotImplementedError(
-                    "libcudf replace does not support empty strings"
-                )
-        elif self.name == pl_expr.StringFunction.ReplaceMany:
-            (ascii_case_insensitive,) = self.options
-            if ascii_case_insensitive:
-                raise NotImplementedError(
-                    "ascii_case_insensitive not implemented for replace_many"
-                )
-            if not all(
-                isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:]
-            ):
-                raise NotImplementedError("replace_many only supports literal inputs")
-            target = self.children[1]
-            if pc.any(pc.equal(target.value, "")).as_py():
-                raise NotImplementedError(
-                    "libcudf replace_many is implemented differently from polars "
-                    "for empty strings"
-                )
-        elif self.name == pl_expr.StringFunction.Slice:
-            if not all(isinstance(child, Literal) for child in self.children[1:]):
-                raise NotImplementedError(
-                    "Slice only supports literal start and stop values"
-                )
-        elif self.name == pl_expr.StringFunction.Strptime:
-            format, _, exact, cache = self.options
-            if cache:
-                raise NotImplementedError("Strptime cache is a CPU feature")
-            if format is None:
-                raise NotImplementedError("Strptime format is required")
-            if not exact:
-                raise NotImplementedError("Strptime does not support exact=False")
-        elif self.name in {
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-        }:
-            if not isinstance(self.children[1], Literal):
-                raise NotImplementedError(
-                    "strip operations only support scalar patterns"
-                )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name == pl_expr.StringFunction.Contains:
-            child, arg = self.children
-            column = child.evaluate(df, context=context, mapping=mapping)
-
-            literal, _ = self.options
-            if literal:
-                pat = arg.evaluate(df, context=context, mapping=mapping)
-                pattern = (
-                    pat.obj_scalar
-                    if pat.is_scalar and pat.obj.size() != column.obj.size()
-                    else pat.obj
-                )
-                return Column(plc.strings.find.contains(column.obj, pattern))
-            else:
-                return Column(
-                    plc.strings.contains.contains_re(column.obj, self._regex_program)
-                )
-        elif self.name == pl_expr.StringFunction.Slice:
-            child, expr_offset, expr_length = self.children
-            assert isinstance(expr_offset, Literal)
-            assert isinstance(expr_length, Literal)
-
-            column = child.evaluate(df, context=context, mapping=mapping)
-            # libcudf slices via [start,stop).
-            # polars slices with offset + length where start == offset
-            # stop = start + length. Negative values for start look backward
-            # from the last element of the string. If the end index would be
-            # below zero, an empty string is returned.
-            # Do this maths on the host
-            start = expr_offset.value.as_py()
-            length = expr_length.value.as_py()
-
-            if length == 0:
-                stop = start
-            else:
-                # No length indicates a scan to the end
-                # The libcudf equivalent is a null stop
-                stop = start + length if length else None
-                if length and start < 0 and length >= -start:
-                    stop = None
-            return Column(
-                plc.strings.slice.slice_strings(
-                    column.obj,
-                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
-                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
-                )
-            )
-        elif self.name in {
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-        }:
-            column, chars = (
-                c.evaluate(df, context=context, mapping=mapping) for c in self.children
-            )
-            if self.name == pl_expr.StringFunction.StripCharsStart:
-                side = plc.strings.SideType.LEFT
-            elif self.name == pl_expr.StringFunction.StripCharsEnd:
-                side = plc.strings.SideType.RIGHT
-            else:
-                side = plc.strings.SideType.BOTH
-            return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
-
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        if self.name == pl_expr.StringFunction.Lowercase:
-            (column,) = columns
-            return Column(plc.strings.case.to_lower(column.obj))
-        elif self.name == pl_expr.StringFunction.Uppercase:
-            (column,) = columns
-            return Column(plc.strings.case.to_upper(column.obj))
-        elif self.name == pl_expr.StringFunction.EndsWith:
-            column, suffix = columns
-            return Column(
-                plc.strings.find.ends_with(
-                    column.obj,
-                    suffix.obj_scalar
-                    if column.obj.size() != suffix.obj.size() and suffix.is_scalar
-                    else suffix.obj,
-                )
-            )
-        elif self.name == pl_expr.StringFunction.StartsWith:
-            column, prefix = columns
-            return Column(
-                plc.strings.find.starts_with(
-                    column.obj,
-                    prefix.obj_scalar
-                    if column.obj.size() != prefix.obj.size() and prefix.is_scalar
-                    else prefix.obj,
-                )
-            )
-        elif self.name == pl_expr.StringFunction.Strptime:
-            # TODO: ignores ambiguous
-            format, strict, exact, cache = self.options
-            col = self.children[0].evaluate(df, context=context, mapping=mapping)
-
-            is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
-                col.obj, format
-            )
-
-            if strict:
-                if not plc.interop.to_arrow(
-                    plc.reduce.reduce(
-                        is_timestamps,
-                        plc.aggregation.all(),
-                        plc.DataType(plc.TypeId.BOOL8),
-                    )
-                ).as_py():
-                    raise InvalidOperationError("conversion from `str` failed.")
-            else:
-                not_timestamps = plc.unary.unary_operation(
-                    is_timestamps, plc.unary.UnaryOperator.NOT
-                )
-
-                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
-                res = plc.copying.boolean_mask_scatter(
-                    [null], plc.Table([col.obj]), not_timestamps
-                )
-                return Column(
-                    plc.strings.convert.convert_datetime.to_timestamps(
-                        res.columns()[0], self.dtype, format
-                    )
-                )
-        elif self.name == pl_expr.StringFunction.Replace:
-            column, target, repl = columns
-            n, _ = self.options
-            return Column(
-                plc.strings.replace.replace(
-                    column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
-                )
-            )
-        elif self.name == pl_expr.StringFunction.ReplaceMany:
-            column, target, repl = columns
-            return Column(
-                plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
-            )
-        raise NotImplementedError(
-            f"StringFunction {self.name}"
-        )  # pragma: no cover; handled by init raising
-
-
-class TemporalFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
-        pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
-        pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
-        pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
-        pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
-        pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR,
-        pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE,
-        pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND,
-        pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
-        pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
-        pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
-    }
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.TemporalFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        if self.name not in self._COMPONENT_MAP:
-            raise NotImplementedError(f"Temporal function {self.name}")
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        (column,) = columns
-        if self.name == pl_expr.TemporalFunction.Microsecond:
-            millis = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
-            )
-            micros = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
-            )
-            millis_as_micros = plc.binaryop.binary_operation(
-                millis,
-                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.DataType(plc.TypeId.INT32),
-            )
-            total_micros = plc.binaryop.binary_operation(
-                micros,
-                millis_as_micros,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            return Column(total_micros)
-        elif self.name == pl_expr.TemporalFunction.Nanosecond:
-            millis = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
-            )
-            micros = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
-            )
-            nanos = plc.datetime.extract_datetime_component(
-                column.obj, plc.datetime.DatetimeComponent.NANOSECOND
-            )
-            millis_as_nanos = plc.binaryop.binary_operation(
-                millis,
-                plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            micros_as_nanos = plc.binaryop.binary_operation(
-                micros,
-                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            total_nanos = plc.binaryop.binary_operation(
-                nanos,
-                millis_as_nanos,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            total_nanos = plc.binaryop.binary_operation(
-                total_nanos,
-                micros_as_nanos,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            return Column(total_nanos)
-
-        return Column(
-            plc.datetime.extract_datetime_component(
-                column.obj,
-                self._COMPONENT_MAP[self.name],
-            )
-        )
-
-
-class UnaryFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    # Note: log, and pow are handled via translation to binops
-    _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
-        "sin": plc.unary.UnaryOperator.SIN,
-        "cos": plc.unary.UnaryOperator.COS,
-        "tan": plc.unary.UnaryOperator.TAN,
-        "arcsin": plc.unary.UnaryOperator.ARCSIN,
-        "arccos": plc.unary.UnaryOperator.ARCCOS,
-        "arctan": plc.unary.UnaryOperator.ARCTAN,
-        "sinh": plc.unary.UnaryOperator.SINH,
-        "cosh": plc.unary.UnaryOperator.COSH,
-        "tanh": plc.unary.UnaryOperator.TANH,
-        "arcsinh": plc.unary.UnaryOperator.ARCSINH,
-        "arccosh": plc.unary.UnaryOperator.ARCCOSH,
-        "arctanh": plc.unary.UnaryOperator.ARCTANH,
-        "exp": plc.unary.UnaryOperator.EXP,
-        "sqrt": plc.unary.UnaryOperator.SQRT,
-        "cbrt": plc.unary.UnaryOperator.CBRT,
-        "ceil": plc.unary.UnaryOperator.CEIL,
-        "floor": plc.unary.UnaryOperator.FLOOR,
-        "abs": plc.unary.UnaryOperator.ABS,
-        "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
-        "not": plc.unary.UnaryOperator.NOT,
-    }
-    _supported_misc_fns = frozenset(
-        {
-            "drop_nulls",
-            "fill_null",
-            "mask_nans",
-            "round",
-            "set_sorted",
-            "unique",
-        }
-    )
-    _supported_cum_aggs = frozenset(
-        {
-            "cum_min",
-            "cum_max",
-            "cum_prod",
-            "cum_sum",
-        }
-    )
-    _supported_fns = frozenset().union(
-        _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys()
-    )
-
-    def __init__(
-        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.name = name
-        self.options = options
-        self.children = children
-
-        if self.name not in UnaryFunction._supported_fns:
-            raise NotImplementedError(f"Unary function {name=}")
-        if self.name in UnaryFunction._supported_cum_aggs:
-            (reverse,) = self.options
-            if reverse:
-                raise NotImplementedError(
-                    "reverse=True is not supported for cumulative aggregations"
-                )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name == "mask_nans":
-            (child,) = self.children
-            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
-        if self.name == "round":
-            (decimal_places,) = self.options
-            (values,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            return Column(
-                plc.round.round(
-                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
-                )
-            ).sorted_like(values)
-        elif self.name == "unique":
-            (maintain_order,) = self.options
-            (values,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            # Only one column, so keep_any is the same as keep_first
-            # for stable distinct
-            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
-            if values.is_sorted:
-                maintain_order = True
-                result = plc.stream_compaction.unique(
-                    plc.Table([values.obj]),
-                    [0],
-                    keep,
-                    plc.types.NullEquality.EQUAL,
-                )
-            else:
-                distinct = (
-                    plc.stream_compaction.stable_distinct
-                    if maintain_order
-                    else plc.stream_compaction.distinct
-                )
-                result = distinct(
-                    plc.Table([values.obj]),
-                    [0],
-                    keep,
-                    plc.types.NullEquality.EQUAL,
-                    plc.types.NanEquality.ALL_EQUAL,
-                )
-            (column,) = result.columns()
-            if maintain_order:
-                return Column(column).sorted_like(values)
-            return Column(column)
-        elif self.name == "set_sorted":
-            (column,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            (asc,) = self.options
-            order = (
-                plc.types.Order.ASCENDING
-                if asc == "ascending"
-                else plc.types.Order.DESCENDING
-            )
-            null_order = plc.types.NullOrder.BEFORE
-            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
-                # PERF: This invokes four stream synchronisations!
-                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
-                has_nulls_last = not plc.copying.get_element(
-                    column.obj, n - 1
-                ).is_valid()
-                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
-                    order == plc.types.Order.ASCENDING and has_nulls_last
-                ):
-                    null_order = plc.types.NullOrder.AFTER
-            return column.set_sorted(
-                is_sorted=plc.types.Sorted.YES,
-                order=order,
-                null_order=null_order,
-            )
-        elif self.name == "drop_nulls":
-            (column,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            return Column(
-                plc.stream_compaction.drop_nulls(
-                    plc.Table([column.obj]), [0], 1
-                ).columns()[0]
-            )
-        elif self.name == "fill_null":
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            if isinstance(self.children[1], Literal):
-                arg = plc.interop.from_arrow(self.children[1].value)
-            else:
-                evaluated = self.children[1].evaluate(
-                    df, context=context, mapping=mapping
-                )
-                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
-            return Column(plc.replace.replace_nulls(column.obj, arg))
-        elif self.name in self._OP_MAPPING:
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            if column.obj.type().id() != self.dtype.id():
-                arg = plc.unary.cast(column.obj, self.dtype)
-            else:
-                arg = column.obj
-            return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
-        elif self.name in UnaryFunction._supported_cum_aggs:
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            plc_col = column.obj
-            col_type = column.obj.type()
-            # cum_sum casts
-            # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
-            # Bool -> UInt32
-            # cum_prod casts integer dtypes < int64 and bool to int64
-            # See:
-            # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs
-            if (
-                self.name == "cum_sum"
-                and col_type.id()
-                in {
-                    plc.types.TypeId.INT8,
-                    plc.types.TypeId.UINT8,
-                    plc.types.TypeId.INT16,
-                    plc.types.TypeId.UINT16,
-                }
-            ) or (
-                self.name == "cum_prod"
-                and plc.traits.is_integral(col_type)
-                and plc.types.size_of(col_type) <= 4
-            ):
-                plc_col = plc.unary.cast(
-                    plc_col, plc.types.DataType(plc.types.TypeId.INT64)
-                )
-            elif (
-                self.name == "cum_sum"
-                and column.obj.type().id() == plc.types.TypeId.BOOL8
-            ):
-                plc_col = plc.unary.cast(
-                    plc_col, plc.types.DataType(plc.types.TypeId.UINT32)
-                )
-            if self.name == "cum_sum":
-                agg = plc.aggregation.sum()
-            elif self.name == "cum_prod":
-                agg = plc.aggregation.product()
-            elif self.name == "cum_min":
-                agg = plc.aggregation.min()
-            elif self.name == "cum_max":
-                agg = plc.aggregation.max()
-
-            return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE))
-        raise NotImplementedError(
-            f"Unimplemented unary function {self.name=}"
-        )  # pragma: no cover; init trips first
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
-            raise NotImplementedError(f"{self.name} in groupby")
-        if depth == 1:
-            # inside aggregation, need to pre-evaluate, groupby
-            # construction has checked that we don't have nested aggs,
-            # so stop the recursion and return ourselves for pre-eval
-            return AggInfo([(self, plc.aggregation.collect_list(), self)])
-        else:
-            (child,) = self.children
-            return child.collect_agg(depth=depth)
-
-
-class Sort(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr]
-
-    def __init__(
-        self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (column,)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
-        (stable, nulls_last, descending) = self.options
-        order, null_order = sorting.sort_order(
-            [descending], nulls_last=[nulls_last], num_keys=1
-        )
-        do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
-        table = do_sort(plc.Table([column.obj]), order, null_order)
-        return Column(
-            table.columns()[0],
-            is_sorted=plc.types.Sorted.YES,
-            order=order[0],
-            null_order=null_order[0],
-        )
-
-
-class SortBy(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        options: tuple[bool, tuple[bool], tuple[bool]],
-        column: Expr,
-        *by: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (column, *by)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        column, *by = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        (stable, nulls_last, descending) = self.options
-        order, null_order = sorting.sort_order(
-            descending, nulls_last=nulls_last, num_keys=len(by)
-        )
-        do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
-        table = do_sort(
-            plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
-        )
-        return Column(table.columns()[0])
-
-
-class Gather(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
-
-    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
-        super().__init__(dtype)
-        self.children = (values, indices)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        values, indices = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        lo, hi = plc.reduce.minmax(indices.obj)
-        lo = plc.interop.to_arrow(lo).as_py()
-        hi = plc.interop.to_arrow(hi).as_py()
-        n = df.num_rows
-        if hi >= n or lo < -n:
-            raise ValueError("gather indices are out of bounds")
-        if indices.obj.null_count():
-            bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
-            obj = plc.replace.replace_nulls(
-                indices.obj,
-                plc.interop.from_arrow(
-                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
-                ),
-            )
-        else:
-            bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
-            obj = indices.obj
-        table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
-        return Column(table.columns()[0])
-
-
-class Filter(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
-
-    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
-        super().__init__(dtype)
-        self.children = (values, indices)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        values, mask = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        table = plc.stream_compaction.apply_boolean_mask(
-            plc.Table([values.obj]), mask.obj
-        )
-        return Column(table.columns()[0]).sorted_like(values)
-
-
-class RollingWindow(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr]
-
-    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (agg,)
-        raise NotImplementedError("Rolling window not implemented")
-
-
-class GroupedRollingWindow(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (agg, *by)
-        raise NotImplementedError("Grouped rolling window not implemented")
-
-
-class Cast(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr]
-
-    def __init__(self, dtype: plc.DataType, value: Expr) -> None:
-        super().__init__(dtype)
-        self.children = (value,)
-        if not dtypes.can_cast(value.dtype, self.dtype):
-            raise NotImplementedError(
-                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
-            )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
-        return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column)
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: Could do with sort-based groupby and segmented filter
-        (child,) = self.children
-        return child.collect_agg(depth=depth)
-
-
-class Agg(Expr):
-    __slots__ = ("name", "options", "op", "request", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.name = name
-        self.options = options
-        self.children = children
-        if name not in Agg._SUPPORTED:
-            raise NotImplementedError(
-                f"Unsupported aggregation {name=}"
-            )  # pragma: no cover; all valid aggs are supported
-        # TODO: nan handling in groupby case
-        if name == "min":
-            req = plc.aggregation.min()
-        elif name == "max":
-            req = plc.aggregation.max()
-        elif name == "median":
-            req = plc.aggregation.median()
-        elif name == "n_unique":
-            # TODO: datatype of result
-            req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
-        elif name == "first" or name == "last":
-            req = None
-        elif name == "mean":
-            req = plc.aggregation.mean()
-        elif name == "sum":
-            req = plc.aggregation.sum()
-        elif name == "std":
-            # TODO: handle nans
-            req = plc.aggregation.std(ddof=options)
-        elif name == "var":
-            # TODO: handle nans
-            req = plc.aggregation.variance(ddof=options)
-        elif name == "count":
-            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
-        elif name == "quantile":
-            _, quantile = self.children
-            if not isinstance(quantile, Literal):
-                raise NotImplementedError("Only support literal quantile values")
-            req = plc.aggregation.quantile(
-                quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
-            )
-        else:
-            raise NotImplementedError(
-                f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
-            )  # pragma: no cover
-        self.request = req
-        op = getattr(self, f"_{name}", None)
-        if op is None:
-            op = partial(self._reduce, request=req)
-        elif name in {"min", "max"}:
-            op = partial(op, propagate_nans=options)
-        elif name in {"count", "first", "last"}:
-            pass
-        else:
-            raise NotImplementedError(
-                f"Unreachable, supported agg {name=} has no implementation"
-            )  # pragma: no cover
-        self.op = op
-
-    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
-        [
-            "min",
-            "max",
-            "median",
-            "n_unique",
-            "first",
-            "last",
-            "mean",
-            "sum",
-            "count",
-            "std",
-            "var",
-            "quantile",
-        ]
-    )
-
-    interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = {
-        "nearest": plc.types.Interpolation.NEAREST,
-        "higher": plc.types.Interpolation.HIGHER,
-        "lower": plc.types.Interpolation.LOWER,
-        "midpoint": plc.types.Interpolation.MIDPOINT,
-        "linear": plc.types.Interpolation.LINEAR,
-    }
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if depth >= 1:
-            raise NotImplementedError(
-                "Nested aggregations in groupby"
-            )  # pragma: no cover; check_agg trips first
-        if (isminmax := self.name in {"min", "max"}) and self.options:
-            raise NotImplementedError("Nan propagation in groupby for min/max")
-        (child,) = self.children
-        ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
-        request = self.request
-        # These are handled specially here because we don't set up the
-        # request for the whole-frame agg because we can avoid a
-        # reduce for these.
-        if self.name == "first":
-            request = plc.aggregation.nth_element(
-                0, null_handling=plc.types.NullPolicy.INCLUDE
-            )
-        elif self.name == "last":
-            request = plc.aggregation.nth_element(
-                -1, null_handling=plc.types.NullPolicy.INCLUDE
-            )
-        if request is None:
-            raise NotImplementedError(
-                f"Aggregation {self.name} in groupby"
-            )  # pragma: no cover; __init__ trips first
-        if isminmax and plc.traits.is_floating_point(self.dtype):
-            assert expr is not None
-            # Ignore nans in these groupby aggs, do this by masking
-            # nans in the input
-            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
-        return AggInfo([(expr, request, self)])
-
-    def _reduce(
-        self, column: Column, *, request: plc.aggregation.Aggregation
-    ) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, request, self.dtype),
-                1,
-            )
-        )
-
-    def _count(self, column: Column) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(
-                        column.obj.size() - column.obj.null_count(),
-                        type=plc.interop.to_arrow(self.dtype),
-                    ),
-                ),
-                1,
-            )
-        )
-
-    def _min(self, column: Column, *, propagate_nans: bool) -> Column:
-        if propagate_nans and column.nan_count > 0:
-            return Column(
-                plc.Column.from_scalar(
-                    plc.interop.from_arrow(
-                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
-                    ),
-                    1,
-                )
-            )
-        if column.nan_count > 0:
-            column = column.mask_nans()
-        return self._reduce(column, request=plc.aggregation.min())
-
-    def _max(self, column: Column, *, propagate_nans: bool) -> Column:
-        if propagate_nans and column.nan_count > 0:
-            return Column(
-                plc.Column.from_scalar(
-                    plc.interop.from_arrow(
-                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
-                    ),
-                    1,
-                )
-            )
-        if column.nan_count > 0:
-            column = column.mask_nans()
-        return self._reduce(column, request=plc.aggregation.max())
-
-    def _first(self, column: Column) -> Column:
-        return Column(plc.copying.slice(column.obj, [0, 1])[0])
-
-    def _last(self, column: Column) -> Column:
-        n = column.obj.size()
-        return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if context is not ExecutionContext.FRAME:
-            raise NotImplementedError(
-                f"Agg in context {context}"
-            )  # pragma: no cover; unreachable
-
-        # Aggregations like quantiles may have additional children that were
-        # preprocessed into pylibcudf requests.
-        child = self.children[0]
-        return self.op(child.evaluate(df, context=context, mapping=mapping))
-
-
-class Ternary(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr, Expr]
-
-    def __init__(
-        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.children = (when, then, otherwise)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        when, then, otherwise = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        then_obj = then.obj_scalar if then.is_scalar else then.obj
-        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
-        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
-
-
-class BinOp(Expr):
-    __slots__ = ("op", "children")
-    _non_child = ("dtype", "op")
-    children: tuple[Expr, Expr]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        op: plc.binaryop.BinaryOperator,
-        left: Expr,
-        right: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        if plc.traits.is_boolean(self.dtype):
-            # For boolean output types, bitand and bitor implement
-            # boolean logic, so translate. bitxor also does, but the
-            # default behaviour is correct.
-            op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
-        self.op = op
-        self.children = (left, right)
-        if not plc.binaryop.is_supported_operation(
-            self.dtype, left.dtype, right.dtype, op
-        ):
-            raise NotImplementedError(
-                f"Operation {op.name} not supported "
-                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
-                f"with output type {self.dtype.id().name}"
-            )
-
-    _BOOL_KLEENE_MAPPING: ClassVar[
-        dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator]
-    ] = {
-        plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-        plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-        plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-        plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-    }
-
-    _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
-        pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
-        pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
-        pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
-        pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
-        pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS,
-        pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
-        pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER,
-        pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
-        pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD,
-        pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB,
-        pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL,
-        pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV,
-        pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
-        pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
-        pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
-        pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
-        pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
-        pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
-        pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
-        pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
-    }
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        left, right = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        lop = left.obj
-        rop = right.obj
-        if left.obj.size() != right.obj.size():
-            if left.is_scalar:
-                lop = left.obj_scalar
-            elif right.is_scalar:
-                rop = right.obj_scalar
-        return Column(
-            plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if depth == 1:
-            # inside aggregation, need to pre-evaluate,
-            # groupby construction has checked that we don't have
-            # nested aggs, so stop the recursion and return ourselves
-            # for pre-eval
-            return AggInfo([(self, plc.aggregation.collect_list(), self)])
-        else:
-            left_info, right_info = (
-                child.collect_agg(depth=depth) for child in self.children
-            )
-            requests = [*left_info.requests, *right_info.requests]
-            # TODO: Hack, if there were no reductions inside this
-            # binary expression then we want to pre-evaluate and
-            # collect ourselves. Otherwise we want to collect the
-            # aggregations inside and post-evaluate. This is a bad way
-            # of checking that we are in case 1.
-            if all(
-                agg.kind() == plc.aggregation.Kind.COLLECT_LIST
-                for _, agg, _ in requests
-            ):
-                return AggInfo([(self, plc.aggregation.collect_list(), self)])
-            return AggInfo(
-                [*left_info.requests, *right_info.requests],
-            )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py
new file mode 100644
index 00000000000..acbea129088
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Implementations of various expressions."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
new file mode 100644
index 00000000000..b8b18ec5039
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -0,0 +1,229 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for aggregations."""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    AggInfo,
+    ExecutionContext,
+    Expr,
+)
+from cudf_polars.dsl.expressions.literal import Literal
+from cudf_polars.dsl.expressions.unary import UnaryFunction
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Agg"]
+
+
+class Agg(Expr):
+    __slots__ = ("name", "options", "op", "request", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.options = options
+        self.children = children
+        if name not in Agg._SUPPORTED:
+            raise NotImplementedError(
+                f"Unsupported aggregation {name=}"
+            )  # pragma: no cover; all valid aggs are supported
+        # TODO: nan handling in groupby case
+        if name == "min":
+            req = plc.aggregation.min()
+        elif name == "max":
+            req = plc.aggregation.max()
+        elif name == "median":
+            req = plc.aggregation.median()
+        elif name == "n_unique":
+            # TODO: datatype of result
+            req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
+        elif name == "first" or name == "last":
+            req = None
+        elif name == "mean":
+            req = plc.aggregation.mean()
+        elif name == "sum":
+            req = plc.aggregation.sum()
+        elif name == "std":
+            # TODO: handle nans
+            req = plc.aggregation.std(ddof=options)
+        elif name == "var":
+            # TODO: handle nans
+            req = plc.aggregation.variance(ddof=options)
+        elif name == "count":
+            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
+        elif name == "quantile":
+            _, quantile = self.children
+            if not isinstance(quantile, Literal):
+                raise NotImplementedError("Only support literal quantile values")
+            req = plc.aggregation.quantile(
+                quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
+            )
+        else:
+            raise NotImplementedError(
+                f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
+            )  # pragma: no cover
+        self.request = req
+        op = getattr(self, f"_{name}", None)
+        if op is None:
+            op = partial(self._reduce, request=req)
+        elif name in {"min", "max"}:
+            op = partial(op, propagate_nans=options)
+        elif name in {"count", "first", "last"}:
+            pass
+        else:
+            raise NotImplementedError(
+                f"Unreachable, supported agg {name=} has no implementation"
+            )  # pragma: no cover
+        self.op = op
+
+    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
+        [
+            "min",
+            "max",
+            "median",
+            "n_unique",
+            "first",
+            "last",
+            "mean",
+            "sum",
+            "count",
+            "std",
+            "var",
+            "quantile",
+        ]
+    )
+
+    interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = {
+        "nearest": plc.types.Interpolation.NEAREST,
+        "higher": plc.types.Interpolation.HIGHER,
+        "lower": plc.types.Interpolation.LOWER,
+        "midpoint": plc.types.Interpolation.MIDPOINT,
+        "linear": plc.types.Interpolation.LINEAR,
+    }
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth >= 1:
+            raise NotImplementedError(
+                "Nested aggregations in groupby"
+            )  # pragma: no cover; check_agg trips first
+        if (isminmax := self.name in {"min", "max"}) and self.options:
+            raise NotImplementedError("Nan propagation in groupby for min/max")
+        (child,) = self.children
+        ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
+        request = self.request
+        # These are handled specially here because we don't set up the
+        # request for the whole-frame agg because we can avoid a
+        # reduce for these.
+        if self.name == "first":
+            request = plc.aggregation.nth_element(
+                0, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        elif self.name == "last":
+            request = plc.aggregation.nth_element(
+                -1, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        if request is None:
+            raise NotImplementedError(
+                f"Aggregation {self.name} in groupby"
+            )  # pragma: no cover; __init__ trips first
+        if isminmax and plc.traits.is_floating_point(self.dtype):
+            assert expr is not None
+            # Ignore nans in these groupby aggs, do this by masking
+            # nans in the input
+            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
+        return AggInfo([(expr, request, self)])
+
+    def _reduce(
+        self, column: Column, *, request: plc.aggregation.Aggregation
+    ) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, request, self.dtype),
+                1,
+            )
+        )
+
+    def _count(self, column: Column) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(
+                        column.obj.size() - column.obj.null_count(),
+                        type=plc.interop.to_arrow(self.dtype),
+                    ),
+                ),
+                1,
+            )
+        )
+
+    def _min(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
+                    ),
+                    1,
+                )
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return self._reduce(column, request=plc.aggregation.min())
+
+    def _max(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
+                    ),
+                    1,
+                )
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return self._reduce(column, request=plc.aggregation.max())
+
+    def _first(self, column: Column) -> Column:
+        return Column(plc.copying.slice(column.obj, [0, 1])[0])
+
+    def _last(self, column: Column) -> Column:
+        n = column.obj.size()
+        return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if context is not ExecutionContext.FRAME:
+            raise NotImplementedError(
+                f"Agg in context {context}"
+            )  # pragma: no cover; unreachable
+
+        # Aggregations like quantiles may have additional children that were
+        # preprocessed into pylibcudf requests.
+        child = self.children[0]
+        return self.op(child.evaluate(df, context=context, mapping=mapping))
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
new file mode 100644
index 00000000000..8d021b0231d
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -0,0 +1,334 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Base and common classes for expression DSL nodes."""
+
+from __future__ import annotations
+
+import enum
+from enum import IntEnum
+from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence
+
+    from cudf_polars.containers import Column, DataFrame
+
+__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext"]
+
+
+class AggInfo(NamedTuple):
+    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
+
+
+class ExecutionContext(IntEnum):
+    FRAME = enum.auto()
+    GROUPBY = enum.auto()
+    ROLLING = enum.auto()
+
+
+class Expr:
+    """
+    An abstract expression object.
+
+    This contains a (potentially empty) tuple of child expressions,
+    along with non-child data. For uniform reconstruction and
+    implementation of hashing and equality schemes, child classes need
+    to provide a certain amount of metadata when they are defined.
+    Specifically, the ``_non_child`` attribute must list, in-order,
+    the names of the slots that are passed to the constructor. The
+    constructor must take arguments in the order ``(*_non_child,
+    *children).``
+    """
+
+    __slots__ = ("dtype", "_hash_value", "_repr_value")
+    dtype: plc.DataType
+    """Data type of the expression."""
+    _hash_value: int
+    """Caching slot for the hash of the expression."""
+    _repr_value: str
+    """Caching slot for repr of the expression."""
+    children: tuple[Expr, ...] = ()
+    """Children of the expression."""
+    _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
+    """Names of non-child data (not Exprs) for reconstruction."""
+
+    # Constructor must take arguments in order (*_non_child, *children)
+    def __init__(self, dtype: plc.DataType) -> None:
+        self.dtype = dtype
+
+    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
+        return (*(getattr(self, attr) for attr in self._non_child), *children)
+
+    def get_hash(self) -> int:
+        """
+        Return the hash of this expr.
+
+        Override this in subclasses, rather than __hash__.
+
+        Returns
+        -------
+        The integer hash value.
+        """
+        return hash((type(self), self._ctor_arguments(self.children)))
+
+    def __hash__(self) -> int:
+        """Hash of an expression with caching."""
+        try:
+            return self._hash_value
+        except AttributeError:
+            self._hash_value = self.get_hash()
+            return self._hash_value
+
+    def is_equal(self, other: Any) -> bool:
+        """
+        Equality of two expressions.
+
+        Override this in subclasses, rather than __eq__.
+
+        Parameter
+        ---------
+        other
+            object to compare to
+
+        Returns
+        -------
+        True if the two expressions are equal, false otherwise.
+        """
+        if type(self) is not type(other):
+            return False  # pragma: no cover; __eq__ trips first
+        return self._ctor_arguments(self.children) == other._ctor_arguments(
+            other.children
+        )
+
+    def __eq__(self, other: Any) -> bool:
+        """Equality of expressions."""
+        if type(self) is not type(other) or hash(self) != hash(other):
+            return False
+        else:
+            return self.is_equal(other)
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def __repr__(self) -> str:
+        """String representation of an expression with caching."""
+        try:
+            return self._repr_value
+        except AttributeError:
+            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
+            self._repr_value = f"{type(self).__name__}({args})"
+            return self._repr_value
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Do not call this function directly, but rather
+        :meth:`evaluate` which handles the mapping lookups.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression.
+
+        Raises
+        ------
+        NotImplementedError
+            If we couldn't evaluate the expression. Ideally all these
+            are returned during translation to the IR, but for now we
+            are not perfect.
+        """
+        raise NotImplementedError(
+            f"Evaluation of expression {type(self).__name__}"
+        )  # pragma: no cover; translation of unimplemented nodes trips first
+
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Individual subclasses should implement :meth:`do_evaluate`,
+        this method provides logic to handle lookups in the
+        substitution mapping.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression.
+
+        Raises
+        ------
+        NotImplementedError
+            If we couldn't evaluate the expression. Ideally all these
+            are returned during translation to the IR, but for now we
+            are not perfect.
+        """
+        if mapping is None:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+        try:
+            return mapping[self]
+        except KeyError:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """
+        Collect information about aggregations in groupbys.
+
+        Parameters
+        ----------
+        depth
+            The depth of aggregating (reduction or sampling)
+            expressions we are currently at.
+
+        Returns
+        -------
+        Aggregation info describing the expression to aggregate in the
+        groupby.
+
+        Raises
+        ------
+        NotImplementedError
+            If we can't currently perform the aggregation request, for
+            example nested aggregations like ``a.max().min()``.
+        """
+        raise NotImplementedError(
+            f"Collecting aggregation info for {type(self).__name__}"
+        )  # pragma: no cover; check_agg trips first
+
+
+class NamedExpr:
+    # NamedExpr does not inherit from Expr since it does not appear
+    # when evaluating expressions themselves, only when constructing
+    # named return values in dataframe (IR) nodes.
+    __slots__ = ("name", "value")
+    value: Expr
+    name: str
+
+    def __init__(self, name: str, value: Expr) -> None:
+        self.name = name
+        self.value = value
+
+    def __hash__(self) -> int:
+        """Hash of the expression."""
+        return hash((type(self), self.name, self.value))
+
+    def __repr__(self) -> str:
+        """Repr of the expression."""
+        return f"NamedExpr({self.name}, {self.value})"
+
+    def __eq__(self, other: Any) -> bool:
+        """Equality of two expressions."""
+        return (
+            type(self) is type(other)
+            and self.name == other.name
+            and self.value == other.value
+        )
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame providing context
+        context
+            Execution context
+        mapping
+            Substitution mapping
+
+        Returns
+        -------
+        Evaluated Column with name attached.
+
+        See Also
+        --------
+        :meth:`Expr.evaluate` for details, this function just adds the
+        name to a column produced from an expression.
+        """
+        return self.value.evaluate(df, context=context, mapping=mapping).rename(
+            self.name
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return self.value.collect_agg(depth=depth)
+
+
+class Col(Expr):
+    __slots__ = ("name",)
+    _non_child = ("dtype", "name")
+    name: str
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, name: str) -> None:
+        self.dtype = dtype
+        self.name = name
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # Deliberately remove the name here so that we guarantee
+        # evaluation of the IR produces names.
+        return df.column_map[self.name].rename(None)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([(self, plc.aggregation.collect_list(), self)])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
new file mode 100644
index 00000000000..19baae3611d
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""BinaryOp DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, ClassVar
+
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["BinOp"]
+
+
+class BinOp(Expr):
+    __slots__ = ("op", "children")
+    _non_child = ("dtype", "op")
+    children: tuple[Expr, Expr]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        op: plc.binaryop.BinaryOperator,
+        left: Expr,
+        right: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        if plc.traits.is_boolean(self.dtype):
+            # For boolean output types, bitand and bitor implement
+            # boolean logic, so translate. bitxor also does, but the
+            # default behaviour is correct.
+            op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
+        self.op = op
+        self.children = (left, right)
+        if not plc.binaryop.is_supported_operation(
+            self.dtype, left.dtype, right.dtype, op
+        ):
+            raise NotImplementedError(
+                f"Operation {op.name} not supported "
+                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
+                f"with output type {self.dtype.id().name}"
+            )
+
+    _BOOL_KLEENE_MAPPING: ClassVar[
+        dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator]
+    ] = {
+        plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+        plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+    }
+
+    _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
+        pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
+        pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
+        pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
+        pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+        pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS,
+        pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
+        pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER,
+        pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
+        pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD,
+        pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB,
+        pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL,
+        pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV,
+        pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
+        pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
+        pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
+        pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
+        pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
+        pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
+        pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
+        pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
+    }
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        left, right = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        lop = left.obj
+        rop = right.obj
+        if left.obj.size() != right.obj.size():
+            if left.is_scalar:
+                lop = left.obj_scalar
+            elif right.is_scalar:
+                rop = right.obj_scalar
+        return Column(
+            plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate,
+            # groupby construction has checked that we don't have
+            # nested aggs, so stop the recursion and return ourselves
+            # for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            left_info, right_info = (
+                child.collect_agg(depth=depth) for child in self.children
+            )
+            requests = [*left_info.requests, *right_info.requests]
+            # TODO: Hack, if there were no reductions inside this
+            # binary expression then we want to pre-evaluate and
+            # collect ourselves. Otherwise we want to collect the
+            # aggregations inside and post-evaluate. This is a bad way
+            # of checking that we are in case 1.
+            if all(
+                agg.kind() == plc.aggregation.Kind.COLLECT_LIST
+                for _, agg, _ in requests
+            ):
+                return AggInfo([(self, plc.aggregation.collect_list(), self)])
+            return AggInfo(
+                [*left_info.requests, *right_info.requests],
+            )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
new file mode 100644
index 00000000000..ff9973a47d5
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -0,0 +1,269 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Boolean DSL nodes."""
+
+from __future__ import annotations
+
+from functools import partial, reduce
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    ExecutionContext,
+    Expr,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    import polars.type_aliases as pl_types
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["BooleanFunction"]
+
+
+class BooleanFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.BooleanFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name == pl_expr.BooleanFunction.IsIn and not all(
+            c.dtype == self.children[0].dtype for c in self.children
+        ):
+            # TODO: If polars IR doesn't put the casts in, we need to
+            # mimic the supertype promotion rules.
+            raise NotImplementedError("IsIn doesn't support supertype casting")
+
+    @staticmethod
+    def _distinct(
+        column: Column,
+        *,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        source_value: plc.Scalar,
+        target_value: plc.Scalar,
+    ) -> Column:
+        table = plc.Table([column.obj])
+        indices = plc.stream_compaction.distinct_indices(
+            table,
+            keep,
+            # TODO: polars doesn't expose options for these
+            plc.types.NullEquality.EQUAL,
+            plc.types.NanEquality.ALL_EQUAL,
+        )
+        return Column(
+            plc.copying.scatter(
+                [source_value],
+                indices,
+                plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
+            ).columns()[0]
+        )
+
+    _BETWEEN_OPS: ClassVar[
+        dict[
+            pl_types.ClosedInterval,
+            tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
+        ]
+    ] = {
+        "none": (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        "left": (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        "right": (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+        "both": (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+    }
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+        ):
+            # Avoid evaluating the child if the dtype tells us it's unnecessary.
+            (child,) = self.children
+            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
+            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+                value = plc.interop.from_arrow(
+                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
+                )
+                return Column(plc.Column.from_scalar(value, df.num_rows))
+            needles = child.evaluate(df, context=context, mapping=mapping)
+            to_search = [-float("inf"), float("inf")]
+            if is_finite:
+                # NaN is neither finite not infinite
+                to_search.append(float("nan"))
+            haystack = plc.interop.from_arrow(
+                pa.array(
+                    to_search,
+                    type=plc.interop.to_arrow(needles.obj.type()),
+                )
+            )
+            result = plc.search.contains(haystack, needles.obj)
+            if is_finite:
+                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
+            return Column(result)
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
+        # False
+        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
+            (ignore_nulls,) = self.options
+            (column,) = columns
+            is_any = self.name == pl_expr.BooleanFunction.Any
+            agg = plc.aggregation.any() if is_any else plc.aggregation.all()
+            result = plc.reduce.reduce(column.obj, agg, self.dtype)
+            if not ignore_nulls and column.obj.null_count() > 0:
+                #      Truth tables
+                #     Any         All
+                #   | F U T     | F U T
+                # --+------   --+------
+                # F | F U T   F | F F F
+                # U | U U T   U | F U U
+                # T | T T T   T | F U T
+                #
+                # If the input null count was non-zero, we must
+                # post-process the result to insert the correct value.
+                h_result = plc.interop.to_arrow(result).as_py()
+                if is_any and not h_result or not is_any and h_result:
+                    # Any                     All
+                    # False || Null => Null   True && Null => Null
+                    return Column(plc.Column.all_null_like(column.obj, 1))
+            return Column(plc.Column.from_scalar(result, 1))
+        if self.name == pl_expr.BooleanFunction.IsNull:
+            (column,) = columns
+            return Column(plc.unary.is_null(column.obj))
+        elif self.name == pl_expr.BooleanFunction.IsNotNull:
+            (column,) = columns
+            return Column(plc.unary.is_valid(column.obj))
+        elif self.name == pl_expr.BooleanFunction.IsNan:
+            (column,) = columns
+            return Column(
+                plc.unary.is_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsNotNan:
+            (column,) = columns
+            return Column(
+                plc.unary.is_not_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsUnique:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsIn:
+            needles, haystack = columns
+            return Column(plc.search.contains(haystack.obj, needles.obj))
+        elif self.name == pl_expr.BooleanFunction.Not:
+            (column,) = columns
+            return Column(
+                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
+            )
+        else:
+            raise NotImplementedError(
+                f"BooleanFunction {self.name}"
+            )  # pragma: no cover; handled by init raising
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
new file mode 100644
index 00000000000..f752a23b628
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for datetime operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["TemporalFunction"]
+
+
+class TemporalFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
+        pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
+        pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
+        pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
+        pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
+        pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR,
+        pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE,
+        pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND,
+        pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
+        pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
+        pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
+    }
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.TemporalFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name not in self._COMPONENT_MAP:
+            raise NotImplementedError(f"Temporal function {self.name}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        (column,) = columns
+        if self.name == pl_expr.TemporalFunction.Microsecond:
+            millis = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
+            )
+            micros = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
+            )
+            millis_as_micros = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.DataType(plc.TypeId.INT32),
+            )
+            total_micros = plc.binaryop.binary_operation(
+                micros,
+                millis_as_micros,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_micros)
+        elif self.name == pl_expr.TemporalFunction.Nanosecond:
+            millis = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
+            )
+            micros = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
+            )
+            nanos = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.NANOSECOND
+            )
+            millis_as_nanos = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            micros_as_nanos = plc.binaryop.binary_operation(
+                micros,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                nanos,
+                millis_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                total_nanos,
+                micros_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_nanos)
+
+        return Column(
+            plc.datetime.extract_datetime_component(
+                column.obj,
+                self._COMPONENT_MAP[self.name],
+            )
+        )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
new file mode 100644
index 00000000000..562a2255033
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Literal DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+from cudf_polars.utils import dtypes
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    import pyarrow as pa
+
+    import polars as pl
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Literal", "LiteralColumn"]
+
+
+class Literal(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Scalar[Any]
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
+        super().__init__(dtype)
+        assert value.type == plc.interop.to_arrow(dtype)
+        self.value = value
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow scalar is correct by construction.
+        return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
+
+class LiteralColumn(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Array[Any, Any]
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
+        super().__init__(dtype)
+        data = value.to_arrow()
+        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+
+    def get_hash(self) -> int:
+        """Compute a hash of the column."""
+        # This is stricter than necessary, but we only need this hash
+        # for identity in groupby replacements so it's OK. And this
+        # way we avoid doing potentially expensive compute.
+        return hash((type(self), self.dtype, id(self.value)))
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow array is correct by construction.
+        return Column(plc.interop.from_arrow(self.value))
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
new file mode 100644
index 00000000000..f7dcc3c542c
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Rolling DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from cudf_polars.dsl.expressions.base import Expr
+
+if TYPE_CHECKING:
+    import pylibcudf as plc
+
+__all__ = ["RollingWindow", "GroupedRollingWindow"]
+
+
+class RollingWindow(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+    children: tuple[Expr]
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.children = (agg,)
+        raise NotImplementedError("Rolling window not implemented")
+
+
+class GroupedRollingWindow(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.children = (agg, *by)
+        raise NotImplementedError("Grouped rolling window not implemented")
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
new file mode 100644
index 00000000000..a7a3e68a28c
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for selection operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Gather", "Filter"]
+
+
+class Gather(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr]
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
+        super().__init__(dtype)
+        self.children = (values, indices)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values, indices = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        lo, hi = plc.reduce.minmax(indices.obj)
+        lo = plc.interop.to_arrow(lo).as_py()
+        hi = plc.interop.to_arrow(hi).as_py()
+        n = df.num_rows
+        if hi >= n or lo < -n:
+            raise ValueError("gather indices are out of bounds")
+        if indices.obj.null_count():
+            bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
+            obj = plc.replace.replace_nulls(
+                indices.obj,
+                plc.interop.from_arrow(
+                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
+                ),
+            )
+        else:
+            bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
+            obj = indices.obj
+        table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
+        return Column(table.columns()[0])
+
+
+class Filter(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr]
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
+        super().__init__(dtype)
+        self.children = (values, indices)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values, mask = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        table = plc.stream_compaction.apply_boolean_mask(
+            plc.Table([values.obj]), mask.obj
+        )
+        return Column(table.columns()[0]).sorted_like(values)
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
new file mode 100644
index 00000000000..861b73ce6a0
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Sorting DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+from cudf_polars.utils import sorting
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Sort", "SortBy"]
+
+
+class Sort(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+    children: tuple[Expr]
+
+    def __init__(
+        self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.children = (column,)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            [descending], nulls_last=[nulls_last], num_keys=1
+        )
+        do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
+        table = do_sort(plc.Table([column.obj]), order, null_order)
+        return Column(
+            table.columns()[0],
+            is_sorted=plc.types.Sorted.YES,
+            order=order[0],
+            null_order=null_order[0],
+        )
+
+
+class SortBy(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        options: tuple[bool, tuple[bool], tuple[bool]],
+        column: Expr,
+        *by: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.children = (column, *by)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        column, *by = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            descending, nulls_last=nulls_last, num_keys=len(by)
+        )
+        do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+        table = do_sort(
+            plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
+        )
+        return Column(table.columns()[0])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
new file mode 100644
index 00000000000..6669669aadc
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -0,0 +1,283 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for string operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+
+from polars.exceptions import InvalidOperationError
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["StringFunction"]
+
+
+class StringFunction(Expr):
+    __slots__ = ("name", "options", "children", "_regex_program")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.StringFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        self._validate_input()
+
+    def _validate_input(self):
+        if self.name not in (
+            pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.EndsWith,
+            pl_expr.StringFunction.Lowercase,
+            pl_expr.StringFunction.Replace,
+            pl_expr.StringFunction.ReplaceMany,
+            pl_expr.StringFunction.Slice,
+            pl_expr.StringFunction.Strptime,
+            pl_expr.StringFunction.StartsWith,
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+            pl_expr.StringFunction.Uppercase,
+        ):
+            raise NotImplementedError(f"String function {self.name}")
+        if self.name == pl_expr.StringFunction.Contains:
+            literal, strict = self.options
+            if not literal:
+                if not strict:
+                    raise NotImplementedError(
+                        "f{strict=} is not supported for regex contains"
+                    )
+                if not isinstance(self.children[1], Literal):
+                    raise NotImplementedError(
+                        "Regex contains only supports a scalar pattern"
+                    )
+                pattern = self.children[1].value.as_py()
+                try:
+                    self._regex_program = plc.strings.regex_program.RegexProgram.create(
+                        pattern,
+                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+                    )
+                except RuntimeError as e:
+                    raise NotImplementedError(
+                        f"Unsupported regex {pattern} for GPU engine."
+                    ) from e
+        elif self.name == pl_expr.StringFunction.Replace:
+            _, literal = self.options
+            if not literal:
+                raise NotImplementedError("literal=False is not supported for replace")
+            if not all(isinstance(expr, Literal) for expr in self.children[1:]):
+                raise NotImplementedError("replace only supports scalar target")
+            target = self.children[1]
+            if target.value == pa.scalar("", type=pa.string()):
+                raise NotImplementedError(
+                    "libcudf replace does not support empty strings"
+                )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            (ascii_case_insensitive,) = self.options
+            if ascii_case_insensitive:
+                raise NotImplementedError(
+                    "ascii_case_insensitive not implemented for replace_many"
+                )
+            if not all(
+                isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:]
+            ):
+                raise NotImplementedError("replace_many only supports literal inputs")
+            target = self.children[1]
+            if pc.any(pc.equal(target.value, "")).as_py():
+                raise NotImplementedError(
+                    "libcudf replace_many is implemented differently from polars "
+                    "for empty strings"
+                )
+        elif self.name == pl_expr.StringFunction.Slice:
+            if not all(isinstance(child, Literal) for child in self.children[1:]):
+                raise NotImplementedError(
+                    "Slice only supports literal start and stop values"
+                )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            format, _, exact, cache = self.options
+            if cache:
+                raise NotImplementedError("Strptime cache is a CPU feature")
+            if format is None:
+                raise NotImplementedError("Strptime format is required")
+            if not exact:
+                raise NotImplementedError("Strptime does not support exact=False")
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            if not isinstance(self.children[1], Literal):
+                raise NotImplementedError(
+                    "strip operations only support scalar patterns"
+                )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == pl_expr.StringFunction.Contains:
+            child, arg = self.children
+            column = child.evaluate(df, context=context, mapping=mapping)
+
+            literal, _ = self.options
+            if literal:
+                pat = arg.evaluate(df, context=context, mapping=mapping)
+                pattern = (
+                    pat.obj_scalar
+                    if pat.is_scalar and pat.obj.size() != column.obj.size()
+                    else pat.obj
+                )
+                return Column(plc.strings.find.contains(column.obj, pattern))
+            else:
+                return Column(
+                    plc.strings.contains.contains_re(column.obj, self._regex_program)
+                )
+        elif self.name == pl_expr.StringFunction.Slice:
+            child, expr_offset, expr_length = self.children
+            assert isinstance(expr_offset, Literal)
+            assert isinstance(expr_length, Literal)
+
+            column = child.evaluate(df, context=context, mapping=mapping)
+            # libcudf slices via [start,stop).
+            # polars slices with offset + length where start == offset
+            # stop = start + length. Negative values for start look backward
+            # from the last element of the string. If the end index would be
+            # below zero, an empty string is returned.
+            # Do this maths on the host
+            start = expr_offset.value.as_py()
+            length = expr_length.value.as_py()
+
+            if length == 0:
+                stop = start
+            else:
+                # No length indicates a scan to the end
+                # The libcudf equivalent is a null stop
+                stop = start + length if length else None
+                if length and start < 0 and length >= -start:
+                    stop = None
+            return Column(
+                plc.strings.slice.slice_strings(
+                    column.obj,
+                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
+                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                )
+            )
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            column, chars = (
+                c.evaluate(df, context=context, mapping=mapping) for c in self.children
+            )
+            if self.name == pl_expr.StringFunction.StripCharsStart:
+                side = plc.strings.SideType.LEFT
+            elif self.name == pl_expr.StringFunction.StripCharsEnd:
+                side = plc.strings.SideType.RIGHT
+            else:
+                side = plc.strings.SideType.BOTH
+            return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
+
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.StringFunction.Lowercase:
+            (column,) = columns
+            return Column(plc.strings.case.to_lower(column.obj))
+        elif self.name == pl_expr.StringFunction.Uppercase:
+            (column,) = columns
+            return Column(plc.strings.case.to_upper(column.obj))
+        elif self.name == pl_expr.StringFunction.EndsWith:
+            column, suffix = columns
+            return Column(
+                plc.strings.find.ends_with(
+                    column.obj,
+                    suffix.obj_scalar
+                    if column.obj.size() != suffix.obj.size() and suffix.is_scalar
+                    else suffix.obj,
+                )
+            )
+        elif self.name == pl_expr.StringFunction.StartsWith:
+            column, prefix = columns
+            return Column(
+                plc.strings.find.starts_with(
+                    column.obj,
+                    prefix.obj_scalar
+                    if column.obj.size() != prefix.obj.size() and prefix.is_scalar
+                    else prefix.obj,
+                )
+            )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            # TODO: ignores ambiguous
+            format, strict, exact, cache = self.options
+            col = self.children[0].evaluate(df, context=context, mapping=mapping)
+
+            is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
+                col.obj, format
+            )
+
+            if strict:
+                if not plc.interop.to_arrow(
+                    plc.reduce.reduce(
+                        is_timestamps,
+                        plc.aggregation.all(),
+                        plc.DataType(plc.TypeId.BOOL8),
+                    )
+                ).as_py():
+                    raise InvalidOperationError("conversion from `str` failed.")
+            else:
+                not_timestamps = plc.unary.unary_operation(
+                    is_timestamps, plc.unary.UnaryOperator.NOT
+                )
+
+                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
+                res = plc.copying.boolean_mask_scatter(
+                    [null], plc.Table([col.obj]), not_timestamps
+                )
+                return Column(
+                    plc.strings.convert.convert_datetime.to_timestamps(
+                        res.columns()[0], self.dtype, format
+                    )
+                )
+        elif self.name == pl_expr.StringFunction.Replace:
+            column, target, repl = columns
+            n, _ = self.options
+            return Column(
+                plc.strings.replace.replace(
+                    column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
+                )
+            )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            column, target, repl = columns
+            return Column(
+                plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
+            )
+        raise NotImplementedError(
+            f"StringFunction {self.name}"
+        )  # pragma: no cover; handled by init raising
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
new file mode 100644
index 00000000000..c7d7a802ded
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for ternary operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    ExecutionContext,
+    Expr,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+
+__all__ = ["Ternary"]
+
+
+class Ternary(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr, Expr]
+
+    def __init__(
+        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.children = (when, then, otherwise)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        when, then, otherwise = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        then_obj = then.obj_scalar if then.is_scalar else then.obj
+        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
+        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
new file mode 100644
index 00000000000..3d4d15be1ce
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -0,0 +1,328 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+"""DSL nodes for unary operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+from cudf_polars.dsl.expressions.literal import Literal
+from cudf_polars.utils import dtypes
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Cast", "UnaryFunction", "Len"]
+
+
+class Cast(Expr):
+    """Class representing a cast of an expression."""
+
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr]
+
+    def __init__(self, dtype: plc.DataType, value: Expr) -> None:
+        super().__init__(dtype)
+        self.children = (value,)
+        if not dtypes.can_cast(value.dtype, self.dtype):
+            raise NotImplementedError(
+                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
+            )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented filter
+        (child,) = self.children
+        return child.collect_agg(depth=depth)
+
+
+class Len(Expr):
+    """Class representing the length of an expression."""
+
+    children: tuple[()]
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
+                ),
+                1,
+            )
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: polars returns a uint, not an int for count
+        return AggInfo(
+            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
+        )
+
+
+class UnaryFunction(Expr):
+    """Class representing unary functions of an expression."""
+
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    # Note: log, and pow are handled via translation to binops
+    _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
+        "sin": plc.unary.UnaryOperator.SIN,
+        "cos": plc.unary.UnaryOperator.COS,
+        "tan": plc.unary.UnaryOperator.TAN,
+        "arcsin": plc.unary.UnaryOperator.ARCSIN,
+        "arccos": plc.unary.UnaryOperator.ARCCOS,
+        "arctan": plc.unary.UnaryOperator.ARCTAN,
+        "sinh": plc.unary.UnaryOperator.SINH,
+        "cosh": plc.unary.UnaryOperator.COSH,
+        "tanh": plc.unary.UnaryOperator.TANH,
+        "arcsinh": plc.unary.UnaryOperator.ARCSINH,
+        "arccosh": plc.unary.UnaryOperator.ARCCOSH,
+        "arctanh": plc.unary.UnaryOperator.ARCTANH,
+        "exp": plc.unary.UnaryOperator.EXP,
+        "sqrt": plc.unary.UnaryOperator.SQRT,
+        "cbrt": plc.unary.UnaryOperator.CBRT,
+        "ceil": plc.unary.UnaryOperator.CEIL,
+        "floor": plc.unary.UnaryOperator.FLOOR,
+        "abs": plc.unary.UnaryOperator.ABS,
+        "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
+        "not": plc.unary.UnaryOperator.NOT,
+    }
+    _supported_misc_fns = frozenset(
+        {
+            "drop_nulls",
+            "fill_null",
+            "mask_nans",
+            "round",
+            "set_sorted",
+            "unique",
+        }
+    )
+    _supported_cum_aggs = frozenset(
+        {
+            "cum_min",
+            "cum_max",
+            "cum_prod",
+            "cum_sum",
+        }
+    )
+    _supported_fns = frozenset().union(
+        _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys()
+    )
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.options = options
+        self.children = children
+
+        if self.name not in UnaryFunction._supported_fns:
+            raise NotImplementedError(f"Unary function {name=}")
+        if self.name in UnaryFunction._supported_cum_aggs:
+            (reverse,) = self.options
+            if reverse:
+                raise NotImplementedError(
+                    "reverse=True is not supported for cumulative aggregations"
+                )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == "mask_nans":
+            (child,) = self.children
+            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
+        if self.name == "round":
+            (decimal_places,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.round.round(
+                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
+                )
+            ).sorted_like(values)
+        elif self.name == "unique":
+            (maintain_order,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            # Only one column, so keep_any is the same as keep_first
+            # for stable distinct
+            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
+            if values.is_sorted:
+                maintain_order = True
+                result = plc.stream_compaction.unique(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                )
+            else:
+                distinct = (
+                    plc.stream_compaction.stable_distinct
+                    if maintain_order
+                    else plc.stream_compaction.distinct
+                )
+                result = distinct(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                    plc.types.NanEquality.ALL_EQUAL,
+                )
+            (column,) = result.columns()
+            if maintain_order:
+                return Column(column).sorted_like(values)
+            return Column(column)
+        elif self.name == "set_sorted":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            (asc,) = self.options
+            order = (
+                plc.types.Order.ASCENDING
+                if asc == "ascending"
+                else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
+                # PERF: This invokes four stream synchronisations!
+                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
+                has_nulls_last = not plc.copying.get_element(
+                    column.obj, n - 1
+                ).is_valid()
+                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
+                    order == plc.types.Order.ASCENDING and has_nulls_last
+                ):
+                    null_order = plc.types.NullOrder.AFTER
+            return column.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
+        elif self.name == "drop_nulls":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.stream_compaction.drop_nulls(
+                    plc.Table([column.obj]), [0], 1
+                ).columns()[0]
+            )
+        elif self.name == "fill_null":
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if isinstance(self.children[1], Literal):
+                arg = plc.interop.from_arrow(self.children[1].value)
+            else:
+                evaluated = self.children[1].evaluate(
+                    df, context=context, mapping=mapping
+                )
+                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
+            return Column(plc.replace.replace_nulls(column.obj, arg))
+        elif self.name in self._OP_MAPPING:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if column.obj.type().id() != self.dtype.id():
+                arg = plc.unary.cast(column.obj, self.dtype)
+            else:
+                arg = column.obj
+            return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
+        elif self.name in UnaryFunction._supported_cum_aggs:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            plc_col = column.obj
+            col_type = column.obj.type()
+            # cum_sum casts
+            # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
+            # Bool -> UInt32
+            # cum_prod casts integer dtypes < int64 and bool to int64
+            # See:
+            # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs
+            if (
+                self.name == "cum_sum"
+                and col_type.id()
+                in {
+                    plc.types.TypeId.INT8,
+                    plc.types.TypeId.UINT8,
+                    plc.types.TypeId.INT16,
+                    plc.types.TypeId.UINT16,
+                }
+            ) or (
+                self.name == "cum_prod"
+                and plc.traits.is_integral(col_type)
+                and plc.types.size_of(col_type) <= 4
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.INT64)
+                )
+            elif (
+                self.name == "cum_sum"
+                and column.obj.type().id() == plc.types.TypeId.BOOL8
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.UINT32)
+                )
+            if self.name == "cum_sum":
+                agg = plc.aggregation.sum()
+            elif self.name == "cum_prod":
+                agg = plc.aggregation.product()
+            elif self.name == "cum_min":
+                agg = plc.aggregation.min()
+            elif self.name == "cum_max":
+                agg = plc.aggregation.max()
+
+            return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE))
+        raise NotImplementedError(
+            f"Unimplemented unary function {self.name=}"
+        )  # pragma: no cover; init trips first
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
+            raise NotImplementedError(f"{self.name} in groupby")
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate, groupby
+            # construction has checked that we don't have nested aggs,
+            # so stop the recursion and return ourselves for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            (child,) = self.children
+            return child.collect_agg(depth=depth)

From fea87cb85cb2aa1dd2dd1c767bc56c256a734659 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Oct 2024 12:39:24 -0700
Subject: [PATCH 059/117] Move `flatten_single_pass_aggs` to its own TU
 (#17053)

Part of splitting the original bulk shared memory groupby PR https://github.com/rapidsai/cudf/pull/16619.

This PR separates `flatten_single_pass_aggs` into its own translation unit without making any code modifications.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17053
---
 cpp/CMakeLists.txt                            |   1 +
 .../groupby/hash/flatten_single_pass_aggs.cpp | 139 ++++++++++++++++++
 .../groupby/hash/flatten_single_pass_aggs.hpp |  33 +++++
 cpp/src/groupby/hash/groupby.cu               | 105 +------------
 4 files changed, 174 insertions(+), 104 deletions(-)
 create mode 100644 cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
 create mode 100644 cpp/src/groupby/hash/flatten_single_pass_aggs.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f7a5dd2f2fb..a0ea9579475 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -315,6 +315,7 @@ add_library(
   src/filling/repeat.cu
   src/filling/sequence.cu
   src/groupby/groupby.cu
+  src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
new file mode 100644
index 00000000000..b2048a9fbb8
--- /dev/null
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatten_single_pass_aggs.hpp"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <memory>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+
+class groupby_simple_aggregations_collector final
+  : public cudf::detail::simple_aggregations_collector {
+ public:
+  using cudf::detail::simple_aggregations_collector::visit;
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::min_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
+                                                    : make_min_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::max_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
+                                                    : make_max_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::mean_aggregation const&) override
+  {
+    (void)col_type;
+    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type,
+                                                  cudf::detail::var_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type,
+                                                  cudf::detail::std_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type, cudf::detail::correlation_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+};
+
+// flatten aggs to filter in single pass aggs
+std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests)
+{
+  std::vector<column_view> columns;
+  std::vector<std::unique_ptr<aggregation>> aggs;
+  std::vector<aggregation::Kind> agg_kinds;
+
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+
+    std::unordered_set<aggregation::Kind> agg_kinds_set;
+    auto insert_agg = [&](column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
+      if (agg_kinds_set.insert(agg->kind).second) {
+        agg_kinds.push_back(agg->kind);
+        aggs.push_back(std::move(agg));
+        columns.push_back(request_values);
+      }
+    };
+
+    auto values_type = cudf::is_dictionary(request.values.type())
+                         ? cudf::dictionary_column_view(request.values).keys().type()
+                         : request.values.type();
+    for (auto&& agg : agg_v) {
+      groupby_simple_aggregations_collector collector;
+
+      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
+        insert_agg(request.values, std::move(agg_s));
+      }
+    }
+  }
+
+  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs));
+}
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
new file mode 100644
index 00000000000..2bf983e5e90
--- /dev/null
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+
+// flatten aggs to filter in single pass aggs
+std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index f9a80a048b5..75767786272 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "flatten_single_pass_aggs.hpp"
 #include "groupby/common/utils.hpp"
 #include "groupby/hash/groupby_kernels.cuh"
 
@@ -110,76 +111,6 @@ bool constexpr is_hash_aggregation(aggregation::Kind t)
   return array_contains(hash_aggregations, t);
 }
 
-class groupby_simple_aggregations_collector final
-  : public cudf::detail::simple_aggregations_collector {
- public:
-  using cudf::detail::simple_aggregations_collector::visit;
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::min_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
-                                                    : make_min_aggregation());
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::max_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
-                                                    : make_max_aggregation());
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::mean_aggregation const&) override
-  {
-    (void)col_type;
-    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type,
-                                                  cudf::detail::var_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type,
-                                                  cudf::detail::std_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(
-    data_type, cudf::detail::correlation_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-};
-
 template <typename SetType>
 class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
   column_view col;
@@ -347,40 +278,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
     dense_results->add_result(col, agg, std::move(result));
   }
 };
-// flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
-flatten_single_pass_aggs(host_span<aggregation_request const> requests)
-{
-  std::vector<column_view> columns;
-  std::vector<std::unique_ptr<aggregation>> aggs;
-  std::vector<aggregation::Kind> agg_kinds;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-
-    std::unordered_set<aggregation::Kind> agg_kinds_set;
-    auto insert_agg = [&](column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
-      if (agg_kinds_set.insert(agg->kind).second) {
-        agg_kinds.push_back(agg->kind);
-        aggs.push_back(std::move(agg));
-        columns.push_back(request_values);
-      }
-    };
-
-    auto values_type = cudf::is_dictionary(request.values.type())
-                         ? cudf::dictionary_column_view(request.values).keys().type()
-                         : request.values.type();
-    for (auto&& agg : agg_v) {
-      groupby_simple_aggregations_collector collector;
-
-      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
-        insert_agg(request.values, std::move(agg_s));
-      }
-    }
-  }
-
-  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs));
-}
 
 /**
  * @brief Gather sparse results into dense using `gather_map` and add to

From c8a56a54204ba5473013d83f96192aa626f4b21b Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 11 Oct 2024 16:56:44 -0400
Subject: [PATCH 060/117] Migrate Min Hashing APIs to pylibcudf (#17021)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17021
---
 .../api_docs/pylibcudf/nvtext/index.rst       |   1 +
 .../api_docs/pylibcudf/nvtext/minhash.rst     |   6 +
 python/cudf/cudf/_lib/nvtext/minhash.pyx      | 101 +++--------
 .../pylibcudf/libcudf/concatenate.pxd         |   2 +-
 .../pylibcudf/pylibcudf/libcudf/groupby.pxd   |   1 -
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  14 ++
 .../utilities/{host_span.pxd => span.pxd}     |   0
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |   2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |   3 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |   3 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |  18 ++
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 160 ++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_minhash.py    |  54 ++++++
 13 files changed, 285 insertions(+), 80 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst
 rename python/pylibcudf/pylibcudf/libcudf/utilities/{host_span.pxd => span.pxd} (100%)
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/minhash.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 6300f77d686..f6caabe324d 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -7,3 +7,4 @@ nvtext
     edit_distance
     generate_ngrams
     jaccard
+    minhash
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst
new file mode 100644
index 00000000000..b8ec02fca35
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst
@@ -0,0 +1,6 @@
+=======
+minhash
+=======
+
+.. automodule:: pylibcudf.nvtext.minhash
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 59cb8d51440..5e39cafa47b 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -2,93 +2,44 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.minhash cimport (
-    minhash as cpp_minhash,
-    minhash64 as cpp_minhash64,
-    word_minhash as cpp_word_minhash,
-    word_minhash64 as cpp_word_minhash64,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
 
-
-@acquire_spill_lock()
-def minhash(Column strings, Column seeds, int width):
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_width = width
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_minhash(
-                c_strings,
-                c_seeds,
-                c_width
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def minhash64(Column strings, Column seeds, int width):
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_width = width
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
+def minhash(Column input, Column seeds, int width=4):
+    result = nvtext.minhash.minhash(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
 
-    with nogil:
-        c_result = move(
-            cpp_minhash64(
-                c_strings,
-                c_seeds,
-                c_width
-            )
-        )
 
-    return Column.from_unique_ptr(move(c_result))
+@acquire_spill_lock()
+def minhash64(Column input, Column seeds, int width=4):
+    result = nvtext.minhash.minhash64(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def word_minhash(Column input, Column seeds):
-
-    cdef column_view c_input = input.view()
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_word_minhash(
-                c_input,
-                c_seeds
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.minhash.word_minhash(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def word_minhash64(Column input, Column seeds):
-
-    cdef column_view c_input = input.view()
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_word_minhash64(
-                c_input,
-                c_seeds
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.minhash.word_minhash64(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
index a09b6c01392..def292148c5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
@@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.column.column cimport column, column_view
 from pylibcudf.libcudf.table.table cimport table, table_view
-from pylibcudf.libcudf.utilities.host_span cimport host_span
+from pylibcudf.libcudf.utilities.span cimport host_span
 
 from rmm.librmm.device_buffer cimport device_buffer
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
index 848462131fe..17ea33a2066 100644
--- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
@@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport (
     size_type,
     sorted,
 )
-from pylibcudf.libcudf.utilities.host_span cimport host_span
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index f2dd22f43aa..41250037dcf 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,13 +1,21 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
 
+    cdef unique_ptr[column] minhash(
+        const column_view &strings,
+        const numeric_scalar[uint32_t] seed,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] minhash(
         const column_view &strings,
         const column_view &seeds,
@@ -20,6 +28,12 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash64(
+        const column_view &strings,
+        const numeric_scalar[uint64_t] seed,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] word_minhash(
         const column_view &input,
         const column_view &seeds
diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
similarity index 100%
rename from python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 9913e1fbadb..7fd65beeeb0 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 5f1762b1e3d..9eed1da1ab5 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance, generate_ngrams, jaccard
+from . cimport edit_distance, generate_ngrams, jaccard, minhash
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
+    "minhash"
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 1c0ddb1e5a4..a3a2363f7ef 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams, jaccard
+from . import edit_distance, generate_ngrams, jaccard, minhash
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
+    "minhash",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
new file mode 100644
index 00000000000..97e8c9dc83c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t, uint64_t
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
+
+cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
+
+cpdef Column word_minhash(Column input, Column seeds)
+
+cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
new file mode 100644
index 00000000000..5fabf6a3f89
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -0,0 +1,160 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.minhash cimport (
+    minhash as cpp_minhash,
+    minhash64 as cpp_minhash64,
+    word_minhash as cpp_word_minhash,
+    word_minhash64 as cpp_word_minhash64,
+)
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+
+cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
+    """
+    Returns the minhash values for each string per seed.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seeds : Column or Scalar
+        Seed value(s) used for the hash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+        Default is 4 characters.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    if not isinstance(seeds, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = move(
+            cpp_minhash(
+                input.view(),
+                seeds.view() if ColumnOrScalar is Column else
+                dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
+                width
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
+    """
+    Returns the minhash values for each string per seed.
+    This function uses MurmurHash3_x64_128 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash64`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seeds : Column or Scalar
+        Seed value(s) used for the hash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+        Default is 4 characters.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    if not isinstance(seeds, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = move(
+            cpp_minhash64(
+                input.view(),
+                seeds.view() if ColumnOrScalar is Column else
+                dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
+                width
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column word_minhash(Column input, Column seeds):
+    """
+    Returns the minhash values for each row of strings per seed.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`word_minhash`.
+
+    Parameters
+    ----------
+    input : Column
+        Lists column of strings to compute minhash
+    seeds : Column or Scalar
+        Seed values used for the hash algorithm.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash(
+                input.view(),
+                seeds.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column word_minhash64(Column input, Column seeds):
+    """
+    Returns the minhash values for each row of strings per seed.
+    This function uses MurmurHash3_x64_128 for the hash algorithm though
+    only the first 64-bits of the hash are used in computing the output.
+
+    For details, see :cpp:func:`word_minhash64`.
+
+    Parameters
+    ----------
+    input : Column
+        Lists column of strings to compute minhash
+    seeds : Column or Scalar
+        Seed values used for the hash algorithm.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash64(
+                input.view(),
+                seeds.view()
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
new file mode 100644
index 00000000000..4e389a63f90
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+
+
+@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
+def minhash_input_data(request):
+    input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"])
+    seeds = pa.array([2, 3, 4, 5], request.param)
+    return input_arr, seeds, request.param
+
+
+@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
+def word_minhash_input_data(request):
+    input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]])
+    seeds = pa.array([2, 3, 4, 5], request.param)
+    return input_arr, seeds, request.param
+
+
+@pytest.mark.parametrize("width", [5, 12])
+def test_minhash(minhash_input_data, width):
+    input_arr, seeds, seed_type = minhash_input_data
+    minhash_func = (
+        plc.nvtext.minhash.minhash
+        if seed_type == pa.uint32()
+        else plc.nvtext.minhash.minhash64
+    )
+    result = minhash_func(
+        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
+    assert pa_result.type == pa.list_(
+        pa.field("element", seed_type, nullable=False)
+    )
+
+
+def test_word_minhash(word_minhash_input_data):
+    input_arr, seeds, seed_type = word_minhash_input_data
+    word_minhash_func = (
+        plc.nvtext.minhash.word_minhash
+        if seed_type == pa.uint32()
+        else plc.nvtext.minhash.word_minhash64
+    )
+    result = word_minhash_func(
+        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds)
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
+    assert pa_result.type == pa.list_(
+        pa.field("element", seed_type, nullable=False)
+    )

From be1dd3267ed3cf7045c573ccc622f34fd159675f Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 11 Oct 2024 14:29:47 -0700
Subject: [PATCH 061/117] Add an example to demonstrate multithreaded
 `read_parquet` pipelines (#16828)

Closes #16717. This PR adds a new example to read multiple parquet files using multiple threads.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Basit Ayantunde (https://github.com/lamarrr)

URL: https://github.com/rapidsai/cudf/pull/16828
---
 ci/run_cudf_examples.sh                       |   5 +-
 cpp/examples/parquet_io/CMakeLists.txt        |  19 +-
 .../{parquet_io.hpp => common_utils.cpp}      | 105 ++--
 cpp/examples/parquet_io/common_utils.hpp      |  89 ++++
 cpp/examples/parquet_io/io_source.cpp         |  97 ++++
 cpp/examples/parquet_io/io_source.hpp         | 101 ++++
 cpp/examples/parquet_io/parquet_io.cpp        |  91 ++--
 .../parquet_io/parquet_io_multithreaded.cpp   | 466 ++++++++++++++++++
 8 files changed, 875 insertions(+), 98 deletions(-)
 rename cpp/examples/parquet_io/{parquet_io.hpp => common_utils.cpp} (50%)
 create mode 100644 cpp/examples/parquet_io/common_utils.hpp
 create mode 100644 cpp/examples/parquet_io/io_source.cpp
 create mode 100644 cpp/examples/parquet_io/io_source.hpp
 create mode 100644 cpp/examples/parquet_io/parquet_io_multithreaded.cpp

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 0819eacf636..2439af5b644 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -23,7 +23,10 @@ compute-sanitizer --tool memcheck custom_optimized names.csv
 compute-sanitizer --tool memcheck custom_prealloc names.csv
 compute-sanitizer --tool memcheck custom_with_malloc names.csv
 
-compute-sanitizer --tool memcheck parquet_io
+compute-sanitizer --tool memcheck parquet_io example.parquet
 compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
 
+compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet
+compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2
+
 exit ${EXITCODE}
diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index d8e9205ffd4..a7d0146b170 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -16,10 +16,23 @@ project(
 
 include(../fetch_dependencies.cmake)
 
-# Configure your project here
+add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp)
+target_compile_features(parquet_io_utils PRIVATE cxx_std_17)
+target_link_libraries(parquet_io_utils PRIVATE cudf::cudf)
+
+# Build and install parquet_io
 add_executable(parquet_io parquet_io.cpp)
-target_link_libraries(parquet_io PRIVATE cudf::cudf)
+target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_utils>)
 target_compile_features(parquet_io PRIVATE cxx_std_17)
-
 install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
+
+# Build and install parquet_io_multithreaded
+add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp)
+target_link_libraries(
+  parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_utils>
+)
+target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17)
+install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf)
+
+# Install the example.parquet file
 install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/common_utils.cpp
similarity index 50%
rename from cpp/examples/parquet_io/parquet_io.hpp
rename to cpp/examples/parquet_io/common_utils.cpp
index e27cbec4fce..a79ca48af86 100644
--- a/cpp/examples/parquet_io/parquet_io.hpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -14,30 +14,27 @@
  * limitations under the License.
  */
 
-#pragma once
+#include "common_utils.hpp"
 
-#include <cudf/io/parquet.hpp>
+#include <cudf/concatenate.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <rmm/cuda_device.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <chrono>
-#include <iostream>
-#include <optional>
+#include <iomanip>
 #include <string>
 
 /**
- * @brief Create memory resource for libcudf functions
+ * @file common_utils.cpp
+ * @brief Definitions for common utilities for `parquet_io` examples
  *
- * @param pool Whether to use a pool memory resource.
- * @return Memory resource instance
  */
+
 std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used)
 {
   auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
@@ -48,17 +45,11 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
   return cuda_mr;
 }
 
-/**
- * @brief Get encoding type from the keyword
- *
- * @param name encoding keyword name
- * @return corresponding column encoding type
- */
-[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name)
+cudf::io::column_encoding get_encoding_type(std::string name)
 {
   using encoding_type = cudf::io::column_encoding;
 
-  static const std::unordered_map<std::string_view, cudf::io::column_encoding> map = {
+  static std::unordered_map<std::string_view, encoding_type> const map = {
     {"DEFAULT", encoding_type::USE_DEFAULT},
     {"DICTIONARY", encoding_type::DICTIONARY},
     {"PLAIN", encoding_type::PLAIN},
@@ -69,26 +60,18 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   std::transform(name.begin(), name.end(), name.begin(), ::toupper);
   if (map.find(name) != map.end()) { return map.at(name); }
-  throw std::invalid_argument("FATAL: " + std::string(name) +
+  throw std::invalid_argument(name +
                               " is not a valid encoding type.\n\n"
                               "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
                               "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
-                              "DELTA_BYTE_ARRAY\n"
-                              "\n"
-                              "Exiting...\n");
+                              "DELTA_BYTE_ARRAY\n\n");
 }
 
-/**
- * @brief Get compression type from the keyword
- *
- * @param name compression keyword name
- * @return corresponding compression type
- */
-[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name)
+cudf::io::compression_type get_compression_type(std::string name)
 {
   using compression_type = cudf::io::compression_type;
 
-  static const std::unordered_map<std::string_view, cudf::io::compression_type> map = {
+  static std::unordered_map<std::string_view, compression_type> const map = {
     {"NONE", compression_type::NONE},
     {"AUTO", compression_type::AUTO},
     {"SNAPPY", compression_type::SNAPPY},
@@ -97,30 +80,58 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   std::transform(name.begin(), name.end(), name.begin(), ::toupper);
   if (map.find(name) != map.end()) { return map.at(name); }
-  throw std::invalid_argument("FATAL: " + std::string(name) +
+  throw std::invalid_argument(name +
                               " is not a valid compression type.\n\n"
-                              "Available compression_type types: NONE, AUTO, SNAPPY,\n"
-                              "LZ4, ZSTD\n"
-                              "\n"
-                              "Exiting...\n");
+                              "Available compression types: NONE, AUTO, SNAPPY,\n"
+                              "LZ4, ZSTD\n\n");
 }
 
-/**
- * @brief Get the optional page size stat frequency from they keyword
- *
- * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON
- * @return optional page statistics frequency set to full (STATISTICS_COLUMN)
- */
-[[nodiscard]] std::optional<cudf::io::statistics_freq> get_page_size_stats(std::string use_stats)
+bool get_boolean(std::string input)
 {
-  std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper);
+  std::transform(input.begin(), input.end(), input.begin(), ::toupper);
 
   // Check if the input string matches to any of the following
-  if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or
-      not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) {
-    // Full column and offset indices - STATISTICS_COLUMN
-    return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN);
+  return input == "ON" or input == "TRUE" or input == "YES" or input == "Y" or input == "T";
+}
+
+void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table)
+{
+  try {
+    // Left anti-join the original and transcoded tables
+    // identical tables should not throw an exception and
+    // return an empty indices vector
+    auto const indices = cudf::left_anti_join(lhs_table, rhs_table, cudf::null_equality::EQUAL);
+
+    // No exception thrown, check indices
+    auto const valid = indices->size() == 0;
+    std::cout << "Tables identical: " << valid << "\n\n";
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl << std::endl;
+    throw std::runtime_error("Tables identical: false\n\n");
   }
+}
 
-  return std::nullopt;
+std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
+                                                rmm::cuda_stream_view stream)
+{
+  if (tables.size() == 1) { return std::move(tables[0]); }
+
+  std::vector<cudf::table_view> table_views;
+  table_views.reserve(tables.size());
+  std::transform(
+    tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) {
+      return tbl->view();
+    });
+  // Construct the final table
+  return cudf::concatenate(table_views, stream);
+}
+
+std::string current_date_and_time()
+{
+  auto const time       = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+  auto const local_time = *std::localtime(&time);
+  // Stringstream to format the date and time
+  std::stringstream ss;
+  ss << std::put_time(&local_time, "%Y-%m-%d-%H-%M-%S");
+  return ss.str();
 }
diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp
new file mode 100644
index 00000000000..12896e61a0d
--- /dev/null
+++ b/cpp/examples/parquet_io/common_utils.hpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+#include <string>
+
+/**
+ * @file common_utils.hpp
+ * @brief Common utilities for `parquet_io` examples
+ *
+ */
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used);
+
+/**
+ * @brief Get encoding type from the keyword
+ *
+ * @param name encoding keyword name
+ * @return corresponding column encoding type
+ */
+[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name);
+
+/**
+ * @brief Get compression type from the keyword
+ *
+ * @param name compression keyword name
+ * @return corresponding compression type
+ */
+[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name);
+
+/**
+ * @brief Get boolean from they keyword
+ *
+ * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON
+ * @return true or false
+ */
+[[nodiscard]] bool get_boolean(std::string input);
+
+/**
+ * @brief Check if two tables are identical, throw an error otherwise
+ *
+ * @param lhs_table View to lhs table
+ * @param rhs_table View to rhs table
+ */
+void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table);
+
+/**
+ * @brief Concatenate a vector of tables and return the resultant table
+ *
+ * @param tables Vector of tables to concatenate
+ * @param stream CUDA stream to use
+ *
+ * @return Unique pointer to the resultant concatenated table.
+ */
+std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
+                                                rmm::cuda_stream_view stream);
+
+/**
+ * @brief Returns a string containing current date and time
+ *
+ */
+std::string current_date_and_time();
diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp
new file mode 100644
index 00000000000..019b3f96474
--- /dev/null
+++ b/cpp/examples/parquet_io/io_source.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io_source.hpp"
+
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <filesystem>
+#include <fstream>
+#include <string>
+
+rmm::host_async_resource_ref pinned_memory_resource()
+{
+  static auto mr = rmm::mr::pinned_host_memory_resource{};
+  return mr;
+}
+
+io_source_type get_io_source_type(std::string name)
+{
+  static std::unordered_map<std::string_view, io_source_type> const map = {
+    {"FILEPATH", io_source_type::FILEPATH},
+    {"HOST_BUFFER", io_source_type::HOST_BUFFER},
+    {"PINNED_BUFFER", io_source_type::PINNED_BUFFER},
+    {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}};
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) {
+    return map.at(name);
+  } else {
+    throw std::invalid_argument(name +
+                                " is not a valid io source type. Available: FILEPATH,\n"
+                                "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n");
+  }
+}
+
+io_source::io_source(std::string_view file_path, io_source_type type, rmm::cuda_stream_view stream)
+  : pinned_buffer({pinned_memory_resource(), stream}), d_buffer{0, stream}
+{
+  std::string const file_name{file_path};
+  auto const file_size = std::filesystem::file_size(file_name);
+
+  // For filepath make a quick source_info and return early
+  if (type == io_source_type::FILEPATH) {
+    source_info = cudf::io::source_info(file_name);
+    return;
+  }
+
+  std::ifstream file{file_name, std::ifstream::binary};
+
+  // Copy file contents to the specified io source buffer
+  switch (type) {
+    case io_source_type::HOST_BUFFER: {
+      h_buffer.resize(file_size);
+      file.read(h_buffer.data(), file_size);
+      source_info = cudf::io::source_info(h_buffer.data(), file_size);
+      break;
+    }
+    case io_source_type::PINNED_BUFFER: {
+      pinned_buffer.resize(file_size);
+      file.read(pinned_buffer.data(), file_size);
+      source_info = cudf::io::source_info(pinned_buffer.data(), file_size);
+      break;
+    }
+    case io_source_type::DEVICE_BUFFER: {
+      h_buffer.resize(file_size);
+      file.read(h_buffer.data(), file_size);
+      d_buffer.resize(file_size, stream);
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value()));
+
+      source_info = cudf::io::source_info(d_buffer);
+      break;
+    }
+    default: {
+      throw std::runtime_error("Encountered unexpected source type\n\n");
+    }
+  }
+}
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
new file mode 100644
index 00000000000..a614d348fae
--- /dev/null
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <string>
+
+/**
+ * @file io_source.hpp
+ * @brief Utilities for constructing the specified IO sources from the input parquet files.
+ *
+ */
+
+/**
+ * @brief Available IO source types
+ */
+enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER };
+
+/**
+ * @brief Get io source type from the string keyword argument
+ *
+ * @param name io source type keyword name
+ * @return io source type
+ */
+[[nodiscard]] io_source_type get_io_source_type(std::string name);
+
+/**
+ * @brief Create and return a reference to a static pinned memory pool
+ *
+ * @return Reference to a static pinned memory pool
+ */
+rmm::host_async_resource_ref pinned_memory_resource();
+
+/**
+ * @brief Custom allocator for pinned_buffer via RMM.
+ */
+template <typename T>
+struct pinned_allocator : public std::allocator<T> {
+  pinned_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
+    : mr{_mr}, stream{_stream}
+  {
+  }
+
+  T* allocate(std::size_t n)
+  {
+    auto ptr = mr.allocate_async(n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+    stream.synchronize();
+    return static_cast<T*>(ptr);
+  }
+
+  void deallocate(T* ptr, std::size_t n)
+  {
+    mr.deallocate_async(ptr, n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+ private:
+  rmm::host_async_resource_ref mr;
+  rmm::cuda_stream_view stream;
+};
+
+/**
+ * @brief Class to create a cudf::io::source_info of given type from the input parquet file
+ *
+ */
+class io_source {
+ public:
+  io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream);
+
+  // Get the internal source info
+  [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; }
+
+ private:
+  // alias for pinned vector
+  template <typename T>
+  using pinned_vector = thrust::host_vector<T, pinned_allocator<T>>;
+  cudf::io::source_info source_info;
+  std::vector<char> h_buffer;
+  pinned_vector<char> pinned_buffer;
+  rmm::device_uvector<std::byte> d_buffer;
+};
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 9cda22d0695..c11b8de82b5 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-#include "parquet_io.hpp"
-
 #include "../utilities/timer.hpp"
+#include "common_utils.hpp"
+#include "io_source.hpp"
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
 
-#include <cudf/utilities/default_stream.hpp>
+#include <string>
 
 /**
  * @file parquet_io.cpp
@@ -81,6 +85,18 @@ void write_parquet(cudf::table_view input,
   cudf::io::write_parquet(options);
 }
 
+/**
+ * @brief Function to print example usage and argument information.
+ */
+void print_usage()
+{
+  std::cout << "\nUsage: parquet_io <input parquet file> <output parquet file> <encoding type>\n"
+               "                  <compression type> <write page stats: yes/no>\n\n"
+               "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n"
+               "                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n"
+               "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n";
+}
+
 /**
  * @brief Main for nested_types examples
  *
@@ -97,29 +113,28 @@ void write_parquet(cudf::table_view input,
  */
 int main(int argc, char const** argv)
 {
-  std::string input_filepath;
-  std::string output_filepath;
-  cudf::io::column_encoding encoding;
-  cudf::io::compression_type compression;
-  std::optional<cudf::io::statistics_freq> page_stats;
+  std::string input_filepath                          = "example.parquet";
+  std::string output_filepath                         = "output.parquet";
+  cudf::io::column_encoding encoding                  = get_encoding_type("DELTA_BINARY_PACKED");
+  cudf::io::compression_type compression              = get_compression_type("ZSTD");
+  std::optional<cudf::io::statistics_freq> page_stats = std::nullopt;
 
   switch (argc) {
-    case 1:
-      input_filepath  = "example.parquet";
-      output_filepath = "output.parquet";
-      encoding        = get_encoding_type("DELTA_BINARY_PACKED");
-      compression     = get_compression_type("ZSTD");
-      break;
-    case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]];
-    case 5:
-      input_filepath  = argv[1];
-      output_filepath = argv[2];
-      encoding        = get_encoding_type(argv[3]);
-      compression     = get_compression_type(argv[4]);
-      break;
-    default:
-      throw std::runtime_error(
-        "Either provide all command-line arguments, or none to use defaults\n");
+    case 6:
+      page_stats = get_boolean(argv[5])
+                     ? std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN)
+                     : std::nullopt;
+      [[fallthrough]];
+    case 5: compression = get_compression_type(argv[4]); [[fallthrough]];
+    case 4: encoding = get_encoding_type(argv[3]); [[fallthrough]];
+    case 3: output_filepath = argv[2]; [[fallthrough]];
+    case 2:  // Check if instead of input_paths, the first argument is `-h` or `--help`
+      if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") {
+        input_filepath = std::move(arg);
+        break;
+      }
+      [[fallthrough]];
+    default: print_usage(); throw std::runtime_error("");
   }
 
   // Create and use a memory pool
@@ -130,18 +145,16 @@ int main(int argc, char const** argv)
   // Read input parquet file
   // We do not want to time the initial read time as it may include
   // time for nvcomp, cufile loading and RMM growth
-  std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl;
+  std::cout << "\nReading " << input_filepath << "...\n";
   std::cout << "Note: Not timing the initial parquet read as it may include\n"
-               "times for nvcomp, cufile loading and RMM growth."
-            << std::endl
-            << std::endl;
+               "times for nvcomp, cufile loading and RMM growth.\n\n";
   auto [input, metadata] = read_parquet(input_filepath);
 
   // Status string to indicate if page stats are set to be written or not
   auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
   // Write parquet file with the specified encoding and compression
   std::cout << "Writing " << output_filepath << " with encoding, compression and "
-            << page_stat_string << ".." << std::endl;
+            << page_stat_string << "..\n";
 
   // `timer` is automatically started here
   cudf::examples::timer timer;
@@ -149,7 +162,7 @@ int main(int argc, char const** argv)
   timer.print_elapsed_millis();
 
   // Read the parquet file written with encoding and compression
-  std::cout << "Reading " << output_filepath << "..." << std::endl;
+  std::cout << "Reading " << output_filepath << "...\n";
 
   // Reset the timer
   timer.reset();
@@ -157,23 +170,7 @@ int main(int argc, char const** argv)
   timer.print_elapsed_millis();
 
   // Check for validity
-  try {
-    // Left anti-join the original and transcoded tables
-    // identical tables should not throw an exception and
-    // return an empty indices vector
-    auto const indices = cudf::left_anti_join(input->view(),
-                                              transcoded_input->view(),
-                                              cudf::null_equality::EQUAL,
-                                              cudf::get_default_stream(),
-                                              resource.get());
-
-    // No exception thrown, check indices
-    auto const valid = indices->size() == 0;
-    std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
-  } catch (std::exception& e) {
-    std::cerr << e.what() << std::endl << std::endl;
-    std::cout << "Transcoding valid: false" << std::endl;
-  }
+  check_tables_equal(input->view(), transcoded_input->view());
 
   return 0;
 }
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
new file mode 100644
index 00000000000..6ad4b862240
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "common_utils.hpp"
+#include "io_source.hpp"
+
+#include <cudf/concatenate.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <filesystem>
+#include <stdexcept>
+#include <string>
+#include <thread>
+
+/**
+ * @file parquet_io_multithreaded.cpp
+ * @brief Demonstrates reading parquet data from the specified io source using multiple threads.
+ *
+ * The input parquet data is provided via files which are converted to the specified io source type
+ * to be read using multiple threads. Optionally, the parquet data read by each thread can be
+ * written to corresponding files and checked for validatity of the output files against the input
+ * data.
+ *
+ * Run: ``parquet_io_multithreaded -h`` to see help with input args and more information.
+ *
+ * The following io source types are supported:
+ * IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER
+ *
+ */
+
+// Type alias for unique ptr to cudf table
+using table_t = std::unique_ptr<cudf::table>;
+
+/**
+ * @brief Behavior when handling the read tables by multiple threads
+ */
+enum class read_mode {
+  NO_CONCATENATE,      ///< Only read and discard tables
+  CONCATENATE_THREAD,  ///< Read and concatenate tables from each thread
+  CONCATENATE_ALL,     ///< Read and concatenate everything to a single table
+};
+
+/**
+ * @brief Functor for multithreaded parquet reading based on the provided read_mode
+ */
+template <read_mode read_mode>
+struct read_fn {
+  std::vector<io_source> const& input_sources;
+  std::vector<table_t>& tables;
+  int const thread_id;
+  int const thread_count;
+  rmm::cuda_stream_view stream;
+
+  void operator()()
+  {
+    // Tables read by this thread
+    std::vector<table_t> tables_this_thread;
+
+    // Sweep the available input files
+    for (auto curr_file_idx = thread_id; curr_file_idx < input_sources.size();
+         curr_file_idx += thread_count) {
+      auto builder =
+        cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info());
+      auto const options = builder.build();
+      if constexpr (read_mode != read_mode::NO_CONCATENATE) {
+        tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl);
+      } else {
+        cudf::io::read_parquet(options, stream);
+      }
+    }
+
+    // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode.
+    if constexpr (read_mode != read_mode::NO_CONCATENATE) {
+      auto table = concatenate_tables(std::move(tables_this_thread), stream);
+      stream.synchronize_no_throw();
+      tables[thread_id] = std::move(table);
+    } else {
+      // Just synchronize this stream and exit
+      stream.synchronize_no_throw();
+    }
+  }
+};
+
+/**
+ * @brief Function to setup and launch multithreaded parquet reading.
+ *
+ * @tparam read_mode Specifies if to concatenate and return the actual
+ *                    tables or discard them and return an empty vector
+ *
+ * @param input_sources List of input sources to read
+ * @param thread_count Number of threads
+ * @param stream_pool CUDA stream pool to use for threads
+ *
+ * @return Vector of read tables.
+ */
+template <read_mode read_mode>
+std::vector<table_t> read_parquet_multithreaded(std::vector<io_source> const& input_sources,
+                                                int32_t thread_count,
+                                                rmm::cuda_stream_pool& stream_pool)
+{
+  // Tables read by each thread
+  std::vector<table_t> tables(thread_count);
+
+  // Table reading tasks
+  std::vector<read_fn<read_mode>> read_tasks;
+  read_tasks.reserve(thread_count);
+
+  // Create the read tasks
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) {
+      read_tasks.emplace_back(
+        read_fn<read_mode>{input_sources, tables, tid, thread_count, stream_pool.get_stream()});
+    });
+
+  // Create threads with tasks
+  std::vector<std::thread> threads;
+  threads.reserve(thread_count);
+  for (auto& c : read_tasks) {
+    threads.emplace_back(c);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // If CONCATENATE_ALL mode, then concatenate to a vector of one final table.
+  if (read_mode == read_mode::CONCATENATE_ALL) {
+    auto stream    = stream_pool.get_stream();
+    auto final_tbl = concatenate_tables(std::move(tables), stream);
+    stream.synchronize();
+    tables.clear();
+    tables.emplace_back(std::move(final_tbl));
+  }
+
+  return tables;
+}
+
+/**
+ * @brief Functor for multithreaded parquet writing
+ */
+struct write_fn {
+  std::string const& output_path;
+  std::vector<cudf::table_view> const& table_views;
+  int const thread_id;
+  rmm::cuda_stream_view stream;
+
+  void operator()()
+  {
+    // Create a sink
+    cudf::io::sink_info const sink_info{output_path + "/table_" + std::to_string(thread_id) +
+                                        ".parquet"};
+    // Writer options builder
+    auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]);
+    // Create a new metadata for the table
+    auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]};
+
+    builder.metadata(table_metadata);
+    auto options = builder.build();
+
+    // Write parquet data
+    cudf::io::write_parquet(options, stream);
+
+    // Done with this stream
+    stream.synchronize_no_throw();
+  }
+};
+
+/**
+ * @brief Function to setup and launch multithreaded writing parquet files.
+ *
+ * @param output_path Path to output directory
+ * @param tables List of at least table views to be written
+ * @param thread_count Number of threads to use for writing tables.
+ * @param stream_pool CUDA stream pool to use for threads
+ *
+ */
+void write_parquet_multithreaded(std::string const& output_path,
+                                 std::vector<cudf::table_view> const& tables,
+                                 int32_t thread_count,
+                                 rmm::cuda_stream_pool& stream_pool)
+{
+  // Table writing tasks
+  std::vector<write_fn> write_tasks;
+  write_tasks.reserve(thread_count);
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) {
+      write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()});
+    });
+
+  // Writer threads
+  std::vector<std::thread> threads;
+  threads.reserve(thread_count);
+  for (auto& c : write_tasks) {
+    threads.emplace_back(c);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+/**
+ * @brief Function to print example usage and argument information.
+ */
+void print_usage()
+{
+  std::cout
+    << "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files> <input "
+       "multiplier>\n"
+       "                                <io source type> <number of times to read> <thread count>\n"
+       "                                <write to temp output files and validate: "
+       "yes/no>\n\n"
+       "Available IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER (Default), "
+       "DEVICE_BUFFER\n\n"
+       "Note: Provide as many arguments as you like in the above order. Default values\n"
+       "      for the unprovided arguments will be used. All input parquet files will\n"
+       "      be converted to the specified IO source type before reading\n\n";
+}
+
+/**
+ * @brief Function to process comma delimited input paths string to parquet files and/or dirs
+ *        and convert them to specified io sources.
+ *
+ * Process the input path string containing directories (of parquet files) and/or individual
+ * parquet files into a list of input parquet files, multiple the list by `input_multiplier`,
+ * make sure to have at least `thread_count` files to satisfy at least file per parallel thread,
+ * and convert the final list of files to a list of `io_source` and return.
+ *
+ * @param paths Comma delimited input paths string
+ * @param input_multiplier Multiplier for the input files list
+ * @param thread_count Number of threads being used in the example
+ * @param io_source_type Specified IO source type to convert input files to
+ * @param stream CUDA stream to use
+ *
+ * @return Vector of input sources for the given paths
+ */
+std::vector<io_source> extract_input_sources(std::string const& paths,
+                                             int32_t input_multiplier,
+                                             int32_t thread_count,
+                                             io_source_type io_source_type,
+                                             rmm::cuda_stream_view stream)
+{
+  // Get the delimited paths to directory and/or files.
+  std::vector<std::string> const delimited_paths = [&]() {
+    std::vector<std::string> paths_list;
+    std::stringstream strstream{paths};
+    std::string path;
+    // Extract the delimited paths.
+    while (std::getline(strstream, path, char{','})) {
+      paths_list.push_back(path);
+    }
+    return paths_list;
+  }();
+
+  // List of parquet files
+  std::vector<std::string> parquet_files;
+  std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
+    std::filesystem::path path{path_string};
+    // If this is a parquet file, add it.
+    if (std::filesystem::is_regular_file(path)) {
+      parquet_files.push_back(path_string);
+    }
+    // If this is a directory, add all files in the directory.
+    else if (std::filesystem::is_directory(path)) {
+      for (auto const& file : std::filesystem::directory_iterator(path)) {
+        if (std::filesystem::is_regular_file(file.path())) {
+          parquet_files.push_back(file.path().string());
+        } else {
+          std::cout << "Skipping sub-directory: " << file.path().string() << "\n";
+        }
+      }
+    } else {
+      print_usage();
+      throw std::runtime_error("Encountered an invalid input path\n");
+    }
+  });
+
+  // Current size of list of parquet files
+  auto const initial_size = parquet_files.size();
+  if (initial_size == 0) { return {}; }
+
+  // Reserve space
+  parquet_files.reserve(std::max<size_t>(thread_count, input_multiplier * parquet_files.size()));
+
+  // Append the input files by input_multiplier times
+  std::for_each(thrust::make_counting_iterator(1),
+                thrust::make_counting_iterator(input_multiplier),
+                [&](auto i) {
+                  parquet_files.insert(parquet_files.end(),
+                                       parquet_files.begin(),
+                                       parquet_files.begin() + initial_size);
+                });
+
+  // Cycle append parquet files from the existing ones if less than the thread_count
+  std::cout << "Warning: Number of input sources < thread count. Cycling from\n"
+               "and appending to current input sources such that the number of\n"
+               "input source == thread count\n";
+  for (size_t idx = 0; thread_count > static_cast<int>(parquet_files.size()); idx++) {
+    parquet_files.emplace_back(parquet_files[idx % initial_size]);
+  }
+
+  // Vector of io sources
+  std::vector<io_source> input_sources;
+  input_sources.reserve(parquet_files.size());
+  // Transform input files to the specified io sources
+  std::transform(parquet_files.begin(),
+                 parquet_files.end(),
+                 std::back_inserter(input_sources),
+                 [&](auto const& file_name) {
+                   return io_source{file_name, io_source_type, stream};
+                 });
+  stream.synchronize();
+  return input_sources;
+}
+
+/**
+ * @brief The main function
+ */
+int32_t main(int argc, char const** argv)
+{
+  // Set arguments to defaults
+  std::string input_paths       = "example.parquet";
+  int32_t input_multiplier      = 1;
+  int32_t num_reads             = 1;
+  int32_t thread_count          = 1;
+  io_source_type io_source_type = io_source_type::PINNED_BUFFER;
+  bool write_and_validate       = false;
+
+  // Set to the provided args
+  switch (argc) {
+    case 7: write_and_validate = get_boolean(argv[6]); [[fallthrough]];
+    case 6: thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); [[fallthrough]];
+    case 5: num_reads = std::max(1, std::stoi(argv[4])); [[fallthrough]];
+    case 4: io_source_type = get_io_source_type(argv[3]); [[fallthrough]];
+    case 3:
+      input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]}));
+      [[fallthrough]];
+    case 2:
+      // Check if instead of input_paths, the first argument is `-h` or `--help`
+      if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") {
+        input_paths = std::move(arg);
+        break;
+      }
+      [[fallthrough]];
+    default: print_usage(); throw std::runtime_error("");
+  }
+
+  // Initialize mr, default stream and stream pool
+  auto const is_pool_used = true;
+  auto resource           = create_memory_resource(is_pool_used);
+  auto default_stream     = cudf::get_default_stream();
+  auto stream_pool        = rmm::cuda_stream_pool(thread_count);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+
+  // List of input sources from the input_paths string.
+  auto const input_sources = extract_input_sources(
+    input_paths, input_multiplier, thread_count, io_source_type, default_stream);
+
+  // Check if there is nothing to do
+  if (input_sources.empty()) {
+    print_usage();
+    throw std::runtime_error("No input files to read. Exiting early.\n");
+  }
+
+  // Read the same parquet files specified times with multiple threads and discard the read tables
+  {
+    // Print status
+    std::cout << "\nReading " << input_sources.size() << " input sources " << num_reads
+              << " time(s) using " << thread_count
+              << " threads and discarding output "
+                 "tables..\n";
+
+    if (io_source_type == io_source_type::FILEPATH) {
+      std::cout << "Note that the first read may include times for nvcomp, cufile loading and RMM "
+                   "growth.\n\n";
+    }
+
+    cudf::examples::timer timer;
+    std::for_each(thrust::make_counting_iterator(0),
+                  thrust::make_counting_iterator(num_reads),
+                  [&](auto i) {  // Read parquet files and discard the tables
+                    std::ignore = read_parquet_multithreaded<read_mode::NO_CONCATENATE>(
+                      input_sources, thread_count, stream_pool);
+                  });
+    default_stream.synchronize();
+    timer.print_elapsed_millis();
+  }
+
+  // Write parquet files and validate if needed
+  if (write_and_validate) {
+    // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables
+    auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
+      input_sources, thread_count, stream_pool);
+    default_stream.synchronize();
+
+    // Construct a vector of table views for write_parquet_multithreaded
+    auto const table_views = [&tables]() {
+      std::vector<cudf::table_view> table_views;
+      table_views.reserve(tables.size());
+      std::transform(
+        tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) {
+          return tbl->view();
+        });
+      return table_views;
+    }();
+
+    // Write tables to parquet
+    std::cout << "Writing parquet output files..\n";
+
+    // Create a directory at the tmpdir path.
+    std::string output_path =
+      std::filesystem::temp_directory_path().string() + "/output_" + current_date_and_time();
+    std::filesystem::create_directory({output_path});
+    cudf::examples::timer timer;
+    write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool);
+    default_stream.synchronize();
+    timer.print_elapsed_millis();
+
+    // Verify the output
+    std::cout << "Verifying output..\n";
+
+    // Simply concatenate the previously read tables from input sources
+    auto const input_table = cudf::concatenate(table_views, default_stream);
+
+    // Sources from written parquet files
+    auto const written_pq_sources = extract_input_sources(
+      output_path, input_multiplier, thread_count, io_source_type, default_stream);
+
+    // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only
+    auto const transcoded_table = std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(
+                                              written_pq_sources, thread_count, stream_pool)
+                                              .back());
+    default_stream.synchronize();
+
+    // Check if the tables are identical
+    check_tables_equal(input_table->view(), transcoded_table->view());
+
+    // Remove the created temp directory and parquet data
+    std::filesystem::remove_all(output_path);
+  }
+
+  // Print peak memory
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n";
+
+  return 0;
+}

From 4dbb8a354a9d4f0b4d82a5bf9747409c6304358f Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 11 Oct 2024 19:11:09 -0700
Subject: [PATCH 062/117] Refactor ORC dictionary encoding to migrate to the
 new `cuco::static_map` (#17049)

Part of #12261. This PR refactors ORC writer's dictionary encoding to migrate from `cuco::legacy::static_map` to the new `cuco::static_map`. No performance impact measured. Results [here](https://github.com/rapidsai/cudf/pull/17049#issuecomment-2405980218).

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17049
---
 cpp/src/io/orc/dict_enc.cu       | 107 ++++++++++++++-----------------
 cpp/src/io/orc/orc_gpu.hpp       |  39 ++++++-----
 cpp/src/io/orc/writer_impl.cu    |  25 ++++++--
 cpp/src/io/parquet/chunk_dict.cu |  15 ++---
 4 files changed, 96 insertions(+), 90 deletions(-)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 5be75350951..0cb5c382631 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -77,20 +77,6 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
     counts, orc_columns, rowgroup_bounds, str_col_indexes);
 }
 
-template <int block_size>
-CUDF_KERNEL void __launch_bounds__(block_size)
-  initialize_dictionary_hash_maps_kernel(device_span<stripe_dictionary> dictionaries)
-{
-  auto const dict_map = dictionaries[blockIdx.x].map_slots;
-  auto const t        = threadIdx.x;
-  for (size_type i = 0; i < dict_map.size(); i += block_size) {
-    if (t + i < dict_map.size()) {
-      new (&dict_map[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
-      new (&dict_map[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
-    }
-  }
-}
-
 struct equality_functor {
   column_device_view const& col;
   __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const
@@ -109,6 +95,9 @@ struct hash_functor {
   }
 };
 
+// Probing scheme to use for the hash map
+using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_functor>;
+
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
   populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
@@ -121,26 +110,34 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto const& col       = columns[dict.column_idx];
 
   // Make a view of the hash map
-  auto hash_map_mutable  = map_type::device_mutable_view(dict.map_slots.data(),
-                                                        dict.map_slots.size(),
-                                                        cuco::empty_key{KEY_SENTINEL},
-                                                        cuco::empty_value{VALUE_SENTINEL});
   auto const hash_fn     = hash_functor{col};
   auto const equality_fn = equality_functor{col};
 
+  storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()};
+  // Make a view of the hash map.
+  auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                           cuco::empty_value{VALUE_SENTINEL},
+                                           equality_fn,
+                                           probing_scheme_type{hash_fn},
+                                           cuco::thread_scope_block,
+                                           storage_ref};
+
+  // Create a map ref with `cuco::insert` operator
+  auto has_map_insert_ref = hash_map_ref.rebind_operators(cuco::insert);
+
   auto const start_row = dict.start_row;
   auto const end_row   = dict.start_row + dict.num_rows;
 
   size_type entry_count{0};
   size_type char_count{0};
+
   // all threads should loop the same number of times
   for (thread_index_type cur_row = start_row + t; cur_row - t < end_row; cur_row += block_size) {
     auto const is_valid = cur_row < end_row and col.is_valid(cur_row);
 
     if (is_valid) {
       // insert element at cur_row to hash map and count successful insertions
-      auto const is_unique =
-        hash_map_mutable.insert(std::pair(cur_row, cur_row), hash_fn, equality_fn);
+      auto const is_unique = has_map_insert_ref.insert(cuco::pair{cur_row, cur_row});
 
       if (is_unique) {
         ++entry_count;
@@ -175,24 +172,23 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   if (not dict.is_enabled) { return; }
 
   auto const t = threadIdx.x;
-  auto map     = map_type::device_view(dict.map_slots.data(),
-                                   dict.map_slots.size(),
-                                   cuco::empty_key{KEY_SENTINEL},
-                                   cuco::empty_value{VALUE_SENTINEL});
-
   __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
 
   using cuda::std::memory_order_relaxed;
   if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
   __syncthreads();
+
   for (size_type i = 0; i < dict.map_slots.size(); i += block_size) {
     if (t + i < dict.map_slots.size()) {
-      auto* slot = reinterpret_cast<map_type::value_type*>(map.begin_slot() + t + i);
-      auto key   = slot->first;
-      if (key != KEY_SENTINEL) {
-        auto loc       = counter.fetch_add(1, memory_order_relaxed);
-        dict.data[loc] = key;
-        slot->second   = loc;
+      auto window = dict.map_slots.begin() + t + i;
+      // Collect all slots from each window.
+      for (auto& slot : *window) {
+        auto const key = slot.first;
+        if (key != KEY_SENTINEL) {
+          auto loc       = counter.fetch_add(1, memory_order_relaxed);
+          dict.data[loc] = key;
+          slot.second    = loc;
+        }
       }
     }
   }
@@ -205,47 +201,42 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 {
   auto const col_idx    = blockIdx.x;
   auto const stripe_idx = blockIdx.y;
+  auto const t          = threadIdx.x;
   auto const& dict      = dictionaries[col_idx][stripe_idx];
   auto const& col       = columns[dict.column_idx];
 
   if (not dict.is_enabled) { return; }
 
-  auto const t         = threadIdx.x;
+  // Make a view of the hash map
+  auto const hash_fn     = hash_functor{col};
+  auto const equality_fn = equality_functor{col};
+
+  storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()};
+  // Make a view of the hash map.
+  auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                           cuco::empty_value{VALUE_SENTINEL},
+                                           equality_fn,
+                                           probing_scheme_type{hash_fn},
+                                           cuco::thread_scope_block,
+                                           storage_ref};
+
+  // Create a map ref with `cuco::insert` operator
+  auto has_map_find_ref = hash_map_ref.rebind_operators(cuco::find);
+
   auto const start_row = dict.start_row;
   auto const end_row   = dict.start_row + dict.num_rows;
 
-  auto const map = map_type::device_view(dict.map_slots.data(),
-                                         dict.map_slots.size(),
-                                         cuco::empty_key{KEY_SENTINEL},
-                                         cuco::empty_value{VALUE_SENTINEL});
-
-  thread_index_type cur_row = start_row + t;
-  while (cur_row < end_row) {
+  for (thread_index_type cur_row = start_row + t; cur_row < end_row; cur_row += block_size) {
     if (col.is_valid(cur_row)) {
-      auto const hash_fn     = hash_functor{col};
-      auto const equality_fn = equality_functor{col};
-      auto const found_slot  = map.find(cur_row, hash_fn, equality_fn);
-      cudf_assert(found_slot != map.end() &&
+      auto const found_slot = has_map_find_ref.find(cur_row);
+      // Fail if we didn't find the previously inserted key.
+      cudf_assert(found_slot != has_map_find_ref.end() &&
                   "Unable to find value in map in dictionary index construction");
-      if (found_slot != map.end()) {
-        // No need for atomic as this is not going to be modified by any other thread
-        auto const val_ptr  = reinterpret_cast<map_type::mapped_type const*>(&found_slot->second);
-        dict.index[cur_row] = *val_ptr;
-      }
+      dict.index[cur_row] = found_slot->second;
     }
-    cur_row += block_size;
   }
 }
 
-void initialize_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries,
-                                     rmm::cuda_stream_view stream)
-{
-  if (dictionaries.count() == 0) { return; }
-  constexpr int block_size = 1024;
-  initialize_dictionary_hash_maps_kernel<block_size>
-    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries.flat_view());
-}
-
 void populate_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries,
                                    device_span<orc_column_device_view const> columns,
                                    rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 8c7ccf0527f..0949fafe9a4 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -21,6 +21,7 @@
 #include "io/utilities/column_buffer.hpp"
 #include "orc.hpp"
 
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/timezone.cuh>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/io/types.hpp>
@@ -40,19 +41,27 @@ namespace gpu {
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
 
+using key_type    = size_type;
+using mapped_type = size_type;
+using slot_type   = cuco::pair<key_type, mapped_type>;
+auto constexpr map_cg_size =
+  1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
+      ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
+auto constexpr window_size =
+  1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
+auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
+                                          ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
+using storage_type     = cuco::aow_storage<slot_type,
+                                       window_size,
+                                       cuco::extent<std::size_t>,
+                                       cudf::detail::cuco_allocator<char>>;
+using storage_ref_type = typename storage_type::ref_type;
+using window_type      = typename storage_type::window_type;
+using slot_type        = cuco::pair<key_type, mapped_type>;
+
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
 
-using map_type = cuco::legacy::static_map<size_type, size_type>;
-
-/**
- * @brief The alias of `map_type::pair_atomic_type` class.
- *
- * Declare this struct by trivial subclassing instead of type aliasing so we can have forward
- * declaration of this struct somewhere else.
- */
-struct slot_type : public map_type::slot_type {};
-
 struct CompressedStreamInfo {
   CompressedStreamInfo() = default;
   explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_)
@@ -184,11 +193,11 @@ struct StripeStream {
  */
 struct stripe_dictionary {
   // input
-  device_span<slot_type> map_slots;  // hash map storage
-  uint32_t column_idx      = 0;      // column index
-  size_type start_row      = 0;      // first row in the stripe
-  size_type start_rowgroup = 0;      // first rowgroup in the stripe
-  size_type num_rows       = 0;      // number of rows in the stripe
+  device_span<window_type> map_slots;  // hash map (windows) storage
+  uint32_t column_idx      = 0;        // column index
+  size_type start_row      = 0;        // first row in the stripe
+  size_type start_rowgroup = 0;        // first rowgroup in the stripe
+  size_type num_rows       = 0;        // number of rows in the stripe
 
   // output
   device_span<uint32_t> data;        // index of elements in the column to include in the dictionary
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 60a64fb0ee6..b09062f700e 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -20,6 +20,7 @@
  */
 
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
 #include "writer_impl.hpp"
@@ -2110,7 +2111,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
                                        bool sort_dictionaries,
                                        rmm::cuda_stream_view stream)
 {
-  std::vector<std::vector<rmm::device_uvector<gpu::slot_type>>> hash_maps_storage(
+  // Variable to keep track of the current total map storage size
+  size_t total_map_storage_size = 0;
+  std::vector<std::vector<size_t>> hash_maps_storage_offsets(
     orc_table.string_column_indices.size());
   for (auto col_idx : orc_table.string_column_indices) {
     auto& str_column = orc_table.column(col_idx);
@@ -2119,14 +2122,21 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
         stripe.size == 0 ? 0
                          : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end -
                              segmentation.rowgroups[stripe.first][col_idx].begin;
-      hash_maps_storage[str_column.str_index()].emplace_back(stripe_num_rows * 1.43, stream);
+      hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size);
+      total_map_storage_size += stripe_num_rows * gpu::occupancy_factor;
     }
+    hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size);
   }
 
   hostdevice_2dvector<gpu::stripe_dictionary> stripe_dicts(
     orc_table.num_string_columns(), segmentation.num_stripes(), stream);
   if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}};
 
+  // Create a single bulk storage to use for all sub-dictionaries
+  auto map_storage = std::make_unique<gpu::storage_type>(
+    total_map_storage_size,
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream});
+
   // Initialize stripe dictionaries
   for (auto col_idx : orc_table.string_column_indices) {
     auto& str_column       = orc_table.column(col_idx);
@@ -2137,7 +2147,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
       auto const stripe_idx = stripe.id;
       auto& sd              = stripe_dicts[str_col_idx][stripe_idx];
 
-      sd.map_slots      = hash_maps_storage[str_col_idx][stripe_idx];
+      sd.map_slots      = {map_storage->data() + hash_maps_storage_offsets[str_col_idx][stripe_idx],
+                           hash_maps_storage_offsets[str_col_idx][stripe_idx + 1] -
+                             hash_maps_storage_offsets[str_col_idx][stripe_idx]};
       sd.column_idx     = col_idx;
       sd.start_row      = segmentation.rowgroups[stripe.first][col_idx].begin;
       sd.start_rowgroup = stripe.first;
@@ -2150,7 +2162,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   }
   stripe_dicts.host_to_device_async(stream);
 
-  gpu::initialize_dictionary_hash_maps(stripe_dicts, stream);
+  map_storage->initialize_async({gpu::KEY_SENTINEL, gpu::VALUE_SENTINEL}, {stream.value()});
   gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream);
   // Copy the entry counts and char counts from the device to the host
   stripe_dicts.device_to_host_sync(stream);
@@ -2184,8 +2196,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
         col_use_dictionary = true;
       } else {
         // Clear hash map storage as dictionary encoding is not used for this stripe
-        hash_maps_storage[str_col_idx][stripe_idx] = rmm::device_uvector<gpu::slot_type>(0, stream);
-        sd.map_slots                               = {};
+        sd.map_slots = {};
       }
     }
     // If any stripe uses dictionary encoding, allocate index storage for the whole column
@@ -2203,7 +2214,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
 
   // deallocate hash map storage, unused after this point
-  hash_maps_storage.clear();
+  map_storage.reset();
 
   // Clear map slots and attach order buffers
   auto dictionaries_flat = stripe_dicts.host_view().flat_view();
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 1a2a9eac17d..b85ebf2fa1a 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -194,17 +194,12 @@ struct map_find_fn {
            val_idx += block_size) {
         // Find the key using a single thread for best performance for now.
         if (data_col.is_valid(val_idx)) {
+          auto const found_slot = map_find_ref.find(val_idx);
+          // Fail if we didn't find the previously inserted key.
+          cudf_assert(found_slot != map_find_ref.end() &&
+                      "Unable to find value in map in dictionary index construction");
           // No need for atomic as this is not going to be modified by any other thread.
-          chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() {
-            auto const found_slot = map_find_ref.find(val_idx);
-
-            // Fail if we didn't find the previously inserted key.
-            cudf_assert(found_slot != map_find_ref.end() &&
-                        "Unable to find value in map in dictionary index construction");
-
-            // Return the found value.
-            return found_slot->second;
-          }();
+          chunk->dict_index[val_idx - s_ck_start_val_idx] = found_slot->second;
         }
       }
     } else {

From 3bee6788a795ebc22125d8726fbc79dbfb50b0b5 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Mon, 14 Oct 2024 18:13:29 +0100
Subject: [PATCH 063/117] Made cudftestutil header-only and removed GTest
 dependency (#16839)

This merge request follows up on https://github.com/rapidsai/cudf/issues/16658.
It removes the dependency on GTest by cudftestutil.  It satisfies the requirement that we only need API compatibility with the GTest API and we don't expose the GTest symbols to our consumers nor ship any binary artifact.
The source files defining the symbols are late-binded to the resulting executable (via library INTERFACE sources).
The user has to link to manually link the GTest and GMock libraries to the final executable as illustrated below.

Closes #16658

### Usage

CMakeLists.txt:

```cmake
add_executable(test1 test1.cpp)
target_link_libraries(test1 PRIVATE GTest::gtest GTest::gmock GTest::gtest_main cudf::cudftestutil cudf::cudftestutil_impl)
```

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Robert Maynard (https://github.com/robertmaynard)
  - David Wendt (https://github.com/davidwendt)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/cudf/pull/16839
---
 cpp/CMakeLists.txt                            | 65 +++++++++++-------
 cpp/benchmarks/CMakeLists.txt                 | 25 +++----
 cpp/include/cudf_test/testing_main.hpp        | 67 +++++++++++++------
 cpp/tests/CMakeLists.txt                      | 11 ++-
 cpp/tests/io/metadata_utilities.cpp           |  5 +-
 .../large_strings/large_strings_fixture.cpp   |  9 +--
 cpp/tests/utilities/table_utilities.cu        |  5 +-
 7 files changed, 115 insertions(+), 72 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a0ea9579475..c8f8ae2dcfe 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -863,15 +863,7 @@ if(CUDF_BUILD_TESTUTIL)
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
-  add_library(
-    cudftestutil SHARED
-    tests/io/metadata_utilities.cpp
-    tests/utilities/column_utilities.cu
-    tests/utilities/debug_utilities.cu
-    tests/utilities/random_seed.cpp
-    tests/utilities/table_utilities.cu
-    tests/utilities/tdigest_utilities.cu
-  )
+  add_library(cudftestutil INTERFACE)
 
   set_target_properties(
     cudftestutil
@@ -880,32 +872,56 @@ if(CUDF_BUILD_TESTUTIL)
                # set target compile options
                CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
-               CXX_VISIBILITY_PRESET hidden
                CUDA_STANDARD 17
                CUDA_STANDARD_REQUIRED ON
-               CUDA_VISIBILITY_PRESET hidden
-               POSITION_INDEPENDENT_CODE ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
   target_compile_options(
-    cudftestutil PUBLIC "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
-                        "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
+    cudftestutil INTERFACE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
+                           "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
   )
 
   target_link_libraries(
-    cudftestutil
-    PUBLIC Threads::Threads cudf cudftest_default_stream
-    PRIVATE GTest::gmock GTest::gtest $<TARGET_NAME_IF_EXISTS:conda_env>
+    cudftestutil INTERFACE Threads::Threads cudf cudftest_default_stream
+                           $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
   target_include_directories(
-    cudftestutil PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
-                        "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+    cudftestutil INTERFACE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+                           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
   )
   rapids_cuda_set_runtime(cudftestutil USE_STATIC ${CUDA_STATIC_RUNTIME})
   add_library(cudf::cudftestutil ALIAS cudftestutil)
 
+  add_library(cudftestutil_impl INTERFACE)
+  add_library(cudf::cudftestutil_impl ALIAS cudftestutil_impl)
+  target_sources(
+    cudftestutil_impl
+    INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/io/metadata_utilities.cpp>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/column_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/debug_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/random_seed.cpp>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/table_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/tdigest_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/io/metadata_utilities.cpp>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/column_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/debug_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/random_seed.cpp>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/table_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/tdigest_utilities.cu>
+  )
+  target_link_libraries(cudftestutil_impl INTERFACE cudf::cudftestutil)
+
+  install(FILES tests/io/metadata_utilities.cpp DESTINATION src/cudftestutil/io)
+  install(
+    FILES tests/utilities/column_utilities.cu
+          tests/utilities/debug_utilities.cu
+          tests/utilities/random_seed.cpp
+          tests/utilities/table_utilities.cu
+          tests/utilities/tdigest_utilities.cu
+    DESTINATION src/cudftestutil/utilities
+  )
+
 endif()
 
 # * build cudf_identify_stream_usage --------------------------------------------------------------
@@ -1006,7 +1022,7 @@ install(
 set(_components_export_string)
 if(TARGET cudftestutil)
   install(
-    TARGETS cudftest_default_stream cudftestutil
+    TARGETS cudftest_default_stream cudftestutil cudftestutil_impl
     DESTINATION ${lib_dir}
     EXPORT cudf-testing-exports
   )
@@ -1046,14 +1062,15 @@ targets:
 This module offers an optional testing component which defines the
 following IMPORTED GLOBAL  targets:
 
- cudf::cudftestutil     - The main cudf testing library
+ cudf::cudftestutil          - The main cudf testing library
+ cudf::cudftestutil_impl     - C++ and CUDA sources to compile for definitions in cudf::cudftestutil
     ]=]
 )
 
 rapids_export(
   INSTALL cudf
   EXPORT_SET cudf-exports ${_components_export_string}
-  GLOBAL_TARGETS cudf cudftestutil
+  GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl
   NAMESPACE cudf::
   DOCUMENTATION doc_string
 )
@@ -1074,7 +1091,7 @@ endif()
 rapids_export(
   BUILD cudf
   EXPORT_SET cudf-exports ${_components_export_string}
-  GLOBAL_TARGETS cudf cudftestutil
+  GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK build_code_string
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b0f75b25975..d6fc5dc6039 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -25,7 +25,7 @@ target_compile_options(
 target_link_libraries(
   cudf_datagen
   PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf
-         cudftestutil nvtx3::nvtx3-cpp
+         cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -49,7 +49,7 @@ target_compile_options(
 
 target_link_libraries(
   ndsh_data_generator
-  PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
+  PUBLIC cudf GTest::gmock GTest::gtest cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -65,14 +65,14 @@ target_include_directories(
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
   cudf_benchmark_common OBJECT
-  "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
-  synchronization/synchronization.cpp
-  io/cuio_common.cpp
-  common/table_utilities.cpp
-  common/benchmark_utilities.cpp
-  common/nvbench_utilities.cpp
+  synchronization/synchronization.cpp io/cuio_common.cpp common/table_utilities.cpp
+  common/benchmark_utilities.cpp common/nvbench_utilities.cpp
 )
-target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
+target_link_libraries(
+  cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env> GTest::gmock
+                                GTest::gtest
+)
+
 add_custom_command(
   OUTPUT CUDF_BENCHMARKS
   COMMAND echo Running benchmarks
@@ -99,7 +99,7 @@ function(ConfigureBench CMAKE_BENCH_NAME)
   )
   target_link_libraries(
     ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
-                                $<TARGET_NAME_IF_EXISTS:conda_env>
+                                cudf::cudftestutil_impl $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   add_custom_command(
     OUTPUT CUDF_BENCHMARKS
@@ -127,8 +127,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
-                                nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_BENCH_NAME}
+    PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen nvbench::nvbench
+            $<TARGET_NAME_IF_EXISTS:conda_env> cudf::cudftestutil_impl
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 272c91133f8..2bd08f410e0 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/cxxopts.hpp>
 #include <cudf_test/stream_checking_resource_adaptor.hpp>
 
@@ -36,6 +37,12 @@
 namespace CUDF_EXPORT cudf {
 namespace test {
 
+struct config {
+  std::string rmm_mode;
+  std::string stream_mode;
+  std::string stream_error_mode;
+};
+
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
@@ -157,10 +164,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
  * @param cmd_opts Command line options returned by parse_cudf_test_opts
  * @return Memory resource adaptor
  */
-inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
+inline auto make_memory_resource_adaptor(cudf::test::config const& config)
 {
-  auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
-  auto resource       = cudf::test::create_memory_resource(rmm_mode);
+  auto resource = cudf::test::create_memory_resource(config.rmm_mode);
   cudf::set_current_device_resource(resource.get());
   return resource;
 }
@@ -176,37 +182,54 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
  * @param cmd_opts Command line options returned by parse_cudf_test_opts
  * @return Memory resource adaptor
  */
-inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
+inline auto make_stream_mode_adaptor(cudf::test::config const& config)
 {
   auto resource                      = cudf::get_current_device_resource_ref();
-  auto const stream_mode             = cmd_opts["stream_mode"].as<std::string>();
-  auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
-  auto const error_on_invalid_stream = (stream_error_mode == "error");
-  auto const check_default_stream    = (stream_mode == "new_cudf_default");
+  auto const error_on_invalid_stream = (config.stream_error_mode == "error");
+  auto const check_default_stream    = (config.stream_mode == "new_cudf_default");
   auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
     resource, error_on_invalid_stream, check_default_stream);
-  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
+  if ((config.stream_mode == "new_cudf_default") || (config.stream_mode == "new_testing_default")) {
     cudf::set_current_device_resource(&adaptor);
   }
   return adaptor;
 }
 
+/**
+ * @brief Should be called in every test program that uses rmm allocators since it maintains the
+ * lifespan of the rmm default memory resource. this function parses the command line to customize
+ * test behavior, like the allocation mode used for creating the default memory resource.
+ *
+ */
+inline void init_cudf_test(int argc, char** argv, cudf::test::config const& config_override = {})
+{
+  // static lifetime to keep rmm resource alive till tests end
+  auto const cmd_opts       = parse_cudf_test_opts(argc, argv);
+  cudf::test::config config = config_override;
+  if (config.rmm_mode.empty()) { config.rmm_mode = cmd_opts["rmm_mode"].as<std::string>(); }
+
+  if (config.stream_mode.empty()) {
+    config.stream_mode = cmd_opts["stream_mode"].as<std::string>();
+  }
+
+  if (config.stream_error_mode.empty()) {
+    config.stream_error_mode = cmd_opts["stream_error_mode"].as<std::string>();
+  }
+
+  [[maybe_unused]] static auto mr      = make_memory_resource_adaptor(config);
+  [[maybe_unused]] static auto adaptor = make_stream_mode_adaptor(config);
+}
+
 /**
  * @brief Macro that defines main function for gtest programs that use rmm
  *
- * Should be included in every test program that uses rmm allocators since
- * it maintains the lifespan of the rmm default memory resource.
  * This `main` function is a wrapper around the google test generated `main`,
- * maintaining the original functionality. In addition, this custom `main`
- * function parses the command line to customize test behavior, like the
- * allocation mode used for creating the default memory resource.
+ * maintaining the original functionality.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                            \
-  int main(int argc, char** argv)                                           \
-  {                                                                         \
-    ::testing::InitGoogleTest(&argc, argv);                                 \
-    auto const cmd_opts           = parse_cudf_test_opts(argc, argv);       \
-    [[maybe_unused]] auto mr      = make_memory_resource_adaptor(cmd_opts); \
-    [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts);     \
-    return RUN_ALL_TESTS();                                                 \
+#define CUDF_TEST_PROGRAM_MAIN()            \
+  int main(int argc, char** argv)           \
+  {                                         \
+    ::testing::InitGoogleTest(&argc, argv); \
+    init_cudf_test(argc, argv);             \
+    return RUN_ALL_TESTS();                 \
   }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4596ec65ce7..62189f76cae 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -56,8 +56,15 @@ function(ConfigureTest CMAKE_TEST_NAME)
 
   target_link_libraries(
     ${CMAKE_TEST_NAME}
-    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main
-            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIBS}"
+    PRIVATE cudf::cudftestutil
+            cudf::cudftestutil_impl
+            GTest::gmock
+            GTest::gmock_main
+            GTest::gtest
+            GTest::gtest_main
+            nvtx3::nvtx3-cpp
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+            "${_CUDF_TEST_EXTRA_LIBS}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp
index 84f04f67038..380d66c53f9 100644
--- a/cpp/tests/io/metadata_utilities.cpp
+++ b/cpp/tests/io/metadata_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/io_metadata_utilities.hpp>
 
-#include <gmock/gmock.h>
-
 namespace cudf::test {
 
 void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index 249319da7f7..7b61be113f9 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -123,12 +123,9 @@ LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
 int main(int argc, char** argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
-  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
-  // hardcoding the CUDA memory resource to keep from exceeding the pool
-  auto mr = cudf::test::make_cuda();
-  cudf::set_current_device_resource(mr.get());
-  auto adaptor = make_stream_mode_adaptor(cmd_opts);
-
+  cudf::test::config config;
+  config.rmm_mode = "cuda";
+  init_cudf_test(argc, argv, config);
   // create object to automatically be destroyed at the end of main()
   auto lsd = cudf::test::StringsLargeTest::get_ls_data();
 
diff --git a/cpp/tests/utilities/table_utilities.cu b/cpp/tests/utilities/table_utilities.cu
index 354c0b1b57e..8e4906408de 100644
--- a/cpp/tests/utilities/table_utilities.cu
+++ b/cpp/tests/utilities/table_utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,9 @@
  */
 
 #include <cudf_test/column_utilities.hpp>
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <gmock/gmock.h>
-
 namespace cudf::test::detail {
 void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs)
 {

From e41dea933f044183ccbfe26875a2b6c3ff383814 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 14 Oct 2024 10:35:37 -0700
Subject: [PATCH 064/117] Add profilers to CUDA 12 conda devcontainers (#17066)

This will make sure that profilers are available by default for everyone using our devcontainers.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/17066
---
 .../cuda12.5-conda/devcontainer.json          | 22 +++++++++++++++++++
 ci/release/update-version.sh                  |  1 +
 2 files changed, 23 insertions(+)

diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 2a195c6c81d..a0e193ff0bf 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -15,9 +15,31 @@
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
+        "version": "12.5",
+        "installCompilers": false,
+        "installProfilers": true,
+        "installDevPackages": false,
+        "installcuDNN": false,
+        "installcuTensor": false,
+        "installNCCL": false,
+        "installCUDARuntime": false,
+        "installNVRTC": false,
+        "installOpenCL": false,
+        "installcuBLAS": false,
+        "installcuSPARSE": false,
+        "installcuFFT": false,
+        "installcuFile": false,
+        "installcuRAND": false,
+        "installcuSOLVER": false,
+        "installNPP": false,
+        "installnvJPEG": false,
+        "pruneStaticLibs": true
+      },
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
   "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 870901d223b..95f36653c2c 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -93,6 +93,7 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
 # .devcontainer files
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done

From 768fbaa28033446dc899872b3c94213c75bc1a98 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 14 Oct 2024 13:53:28 -0700
Subject: [PATCH 065/117] Fix ORC reader when using `device_read_async` while
 the destination device buffers are not ready (#17074)

This fixes a bug in ORC reader when `device_read_async` is called while the destination device buffers are not ready to write in. In detail, this bug is because `device_read_async` does not use the user-provided stream but its own generated stream for data copying. As such, the copying ops could happen before the destination device buffers are being allocated, causing data corruption.

This bug only shows up in certain conditions, and also hard to reproduce. It occurs when copying buffers with small sizes (below `gds_threshold`) and most likely to show up with setting `rmm_mode=async`.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17074
---
 cpp/src/io/orc/reader_impl_chunking.cu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 01ee5ad177d..ecf319e75ab 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -500,6 +500,8 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   auto const [read_begin, read_end] =
     merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
 
+  bool stream_synchronized{false};
+
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read_info = _file_itm_data.data_read_info[read_idx];
     auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
@@ -507,6 +509,13 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
 
     if (source_ptr->is_device_read_preferred(read_info.length)) {
+      // `device_read_async` may not use _stream at all.
+      // Instead, it may use some other stream(s) to sync the H->D memcpy.
+      // As such, we need to make sure the device buffers in `lvl_stripe_data` are ready first.
+      if (!stream_synchronized) {
+        _stream.synchronize();
+        stream_synchronized = true;
+      }
       device_read_tasks.push_back(
         std::pair(source_ptr->device_read_async(
                     read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),

From 44afc5109b342b653797a20db1e2654fa450417f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 14 Oct 2024 16:02:14 -0700
Subject: [PATCH 066/117] Add clang-tidy to CI (#16958)

This PR adds clang-tidy checks to our CI. clang-tidy will be run in nightly CI via CMake. For now, only the parts of the code base that were already made compliant in the PRs leading up to this have been enabled, namely cudf source and test cpp files. Over time we can add more files like benchmarks and examples, add or subtract more rules, or enable linting of cu files (see https://gitlab.kitware.com/cmake/cmake/-/issues/25399). This PR is intended to be the starting point enabling systematic linting, at which point everything else should be significantly easier.

Resolves #584

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16958
---
 .github/workflows/test.yaml           |  14 +-
 ci/clang_tidy.sh                      |  29 +++
 cpp/.clang-tidy                       |   2 +-
 cpp/CMakeLists.txt                    |  54 ++++++
 cpp/scripts/run-clang-tidy.py         | 253 --------------------------
 cpp/tests/CMakeLists.txt              |   1 +
 cpp/tests/interop/nanoarrow_utils.hpp |   3 +-
 dependencies.yaml                     |  30 ++-
 8 files changed, 129 insertions(+), 257 deletions(-)
 create mode 100755 ci/clang_tidy.sh
 delete mode 100644 cpp/scripts/run-clang-tidy.py

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a22d3c5b9cc..1275aad757c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -47,11 +47,23 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
-      build_type: pull-request
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
       # Use the wheel container so we can skip conda solves and since our
       # primary static consumers (Spark) are not in conda anyway.
       container_image: "rapidsai/ci-wheel:latest"
       run_script: "ci/configure_cpp_static.sh"
+  clang-tidy:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      run_script: "ci/clang_tidy.sh"
   conda-python-cudf-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
diff --git a/ci/clang_tidy.sh b/ci/clang_tidy.sh
new file mode 100755
index 00000000000..4d5d3fc3136
--- /dev/null
+++ b/ci/clang_tidy.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+rapids-logger "Create clang-tidy conda environment"
+. /opt/conda/etc/profile.d/conda.sh
+
+ENV_YAML_DIR="$(mktemp -d)"
+
+rapids-dependency-file-generator \
+  --output conda \
+  --file-key clang_tidy \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
+
+rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate clang_tidy
+set -u
+
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
+source rapids-configure-sccache
+
+# Run the build via CMake, which will run clang-tidy when CUDF_CLANG_TIDY is enabled.
+cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON -GNinja
+cmake --build cpp/build
diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
index 2d4f8c0d80e..12120a5c6d1 100644
--- a/cpp/.clang-tidy
+++ b/cpp/.clang-tidy
@@ -39,7 +39,7 @@ Checks:
        -clang-analyzer-optin.core.EnumCastOutOfRange,
        -clang-analyzer-optin.cplusplus.UninitializedObject'
 
-WarningsAsErrors: ''
+WarningsAsErrors: '*'
 HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*'
 ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*'
 FormatStyle:     none
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c8f8ae2dcfe..32a753c9f40 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -88,6 +88,7 @@ option(
   ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL}
 )
 mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
+option(CUDF_CLANG_TIDY "Enable clang-tidy checking" OFF)
 
 message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
@@ -144,6 +145,58 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR)
   set(CUDF_GENERATED_INCLUDE_DIR ${CUDF_BINARY_DIR})
 endif()
 
+# ##################################################################################################
+# * clang-tidy configuration ----------------------------------------------------------------------
+if(CUDF_CLANG_TIDY)
+  find_program(
+    CLANG_TIDY_EXE
+    NAMES "clang-tidy"
+    DOC "Path to clang-tidy executable" REQUIRED
+  )
+
+  execute_process(
+    COMMAND ${CLANG_TIDY_EXE} --version
+    OUTPUT_VARIABLE CLANG_TIDY_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  string(REGEX MATCH "LLVM version ([0-9]+\\.[0-9]+)\\.[0-9]+" LLVM_VERSION_MATCH
+               "${CLANG_TIDY_OUTPUT}"
+  )
+  # Discard the patch version and allow it to float. Empirically, results between patch versions are
+  # mostly stable, and looking at available packages on some package managers sometimes patch
+  # versions are skipped so we don't want to constrain to a patch version that the user can't
+  # install.
+  set(LLVM_VERSION "${CMAKE_MATCH_1}")
+  set(expected_clang_tidy_version 19.1)
+  if(NOT expected_clang_tidy_version VERSION_EQUAL LLVM_VERSION)
+    message(
+      FATAL_ERROR
+        "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}"
+    )
+  endif()
+endif()
+
+# Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES.
+function(enable_clang_tidy target)
+  set(_tidy_options)
+  set(_tidy_one_value)
+  set(_tidy_multi_value SKIPPED_FILES)
+  cmake_parse_arguments(
+    _TIDY "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN}
+  )
+
+  if(CUDF_CLANG_TIDY)
+    # clang will complain about unused link libraries on the compile line unless we specify
+    # -Qunused-arguments.
+    set_target_properties(
+      ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
+    )
+    foreach(file IN LISTS _TIDY_SKIPPED_FILES)
+      set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON)
+    endforeach()
+  endif()
+endfunction()
+
 # ##################################################################################################
 # * conda environment -----------------------------------------------------------------------------
 rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
@@ -714,6 +767,7 @@ target_compile_options(
   cudf PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
+enable_clang_tidy(cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp)
 
 if(CUDF_BUILD_STACKTRACE_DEBUG)
   # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
deleted file mode 100644
index e5e57dbf562..00000000000
--- a/cpp/scripts/run-clang-tidy.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import re
-import os
-import subprocess
-import argparse
-import json
-import multiprocessing as mp
-import shutil
-
-
-EXPECTED_VERSION = "16.0.6"
-VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
-GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
-SPACES = re.compile(r"\s+")
-SEPARATOR = "-" * 16
-
-
-def parse_args():
-    argparser = argparse.ArgumentParser("Runs clang-tidy on a project")
-    argparser.add_argument("-cdb", type=str,
-                           # TODO This is a hack, needs to be fixed
-                           default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json",
-                           help="Path to cmake-generated compilation database"
-                           " file. It is always found inside the root of the "
-                           "cmake build folder. So make sure that `cmake` has "
-                           "been run once before running this script!")
-    argparser.add_argument("-exe", type=str, default="clang-tidy",
-                           help="Path to clang-tidy exe")
-    argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/",
-                           help="Regex used to ignore files from checking")
-    argparser.add_argument("-select", type=str, default=None,
-                           help="Regex used to select files for checking")
-    argparser.add_argument("-j", type=int, default=-1,
-                           help="Number of parallel jobs to launch.")
-    args = argparser.parse_args()
-    if args.j <= 0:
-        args.j = mp.cpu_count()
-    args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
-    args.select_compiled = re.compile(args.select) if args.select else None
-    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
-    ret = ret.decode("utf-8")
-    version = VERSION_REGEX.search(ret)
-    if version is None:
-        raise Exception("Failed to figure out clang-tidy version!")
-    version = version.group(1)
-    if version != EXPECTED_VERSION:
-        raise Exception("clang-tidy exe must be v%s found '%s'" % \
-                        (EXPECTED_VERSION, version))
-    if not os.path.exists(args.cdb):
-        raise Exception("Compilation database '%s' missing" % args.cdb)
-    return args
-
-
-def get_all_commands(cdb):
-    with open(cdb) as fp:
-        return json.load(fp)
-
-
-def get_gpu_archs(command):
-    archs = []
-    for loc in range(len(command)):
-        if command[loc] != "-gencode":
-            continue
-        arch_flag = command[loc + 1]
-        match = GPU_ARCH_REGEX.search(arch_flag)
-        if match is not None:
-            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
-    return archs
-
-
-def get_index(arr, item):
-    try:
-        return arr.index(item)
-    except:
-        return -1
-
-
-def remove_item(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc]
-    return loc
-
-
-def remove_item_plus_one(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc + 1]
-        del arr[loc]
-    return loc
-
-
-def get_clang_includes(exe):
-    dir = os.getenv("CONDA_PREFIX")
-    if dir is None:
-        ret = subprocess.check_output("which %s 2>&1" % exe, shell=True)
-        ret = ret.decode("utf-8")
-        dir = os.path.dirname(os.path.dirname(ret))
-    header = os.path.join(dir, "include", "ClangHeaders")
-    return ["-I", header]
-
-
-def get_tidy_args(cmd, exe):
-    command, file = cmd["command"], cmd["file"]
-    is_cuda = file.endswith(".cu")
-    command = re.split(SPACES, command)
-    # compiler is always clang++!
-    command[0] = "clang++"
-    # remove compilation and output targets from the original command
-    remove_item_plus_one(command, "-c")
-    remove_item_plus_one(command, "-o")
-    if is_cuda:
-        # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
-        archs = get_gpu_archs(command)
-        command.extend(archs)
-        while True:
-            loc = remove_item_plus_one(command, "-gencode")
-            if loc < 0:
-                break
-        # "-x cuda" is the right usage in clang
-        loc = get_index(command, "-x")
-        if loc >= 0:
-            command[loc + 1] = "cuda"
-        remove_item_plus_one(command, "-ccbin")
-        remove_item(command, "--expt-extended-lambda")
-        remove_item(command, "--diag_suppress=unrecognized_gcc_pragma")
-    command.extend(get_clang_includes(exe))
-    return command, is_cuda
-
-
-def run_clang_tidy_command(tidy_cmd):
-    cmd = " ".join(tidy_cmd)
-    result = subprocess.run(cmd, check=False, shell=True,
-                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    status = result.returncode == 0
-    if status:
-        out = ""
-    else:
-        out = "CMD: " + cmd
-    out += result.stdout.decode("utf-8").rstrip()
-    return status, out
-
-
-def run_clang_tidy(cmd, args):
-    command, is_cuda = get_tidy_args(cmd, args.exe)
-    tidy_cmd = [args.exe,
-                "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'",
-                cmd["file"], "--", ]
-    tidy_cmd.extend(command)
-    status = True
-    out = ""
-    if is_cuda:
-        tidy_cmd.append("--cuda-device-only")
-        tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        out += out1
-        out += "%s" % SEPARATOR
-        if not ret:
-            status = ret
-        tidy_cmd[-2] = "--cuda-host-only"
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
-        out += out1
-    else:
-        tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
-        out += out1
-    return status, out, cmd["file"]
-
-
-# yikes! global var :(
-results = []
-def collect_result(result):
-    global results
-    results.append(result)
-
-
-def print_result(passed, stdout, file):
-    status_str = "PASSED" if passed else "FAILED"
-    print(f"{SEPARATOR} File:{file} {status_str} {SEPARATOR}")
-    if stdout:
-        print(stdout)
-        print(f"{SEPARATOR} File:{file} ENDS {SEPARATOR}")
-
-
-def print_results():
-    global results
-    status = True
-    for passed, stdout, file in results:
-        print_result(passed, stdout, file)
-        if not passed:
-            status = False
-    return status
-
-
-def run_tidy_for_all_files(args, all_files):
-    pool = None if args.j == 1 else mp.Pool(args.j)
-    # actual tidy checker
-    for cmd in all_files:
-        # skip files that we don't want to look at
-        if args.ignore_compiled is not None and \
-           re.search(args.ignore_compiled, cmd["file"]) is not None:
-            continue
-        if args.select_compiled is not None and \
-           re.search(args.select_compiled, cmd["file"]) is None:
-            continue
-        if pool is not None:
-            pool.apply_async(run_clang_tidy, args=(cmd, args),
-                             callback=collect_result)
-        else:
-            passed, stdout, file = run_clang_tidy(cmd, args)
-            collect_result((passed, stdout, file))
-    if pool is not None:
-        pool.close()
-        pool.join()
-    return print_results()
-
-
-def main():
-    args = parse_args()
-    # Attempt to making sure that we run this script from root of repo always
-    if not os.path.exists(".git"):
-        raise Exception("This needs to always be run from the root of repo")
-    # Check whether clang-tidy exists
-    # print(args)
-    if "exe" not in args and shutil.which("clang-tidy") is not None:
-        print("clang-tidy not found. Exiting...")
-        return
-    all_files = get_all_commands(args.cdb)
-    status = run_tidy_for_all_files(args, all_files)
-    if not status:
-        raise Exception("clang-tidy failed! Refer to the errors above.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 62189f76cae..799a84cbc37 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -83,6 +83,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
         "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_${_CUDF_TEST_STREAM_MODE}>"
     )
   endif()
+  enable_clang_tidy(${CMAKE_TEST_NAME})
 endfunction()
 
 # ##################################################################################################
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index a961f73d955..8be7e087b6d 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -256,7 +256,8 @@ std::enable_if_t<std::is_same_v<T, bool>, nanoarrow::UniqueArray> get_nanoarrow_
     ArrowBitmap out;
     ArrowBitmapInit(&out);
     NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1));
-    std::memset(out.buffer.data, 0, out.buffer.size_bytes);
+    // TODO: Investigate clang-tidy issue further after nanoarrow is made compliant
+    std::memset(out.buffer.data, 0, out.buffer.size_bytes);  // NOLINT
 
     for (size_t i = 0; i < b.size(); ++i) {
       ArrowBitSetTo(out.buffer.data, i, static_cast<uint8_t>(b[i]));
diff --git a/dependencies.yaml b/dependencies.yaml
index ca17917c905..ff97b67f0ce 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,10 +6,18 @@ files:
       cuda: ["11.8", "12.5"]
       arch: [x86_64]
     includes:
+      # Note that clang-tidy is not included here because cudf's preferred
+      # version conflicts with the rest of RAPIDS as well as its own
+      # clang-format version. Until we update our clang-format version we will
+      # not be able to install both into the same environment. Moreover, using
+      # this version will break compatibility with other RAPIDS libraries that
+      # are still using 16.0.6, and as such will and that would break any
+      # unified environment like that used in unified devcontainers.
       - build_base
       - build_all
       - build_cpp
       - build_python_common
+      - clang_format
       - cuda
       - cuda_version
       - depends_on_cupy
@@ -86,6 +94,16 @@ files:
     includes:
       - develop
       - py_version
+  clang_tidy:
+    output: none
+    includes:
+      - build_all
+      - build_base
+      - clang_tidy
+      - cuda
+      - cuda_version
+      - develop
+      - py_version
   docs:
     output: none
     includes:
@@ -553,11 +571,21 @@ dependencies:
           # pre-commit requires identify minimum version 1.0, but clang-format requires textproto support and that was
           # added in 2.5.20, so we need to call out the minimum version needed for our plugins
           - identify>=2.5.20
+      - output_types: conda
+        packages:
+          - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
+  clang_format:
+    common:
       - output_types: conda
         packages:
           - clang==16.0.6
           - clang-tools=16.0.6
-          - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
+  clang_tidy:
+    common:
+      - output_types: conda
+        packages:
+          - clang==19.1.0
+          - clang-tools==19.1.0
   docs:
     common:
       - output_types: [conda]

From 86db9804746fb20554c1900b311a228dc1d6e349 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 16:30:22 -0700
Subject: [PATCH 067/117] Clean up hash-groupby `var_hash_functor` (#17034)

This work is part of splitting the original bulk shared memory groupby PR #16619.

This PR renames the file originally titled `multi_pass_kernels.cuh`, which contains the `var_hash_functor`, to `var_hash_functor.cuh`. It also includes cleanups such as utilizing `cuda::std::` utilities in device code and removing redundant template parameters.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17034
---
 cpp/src/groupby/hash/groupby.cu               |  5 +-
 cpp/src/groupby/hash/groupby_kernels.cuh      |  2 -
 ..._pass_kernels.cuh => var_hash_functor.cuh} | 51 ++++++++-----------
 3 files changed, 25 insertions(+), 33 deletions(-)
 rename cpp/src/groupby/hash/{multi_pass_kernels.cuh => var_hash_functor.cuh} (69%)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 75767786272..0432b9d120a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -16,7 +16,8 @@
 
 #include "flatten_single_pass_aggs.hpp"
 #include "groupby/common/utils.hpp"
-#include "groupby/hash/groupby_kernels.cuh"
+#include "groupby_kernels.cuh"
+#include "var_hash_functor.cuh"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -261,7 +262,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
       rmm::exec_policy(stream),
       thrust::make_counting_iterator(0),
       col.size(),
-      ::cudf::detail::var_hash_functor{
+      var_hash_functor{
         set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
     sparse_results->add_result(col, agg, std::move(var_result));
     dense_results->add_result(col, agg, to_dense_agg_result(agg));
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index 188d0cff3f1..86f4d76487f 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include "multi_pass_kernels.cuh"
-
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/groupby.hpp>
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh
similarity index 69%
rename from cpp/src/groupby/hash/multi_pass_kernels.cuh
rename to cpp/src/groupby/hash/var_hash_functor.cuh
index 7043eafdc10..bb55cc9188c 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/var_hash_functor.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/aggregation.hpp>
@@ -21,17 +20,14 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuco/static_set_ref.cuh>
 #include <cuda/atomic>
+#include <cuda/std/type_traits>
 
-#include <cmath>
-
-namespace cudf {
-namespace detail {
-
-template <typename SetType, bool target_has_nulls = true, bool source_has_nulls = true>
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
 struct var_hash_functor {
   SetType set;
   bitmask_type const* __restrict__ row_bitmask;
@@ -47,13 +43,13 @@ struct var_hash_functor {
                    column_device_view sum,
                    column_device_view count,
                    size_type ddof)
-    : set(set),
-      row_bitmask(row_bitmask),
-      target(target),
-      source(source),
-      sum(sum),
-      count(count),
-      ddof(ddof)
+    : set{set},
+      row_bitmask{row_bitmask},
+      target{target},
+      source{source},
+      sum{sum},
+      count{count},
+      ddof{ddof}
   {
   }
 
@@ -64,23 +60,21 @@ struct var_hash_functor {
   }
 
   template <typename Source>
-  __device__ std::enable_if_t<!is_supported<Source>()> operator()(column_device_view const& source,
-                                                                  size_type source_index,
-                                                                  size_type target_index) noexcept
+  __device__ cuda::std::enable_if_t<!is_supported<Source>()> operator()(
+    column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
     CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination.");
   }
 
   template <typename Source>
-  __device__ std::enable_if_t<is_supported<Source>()> operator()(column_device_view const& source,
-                                                                 size_type source_index,
-                                                                 size_type target_index) noexcept
+  __device__ cuda::std::enable_if_t<is_supported<Source>()> operator()(
+    column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
-    using Target    = target_type_t<Source, aggregation::VARIANCE>;
-    using SumType   = target_type_t<Source, aggregation::SUM>;
-    using CountType = target_type_t<Source, aggregation::COUNT_VALID>;
+    using Target    = cudf::detail::target_type_t<Source, aggregation::VARIANCE>;
+    using SumType   = cudf::detail::target_type_t<Source, aggregation::SUM>;
+    using CountType = cudf::detail::target_type_t<Source, aggregation::COUNT_VALID>;
 
-    if (source_has_nulls and source.is_null(source_index)) return;
+    if (source.is_null(source_index)) return;
     CountType group_size = count.element<CountType>(target_index);
     if (group_size == 0 or group_size - ddof <= 0) return;
 
@@ -91,8 +85,9 @@ struct var_hash_functor {
     ref.fetch_add(result, cuda::std::memory_order_relaxed);
     // STD sqrt is applied in finalize()
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
+
   __device__ inline void operator()(size_type source_index)
   {
     if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) {
@@ -110,6 +105,4 @@ struct var_hash_functor {
     }
   }
 };
-
-}  // namespace detail
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash

From 319ec3b8031e4deb7dfc3f4c4a07a10ef88c131f Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 14 Oct 2024 19:58:30 -0400
Subject: [PATCH 068/117] Adding assertion to check for regular JSON inputs of
 size greater than `INT_MAX` bytes (#17057)

Addresses #17017

Libcudf does not support parsing regular JSON inputs of size greater than `INT_MAX` bytes. Note that the batched reader can only be used for JSON lines inputs.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/17057
---
 cpp/src/io/json/nested_json_gpu.cu |  3 +--
 cpp/src/io/json/read_json.cu       | 14 ++++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 76816071d8c..69a51fab5dc 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -83,8 +83,7 @@ struct tree_node {
 void check_input_size(std::size_t input_size)
 {
   // Transduce() writes symbol offsets that may be as large input_size-1
-  CUDF_EXPECTS(input_size == 0 ||
-                 (input_size - 1) <= std::numeric_limits<cudf::io::json::SymbolOffsetT>::max(),
+  CUDF_EXPECTS(input_size == 0 || (input_size - 1) <= std::numeric_limits<int32_t>::max(),
                "Given JSON input is too large");
 }
 }  // namespace
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 99a5b17bce8..c424d2b3b62 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -351,10 +351,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * JSON inputs.
    */
   std::size_t const total_source_size = sources_size(sources, 0, 0);
-  std::size_t chunk_offset            = reader_opts.get_byte_range_offset();
-  std::size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                          = !chunk_size ? total_source_size - chunk_offset
-                                                    : std::min(chunk_size, total_source_size - chunk_offset);
+
+  // Batching is enabled only for JSONL inputs, not regular JSON files
+  CUDF_EXPECTS(
+    reader_opts.is_enabled_lines() || total_source_size < std::numeric_limits<int32_t>::max(),
+    "Parsing Regular JSON inputs of size greater than INT_MAX bytes is not supported");
+
+  std::size_t chunk_offset = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size   = reader_opts.get_byte_range_size();
+  chunk_size               = !chunk_size ? total_source_size - chunk_offset
+                                         : std::min(chunk_size, total_source_size - chunk_offset);
 
   std::size_t const size_per_subchunk      = estimate_size_per_subchunk(chunk_size);
   std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();

From c141ca5ae2867909911839ad680bbf52580f8305 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 15 Oct 2024 05:42:17 -1000
Subject: [PATCH 069/117] Add string.convert.convert_integers APIs to pylibcudf
 (#16991)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/brandon-b-miller
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16991
---
 .../strings/convert/convert_integers.rst      |   6 +
 .../pylibcudf/strings/convert/index.rst       |   1 +
 python/cudf/cudf/_lib/string_casting.pyx      | 128 ++++-------
 .../strings/convert/convert_integers.pxd      |  24 +-
 .../pylibcudf/strings/convert/CMakeLists.txt  |   2 +-
 .../pylibcudf/strings/convert/__init__.pxd    |   1 +
 .../pylibcudf/strings/convert/__init__.py     |   1 +
 .../strings/convert/convert_integers.pxd      |  17 ++
 .../strings/convert/convert_integers.pyx      | 206 ++++++++++++++++++
 .../tests/test_string_convert_integers.py     |  69 ++++++
 10 files changed, 354 insertions(+), 101 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst
new file mode 100644
index 00000000000..71d146c0379
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst
@@ -0,0 +1,6 @@
+================
+convert_integers
+================
+
+.. automodule:: pylibcudf.strings.convert.convert_integers
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
index fa05cb7d786..3d07c1271b4 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
@@ -9,6 +9,7 @@ convert
     convert_durations
     convert_fixed_point
     convert_floats
+    convert_integers
     convert_ipv4
     convert_lists
     convert_urls
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 93b67bd4c9d..06ee07d8e2b 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -2,28 +2,10 @@
 
 from cudf._lib.column cimport Column
 
-from cudf._lib.scalar import as_device_scalar
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_integers cimport (
-    from_integers as cpp_from_integers,
-    hex_to_integers as cpp_hex_to_integers,
-    integers_to_hex as cpp_integers_to_hex,
-    is_hex as cpp_is_hex,
-    to_integers as cpp_to_integers,
-)
-from pylibcudf.libcudf.types cimport data_type, type_id
-
-from cudf._lib.types cimport underlying_type_t_type_id
-
 import pylibcudf as plc
+from pylibcudf.types cimport DataType
 
-import cudf
+from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
@@ -35,10 +17,10 @@ def floating_to_string(Column input_col):
     return Column.from_pylibcudf(plc_column)
 
 
-def string_to_floating(Column input_col, object out_type):
+def string_to_floating(Column input_col, DataType out_type):
     plc_column = plc.strings.convert.convert_floats.to_floats(
         input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(out_type)
+        out_type
     )
     return Column.from_pylibcudf(plc_column)
 
@@ -72,7 +54,7 @@ def stod(Column input_col):
     A Column with strings cast to double
     """
 
-    return string_to_floating(input_col, cudf.dtype("float64"))
+    return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64))
 
 
 def ftos(Column input_col):
@@ -104,36 +86,22 @@ def stof(Column input_col):
     A Column with strings cast to float
     """
 
-    return string_to_floating(input_col, cudf.dtype("float32"))
+    return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32))
 
 
 def integer_to_string(Column input_col):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_integers(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def string_to_integer(Column input_col, object out_type):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
-        )
+    plc_column = plc.strings.convert.convert_integers.from_integers(
+        input_col.to_pylibcudf(mode="read"),
     )
-    cdef data_type c_out_type = data_type(tid)
-    with nogil:
-        c_result = move(
-            cpp_to_integers(
-                input_column_view,
-                c_out_type))
+    return Column.from_pylibcudf(plc_column)
 
-    return Column.from_unique_ptr(move(c_result))
+
+def string_to_integer(Column input_col, DataType out_type):
+    plc_column = plc.strings.convert.convert_integers.to_integers(
+        input_col.to_pylibcudf(mode="read"),
+        out_type
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def i8tos(Column input_col):
@@ -165,7 +133,7 @@ def stoi8(Column input_col):
     A Column with strings cast to int8
     """
 
-    return string_to_integer(input_col, cudf.dtype("int8"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8))
 
 
 def i16tos(Column input_col):
@@ -197,7 +165,7 @@ def stoi16(Column input_col):
     A Column with strings cast to int16
     """
 
-    return string_to_integer(input_col, cudf.dtype("int16"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16))
 
 
 def itos(Column input_col):
@@ -229,7 +197,7 @@ def stoi(Column input_col):
     A Column with strings cast to int32
     """
 
-    return string_to_integer(input_col, cudf.dtype("int32"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32))
 
 
 def ltos(Column input_col):
@@ -261,7 +229,7 @@ def stol(Column input_col):
     A Column with strings cast to int64
     """
 
-    return string_to_integer(input_col, cudf.dtype("int64"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64))
 
 
 def ui8tos(Column input_col):
@@ -293,7 +261,7 @@ def stoui8(Column input_col):
     A Column with strings cast to uint8
     """
 
-    return string_to_integer(input_col, cudf.dtype("uint8"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8))
 
 
 def ui16tos(Column input_col):
@@ -325,7 +293,7 @@ def stoui16(Column input_col):
     A Column with strings cast to uint16
     """
 
-    return string_to_integer(input_col, cudf.dtype("uint16"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16))
 
 
 def uitos(Column input_col):
@@ -357,7 +325,7 @@ def stoui(Column input_col):
     A Column with strings cast to uint32
     """
 
-    return string_to_integer(input_col, cudf.dtype("uint32"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32))
 
 
 def ultos(Column input_col):
@@ -389,7 +357,7 @@ def stoul(Column input_col):
     A Column with strings cast to uint64
     """
 
-    return string_to_integer(input_col, cudf.dtype("uint64"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64))
 
 
 def to_booleans(Column input_col):
@@ -477,8 +445,6 @@ def istimestamp(Column input_col, str format):
     A Column of boolean values identifying strings that matched the format.
 
     """
-    if input_col.size == 0:
-        return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool"))
     plc_column = plc.strings.convert.convert_datetime.is_timestamp(
         input_col.to_pylibcudf(mode="read"),
         format
@@ -582,7 +548,7 @@ def is_ipv4(Column source_strings):
     return Column.from_pylibcudf(plc_column)
 
 
-def htoi(Column input_col, **kwargs):
+def htoi(Column input_col):
     """
     Converting input column of type string having hex values
     to integer of out_type
@@ -595,22 +561,11 @@ def htoi(Column input_col, **kwargs):
     -------
     A Column of integers parsed from hexadecimal string values.
     """
-
-    cdef column_view input_column_view = input_col.view()
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")]
-        )
+    plc_column = plc.strings.convert.convert_integers.hex_to_integers(
+        input_col.to_pylibcudf(mode="read"),
+        plc.DataType(plc.TypeId.INT64)
     )
-    cdef data_type c_out_type = data_type(tid)
-
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_hex_to_integers(input_column_view,
-                                c_out_type))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 def is_hex(Column source_strings):
@@ -618,15 +573,10 @@ def is_hex(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that have hex characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_hex(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_integers.is_hex(
+        source_strings.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def itoh(Column input_col):
@@ -642,11 +592,7 @@ def itoh(Column input_col):
     -------
     A Column of strings with hexadecimal characters.
     """
-
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_integers_to_hex(input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_integers.integers_to_hex(
+        input_col.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
index f12aab0a2e4..69d566b8c49 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport data_type
@@ -9,23 +10,28 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_integers(
-        column_view input_col,
-        data_type output_type) except +
+        column_view input,
+        data_type output_type) except +libcudf_exception_handler
 
     cdef unique_ptr[column] from_integers(
-        column_view input_col) except +
+        column_view integers) except +libcudf_exception_handler
+
+    cdef unique_ptr[column] is_integer(
+        column_view input
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_integer(
-        column_view source_strings
-    ) except +
+        column_view input,
+        data_type int_type
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] hex_to_integers(
-        column_view input_col,
+        column_view input,
         data_type output_type) except +
 
     cdef unique_ptr[column] is_hex(
-        column_view source_strings
-    ) except +
+        column_view input
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] integers_to_hex(
-        column_view input_col) except +
+        column_view input) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 846070870b1..8ba84ba7d50 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 set(cython_sources
     convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx
-    convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
+    convert_floats.pyx convert_integers.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index 799532d72c6..85300936e4d 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -5,6 +5,7 @@ from . cimport (
     convert_durations,
     convert_fixed_point,
     convert_floats,
+    convert_integers,
     convert_ipv4,
     convert_lists,
     convert_urls,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index deb2d8ab74b..aa27a7c8929 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -5,6 +5,7 @@
     convert_durations,
     convert_fixed_point,
     convert_floats,
+    convert_integers,
     convert_ipv4,
     convert_lists,
     convert_urls,
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
new file mode 100644
index 00000000000..eff2e080c27
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_integers(Column input, DataType output_type)
+
+cpdef Column from_integers(Column integers)
+
+cpdef Column is_integer(Column input, DataType int_type=*)
+
+cpdef Column hex_to_integers(Column input, DataType output_type)
+
+cpdef Column is_hex(Column input)
+
+cpdef Column integers_to_hex(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
new file mode 100644
index 00000000000..5558683a502
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
@@ -0,0 +1,206 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_integers as cpp_convert_integers,
+)
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_integers(Column input, DataType output_type):
+    """
+    Returns a new integer numeric column parsing integer values from the
+    provided strings column.
+
+    For details, cpp:func:`cudf::strings::to_integers`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of integer numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with integers converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.to_integers(
+                input.view(),
+                output_type.c_obj
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column from_integers(Column integers):
+    """
+    Returns a new strings column converting the integer values from the
+    provided column into strings.
+
+    For details, cpp:func:`cudf::strings::from_integers`.
+
+    Parameters
+    ----------
+    integers : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New strings column with integers as strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.from_integers(
+                integers.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_integer(Column input, DataType int_type=None):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to integers.
+
+    For details, cpp:func:`cudf::strings::is_integer`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    int_type : DataType
+        Integer type used for checking underflow and overflow.
+        By default, does not check an integer type for underflow
+        or overflow.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    if int_type is None:
+        with nogil:
+            c_result = move(
+                cpp_convert_integers.is_integer(
+                    input.view(),
+                )
+            )
+    else:
+        with nogil:
+            c_result = move(
+                cpp_convert_integers.is_integer(
+                    input.view(),
+                    int_type.c_obj
+                )
+            )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column hex_to_integers(Column input, DataType output_type):
+    """
+    Returns a new integer numeric column parsing hexadecimal values
+    from the provided strings column.
+
+    For details, cpp:func:`cudf::strings::hex_to_integers`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of integer numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with integers converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.hex_to_integers(
+                input.view(),
+                output_type.c_obj
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_hex(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to integers from hex.
+
+    For details, cpp:func:`cudf::strings::is_hex`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.is_hex(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column integers_to_hex(Column input):
+    """
+    Returns a new strings column converting integer columns to hexadecimal
+    characters.
+
+    For details, cpp:func:`cudf::strings::integers_to_hex`.
+
+    Parameters
+    ----------
+    input : Column
+        Integer column to convert to hex.
+
+    Returns
+    -------
+    Column
+        New strings column with hexadecimal characters.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.integers_to_hex(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
new file mode 100644
index 00000000000..6d1d565af30
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pylibcudf as plc
+from utils import assert_column_eq
+
+
+def test_to_integers():
+    typ = pa.int8()
+    arr = pa.array(["1", "-1", None])
+    result = plc.strings.convert.convert_integers.to_integers(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_integers():
+    arr = pa.array([1, -1, None])
+    result = plc.strings.convert.convert_integers.from_integers(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(["1", "-1", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_integer():
+    arr = pa.array(["1", "-1", "1.2", "A", None])
+    plc_column = plc.interop.from_arrow(arr)
+    result = plc.strings.convert.convert_integers.is_integer(plc_column)
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)
+
+    result = plc.strings.convert.convert_integers.is_integer(
+        plc_column, plc.interop.from_arrow(pa.uint8())
+    )
+    expected = pa.array([True, False, False, False, None])
+    assert_column_eq(result, expected)
+
+
+def test_hex_to_integers():
+    typ = pa.int32()
+    data = ["0xff", "0x2a", None]
+    result = plc.strings.convert.convert_integers.hex_to_integers(
+        plc.interop.from_arrow(pa.array(data)), plc.interop.from_arrow(typ)
+    )
+    expected = pa.array(
+        [int(val, 16) if isinstance(val, str) else val for val in data],
+        type=typ,
+    )
+    assert_column_eq(result, expected)
+
+
+def test_is_hex():
+    arr = pa.array(["0xff", "123", "!", None])
+    result = plc.strings.convert.convert_integers.is_hex(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array([True, True, False, None])
+    assert_column_eq(result, expected)
+
+
+def test_integers_to_hex():
+    data = [255, -42, None]
+    arr = pa.array(data)
+    result = plc.strings.convert.convert_integers.integers_to_hex(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(["FF", "FFFFFFFFFFFFFFD6", None])
+    assert_column_eq(result, expected)

From 7bcfc87935b7a202002d54e17e140789b02f16e9 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 15 Oct 2024 12:17:53 -0400
Subject: [PATCH 070/117] Fix regex handling of fixed quantifier with 0 range
 (#17067)

Fixes regex logic handling of a pattern with a fixed quantifier that includes a zero-range.
Added new gtests for this specific case.
Bug was introduced in #16798

Closes #17065

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - MithunR (https://github.com/mythrocks)
  - Basit Ayantunde (https://github.com/lamarrr)

URL: https://github.com/rapidsai/cudf/pull/17067
---
 cpp/src/strings/regex/regcomp.cpp         |  6 ++--
 cpp/tests/strings/contains_tests.cpp      | 34 +++++++++++++++++++++++
 cpp/tests/strings/replace_regex_tests.cpp | 28 +++++++++++++++++++
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 775a2580f60..b923a301f84 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -710,9 +710,7 @@ class regex_parser {
     std::stack<int> lbra_stack;
     auto repeat_start_index = -1;
 
-    for (std::size_t index = 0; index < in.size(); index++) {
-      auto const item = in[index];
-
+    for (auto const item : in) {
       if (item.type != COUNTED && item.type != COUNTED_LAZY) {
         out.push_back(item);
         if (item.type == LBRA || item.type == LBRA_NC) {
@@ -739,7 +737,7 @@ class regex_parser {
         auto const m = item.d.count.m;  // maximum count
         assert(n >= 0 && "invalid repeat count value n");
         // zero-repeat edge-case: need to erase the previous items
-        if (n == 0 && m == 0) { out.erase(begin, end); }
+        if (n == 0) { out.erase(begin, end); }
 
         std::vector<regex_parser::Item> repeat_copy(begin, end);
         // special handling for quantified capture groups
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 216ddfce5f1..cceec1d3537 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -474,6 +474,40 @@ TEST_F(StringsContainsTests, FixedQuantifier)
   }
 }
 
+TEST_F(StringsContainsTests, ZeroRangeQuantifier)
+{
+  auto input = cudf::test::strings_column_wrapper({"a", "", "abc", "XYAZ", "ABC", "ZYXA"});
+  auto sv    = cudf::strings_column_view(input);
+
+  auto pattern = std::string("A{0,}");  // should match everyting
+  auto prog    = cudf::strings::regex_program::create(pattern);
+
+  {
+    auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1});
+    auto results  = cudf::strings::contains_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({2, 1, 4, 5, 4, 5});
+    auto results  = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+
+  pattern = std::string("(?:ab){0,3}");
+  prog    = cudf::strings::regex_program::create(pattern);
+
+  {
+    auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1});
+    auto results  = cudf::strings::contains_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({2, 1, 3, 5, 4, 5});
+    auto results  = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
 TEST_F(StringsContainsTests, NestedQuantifier)
 {
   auto input   = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555",
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 9847d8d6bb5..abc12b00a81 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -200,6 +200,34 @@ TEST_F(StringsReplaceRegexTest, ZeroLengthMatch)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsReplaceRegexTest, ZeroRangeQuantifier)
+{
+  auto input = cudf::test::strings_column_wrapper({"a", "", "123", "XYAZ", "abc", "zéyab"});
+  auto sv    = cudf::strings_column_view(input);
+
+  auto pattern  = std::string("A{0,5}");
+  auto prog     = cudf::strings::regex_program::create(pattern);
+  auto repl     = cudf::string_scalar("_");
+  auto expected = cudf::test::strings_column_wrapper(
+    {"_a_", "_", "_1_2_3_", "_X_Y__Z_", "_a_b_c_", "_z_é_y_a_b_"});
+  auto results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  pattern = std::string("[a0-9]{0,2}");
+  prog    = cudf::strings::regex_program::create(pattern);
+  expected =
+    cudf::test::strings_column_wrapper({"__", "_", "___", "_X_Y_A_Z_", "__b_c_", "_z_é_y__b_"});
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  pattern = std::string("(?:ab){0,3}");
+  prog    = cudf::strings::regex_program::create(pattern);
+  expected =
+    cudf::test::strings_column_wrapper({"_a_", "_", "_1_2_3_", "_X_Y_A_Z_", "__c_", "_z_é_y__"});
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsReplaceRegexTest, Multiline)
 {
   auto const multiline = cudf::strings::regex_flags::MULTILINE;

From 3420c71cb72f63db8d63164446cca042f354a08e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 15 Oct 2024 23:45:17 -0400
Subject: [PATCH 071/117] Migrate remaining nvtext NGrams APIs to pylibcudf
 (#17070)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/17070
---
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../pylibcudf/nvtext/ngrams_tokenize.rst      |  6 +++
 .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx | 46 ++++------------
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  4 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   | 11 +++-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  3 +-
 .../pylibcudf/nvtext/ngrams_tokenize.pxd      | 13 +++++
 .../pylibcudf/nvtext/ngrams_tokenize.pyx      | 54 +++++++++++++++++++
 .../tests/test_nvtext_ngrams_tokenize.py      | 37 +++++++++++++
 9 files changed, 135 insertions(+), 40 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index f6caabe324d..58303356336 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -8,3 +8,4 @@ nvtext
     generate_ngrams
     jaccard
     minhash
+    ngrams_tokenize
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst
new file mode 100644
index 00000000000..ce6db76f889
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst
@@ -0,0 +1,6 @@
+===============
+ngrams_tokenize
+===============
+
+.. automodule:: pylibcudf.nvtext.ngrams_tokenize
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index dec4f037d98..6521116eafe 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -2,48 +2,22 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
-    ngrams_tokenize as cpp_ngrams_tokenize,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def ngrams_tokenize(
-    Column strings,
+    Column input,
     int ngrams,
     object py_delimiter,
     object py_separator
 ):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_ngrams_tokenize(
-                c_strings,
-                c_ngrams,
-                c_delimiter[0],
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.ngrams_tokenize.ngrams_tokenize(
+        input.to_pylibcudf(mode="read"),
+        ngrams,
+        py_delimiter.device_value.c_value,
+        py_separator.device_value.c_value
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 7fd65beeeb0..94df9bbbebb 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+                   ngrams_tokenize.pyx
+)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 9eed1da1ab5..b6659827688 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,10 +1,17 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance, generate_ngrams, jaccard, minhash
+from . cimport (
+    edit_distance,
+    generate_ngrams,
+    jaccard,
+    minhash,
+    ngrams_tokenize,
+)
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
-    "minhash"
+    "minhash",
+    "ngrams_tokenize"
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index a3a2363f7ef..f74633a3521 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams, jaccard, minhash
+from . import edit_distance, generate_ngrams, jaccard, minhash, ngrams_tokenize
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
     "minhash",
+    "ngrams_tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
new file mode 100644
index 00000000000..4f791ba1ee9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column ngrams_tokenize(
+    Column input,
+    size_type ngrams,
+    Scalar delimiter,
+    Scalar separator
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
new file mode 100644
index 00000000000..8a1854c5f0d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
+    ngrams_tokenize as cpp_ngrams_tokenize,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column ngrams_tokenize(
+    Column input,
+    size_type ngrams,
+    Scalar delimiter,
+    Scalar separator
+):
+    """
+    Returns a single column of strings by tokenizing the input strings column
+    and then producing ngrams of each string.
+
+    For details, see :cpp:func:`ngrams_tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngrams : size_type
+        The ngram number to generate
+    delimiter : Scalar
+        UTF-8 characters used to separate each string into tokens.
+        An empty string will separate tokens using whitespace.
+    separator : Scalar
+        The string to use for separating ngram tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_ngrams_tokenize(
+            input.view(),
+            ngrams,
+            dereference(<const string_scalar*>delimiter.get()),
+            dereference(<const string_scalar*>separator.get()),
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
new file mode 100644
index 00000000000..283a009288d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["a*b*c*d", "a b c d", "a-b-c-d", "a*b c-d"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("ngrams", [2, 3])
+@pytest.mark.parametrize("delim", ["*", " ", "-"])
+@pytest.mark.parametrize("sep", ["_", "&", ","])
+def test_ngrams_tokenize(input_col, ngrams, delim, sep):
+    def ngrams_tokenize(strings, ngrams, delim, sep):
+        tokens = []
+        for s in strings:
+            ss = s.split(delim)
+            for i in range(len(ss) - ngrams + 1):
+                token = sep.join(ss[i : i + ngrams])
+                tokens.append(token)
+        return tokens
+
+    result = plc.nvtext.ngrams_tokenize.ngrams_tokenize(
+        plc.interop.from_arrow(input_col),
+        ngrams,
+        plc.interop.from_arrow(pa.scalar(delim)),
+        plc.interop.from_arrow(pa.scalar(sep)),
+    )
+    expected = pa.array(
+        ngrams_tokenize(input_col.to_pylist(), ngrams, delim, sep)
+    )
+    assert_column_eq(result, expected)

From 95df62a1d76026e876fff4e51811de5e95a6b06e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 16 Oct 2024 15:15:50 -0400
Subject: [PATCH 072/117] Remove unnecessary `std::move`'s in pylibcudf
 (#16983)

This PR removes a lot of unnecessary `std::move`'s from pylibcudf. These were necessary with older versions of Cython, but newer versions appear to generate the correct C++ without needing the extra hints.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16983
---
 python/pylibcudf/pylibcudf/binaryop.pyx       |  36 +++---
 python/pylibcudf/pylibcudf/column.pyx         |   8 +-
 .../pylibcudf/pylibcudf/column_factories.pxd  |   4 +-
 .../pylibcudf/pylibcudf/column_factories.pyx  |  68 ++++------
 python/pylibcudf/pylibcudf/concatenate.pyx    |   4 +-
 python/pylibcudf/pylibcudf/copying.pyx        | 103 +++++++--------
 python/pylibcudf/pylibcudf/datetime.pyx       |   4 +-
 python/pylibcudf/pylibcudf/filling.pyx        |  38 +++---
 python/pylibcudf/pylibcudf/groupby.pyx        |  17 +--
 python/pylibcudf/pylibcudf/interop.pyx        |   4 +-
 python/pylibcudf/pylibcudf/io/avro.pyx        |   2 +-
 python/pylibcudf/pylibcudf/io/csv.pyx         |   2 +-
 python/pylibcudf/pylibcudf/io/json.pyx        |   2 +-
 python/pylibcudf/pylibcudf/io/orc.pyx         |   2 +-
 python/pylibcudf/pylibcudf/io/timezone.pyx    |   8 +-
 python/pylibcudf/pylibcudf/join.pyx           |   2 +-
 python/pylibcudf/pylibcudf/json.pyx           |  10 +-
 python/pylibcudf/pylibcudf/labeling.pyx       |  14 +--
 python/pylibcudf/pylibcudf/lists.pyx          |  74 +++++------
 python/pylibcudf/pylibcudf/merge.pyx          |  12 +-
 python/pylibcudf/pylibcudf/null_mask.pyx      |   8 +-
 .../pylibcudf/nvtext/edit_distance.pyx        |   4 +-
 .../pylibcudf/nvtext/generate_ngrams.pyx      |  26 ++--
 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx |  10 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx |  40 +++---
 python/pylibcudf/pylibcudf/partitioning.pyx   |  20 +--
 python/pylibcudf/pylibcudf/quantiles.pyx      |  30 ++---
 python/pylibcudf/pylibcudf/reduce.pyx         |  22 ++--
 python/pylibcudf/pylibcudf/replace.pyx        |  57 ++++-----
 python/pylibcudf/pylibcudf/reshape.pyx        |   4 +-
 python/pylibcudf/pylibcudf/rolling.pyx        |  29 ++---
 python/pylibcudf/pylibcudf/round.pyx          |  10 +-
 python/pylibcudf/pylibcudf/search.pyx         |  32 ++---
 python/pylibcudf/pylibcudf/sorting.pyx        | 118 ++++++++----------
 .../pylibcudf/pylibcudf/stream_compaction.pyx |  42 +++----
 .../pylibcudf/strings/attributes.pyx          |   6 +-
 .../pylibcudf/strings/char_types.pyx          |  22 ++--
 .../pylibcudf/pylibcudf/strings/contains.pyx  |  20 +--
 .../strings/convert/convert_booleans.pyx      |  18 ++-
 .../strings/convert/convert_durations.pyx     |  18 ++-
 .../strings/convert/convert_fixed_point.pyx   |  22 ++--
 .../strings/convert/convert_floats.pyx        |  20 +--
 .../strings/convert/convert_ipv4.pyx          |  18 +--
 .../strings/convert/convert_lists.pyx         |  10 +-
 .../strings/convert/convert_urls.pyx          |  12 +-
 .../pylibcudf/pylibcudf/strings/extract.pyx   |  16 +--
 python/pylibcudf/pylibcudf/strings/find.pyx   |  82 +++++-------
 .../pylibcudf/strings/find_multiple.pyx       |   8 +-
 .../pylibcudf/pylibcudf/strings/findall.pyx   |  16 +--
 .../pylibcudf/pylibcudf/strings/padding.pyx   |  20 ++-
 python/pylibcudf/pylibcudf/strings/repeat.pyx |  16 +--
 .../pylibcudf/pylibcudf/strings/replace.pyx   |  12 +-
 .../pylibcudf/strings/split/partition.pyx     |  16 +--
 .../pylibcudf/strings/split/split.pyx         |  80 +++++-------
 .../pylibcudf/pylibcudf/strings/translate.pyx |  20 ++-
 python/pylibcudf/pylibcudf/strings/wrap.pyx   |   8 +-
 python/pylibcudf/pylibcudf/table.pyx          |   4 +-
 python/pylibcudf/pylibcudf/transform.pyx      |  16 ++-
 python/pylibcudf/pylibcudf/transpose.pyx      |   2 +-
 python/pylibcudf/pylibcudf/unary.pyx          |  12 +-
 60 files changed, 544 insertions(+), 816 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index 5f9d145139a..51b2b4cfaa3 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -52,33 +52,27 @@ cpdef Column binary_operation(
 
     if LeftBinaryOperand is Column and RightBinaryOperand is Column:
         with nogil:
-            result = move(
-                cpp_binaryop.binary_operation(
-                    lhs.view(),
-                    rhs.view(),
-                    op,
-                    output_type.c_obj
-                )
+            result = cpp_binaryop.binary_operation(
+                lhs.view(),
+                rhs.view(),
+                op,
+                output_type.c_obj
             )
     elif LeftBinaryOperand is Column and RightBinaryOperand is Scalar:
         with nogil:
-            result = move(
-                cpp_binaryop.binary_operation(
-                    lhs.view(),
-                    dereference(rhs.c_obj),
-                    op,
-                    output_type.c_obj
-                )
+            result = cpp_binaryop.binary_operation(
+                lhs.view(),
+                dereference(rhs.c_obj),
+                op,
+                output_type.c_obj
             )
     elif LeftBinaryOperand is Scalar and RightBinaryOperand is Column:
         with nogil:
-            result = move(
-                cpp_binaryop.binary_operation(
-                    dereference(lhs.c_obj),
-                    rhs.view(),
-                    op,
-                    output_type.c_obj
-                )
+            result = cpp_binaryop.binary_operation(
+                dereference(lhs.c_obj),
+                rhs.view(),
+                op,
+                output_type.c_obj
             )
     else:
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
index 03808f0b664..4e5698566d0 100644
--- a/python/pylibcudf/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -138,7 +138,7 @@ cdef class Column:
 
         cdef size_type null_count = libcudf_col.get().null_count()
 
-        cdef column_contents contents = move(libcudf_col.get().release())
+        cdef column_contents contents = libcudf_col.get().release()
 
         # Note that when converting to cudf Column objects we'll need to pull
         # out the base object.
@@ -247,7 +247,7 @@ cdef class Column:
         cdef const scalar* c_scalar = slr.get()
         cdef unique_ptr[column] c_result
         with nogil:
-            c_result = move(make_column_from_scalar(dereference(c_scalar), size))
+            c_result = make_column_from_scalar(dereference(c_scalar), size)
         return Column.from_libcudf(move(c_result))
 
     @staticmethod
@@ -269,7 +269,7 @@ cdef class Column:
         cdef Scalar slr = Scalar.empty_like(like)
         cdef unique_ptr[column] c_result
         with nogil:
-            c_result = move(make_column_from_scalar(dereference(slr.get()), size))
+            c_result = make_column_from_scalar(dereference(slr.get()), size)
         return Column.from_libcudf(move(c_result))
 
     @staticmethod
@@ -373,7 +373,7 @@ cdef class Column:
         """Create a copy of the column."""
         cdef unique_ptr[column] c_result
         with nogil:
-            c_result = move(make_unique[column](self.view()))
+            c_result = make_unique[column](self.view())
         return Column.from_libcudf(move(c_result))
 
 
diff --git a/python/pylibcudf/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd
index fef02359240..d556085ab64 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/column_factories.pxd
@@ -1,7 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from pylibcudf.libcudf.types cimport mask_state, size_type
+from pylibcudf.libcudf.types cimport mask_state
 
 from .column cimport Column
 from .types cimport DataType, size_type, type_id
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
index e9085e3ea02..ac942a620b5 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -39,29 +39,17 @@ cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id):
         if isinstance(type_or_id, TypeId):
             id = type_or_id
             with nogil:
-                result = move(
-                    cpp_make_empty_column(
-                        id
-                    )
-                )
+                result = cpp_make_empty_column(id)
         else:
             raise TypeError(
                 "Must pass a TypeId or DataType"
             )
     elif MakeEmptyColumnOperand is DataType:
         with nogil:
-            result = move(
-                cpp_make_empty_column(
-                    type_or_id.c_obj
-                )
-            )
+            result = cpp_make_empty_column(type_or_id.c_obj)
     elif MakeEmptyColumnOperand is type_id:
         with nogil:
-            result = move(
-                cpp_make_empty_column(
-                    type_or_id
-                )
-            )
+            result = cpp_make_empty_column(type_or_id)
     else:
         raise TypeError(
             "Must pass a TypeId or DataType"
@@ -92,12 +80,10 @@ cpdef Column make_numeric_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_numeric_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_numeric_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
@@ -121,12 +107,10 @@ cpdef Column make_fixed_point_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_fixed_point_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_fixed_point_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
@@ -151,12 +135,10 @@ cpdef Column make_timestamp_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_timestamp_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_timestamp_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
@@ -181,12 +163,10 @@ cpdef Column make_duration_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_duration_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_duration_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
@@ -211,12 +191,10 @@ cpdef Column make_fixed_width_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_fixed_width_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_fixed_width_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx
index 8bdcc086e0f..10c860d97bb 100644
--- a/python/pylibcudf/pylibcudf/concatenate.pyx
+++ b/python/pylibcudf/pylibcudf/concatenate.pyx
@@ -40,14 +40,14 @@ cpdef concatenate(list objects):
             c_tables.push_back((<Table?>tbl).view())
 
         with nogil:
-            c_tbl_result = move(cpp_concatenate.concatenate(c_tables))
+            c_tbl_result = cpp_concatenate.concatenate(c_tables)
         return Table.from_libcudf(move(c_tbl_result))
     elif isinstance(objects[0], Column):
         for column in objects:
             c_columns.push_back((<Column?>column).view())
 
         with nogil:
-            c_col_result = move(cpp_concatenate.concatenate(c_columns))
+            c_col_result = cpp_concatenate.concatenate(c_columns)
         return Column.from_libcudf(move(c_col_result))
     else:
         raise ValueError("input must be a list of Columns or Tables")
diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx
index 9743119d92a..4938f1a3dda 100644
--- a/python/pylibcudf/pylibcudf/copying.pyx
+++ b/python/pylibcudf/pylibcudf/copying.pyx
@@ -67,13 +67,12 @@ cpdef Table gather(
     """
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(
-            cpp_copying.gather(
-                source_table.view(),
-                gather_map.view(),
-                bounds_policy
-            )
+        c_result = cpp_copying.gather(
+            source_table.view(),
+            gather_map.view(),
+            bounds_policy
         )
+
     return Table.from_libcudf(move(c_result))
 
 
@@ -121,22 +120,18 @@ cpdef Table scatter(
     cdef vector[reference_wrapper[const scalar]] source_scalars
     if TableOrListOfScalars is Table:
         with nogil:
-            c_result = move(
-                cpp_copying.scatter(
-                    source.view(),
-                    scatter_map.view(),
-                    target_table.view(),
-                )
+            c_result = cpp_copying.scatter(
+                source.view(),
+                scatter_map.view(),
+                target_table.view(),
             )
     else:
         source_scalars = _as_vector(source)
         with nogil:
-            c_result = move(
-                cpp_copying.scatter(
-                    source_scalars,
-                    scatter_map.view(),
-                    target_table.view(),
-                )
+            c_result = cpp_copying.scatter(
+                source_scalars,
+                scatter_map.view(),
+                target_table.view(),
             )
     return Table.from_libcudf(move(c_result))
 
@@ -160,11 +155,11 @@ cpdef ColumnOrTable empty_like(ColumnOrTable input):
     cdef unique_ptr[column] c_col_result
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.empty_like(input.view()))
+            c_col_result = cpp_copying.empty_like(input.view())
         return Column.from_libcudf(move(c_col_result))
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.empty_like(input.view()))
+            c_tbl_result = cpp_copying.empty_like(input.view())
         return Table.from_libcudf(move(c_tbl_result))
 
 
@@ -195,13 +190,11 @@ cpdef Column allocate_like(
     cdef size_type c_size = size if size is not None else input_column.size()
 
     with nogil:
-        c_result = move(
-            cpp_copying.allocate_like(
+        c_result = cpp_copying.allocate_like(
                 input_column.view(),
                 c_size,
                 policy,
             )
-        )
 
     return Column.from_libcudf(move(c_result))
 
@@ -298,12 +291,12 @@ cpdef Column copy_range(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_copying.copy_range(
+        c_result = cpp_copying.copy_range(
             input_column.view(),
             target_column.view(),
             input_begin,
             input_end,
-            target_begin)
+            target_begin
         )
 
     return Column.from_libcudf(move(c_result))
@@ -337,13 +330,11 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_value):
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_copying.shift(
+        c_result = cpp_copying.shift(
                 input.view(),
                 offset,
                 dereference(fill_value.c_obj)
             )
-        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -378,7 +369,7 @@ cpdef list slice(ColumnOrTable input, list indices):
     cdef int i
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.slice(input.view(), c_indices))
+            c_col_result = cpp_copying.slice(input.view(), c_indices)
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -386,7 +377,7 @@ cpdef list slice(ColumnOrTable input, list indices):
         ]
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.slice(input.view(), c_indices))
+            c_tbl_result = cpp_copying.slice(input.view(), c_indices)
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -418,7 +409,7 @@ cpdef list split(ColumnOrTable input, list splits):
 
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.split(input.view(), c_splits))
+            c_col_result = cpp_copying.split(input.view(), c_splits)
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -426,7 +417,7 @@ cpdef list split(ColumnOrTable input, list splits):
         ]
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.split(input.view(), c_splits))
+            c_tbl_result = cpp_copying.split(input.view(), c_splits)
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -472,29 +463,25 @@ cpdef Column copy_if_else(
 
     if LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Column:
         with nogil:
-            result = move(
-                cpp_copying.copy_if_else(lhs.view(), rhs.view(), boolean_mask.view())
+            result = cpp_copying.copy_if_else(
+                lhs.view(),
+                rhs.view(),
+                boolean_mask.view()
             )
     elif LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Scalar:
         with nogil:
-            result = move(
-                cpp_copying.copy_if_else(
-                    lhs.view(), dereference(rhs.c_obj), boolean_mask.view()
-                )
+            result = cpp_copying.copy_if_else(
+                lhs.view(), dereference(rhs.c_obj), boolean_mask.view()
             )
     elif LeftCopyIfElseOperand is Scalar and RightCopyIfElseOperand is Column:
         with nogil:
-            result = move(
-                cpp_copying.copy_if_else(
-                    dereference(lhs.c_obj), rhs.view(), boolean_mask.view()
-                )
+            result = cpp_copying.copy_if_else(
+                dereference(lhs.c_obj), rhs.view(), boolean_mask.view()
             )
     else:
         with nogil:
-            result = move(
-                cpp_copying.copy_if_else(
-                    dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view()
-                )
+            result = cpp_copying.copy_if_else(
+                dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view()
             )
 
     return Column.from_libcudf(move(result))
@@ -541,22 +528,18 @@ cpdef Table boolean_mask_scatter(
 
     if TableOrListOfScalars is Table:
         with nogil:
-            result = move(
-                cpp_copying.boolean_mask_scatter(
-                    input.view(),
-                    target.view(),
-                    boolean_mask.view()
-                )
+            result = cpp_copying.boolean_mask_scatter(
+                input.view(),
+                target.view(),
+                boolean_mask.view()
             )
     else:
         source_scalars = _as_vector(input)
         with nogil:
-            result = move(
-                cpp_copying.boolean_mask_scatter(
-                    source_scalars,
-                    target.view(),
-                    boolean_mask.view(),
-                )
+            result = cpp_copying.boolean_mask_scatter(
+                source_scalars,
+                target.view(),
+                boolean_mask.view(),
             )
 
     return Table.from_libcudf(move(result))
@@ -586,8 +569,6 @@ cpdef Scalar get_element(Column input_column, size_type index):
     """
     cdef unique_ptr[scalar] c_output
     with nogil:
-        c_output = move(
-            cpp_copying.get_element(input_column.view(), index)
-        )
+        c_output = cpp_copying.get_element(input_column.view(), index)
 
     return Scalar.from_libcudf(move(c_output))
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index 784d29128bf..ac4335cca56 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -33,7 +33,7 @@ cpdef Column extract_year(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_extract_year(values.view()))
+        result = cpp_extract_year(values.view())
     return Column.from_libcudf(move(result))
 
 cpdef Column extract_datetime_component(
@@ -60,5 +60,5 @@ cpdef Column extract_datetime_component(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_extract_datetime_component(values.view(), component))
+        result = cpp_extract_datetime_component(values.view(), component)
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
index 61b430e64aa..0372e1132cc 100644
--- a/python/pylibcudf/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -48,13 +48,11 @@ cpdef Column fill(
 
     cdef unique_ptr[column] result
     with nogil:
-        result = move(
-            cpp_fill(
-                destination.view(),
-                begin,
-                end,
-                dereference((<Scalar> value).c_obj)
-            )
+        result = cpp_fill(
+            destination.view(),
+            begin,
+            end,
+            dereference((<Scalar> value).c_obj)
         )
     return Column.from_libcudf(move(result))
 
@@ -112,12 +110,10 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step):
     cdef unique_ptr[column] result
     cdef size_type c_size = size
     with nogil:
-        result = move(
-            cpp_sequence(
-                c_size,
-                dereference(init.c_obj),
-                dereference(step.c_obj),
-            )
+        result = cpp_sequence(
+            c_size,
+            dereference(init.c_obj),
+            dereference(step.c_obj),
         )
     return Column.from_libcudf(move(result))
 
@@ -152,18 +148,14 @@ cpdef Table repeat(
 
     if ColumnOrSize is Column:
         with nogil:
-            result = move(
-                cpp_repeat(
-                    input_table.view(),
-                    count.view()
-                )
+            result = cpp_repeat(
+                input_table.view(),
+                count.view()
             )
     if ColumnOrSize is size_type:
         with nogil:
-            result = move(
-                cpp_repeat(
-                    input_table.view(),
-                    count
-                )
+            result = cpp_repeat(
+                input_table.view(),
+                count
             )
     return Table.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
index afb95dba5b3..71f9ecb0453 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -176,7 +176,7 @@ cdef class GroupBy:
         # We rely on libcudf to tell us this rather than checking the types beforehand
         # ourselves.
         with nogil:
-            c_res = move(dereference(self.c_obj).aggregate(c_requests))
+            c_res = dereference(self.c_obj).aggregate(c_requests)
         return GroupBy._parse_outputs(move(c_res))
 
     cpdef tuple scan(self, list requests):
@@ -205,7 +205,7 @@ cdef class GroupBy:
 
         cdef pair[unique_ptr[table], vector[aggregation_result]] c_res
         with nogil:
-            c_res = move(dereference(self.c_obj).scan(c_requests))
+            c_res = dereference(self.c_obj).scan(c_requests)
         return GroupBy._parse_outputs(move(c_res))
 
     cpdef tuple shift(self, Table values, list offset, list fill_values):
@@ -234,10 +234,11 @@ cdef class GroupBy:
         cdef vector[size_type] c_offset = offset
         cdef pair[unique_ptr[table], unique_ptr[table]] c_res
         with nogil:
-            c_res = move(
-                dereference(self.c_obj).shift(values.view(), c_offset, c_fill_values)
+            c_res = dereference(self.c_obj).shift(
+                values.view(),
+                c_offset,
+                c_fill_values
             )
-
         return (
             Table.from_libcudf(move(c_res.first)),
             Table.from_libcudf(move(c_res.second)),
@@ -264,10 +265,10 @@ cdef class GroupBy:
         cdef pair[unique_ptr[table], unique_ptr[table]] c_res
         cdef vector[replace_policy] c_replace_policies = replace_policies
         with nogil:
-            c_res = move(
-                dereference(self.c_obj).replace_nulls(value.view(), c_replace_policies)
+            c_res = dereference(self.c_obj).replace_nulls(
+                value.view(),
+                c_replace_policies
             )
-
         return (
             Table.from_libcudf(move(c_res.first)),
             Table.from_libcudf(move(c_res.second)),
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index 1a03fa5b45b..642516a1b90 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -131,7 +131,7 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
     cdef unique_ptr[table] c_result
     with nogil:
         # The libcudf function here will release the stream.
-        c_result = move(cpp_from_arrow_stream(c_stream))
+        c_result = cpp_from_arrow_stream(c_stream)
 
     return Table.from_libcudf(move(c_result))
 
@@ -166,7 +166,7 @@ def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
 
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(cpp_from_arrow_column(c_schema, c_array))
+        c_result = cpp_from_arrow_column(c_schema, c_array)
 
     # The capsule destructors should release automatically for us, but we
     # choose to do it explicitly here for clarity.
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index 438b0ff1634..fe765b34f82 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -45,7 +45,7 @@ cpdef TableWithMetadata read_avro(
         for col in columns:
             c_columns.push_back(str(col).encode())
 
-    cdef avro_reader_options avro_opts = move(
+    cdef avro_reader_options avro_opts = (
         avro_reader_options.builder(source_info.c_obj)
         .columns(c_columns)
         .skip_rows(skip_rows)
diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx
index b53d6771cd6..2c61cc42d82 100644
--- a/python/pylibcudf/pylibcudf/io/csv.pyx
+++ b/python/pylibcudf/pylibcudf/io/csv.pyx
@@ -168,7 +168,7 @@ def read_csv(
     cdef vector[data_type] c_dtypes_list
     cdef map[string, data_type] c_dtypes_map
 
-    cdef csv_reader_options options = move(
+    cdef csv_reader_options options = (
         csv_reader_options.builder(source_info.c_obj)
         .compression(compression)
         .mangle_dupe_cols(mangle_dupe_cols)
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index 29e49083bc6..65f78f830f1 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -59,7 +59,7 @@ cdef json_reader_options _setup_json_reader_options(
         json_recovery_mode_t recovery_mode):
 
     cdef vector[data_type] types_vec
-    cdef json_reader_options opts = move(
+    cdef json_reader_options opts = (
         json_reader_options.builder(source_info.c_obj)
         .compression(compression)
         .lines(lines)
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 01a5e4b04a1..70e0a7995a2 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -252,7 +252,7 @@ cpdef TableWithMetadata read_orc(
     """
     cdef orc_reader_options opts
     cdef vector[vector[size_type]] c_stripes
-    opts = move(
+    opts = (
         orc_reader_options.builder(source_info.c_obj)
         .use_index(use_index)
         .build()
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx
index e02239d7252..f120b65fb2c 100644
--- a/python/pylibcudf/pylibcudf/io/timezone.pyx
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyx
@@ -33,11 +33,9 @@ cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name):
     cdef string c_tzname = timezone_name.encode()
 
     with nogil:
-        c_result = move(
-            cpp_make_timezone_transition_table(
-                make_optional[string](c_tzdir),
-                c_tzname
-            )
+        c_result = cpp_make_timezone_transition_table(
+            make_optional[string](c_tzdir),
+            c_tzname
         )
 
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
index b019ed8f099..bc72647ea8e 100644
--- a/python/pylibcudf/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -212,5 +212,5 @@ cpdef Table cross_join(Table left, Table right):
     """
     cdef unique_ptr[table] result
     with nogil:
-        result = move(cpp_join.cross_join(left.view(), right.view()))
+        result = cpp_join.cross_join(left.view(), right.view())
     return Table.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx
index 4a8d11068f9..ebb82f80408 100644
--- a/python/pylibcudf/pylibcudf/json.pyx
+++ b/python/pylibcudf/pylibcudf/json.pyx
@@ -143,12 +143,10 @@ cpdef Column get_json_object(
     cdef cpp_json.get_json_object_options c_options = options.options
 
     with nogil:
-        c_result = move(
-            cpp_json.get_json_object(
-                col.view(),
-                dereference(c_json_path),
-                c_options
-            )
+        c_result = cpp_json.get_json_object(
+            col.view(),
+            dereference(c_json_path),
+            c_options
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
index b3f6a92d85c..226a9e14172 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyx
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -54,14 +54,12 @@ cpdef Column label_bins(
     )
 
     with nogil:
-        c_result = move(
-            cpp_labeling.label_bins(
-                input.view(),
-                left_edges.view(),
-                c_left_inclusive,
-                right_edges.view(),
-                c_right_inclusive,
-            )
+        c_result = cpp_labeling.label_bins(
+            input.view(),
+            left_edges.view(),
+            c_left_inclusive,
+            right_edges.view(),
+            c_right_inclusive,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index 6f82124d06e..ecaf62d6895 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -69,7 +69,7 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
     cdef unique_ptr[table] c_result
 
     with nogil:
-        c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx))
+        c_result = cpp_explode.explode_outer(input.view(), explode_column_idx)
 
     return Table.from_libcudf(move(c_result))
 
@@ -92,7 +92,7 @@ cpdef Column concatenate_rows(Table input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_concatenate_rows(input.view()))
+        c_result = cpp_concatenate_rows(input.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -123,10 +123,7 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_concatenate_list_elements(
-            input.view(),
-            null_policy,
-        ))
+        c_result = cpp_concatenate_list_elements(input.view(), null_policy)
 
     return Column.from_libcudf(move(c_result))
 
@@ -161,12 +158,12 @@ cpdef Column contains(Column input, ColumnOrScalar search_key):
         raise TypeError("Must pass a Column or Scalar")
 
     with nogil:
-        c_result = move(cpp_contains.contains(
+        c_result = cpp_contains.contains(
             list_view.view(),
             search_key.view() if ColumnOrScalar is Column else dereference(
                 search_key.get()
             ),
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -190,7 +187,7 @@ cpdef Column contains_nulls(Column input):
     cdef unique_ptr[column] c_result
     cdef ListColumnView list_view = input.list_view()
     with nogil:
-        c_result = move(cpp_contains.contains_nulls(list_view.view()))
+        c_result = cpp_contains.contains_nulls(list_view.view())
     return Column.from_libcudf(move(c_result))
 
 
@@ -229,13 +226,13 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
     )
 
     with nogil:
-        c_result = move(cpp_contains.index_of(
+        c_result = cpp_contains.index_of(
             list_view.view(),
             search_key.view() if ColumnOrScalar is Column else dereference(
                 search_key.get()
             ),
             find_option,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -258,9 +255,7 @@ cpdef Column reverse(Column input):
     cdef ListColumnView list_view = input.list_view()
 
     with nogil:
-        c_result = move(cpp_reverse.reverse(
-            list_view.view(),
-        ))
+        c_result = cpp_reverse.reverse(list_view.view())
     return Column.from_libcudf(move(c_result))
 
 
@@ -288,10 +283,10 @@ cpdef Column segmented_gather(Column input, Column gather_map_list):
     cdef ListColumnView list_view2 = gather_map_list.list_view()
 
     with nogil:
-        c_result = move(cpp_gather.segmented_gather(
+        c_result = cpp_gather.segmented_gather(
             list_view1.view(),
             list_view2.view(),
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -316,10 +311,10 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
     cdef ListColumnView list_view = input.list_view()
 
     with nogil:
-        c_result = move(cpp_extract_list_element(
+        c_result = cpp_extract_list_element(
             list_view.view(),
             index.view() if ColumnOrSizeType is Column else index,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -344,7 +339,7 @@ cpdef Column count_elements(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_count_elements(list_view.view()))
+        c_result = cpp_count_elements(list_view.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -373,17 +368,14 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None):
 
     if steps is not None:
         with nogil:
-            c_result = move(cpp_filling.sequences(
+            c_result = cpp_filling.sequences(
                 starts.view(),
                 steps.view(),
                 sizes.view(),
-            ))
+            )
     else:
         with nogil:
-            c_result = move(cpp_filling.sequences(
-                starts.view(),
-                sizes.view(),
-            ))
+            c_result = cpp_filling.sequences(starts.view(), sizes.view())
     return Column.from_libcudf(move(c_result))
 
 cpdef Column sort_lists(
@@ -423,17 +415,17 @@ cpdef Column sort_lists(
 
     with nogil:
         if stable:
-            c_result = move(cpp_stable_sort_lists(
+            c_result = cpp_stable_sort_lists(
                     list_view.view(),
                     c_sort_order,
                     na_position,
-            ))
+            )
         else:
-            c_result = move(cpp_sort_lists(
+            c_result = cpp_sort_lists(
                     list_view.view(),
                     c_sort_order,
                     na_position,
-            ))
+            )
     return Column.from_libcudf(move(c_result))
 
 
@@ -477,12 +469,12 @@ cpdef Column difference_distinct(
     )
 
     with nogil:
-        c_result = move(cpp_set_operations.difference_distinct(
+        c_result = cpp_set_operations.difference_distinct(
             lhs_view.view(),
             rhs_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -525,12 +517,12 @@ cpdef Column have_overlap(
     )
 
     with nogil:
-        c_result = move(cpp_set_operations.have_overlap(
+        c_result = cpp_set_operations.have_overlap(
             lhs_view.view(),
             rhs_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -573,12 +565,12 @@ cpdef Column intersect_distinct(
     )
 
     with nogil:
-        c_result = move(cpp_set_operations.intersect_distinct(
+        c_result = cpp_set_operations.intersect_distinct(
             lhs_view.view(),
             rhs_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -622,12 +614,12 @@ cpdef Column union_distinct(
     )
 
     with nogil:
-        c_result = move(cpp_set_operations.union_distinct(
+        c_result = cpp_set_operations.union_distinct(
             lhs_view.view(),
             rhs_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -652,10 +644,10 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
     cdef ListColumnView list_view = input.list_view()
     cdef ListColumnView mask_view = boolean_mask.list_view()
     with nogil:
-        c_result = move(cpp_apply_boolean_mask(
+        c_result = cpp_apply_boolean_mask(
             list_view.view(),
             mask_view.view(),
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -690,9 +682,9 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
     )
 
     with nogil:
-        c_result = move(cpp_distinct(
+        c_result = cpp_distinct(
             list_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
index 6d707b67449..61a21aafdb2 100644
--- a/python/pylibcudf/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -47,12 +47,10 @@ cpdef Table merge (
 
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(
-            cpp_merge.merge(
-                c_tables_to_merge,
-                c_key_cols,
-                c_column_order,
-                c_null_precedence,
-            )
+        c_result = cpp_merge.merge(
+            c_tables_to_merge,
+            c_key_cols,
+            c_column_order,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
index aae39987dac..74180951562 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pyx
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -38,7 +38,7 @@ cpdef DeviceBuffer copy_bitmask(Column col):
     cdef device_buffer db
 
     with nogil:
-        db = move(cpp_null_mask.copy_bitmask(col.view()))
+        db = cpp_null_mask.copy_bitmask(col.view())
 
     return buffer_to_python(move(db))
 
@@ -90,7 +90,7 @@ cpdef DeviceBuffer create_null_mask(
     cdef device_buffer db
 
     with nogil:
-        db = move(cpp_null_mask.create_null_mask(size, state))
+        db = cpp_null_mask.create_null_mask(size, state)
 
     return buffer_to_python(move(db))
 
@@ -114,7 +114,7 @@ cpdef tuple bitmask_and(list columns):
     cdef pair[device_buffer, size_type] c_result
 
     with nogil:
-        c_result = move(cpp_null_mask.bitmask_and(c_table.view()))
+        c_result = cpp_null_mask.bitmask_and(c_table.view())
 
     return buffer_to_python(move(c_result.first)), c_result.second
 
@@ -138,6 +138,6 @@ cpdef tuple bitmask_or(list columns):
     cdef pair[device_buffer, size_type] c_result
 
     with nogil:
-        c_result = move(cpp_null_mask.bitmask_or(c_table.view()))
+        c_result = cpp_null_mask.bitmask_or(c_table.view())
 
     return buffer_to_python(move(c_result.first)), c_result.second
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
index fc98ccbc50c..dcacb2e1267 100644
--- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
@@ -33,7 +33,7 @@ cpdef Column edit_distance(Column input, Column targets):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_edit_distance(c_strings, c_targets))
+        c_result = cpp_edit_distance(c_strings, c_targets)
 
     return Column.from_libcudf(move(c_result))
 
@@ -58,6 +58,6 @@ cpdef Column edit_distance_matrix(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_edit_distance_matrix(c_strings))
+        c_result = cpp_edit_distance_matrix(c_strings)
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
index 8c7a8edc01d..09859d09e9e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
@@ -40,12 +40,10 @@ cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_generate_ngrams(
-                c_strings,
-                ngrams,
-                c_separator[0]
-            )
+        c_result = cpp_generate_ngrams(
+            c_strings,
+            ngrams,
+            c_separator[0]
         )
     return Column.from_libcudf(move(c_result))
 
@@ -72,11 +70,9 @@ cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_generate_character_ngrams(
-                c_strings,
-                ngrams,
-            )
+        c_result = cpp_generate_character_ngrams(
+            c_strings,
+            ngrams,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -102,10 +98,8 @@ cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_hash_character_ngrams(
-                c_strings,
-                ngrams,
-            )
+        c_result = cpp_hash_character_ngrams(
+            c_strings,
+            ngrams,
         )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
index 9334d7ce751..3d8669865d9 100644
--- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
@@ -36,12 +36,10 @@ cpdef Column jaccard_index(Column input1, Column input2, size_type width):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_jaccard_index(
-                c_input1,
-                c_input2,
-                width
-            )
+        c_result = cpp_jaccard_index(
+            c_input1,
+            c_input2,
+            width
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 5fabf6a3f89..f1e012e60e5 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -46,13 +46,11 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
         raise TypeError("Must pass a Column or Scalar")
 
     with nogil:
-        c_result = move(
-            cpp_minhash(
-                input.view(),
-                seeds.view() if ColumnOrScalar is Column else
-                dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
-                width
-            )
+        c_result = cpp_minhash(
+            input.view(),
+            seeds.view() if ColumnOrScalar is Column else
+            dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
+            width
         )
 
     return Column.from_libcudf(move(c_result))
@@ -85,13 +83,11 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
         raise TypeError("Must pass a Column or Scalar")
 
     with nogil:
-        c_result = move(
-            cpp_minhash64(
-                input.view(),
-                seeds.view() if ColumnOrScalar is Column else
-                dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
-                width
-            )
+        c_result = cpp_minhash64(
+            input.view(),
+            seeds.view() if ColumnOrScalar is Column else
+            dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
+            width
         )
 
     return Column.from_libcudf(move(c_result))
@@ -118,11 +114,9 @@ cpdef Column word_minhash(Column input, Column seeds):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_word_minhash(
-                input.view(),
-                seeds.view()
-            )
+        c_result = cpp_word_minhash(
+            input.view(),
+            seeds.view()
         )
 
     return Column.from_libcudf(move(c_result))
@@ -150,11 +144,9 @@ cpdef Column word_minhash64(Column input, Column seeds):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_word_minhash64(
-                input.view(),
-                seeds.view()
-            )
+        c_result = cpp_word_minhash64(
+            input.view(),
+            seeds.view()
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx
index 8fa70daab5a..3cff4843735 100644
--- a/python/pylibcudf/pylibcudf/partitioning.pyx
+++ b/python/pylibcudf/pylibcudf/partitioning.pyx
@@ -41,10 +41,10 @@ cpdef tuple[Table, list] hash_partition(
     cdef int c_num_partitions = num_partitions
 
     with nogil:
-        c_result = move(
-            cpp_partitioning.hash_partition(
-                input.view(), c_columns_to_hash, c_num_partitions
-            )
+        c_result = cpp_partitioning.hash_partition(
+            input.view(),
+            c_columns_to_hash,
+            c_num_partitions
         )
 
     return Table.from_libcudf(move(c_result.first)), list(c_result.second)
@@ -74,8 +74,10 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit
     cdef int c_num_partitions = num_partitions
 
     with nogil:
-        c_result = move(
-            cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions)
+        c_result = cpp_partitioning.partition(
+            t.view(),
+            partition_map.view(),
+            c_num_partitions
         )
 
     return Table.from_libcudf(move(c_result.first)), list(c_result.second)
@@ -111,10 +113,8 @@ cpdef tuple[Table, list] round_robin_partition(
     cdef int c_start_partition = start_partition
 
     with nogil:
-        c_result = move(
-            cpp_partitioning.round_robin_partition(
-                input.view(), c_num_partitions, c_start_partition
-            )
+        c_result = cpp_partitioning.round_robin_partition(
+            input.view(), c_num_partitions, c_start_partition
         )
 
     return Table.from_libcudf(move(c_result.first)), list(c_result.second)
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
index 3a771fbe7ef..7d92b598bd0 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -66,14 +66,12 @@ cpdef Column quantile(
         ordered_indices_view = ordered_indices.view()
 
     with nogil:
-        c_result = move(
-            cpp_quantile(
-                input.view(),
-                q,
-                interp,
-                ordered_indices_view,
-                exact,
-            )
+        c_result = cpp_quantile(
+            input.view(),
+            q,
+            interp,
+            ordered_indices_view,
+            exact,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -141,15 +139,13 @@ cpdef Table quantiles(
         null_precedence_vec = null_precedence
 
     with nogil:
-        c_result = move(
-            cpp_quantiles(
-                input.view(),
-                q,
-                interp,
-                is_input_sorted,
-                column_order_vec,
-                null_precedence_vec,
-            )
+        c_result = cpp_quantiles(
+            input.view(),
+            q,
+            interp,
+            is_input_sorted,
+            column_order_vec,
+            null_precedence_vec,
         )
 
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx
index b0212a5b9c1..d9ec3a9bdc4 100644
--- a/python/pylibcudf/pylibcudf/reduce.pyx
+++ b/python/pylibcudf/pylibcudf/reduce.pyx
@@ -39,12 +39,10 @@ cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
     cdef unique_ptr[scalar] result
     cdef const reduce_aggregation *c_agg = agg.view_underlying_as_reduce()
     with nogil:
-        result = move(
-            cpp_reduce.cpp_reduce(
-                col.view(),
-                dereference(c_agg),
-                data_type.c_obj
-            )
+        result = cpp_reduce.cpp_reduce(
+            col.view(),
+            dereference(c_agg),
+            data_type.c_obj
         )
     return Scalar.from_libcudf(move(result))
 
@@ -71,12 +69,10 @@ cpdef Column scan(Column col, Aggregation agg, scan_type inclusive):
     cdef unique_ptr[column] result
     cdef const scan_aggregation *c_agg = agg.view_underlying_as_scan()
     with nogil:
-        result = move(
-            cpp_reduce.cpp_scan(
-                col.view(),
-                dereference(c_agg),
-                inclusive,
-            )
+        result = cpp_reduce.cpp_scan(
+            col.view(),
+            dereference(c_agg),
+            inclusive,
         )
     return Column.from_libcudf(move(result))
 
@@ -99,7 +95,7 @@ cpdef tuple minmax(Column col):
     """
     cdef pair[unique_ptr[scalar], unique_ptr[scalar]] result
     with nogil:
-        result = move(cpp_reduce.cpp_minmax(col.view()))
+        result = cpp_reduce.cpp_minmax(col.view())
 
     return (
         Scalar.from_libcudf(move(result.first)),
diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx
index 115dee132fd..f77eba7ace5 100644
--- a/python/pylibcudf/pylibcudf/replace.pyx
+++ b/python/pylibcudf/pylibcudf/replace.pyx
@@ -56,28 +56,23 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement):
         if isinstance(replacement, ReplacePolicy):
             policy = replacement
             with nogil:
-                c_result = move(
-                    cpp_replace.replace_nulls(source_column.view(), policy)
-                )
+                c_result = cpp_replace.replace_nulls(source_column.view(), policy)
             return Column.from_libcudf(move(c_result))
         else:
             raise TypeError("replacement must be a Column, Scalar, or replace_policy")
 
     with nogil:
         if ReplacementType is Column:
-            c_result = move(
-                cpp_replace.replace_nulls(source_column.view(), replacement.view())
+            c_result = cpp_replace.replace_nulls(
+                source_column.view(),
+                replacement.view()
             )
         elif ReplacementType is Scalar:
-            c_result = move(
-                cpp_replace.replace_nulls(
-                    source_column.view(), dereference(replacement.c_obj)
-                )
+            c_result = cpp_replace.replace_nulls(
+                source_column.view(), dereference(replacement.c_obj)
             )
         elif ReplacementType is replace_policy:
-            c_result = move(
-                cpp_replace.replace_nulls(source_column.view(), replacement)
-            )
+            c_result = cpp_replace.replace_nulls(source_column.view(), replacement)
         else:
             assert False, "Internal error. Please contact pylibcudf developers"
     return Column.from_libcudf(move(c_result))
@@ -109,12 +104,10 @@ cpdef Column find_and_replace_all(
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_replace.find_and_replace_all(
-                source_column.view(),
-                values_to_replace.view(),
-                replacement_values.view(),
-            )
+        c_result = cpp_replace.find_and_replace_all(
+            source_column.view(),
+            values_to_replace.view(),
+            replacement_values.view(),
         )
     return Column.from_libcudf(move(c_result))
 
@@ -156,22 +149,18 @@ cpdef Column clamp(
     cdef unique_ptr[column] c_result
     with nogil:
         if lo_replace is None:
-            c_result = move(
-                cpp_replace.clamp(
-                    source_column.view(),
-                    dereference(lo.c_obj),
-                    dereference(hi.c_obj),
-                )
+            c_result = cpp_replace.clamp(
+                source_column.view(),
+                dereference(lo.c_obj),
+                dereference(hi.c_obj),
             )
         else:
-            c_result = move(
-                cpp_replace.clamp(
-                    source_column.view(),
-                    dereference(lo.c_obj),
-                    dereference(hi.c_obj),
-                    dereference(lo_replace.c_obj),
-                    dereference(hi_replace.c_obj),
-                )
+            c_result = cpp_replace.clamp(
+                source_column.view(),
+                dereference(lo.c_obj),
+                dereference(hi.c_obj),
+                dereference(lo_replace.c_obj),
+                dereference(hi_replace.c_obj),
             )
     return Column.from_libcudf(move(c_result))
 
@@ -199,9 +188,7 @@ cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False):
         if inplace:
             cpp_replace.normalize_nans_and_zeros(source_column.mutable_view())
         else:
-            c_result = move(
-                cpp_replace.normalize_nans_and_zeros(source_column.view())
-            )
+            c_result = cpp_replace.normalize_nans_and_zeros(source_column.view())
 
     if not inplace:
         return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
index eb1499ebbea..6540b5198ab 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -38,7 +38,7 @@ cpdef Column interleave_columns(Table source_table):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_interleave_columns(source_table.view()))
+        c_result = cpp_interleave_columns(source_table.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -63,6 +63,6 @@ cpdef Table tile(Table source_table, size_type count):
     cdef unique_ptr[table] c_result
 
     with nogil:
-        c_result = move(cpp_tile(source_table.view(), count))
+        c_result = cpp_tile(source_table.view(), count)
 
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx
index a46540d7ffa..4fd0b005431 100644
--- a/python/pylibcudf/pylibcudf/rolling.pyx
+++ b/python/pylibcudf/pylibcudf/rolling.pyx
@@ -49,24 +49,21 @@ cpdef Column rolling_window(
     cdef const rolling_aggregation *c_agg = agg.view_underlying_as_rolling()
     if WindowType is Column:
         with nogil:
-            result = move(
-                cpp_rolling.rolling_window(
-                    source.view(),
-                    preceding_window.view(),
-                    following_window.view(),
-                    min_periods,
-                    dereference(c_agg),
-                )
+            result = cpp_rolling.rolling_window(
+                source.view(),
+                preceding_window.view(),
+                following_window.view(),
+                min_periods,
+                dereference(c_agg),
             )
     else:
         with nogil:
-            result = move(
-                cpp_rolling.rolling_window(
-                    source.view(),
-                    preceding_window,
-                    following_window,
-                    min_periods,
-                    dereference(c_agg),
-                )
+            result = cpp_rolling.rolling_window(
+                source.view(),
+                preceding_window,
+                following_window,
+                min_periods,
+                dereference(c_agg),
             )
+
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx
index dc60d53b07e..689363e652d 100644
--- a/python/pylibcudf/pylibcudf/round.pyx
+++ b/python/pylibcudf/pylibcudf/round.pyx
@@ -39,12 +39,10 @@ cpdef Column round(
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_round(
-                source.view(),
-                decimal_places,
-                round_method
-            )
+        c_result = cpp_round(
+            source.view(),
+            decimal_places,
+            round_method
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
index 814bc6553d8..1a870248046 100644
--- a/python/pylibcudf/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -41,13 +41,11 @@ cpdef Column lower_bound(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_search.lower_bound(
-                haystack.view(),
-                needles.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_search.lower_bound(
+            haystack.view(),
+            needles.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -82,13 +80,11 @@ cpdef Column upper_bound(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_search.upper_bound(
-                haystack.view(),
-                needles.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_search.upper_bound(
+            haystack.view(),
+            needles.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -112,10 +108,8 @@ cpdef Column contains(Column haystack, Column needles):
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_search.contains(
-                haystack.view(),
-                needles.view(),
-            )
+        c_result = cpp_search.contains(
+            haystack.view(),
+            needles.view(),
         )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
index 42289d54bca..fc40f03e1fd 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -36,12 +36,10 @@ cpdef Column sorted_order(Table source_table, list column_order, list null_prece
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.sorted_order(
-                source_table.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.sorted_order(
+            source_table.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -74,12 +72,10 @@ cpdef Column stable_sorted_order(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.stable_sorted_order(
-                source_table.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.stable_sorted_order(
+            source_table.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -118,15 +114,13 @@ cpdef Column rank(
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_sorting.rank(
-                input_view.view(),
-                method,
-                column_order,
-                null_handling,
-                null_precedence,
-                percentage,
-            )
+        c_result = cpp_sorting.rank(
+            input_view.view(),
+            method,
+            column_order,
+            null_handling,
+            null_precedence,
+            percentage,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -154,12 +148,10 @@ cpdef bool is_sorted(Table tbl, list column_order, list null_precedence):
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.is_sorted(
-                tbl.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.is_sorted(
+            tbl.view(),
+            c_orders,
+            c_null_precedence,
         )
     return c_result
 
@@ -197,14 +189,12 @@ cpdef Table segmented_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.segmented_sort_by_key(
-                values.view(),
-                keys.view(),
-                segment_offsets.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.segmented_sort_by_key(
+            values.view(),
+            keys.view(),
+            segment_offsets.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -243,14 +233,12 @@ cpdef Table stable_segmented_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.stable_segmented_sort_by_key(
-                values.view(),
-                keys.view(),
-                segment_offsets.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.stable_segmented_sort_by_key(
+            values.view(),
+            keys.view(),
+            segment_offsets.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -285,13 +273,11 @@ cpdef Table sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.sort_by_key(
-                values.view(),
-                keys.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.sort_by_key(
+            values.view(),
+            keys.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -326,13 +312,11 @@ cpdef Table stable_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.stable_sort_by_key(
-                values.view(),
-                keys.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.stable_sort_by_key(
+            values.view(),
+            keys.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -360,12 +344,10 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence):
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.sort(
-                source_table.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.sort(
+            source_table.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -393,11 +375,9 @@ cpdef Table stable_sort(Table source_table, list column_order, list null_precede
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.stable_sort(
-                source_table.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.stable_sort(
+            source_table.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index d5475ea79d5..2145398a191 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -44,10 +44,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.drop_nulls(
-                source_table.view(), c_keys, keep_threshold
-            )
+        c_result = cpp_stream_compaction.drop_nulls(
+            source_table.view(), c_keys, keep_threshold
         )
     return Table.from_libcudf(move(c_result))
 
@@ -74,10 +72,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.drop_nulls(
-                source_table.view(), c_keys, keep_threshold
-            )
+        c_result = cpp_stream_compaction.drop_nulls(
+            source_table.view(), c_keys, keep_threshold
         )
     return Table.from_libcudf(move(c_result))
 
@@ -101,10 +97,8 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     """
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.apply_boolean_mask(
-                source_table.view(), boolean_mask.view()
-            )
+        c_result = cpp_stream_compaction.apply_boolean_mask(
+            source_table.view(), boolean_mask.view()
         )
     return Table.from_libcudf(move(c_result))
 
@@ -144,10 +138,8 @@ cpdef Table unique(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.unique(
-                input.view(), c_keys, keep, nulls_equal
-            )
+        c_result = cpp_stream_compaction.unique(
+            input.view(), c_keys, keep, nulls_equal
         )
     return Table.from_libcudf(move(c_result))
 
@@ -185,10 +177,8 @@ cpdef Table distinct(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.distinct(
-                input.view(), c_keys, keep, nulls_equal, nans_equal
-            )
+        c_result = cpp_stream_compaction.distinct(
+            input.view(), c_keys, keep, nulls_equal, nans_equal
         )
     return Table.from_libcudf(move(c_result))
 
@@ -221,10 +211,8 @@ cpdef Column distinct_indices(
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.distinct_indices(
-                input.view(), keep, nulls_equal, nans_equal
-            )
+        c_result = cpp_stream_compaction.distinct_indices(
+            input.view(), keep, nulls_equal, nans_equal
         )
     return Column.from_libcudf(move(c_result))
 
@@ -262,10 +250,8 @@ cpdef Table stable_distinct(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.stable_distinct(
-                input.view(), c_keys, keep, nulls_equal, nans_equal
-            )
+        c_result = cpp_stream_compaction.stable_distinct(
+            input.view(), c_keys, keep, nulls_equal, nans_equal
         )
     return Table.from_libcudf(move(c_result))
 
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx
index 36bee7bd1d9..8e46a32835d 100644
--- a/python/pylibcudf/pylibcudf/strings/attributes.pyx
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx
@@ -25,7 +25,7 @@ cpdef Column count_characters(Column source_strings):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_attributes.count_characters(source_strings.view()))
+        c_result = cpp_attributes.count_characters(source_strings.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -48,7 +48,7 @@ cpdef Column count_bytes(Column source_strings):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_attributes.count_bytes(source_strings.view()))
+        c_result = cpp_attributes.count_bytes(source_strings.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -71,6 +71,6 @@ cpdef Column code_points(Column source_strings):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_attributes.code_points(source_strings.view()))
+        c_result = cpp_attributes.code_points(source_strings.view())
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
index 6a24d79bc4b..cb04efe5e8f 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -38,12 +38,10 @@ cpdef Column all_characters_of_type(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_char_types.all_characters_of_type(
-                source_strings.view(),
-                types,
-                verify_types,
-            )
+        c_result = cpp_char_types.all_characters_of_type(
+            source_strings.view(),
+            types,
+            verify_types,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -81,13 +79,11 @@ cpdef Column filter_characters_of_type(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_char_types.filter_characters_of_type(
-                source_strings.view(),
-                types_to_remove,
-                dereference(c_replacement),
-                types_to_keep,
-            )
+        c_result = cpp_char_types.filter_characters_of_type(
+            source_strings.view(),
+            types_to_remove,
+            dereference(c_replacement),
+            types_to_keep,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
index 82bd1fbea32..d4b1130241d 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -38,10 +38,10 @@ cpdef Column contains_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_contains.contains_re(
+        result = cpp_contains.contains_re(
             input.view(),
             prog.c_obj.get()[0]
-        ))
+        )
 
     return Column.from_libcudf(move(result))
 
@@ -71,10 +71,10 @@ cpdef Column count_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_contains.count_re(
+        result = cpp_contains.count_re(
             input.view(),
             prog.c_obj.get()[0]
-        ))
+        )
 
     return Column.from_libcudf(move(result))
 
@@ -105,10 +105,10 @@ cpdef Column matches_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_contains.matches_re(
+        result = cpp_contains.matches_re(
             input.view(),
             prog.c_obj.get()[0]
-        ))
+        )
 
     return Column.from_libcudf(move(result))
 
@@ -149,19 +149,19 @@ cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=
 
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(cpp_contains.like(
+            result = cpp_contains.like(
                 input.view(),
                 pattern.view(),
                 dereference(c_escape_character)
-            ))
+            )
     elif ColumnOrScalar is Scalar:
         c_pattern = <const string_scalar*>(pattern.c_obj.get())
         with nogil:
-            result = move(cpp_contains.like(
+            result = cpp_contains.like(
                 input.view(),
                 dereference(c_pattern),
                 dereference(c_escape_character)
-            ))
+            )
     else:
         raise ValueError("pattern must be a Column or a Scalar")
 
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
index 0c10f821ab6..dc12b291b11 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
@@ -39,11 +39,9 @@ cpdef Column to_booleans(Column input, Scalar true_string):
     )
 
     with nogil:
-        c_result = move(
-            cpp_convert_booleans.to_booleans(
-                input.view(),
-                dereference(c_true_string)
-            )
+        c_result = cpp_convert_booleans.to_booleans(
+            input.view(),
+            dereference(c_true_string)
         )
 
     return Column.from_libcudf(move(c_result))
@@ -80,12 +78,10 @@ cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_str
     )
 
     with nogil:
-        c_result = move(
-            cpp_convert_booleans.from_booleans(
-                booleans.view(),
-                dereference(c_true_string),
-                dereference(c_false_string),
-            )
+        c_result = cpp_convert_booleans.from_booleans(
+            booleans.view(),
+            dereference(c_true_string),
+            dereference(c_false_string),
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
index 76c5809c3d5..31980ace418 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -43,12 +43,10 @@ cpdef Column to_durations(
     cdef string c_format = format.encode()
 
     with nogil:
-        c_result = move(
-            cpp_convert_durations.to_durations(
-                input.view(),
-                duration_type.c_obj,
-                c_format
-            )
+        c_result = cpp_convert_durations.to_durations(
+            input.view(),
+            duration_type.c_obj,
+            c_format
         )
 
     return Column.from_libcudf(move(c_result))
@@ -84,11 +82,9 @@ cpdef Column from_durations(
     cdef string c_format = format.encode()
 
     with nogil:
-        c_result = move(
-            cpp_convert_durations.from_durations(
-                durations.view(),
-                c_format
-            )
+        c_result = cpp_convert_durations.from_durations(
+            durations.view(),
+            c_format
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
index 60a8fca8baf..962a47dfadf 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
@@ -33,11 +33,9 @@ cpdef Column to_fixed_point(Column input, DataType output_type):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_fixed_point.to_fixed_point(
-                input.view(),
-                output_type.c_obj,
-            )
+        c_result = cpp_fixed_point.to_fixed_point(
+            input.view(),
+            output_type.c_obj,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -62,11 +60,7 @@ cpdef Column from_fixed_point(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_fixed_point.from_fixed_point(
-                input.view(),
-            )
-        )
+        c_result = cpp_fixed_point.from_fixed_point(input.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -97,11 +91,9 @@ cpdef Column is_fixed_point(Column input, DataType decimal_type=None):
         decimal_type = DataType(type_id.DECIMAL64)
 
     with nogil:
-        c_result = move(
-            cpp_fixed_point.is_fixed_point(
-                input.view(),
-                decimal_type.c_obj,
-            )
+        c_result = cpp_fixed_point.is_fixed_point(
+            input.view(),
+            decimal_type.c_obj,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
index 8081aadb085..1296f4f9db5 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -33,11 +33,9 @@ cpdef Column to_floats(Column strings, DataType output_type):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_convert_floats.to_floats(
-                strings.view(),
-                output_type.c_obj,
-            )
+        c_result = cpp_convert_floats.to_floats(
+            strings.view(),
+            output_type.c_obj,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -63,11 +61,7 @@ cpdef Column from_floats(Column floats):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_convert_floats.from_floats(
-                floats.view(),
-            )
-        )
+        c_result = cpp_convert_floats.from_floats(floats.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -92,10 +86,6 @@ cpdef Column is_float(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_convert_floats.is_float(
-                input.view(),
-            )
-        )
+        c_result = cpp_convert_floats.is_float(input.view())
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
index f2a980d4269..834781f95f3 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
@@ -26,11 +26,7 @@ cpdef Column ipv4_to_integers(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_convert_ipv4.ipv4_to_integers(
-                input.view()
-            )
-        )
+        c_result = cpp_convert_ipv4.ipv4_to_integers(input.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -54,11 +50,7 @@ cpdef Column integers_to_ipv4(Column integers):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_convert_ipv4.integers_to_ipv4(
-                integers.view()
-            )
-        )
+        c_result = cpp_convert_ipv4.integers_to_ipv4(integers.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -83,10 +75,6 @@ cpdef Column is_ipv4(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_convert_ipv4.is_ipv4(
-                input.view()
-            )
-        )
+        c_result = cpp_convert_ipv4.is_ipv4(input.view())
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
index 3fbc08a9ab5..cbfe5f5aa8b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
@@ -61,12 +61,10 @@ cpdef Column format_list_column(
         separators = make_empty_column(type_id.STRING)
 
     with nogil:
-        c_result = move(
-            cpp_convert_lists.format_list_column(
-                input.view(),
-                dereference(c_na_rep),
-                separators.view()
-            )
+        c_result = cpp_convert_lists.format_list_column(
+            input.view(),
+            dereference(c_na_rep),
+            separators.view()
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
index a5e080e53b7..82f8a75f1d9 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
@@ -26,11 +26,7 @@ cpdef Column url_encode(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_convert_urls.url_encode(
-                input.view()
-            )
-        )
+        c_result = cpp_convert_urls.url_encode(input.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -54,10 +50,6 @@ cpdef Column url_decode(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_convert_urls.url_decode(
-                input.view()
-            )
-        )
+        c_result = cpp_convert_urls.url_decode(input.view())
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
index dcb11ca10ce..b56eccc8287 100644
--- a/python/pylibcudf/pylibcudf/strings/extract.pyx
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -33,11 +33,9 @@ cpdef Table extract(Column input, RegexProgram prog):
     cdef unique_ptr[table] c_result
 
     with nogil:
-        c_result = move(
-            cpp_extract.extract(
-                input.view(),
-                prog.c_obj.get()[0]
-            )
+        c_result = cpp_extract.extract(
+            input.view(),
+            prog.c_obj.get()[0]
         )
 
     return Table.from_libcudf(move(c_result))
@@ -66,11 +64,9 @@ cpdef Column extract_all_record(Column input, RegexProgram prog):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_extract.extract_all_record(
-                input.view(),
-                prog.c_obj.get()[0]
-            )
+        c_result = cpp_extract.extract_all_record(
+            input.view(),
+            prog.c_obj.get()[0]
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx
index 22d370bf7e8..6fc6dca24fd 100644
--- a/python/pylibcudf/pylibcudf/strings/find.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find.pyx
@@ -50,22 +50,18 @@ cpdef Column find(
     cdef unique_ptr[column] result
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(
-                cpp_find.find(
-                    input.view(),
-                    target.view(),
-                    start
-                )
+            result = cpp_find.find(
+                input.view(),
+                target.view(),
+                start
             )
     elif ColumnOrScalar is Scalar:
         with nogil:
-            result = move(
-                cpp_find.find(
-                    input.view(),
-                    dereference(<string_scalar*>(target.c_obj.get())),
-                    start,
-                    stop
-                )
+            result = cpp_find.find(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get())),
+                start,
+                stop
             )
     else:
         raise ValueError(f"Invalid target {target}")
@@ -104,13 +100,11 @@ cpdef Column rfind(
     """
     cdef unique_ptr[column] result
     with nogil:
-        result = move(
-            cpp_find.rfind(
-                input.view(),
-                dereference(<string_scalar*>(target.c_obj.get())),
-                start,
-                stop
-            )
+        result = cpp_find.rfind(
+            input.view(),
+            dereference(<string_scalar*>(target.c_obj.get())),
+            start,
+            stop
         )
     return Column.from_libcudf(move(result))
 
@@ -149,19 +143,15 @@ cpdef Column contains(
     cdef unique_ptr[column] result
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(
-                cpp_find.contains(
-                    input.view(),
-                    target.view()
-                )
+            result = cpp_find.contains(
+                input.view(),
+                target.view()
             )
     elif ColumnOrScalar is Scalar:
         with nogil:
-            result = move(
-                cpp_find.contains(
-                    input.view(),
-                    dereference(<string_scalar*>(target.c_obj.get()))
-                )
+            result = cpp_find.contains(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get()))
             )
     else:
         raise ValueError(f"Invalid target {target}")
@@ -204,19 +194,15 @@ cpdef Column starts_with(
 
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(
-                cpp_find.starts_with(
-                    input.view(),
-                    target.view()
-                )
+            result = cpp_find.starts_with(
+                input.view(),
+                target.view()
             )
     elif ColumnOrScalar is Scalar:
         with nogil:
-            result = move(
-                cpp_find.starts_with(
-                    input.view(),
-                    dereference(<string_scalar*>(target.c_obj.get()))
-                )
+            result = cpp_find.starts_with(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get()))
             )
     else:
         raise ValueError(f"Invalid target {target}")
@@ -256,19 +242,15 @@ cpdef Column ends_with(
     cdef unique_ptr[column] result
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(
-                cpp_find.ends_with(
-                    input.view(),
-                    target.view()
-                )
+            result = cpp_find.ends_with(
+                input.view(),
+                target.view()
             )
     elif ColumnOrScalar is Scalar:
         with nogil:
-            result = move(
-                cpp_find.ends_with(
-                    input.view(),
-                    dereference(<string_scalar*>(target.c_obj.get()))
-                )
+            result = cpp_find.ends_with(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get()))
             )
     else:
         raise ValueError(f"Invalid target {target}")
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
index 413fc1cb79d..672aa606bd0 100644
--- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
@@ -29,11 +29,9 @@ cpdef Column find_multiple(Column input, Column targets):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_find_multiple.find_multiple(
-                input.view(),
-                targets.view()
-            )
+        c_result = cpp_find_multiple.find_multiple(
+            input.view(),
+            targets.view()
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
index 5212dc4594d..89fa4302824 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyx
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -30,11 +30,9 @@ cpdef Column findall(Column input, RegexProgram pattern):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_findall.findall(
-                input.view(),
-                pattern.c_obj.get()[0]
-            )
+        c_result = cpp_findall.findall(
+            input.view(),
+            pattern.c_obj.get()[0]
         )
 
     return Column.from_libcudf(move(c_result))
@@ -62,11 +60,9 @@ cpdef Column find_re(Column input, RegexProgram pattern):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_findall.find_re(
-                input.view(),
-                pattern.c_obj.get()[0]
-            )
+        c_result = cpp_findall.find_re(
+            input.view(),
+            pattern.c_obj.get()[0]
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx
index 24daaaa3838..f6950eecf60 100644
--- a/python/pylibcudf/pylibcudf/strings/padding.pyx
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyx
@@ -33,13 +33,11 @@ cpdef Column pad(Column input, size_type width, side_type side, str fill_char):
     cdef string c_fill_char = fill_char.encode("utf-8")
 
     with nogil:
-        c_result = move(
-            cpp_padding.pad(
-                input.view(),
-                width,
-                side,
-                c_fill_char,
-            )
+        c_result = cpp_padding.pad(
+            input.view(),
+            width,
+            side,
+            c_fill_char,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -65,11 +63,9 @@ cpdef Column zfill(Column input, size_type width):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_padding.zfill(
-                input.view(),
-                width,
-            )
+        c_result = cpp_padding.zfill(
+            input.view(),
+            width,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx
index 5f627218f6e..fb2bb13c666 100644
--- a/python/pylibcudf/pylibcudf/strings/repeat.pyx
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx
@@ -31,19 +31,15 @@ cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times):
 
     if ColumnorSizeType is Column:
         with nogil:
-            c_result = move(
-                cpp_repeat.repeat_strings(
-                    input.view(),
-                    repeat_times.view()
-                )
+            c_result = cpp_repeat.repeat_strings(
+                input.view(),
+                repeat_times.view()
             )
     elif ColumnorSizeType is size_type:
         with nogil:
-            c_result = move(
-                cpp_repeat.repeat_strings(
-                    input.view(),
-                    repeat_times
-                )
+            c_result = cpp_repeat.repeat_strings(
+                input.view(),
+                repeat_times
             )
     else:
         raise ValueError("repeat_times must be size_type or integer")
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
index 9d0ebf4a814..6db7f04fcbb 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -55,12 +55,12 @@ cpdef Column replace(
     repl_str = <string_scalar *>(repl.c_obj.get())
 
     with nogil:
-        c_result = move(cpp_replace(
+        c_result = cpp_replace(
             input.view(),
             target_str[0],
             repl_str[0],
             maxrepl,
-        ))
+        )
 
     return Column.from_libcudf(move(c_result))
 
@@ -98,11 +98,11 @@ cpdef Column replace_multiple(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_replace_multiple(
+        c_result = cpp_replace_multiple(
             input.view(),
             target.view(),
             repl.view(),
-        ))
+        )
 
     return Column.from_libcudf(move(c_result))
 
@@ -151,11 +151,11 @@ cpdef Column replace_slice(
     cdef const string_scalar* scalar_str = <string_scalar*>(repl.c_obj.get())
 
     with nogil:
-        c_result = move(cpp_replace_slice(
+        c_result = cpp_replace_slice(
             input.view(),
             scalar_str[0],
             start,
             stop
-        ))
+        )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
index ecc959e65b0..0fb4f186c41 100644
--- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
@@ -45,11 +45,9 @@ cpdef Table partition(Column input, Scalar delimiter=None):
         )
 
     with nogil:
-        c_result = move(
-            cpp_partition.partition(
-                input.view(),
-                dereference(c_delimiter)
-            )
+        c_result = cpp_partition.partition(
+            input.view(),
+            dereference(c_delimiter)
         )
 
     return Table.from_libcudf(move(c_result))
@@ -85,11 +83,9 @@ cpdef Table rpartition(Column input, Scalar delimiter=None):
         )
 
     with nogil:
-        c_result = move(
-            cpp_partition.rpartition(
-                input.view(),
-                dereference(c_delimiter)
-            )
+        c_result = cpp_partition.rpartition(
+            input.view(),
+            dereference(c_delimiter)
         )
 
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx
index a7d7f39fc47..e3827f6645e 100644
--- a/python/pylibcudf/pylibcudf/strings/split/split.pyx
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx
@@ -44,12 +44,10 @@ cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit):
     )
 
     with nogil:
-        c_result = move(
-            cpp_split.split(
-                strings_column.view(),
-                dereference(c_delimiter),
-                maxsplit,
-            )
+        c_result = cpp_split.split(
+            strings_column.view(),
+            dereference(c_delimiter),
+            maxsplit,
         )
 
     return Table.from_libcudf(move(c_result))
@@ -85,12 +83,10 @@ cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit):
     )
 
     with nogil:
-        c_result = move(
-            cpp_split.rsplit(
-                strings_column.view(),
-                dereference(c_delimiter),
-                maxsplit,
-            )
+        c_result = cpp_split.rsplit(
+            strings_column.view(),
+            dereference(c_delimiter),
+            maxsplit,
         )
 
     return Table.from_libcudf(move(c_result))
@@ -124,12 +120,10 @@ cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit):
     )
 
     with nogil:
-        c_result = move(
-            cpp_split.split_record(
-                strings.view(),
-                dereference(c_delimiter),
-                maxsplit,
-            )
+        c_result = cpp_split.split_record(
+            strings.view(),
+            dereference(c_delimiter),
+            maxsplit,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -165,12 +159,10 @@ cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit)
     )
 
     with nogil:
-        c_result = move(
-            cpp_split.rsplit_record(
-                strings.view(),
-                dereference(c_delimiter),
-                maxsplit,
-            )
+        c_result = cpp_split.rsplit_record(
+            strings.view(),
+            dereference(c_delimiter),
+            maxsplit,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -203,12 +195,10 @@ cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit):
     cdef unique_ptr[table] c_result
 
     with nogil:
-        c_result = move(
-            cpp_split.split_re(
-                input.view(),
-                prog.c_obj.get()[0],
-                maxsplit,
-            )
+        c_result = cpp_split.split_re(
+            input.view(),
+            prog.c_obj.get()[0],
+            maxsplit,
         )
 
     return Table.from_libcudf(move(c_result))
@@ -241,12 +231,10 @@ cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit):
     cdef unique_ptr[table] c_result
 
     with nogil:
-        c_result = move(
-            cpp_split.rsplit_re(
-                input.view(),
-                prog.c_obj.get()[0],
-                maxsplit,
-            )
+        c_result = cpp_split.rsplit_re(
+            input.view(),
+            prog.c_obj.get()[0],
+            maxsplit,
         )
 
     return Table.from_libcudf(move(c_result))
@@ -278,12 +266,10 @@ cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_split.split_record_re(
-                input.view(),
-                prog.c_obj.get()[0],
-                maxsplit,
-            )
+        c_result = cpp_split.split_record_re(
+            input.view(),
+            prog.c_obj.get()[0],
+            maxsplit,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -315,12 +301,10 @@ cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxspli
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_split.rsplit_record_re(
-                input.view(),
-                prog.c_obj.get()[0],
-                maxsplit,
-            )
+        c_result = cpp_split.rsplit_record_re(
+            input.view(),
+            prog.c_obj.get()[0],
+            maxsplit,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx
index a62c7ec4528..d85da8e6cdd 100644
--- a/python/pylibcudf/pylibcudf/strings/translate.pyx
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyx
@@ -62,11 +62,9 @@ cpdef Column translate(Column input, dict chars_table):
     )
 
     with nogil:
-        c_result = move(
-            cpp_translate.translate(
-                input.view(),
-                c_chars_table
-            )
+        c_result = cpp_translate.translate(
+            input.view(),
+            c_chars_table
         )
     return Column.from_libcudf(move(c_result))
 
@@ -111,12 +109,10 @@ cpdef Column filter_characters(
     )
 
     with nogil:
-        c_result = move(
-            cpp_translate.filter_characters(
-                input.view(),
-                c_characters_to_filter,
-                keep_characters,
-                dereference(c_replacement),
-            )
+        c_result = cpp_translate.filter_characters(
+            input.view(),
+            c_characters_to_filter,
+            keep_characters,
+            dereference(c_replacement),
         )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx
index 11e31f54eee..2ced250f837 100644
--- a/python/pylibcudf/pylibcudf/strings/wrap.pyx
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx
@@ -32,11 +32,9 @@ cpdef Column wrap(Column input, size_type width):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_wrap.wrap(
-                input.view(),
-                width,
-            )
+        c_result = cpp_wrap.wrap(
+            input.view(),
+            width,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx
index 5f77b89a605..d0d6f2343d0 100644
--- a/python/pylibcudf/pylibcudf/table.pyx
+++ b/python/pylibcudf/pylibcudf/table.pyx
@@ -49,9 +49,7 @@ cdef class Table:
         calling libcudf algorithms, and should generally not be needed by users
         (even direct pylibcudf Cython users).
         """
-        cdef vector[unique_ptr[column]] c_columns = move(
-            dereference(libcudf_tbl).release()
-        )
+        cdef vector[unique_ptr[column]] c_columns = dereference(libcudf_tbl).release()
 
         cdef vector[unique_ptr[column]].size_type i
         return Table([
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index 74134caeb78..bce9702752a 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -35,7 +35,7 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     cdef pair[unique_ptr[device_buffer], size_type] c_result
 
     with nogil:
-        c_result = move(cpp_transform.nans_to_nulls(input.view()))
+        c_result = cpp_transform.nans_to_nulls(input.view())
 
     return (
         gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
@@ -59,7 +59,7 @@ cpdef tuple[gpumemoryview, int] bools_to_mask(Column input):
     cdef pair[unique_ptr[device_buffer], size_type] c_result
 
     with nogil:
-        c_result = move(cpp_transform.bools_to_mask(input.view()))
+        c_result = cpp_transform.bools_to_mask(input.view())
 
     return (
         gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
@@ -88,7 +88,7 @@ cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit):
     cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask)
 
     with nogil:
-        c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit))
+        c_result = cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)
 
     return Column.from_libcudf(move(c_result))
 
@@ -119,10 +119,8 @@ cpdef Column transform(Column input, str unary_udf, DataType output_type, bool i
     cdef bool c_is_ptx = is_ptx
 
     with nogil:
-        c_result = move(
-            cpp_transform.transform(
-                input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
-            )
+        c_result = cpp_transform.transform(
+            input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
         )
 
     return Column.from_libcudf(move(c_result))
@@ -144,7 +142,7 @@ cpdef tuple[Table, Column] encode(Table input):
     cdef pair[unique_ptr[table], unique_ptr[column]] c_result
 
     with nogil:
-        c_result = move(cpp_transform.encode(input.view()))
+        c_result = cpp_transform.encode(input.view())
 
     return (
         Table.from_libcudf(move(c_result.first)),
@@ -172,7 +170,7 @@ cpdef Table one_hot_encode(Column input, Column categories):
     cdef Table owner_table
 
     with nogil:
-        c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view()))
+        c_result = cpp_transform.one_hot_encode(input.view(), categories.view())
 
     owner_table = Table(
         [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx
index a708f6cc37f..a24f937ced3 100644
--- a/python/pylibcudf/pylibcudf/transpose.pyx
+++ b/python/pylibcudf/pylibcudf/transpose.pyx
@@ -29,7 +29,7 @@ cpdef Table transpose(Table input_table):
     cdef Table owner_table
 
     with nogil:
-        c_result = move(cpp_transpose.transpose(input_table.view()))
+        c_result = cpp_transpose.transpose(input_table.view())
 
     owner_table = Table(
         [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx
index 839360ef406..53e8c382b5e 100644
--- a/python/pylibcudf/pylibcudf/unary.pyx
+++ b/python/pylibcudf/pylibcudf/unary.pyx
@@ -34,7 +34,7 @@ cpdef Column unary_operation(Column input, unary_operator op):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.unary_operation(input.view(), op))
+        result = cpp_unary.unary_operation(input.view(), op)
 
     return Column.from_libcudf(move(result))
 
@@ -57,7 +57,7 @@ cpdef Column is_null(Column input):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.is_null(input.view()))
+        result = cpp_unary.is_null(input.view())
 
     return Column.from_libcudf(move(result))
 
@@ -80,7 +80,7 @@ cpdef Column is_valid(Column input):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.is_valid(input.view()))
+        result = cpp_unary.is_valid(input.view())
 
     return Column.from_libcudf(move(result))
 
@@ -105,7 +105,7 @@ cpdef Column cast(Column input, DataType data_type):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.cast(input.view(), data_type.c_obj))
+        result = cpp_unary.cast(input.view(), data_type.c_obj)
 
     return Column.from_libcudf(move(result))
 
@@ -128,7 +128,7 @@ cpdef Column is_nan(Column input):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.is_nan(input.view()))
+        result = cpp_unary.is_nan(input.view())
 
     return Column.from_libcudf(move(result))
 
@@ -151,7 +151,7 @@ cpdef Column is_not_nan(Column input):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.is_not_nan(input.view()))
+        result = cpp_unary.is_not_nan(input.view())
 
     return Column.from_libcudf(move(result))
 

From f1cbbcc1c8586bf68403f9abbc1a38fc527bdef4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 16 Oct 2024 12:16:54 -0700
Subject: [PATCH 073/117] Reenable huge pages for arrow host copying (#17097)

It is unclear whether the performance gains here are entirely from huge pages themselves or whether invoking madvise with huge pages is primarily serving to trigger an eager population of the pages (huge or not). We attempted to provide alternate flags to `madvise` like `MADV_WILLNEED` and that was not sufficient to recover performance, so either huge pages themselves are doing something special or specifying huge pages is causing `madvise` to trigger a page migration that no other flag does. In any case, this change returns us to the performance before the switch to the C data interface, and this code is lifted straight out of our old implementation so I am comfortable making use of it and knowing that it is not problematic. We should explore further optimizations in this direction, though.

Resolves #17075.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/17097
---
 cpp/src/interop/to_arrow_host.cu | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index 79fb7550044..8ec0904f1ba 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -44,6 +44,7 @@
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
 #include <nanoarrow/nanoarrow_device.h>
+#include <sys/mman.h>
 
 #include <iostream>
 
@@ -52,6 +53,30 @@ namespace detail {
 
 namespace {
 
+/*
+  Enable Transparent Huge Pages (THP) for large (>4MB) allocations.
+  `buf` is returned untouched.
+  Enabling THP can improve performance of device-host memory transfers
+  significantly, see <https://github.com/rapidsai/cudf/pull/13914>.
+*/
+void enable_hugepage(ArrowBuffer* buffer)
+{
+  if (buffer->size_bytes < (1u << 22u)) {  // Smaller than 4 MB
+    return;
+  }
+
+#ifdef MADV_HUGEPAGE
+  auto const pagesize = sysconf(_SC_PAGESIZE);
+  void* addr          = const_cast<uint8_t*>(buffer->data);
+  auto length{static_cast<std::size_t>(buffer->size_bytes)};
+  if (std::align(pagesize, pagesize, addr, length)) {
+    // Intentionally not checking for errors that may be returned by older kernel versions;
+    // optimistically tries enabling huge pages.
+    madvise(addr, length, MADV_HUGEPAGE);
+  }
+#endif
+}
+
 struct dispatch_to_arrow_host {
   cudf::column_view column;
   rmm::cuda_stream_view stream;
@@ -62,6 +87,7 @@ struct dispatch_to_arrow_host {
     if (!column.has_nulls()) { return NANOARROW_OK; }
 
     NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast<int64_t>(column.size()), 0));
+    enable_hugepage(&bitmap->buffer);
     CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data,
                                   (column.offset() > 0)
                                     ? cudf::detail::copy_bitmask(column, stream, mr).data()
@@ -76,6 +102,7 @@ struct dispatch_to_arrow_host {
   int populate_data_buffer(device_span<T const> input, ArrowBuffer* buffer) const
   {
     NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1));
+    enable_hugepage(buffer);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value()));
     return NANOARROW_OK;

From b513df8a0e077f89fb50ee6e1b3a11d316b8a63a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 16 Oct 2024 16:08:22 -0500
Subject: [PATCH 074/117] Include timezone file path in error message (#17102)

Resolves https://github.com/rapidsai/cudf/issues/8795.

Also needed for https://github.com/rapidsai/cudf/pull/16998.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17102
---
 cpp/src/datetime/timezone.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index a6b6cbbf0b5..2196ee97fee 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -138,7 +138,7 @@ struct timezone_file {
       std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name;
     std::ifstream fin;
     fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate);
-    CUDF_EXPECTS(fin, "Failed to open the timezone file.");
+    CUDF_EXPECTS(fin, "Failed to open the timezone file '" + tz_filename.string() + "'");
     auto const file_size = fin.tellg();
     fin.seekg(0);
 

From c9202a0797c1b23f02edbdef34d292ebfd74117f Mon Sep 17 00:00:00 2001
From: Hirota Akio <33370421+a-hirota@users.noreply.github.com>
Date: Thu, 17 Oct 2024 07:38:26 +0900
Subject: [PATCH 075/117] bug fix: use `self.ck_consumer` in `poll` method of
 kafka.py to align with `__init__` (#17044)

Updated the `poll` method in `kafka.py` to use `self.ck_consumer.poll(timeout)` instead of `self.ck.poll(timeout)`. This change ensures consistency with the `__init__` method where `self.ck_consumer` is initialized.

Authors:
  - Hirota Akio (https://github.com/a-hirota)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17044
---
 python/custreamz/custreamz/kafka.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py
index 0def0ba746e..4cbd7244751 100644
--- a/python/custreamz/custreamz/kafka.py
+++ b/python/custreamz/custreamz/kafka.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import confluent_kafka as ck
 from cudf_kafka._lib.kafka import KafkaDatasource
 
@@ -288,4 +288,4 @@ def poll(self, timeout=None):
             (default: infinite (None translated into -1 in the
             library)). (Seconds)
         """
-        return self.ck.poll(timeout)
+        return self.ck_consumer.poll(timeout)

From 5f863a52910e61722ca0146cf6e513d7632a612f Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 16 Oct 2024 20:56:13 -0700
Subject: [PATCH 076/117] Implement batch construction for strings columns
 (#17035)

This implements batch construction of strings columns, allowing to create a large number of strings columns at once with minimal overhead of kernel launch and stream synchronization. There should be only one stream sync in the entire column construction process.

Benchmark: https://github.com/rapidsai/cudf/pull/17035#issuecomment-2406522808

Closes https://github.com/rapidsai/cudf/issues/16486.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17035
---
 cpp/benchmarks/CMakeLists.txt                 |   1 +
 cpp/benchmarks/string/make_strings_column.cu  | 100 +++++++++
 cpp/include/cudf/column/column_factories.hpp  |  20 ++
 .../cudf/strings/detail/strings_children.cuh  |  58 +++++
 .../detail/strings_column_factories.cuh       |  46 +---
 cpp/src/strings/strings_column_factories.cu   | 183 +++++++++++++--
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/streams/strings/factory_test.cpp    |  67 ++++++
 cpp/tests/strings/factories_test.cu           | 209 +++++++++++++++++-
 9 files changed, 622 insertions(+), 63 deletions(-)
 create mode 100644 cpp/benchmarks/string/make_strings_column.cu
 create mode 100644 cpp/tests/streams/strings/factory_test.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d6fc5dc6039..e61a8e6e1e6 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -384,6 +384,7 @@ ConfigureNVBench(
   string/join_strings.cpp
   string/lengths.cpp
   string/like.cpp
+  string/make_strings_column.cu
   string/replace_re.cpp
   string/reverse.cpp
   string/slice.cpp
diff --git a/cpp/benchmarks/string/make_strings_column.cu b/cpp/benchmarks/string/make_strings_column.cu
new file mode 100644
index 00000000000..e86824b9f40
--- /dev/null
+++ b/cpp/benchmarks/string/make_strings_column.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/pair.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+namespace {
+
+constexpr int min_row_width = 0;
+constexpr int max_row_width = 50;
+
+using string_index_pair = thrust::pair<char const*, cudf::size_type>;
+
+template <bool batch_construction>
+std::vector<std::unique_ptr<cudf::column>> make_strings_columns(
+  std::vector<cudf::device_span<string_index_pair const>> const& input,
+  rmm::cuda_stream_view stream)
+{
+  if constexpr (batch_construction) {
+    return cudf::make_strings_column_batch(input, stream);
+  } else {
+    std::vector<std::unique_ptr<cudf::column>> output;
+    output.reserve(input.size());
+    for (auto const& column_input : input) {
+      output.emplace_back(cudf::make_strings_column(column_input, stream));
+    }
+    return output;
+  }
+}
+
+}  // namespace
+
+static void BM_make_strings_column_batch(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));
+  auto const has_nulls  = true;
+
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
+      .null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
+  auto const data_table = create_random_table(
+    cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);
+
+  auto const stream = cudf::get_default_stream();
+  auto input_data   = std::vector<rmm::device_uvector<string_index_pair>>{};
+  auto input        = std::vector<cudf::device_span<string_index_pair const>>{};
+  input_data.reserve(batch_size);
+  input.reserve(batch_size);
+  for (auto const& cv : data_table->view()) {
+    auto const d_data_ptr = cudf::column_device_view::create(cv, stream);
+    auto batch_input      = rmm::device_uvector<string_index_pair>(cv.size(), stream);
+    thrust::tabulate(rmm::exec_policy(stream),
+                     batch_input.begin(),
+                     batch_input.end(),
+                     [data_col = *d_data_ptr] __device__(auto const idx) {
+                       if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+                       auto const row = data_col.element<cudf::string_view>(idx);
+                       return string_index_pair{row.data(), row.size_bytes()};
+                     });
+    input_data.emplace_back(std::move(batch_input));
+    input.emplace_back(input_data.back());
+  }
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto const output = make_strings_columns<true>(input, stream);
+  });
+}
+
+NVBENCH_BENCH(BM_make_strings_column_batch)
+  .set_name("make_strings_column_batch")
+  .add_int64_axis("num_rows", {100'000, 500'000, 1'000'000, 2'000'000})
+  .add_int64_axis("batch_size", {10, 20, 50, 100});
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index c3b68b52c36..6bbe32de134 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -378,6 +378,26 @@ std::unique_ptr<column> make_strings_column(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Construct a batch of STRING type columns given an array of device spans of pointer/size
+ * pairs.
+ *
+ * This function has input/output expectation similar to the `make_strings_column()` API that
+ * accepts only one device span of pointer/size pairs. The difference is that, this is designed to
+ * create many strings columns at once with minimal overhead of multiple kernel launches and
+ * stream synchronizations.
+ *
+ * @param input Array of device spans of pointer/size pairs, where each pointer is a device memory
+ *        address or `nullptr` (indicating a null string), and size is string length (in bytes)
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for memory allocation of the output columns
+ * @return Array of constructed strings columns
+ */
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Construct a STRING type column given a device span of string_view.
  *
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 1283226879b..fb0b25cf9f1 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -29,6 +30,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cub/device/device_memcpy.cuh>
+#include <cuda/functional>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
@@ -38,6 +41,61 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+/**
+ * @brief Gather characters to create a strings column using the given string-index pair iterator
+ *
+ * @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
+ *
+ * @param offsets The offsets for the output strings column
+ * @param chars_size The size (in bytes) of the chars data
+ * @param begin Iterator to the first string-index pair
+ * @param strings_count The number of strings
+ * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return An array of chars gathered from the input string-index pair iterator
+ */
+template <typename IndexPairIterator>
+rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
+                                            int64_t chars_size,
+                                            IndexPairIterator begin,
+                                            size_type strings_count,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::device_async_resource_ref mr)
+{
+  auto chars_data      = rmm::device_uvector<char>(chars_size, stream, mr);
+  auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);
+
+  auto const src_ptrs = cudf::detail::make_counting_transform_iterator(
+    0u, cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
+      // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
+      // we have to use `const_cast` to remove `const` qualifier from the source pointer.
+      // This should be fine as long as we only read but not write anything to the source.
+      return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
+    }));
+  auto const src_sizes = cudf::detail::make_counting_transform_iterator(
+    0u, cuda::proclaim_return_type<size_type>([begin] __device__(uint32_t idx) {
+      return begin[idx].second;
+    }));
+  auto const dst_ptrs = cudf::detail::make_counting_transform_iterator(
+    0u,
+    cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
+                                        uint32_t idx) { return output + offsets[idx]; }));
+
+  size_t temp_storage_bytes = 0;
+  CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(
+    nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, strings_count, stream.value()));
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
+  CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(),
+                                           temp_storage_bytes,
+                                           src_ptrs,
+                                           dst_ptrs,
+                                           src_sizes,
+                                           strings_count,
+                                           stream.value()));
+
+  return chars_data;
+}
+
 /**
  * @brief Create an offsets column to be a child of a compound column
  *
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 6b1b453a752..03240f418fe 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -49,16 +49,6 @@ namespace detail {
  */
 using string_index_pair = thrust::pair<char const*, size_type>;
 
-/**
- * @brief Average string byte-length threshold for deciding character-level
- * vs. row-level parallel algorithm.
- *
- * This value was determined by running the factory_benchmark against different
- * string lengths and observing the point where the performance is faster for
- * long strings.
- */
-constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;
-
 /**
  * @brief Create a strings-type column from iterators of pointer/size pairs
  *
@@ -88,8 +78,6 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
   auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto const d_offsets =
-    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // create null mask
   auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
@@ -99,38 +87,8 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
-    auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
-    // use a character-parallel kernel for long string lengths
-    if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
-      auto const str_begin = thrust::make_transform_iterator(
-        begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
-          return string_view{ip.first, ip.second};
-        }));
-
-      return gather_chars(str_begin,
-                          thrust::make_counting_iterator<size_type>(0),
-                          thrust::make_counting_iterator<size_type>(strings_count),
-                          d_offsets,
-                          bytes,
-                          stream,
-                          mr);
-    } else {
-      // this approach is 2-3x faster for a large number of smaller string lengths
-      auto chars_data = rmm::device_uvector<char>(bytes, stream, mr);
-      auto d_chars    = chars_data.data();
-      auto copy_chars = [d_chars] __device__(auto item) {
-        string_index_pair const str = thrust::get<0>(item);
-        int64_t const offset        = thrust::get<1>(item);
-        if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
-      };
-      thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
-                         strings_count,
-                         copy_chars);
-      return chars_data;
-    }
-  }();
+  auto chars_data =
+    make_chars_buffer(offsets_column->view(), bytes, begin, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 07516f91dcf..8e00a29f8e9 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -16,36 +16,171 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
-#include <cudf/strings/detail/utilities.cuh>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
+#include <thrust/scan.h>
+#include <thrust/uninitialized_fill.h>
 
 namespace cudf {
 
+namespace strings::detail {
+
 namespace {
-struct string_view_to_pair {
-  string_view null_placeholder;
-  string_view_to_pair(string_view n) : null_placeholder(n) {}
-  __device__ thrust::pair<char const*, size_type> operator()(string_view const& i)
-  {
-    return (i.data() == null_placeholder.data())
-             ? thrust::pair<char const*, size_type>{nullptr, 0}
-             : thrust::pair<char const*, size_type>{i.data(), i.size_bytes()};
+
+using column_string_pairs = cudf::device_span<string_index_pair const>;
+
+template <typename OutputType>
+std::pair<std::vector<std::unique_ptr<column>>, rmm::device_uvector<int64_t>>
+make_offsets_child_column_batch_async(std::vector<column_string_pairs> const& input,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  auto const num_columns = input.size();
+  std::vector<std::unique_ptr<column>> offsets_columns(num_columns);
+  rmm::device_uvector<int64_t> chars_sizes(num_columns, stream);
+  for (std::size_t idx = 0; idx < num_columns; ++idx) {
+    auto const string_pairs = input[idx];
+    auto const string_count = static_cast<size_type>(string_pairs.size());
+    auto offsets            = make_numeric_column(
+      data_type{type_to_id<OutputType>()}, string_count + 1, mask_state::UNALLOCATED, stream, mr);
+
+    auto const offsets_transformer = cuda::proclaim_return_type<size_type>(
+      [string_count, string_pairs = string_pairs.data()] __device__(size_type idx) -> size_type {
+        return idx < string_count ? string_pairs[idx].second : size_type{0};
+      });
+    auto const input_it  = cudf::detail::make_counting_transform_iterator(0, offsets_transformer);
+    auto const d_offsets = offsets->mutable_view().template data<OutputType>();
+    auto const output_it = cudf::detail::make_sizes_to_offsets_iterator(
+      d_offsets, d_offsets + string_count + 1, chars_sizes.data() + idx);
+    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                           input_it,
+                           input_it + string_count + 1,
+                           output_it,
+                           int64_t{0});
+    offsets_columns[idx] = std::move(offsets);
   }
-};
+
+  return {std::move(offsets_columns), std::move(chars_sizes)};
+}
 
 }  // namespace
 
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<column_string_pairs> const& input,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto const num_columns = input.size();
+
+  auto [offsets_cols, d_chars_sizes] =
+    make_offsets_child_column_batch_async<size_type>(input, stream, mr);
+
+  std::vector<rmm::device_buffer> null_masks;
+  null_masks.reserve(num_columns);
+
+  rmm::device_uvector<size_type> d_valid_counts(num_columns, stream, mr);
+  thrust::uninitialized_fill(
+    rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0);
+
+  for (std::size_t idx = 0; idx < num_columns; ++idx) {
+    auto const& string_pairs = input[idx];
+    auto const string_count  = static_cast<size_type>(string_pairs.size());
+    null_masks.emplace_back(
+      cudf::create_null_mask(string_count, mask_state::UNINITIALIZED, stream, mr));
+
+    if (string_count == 0) { continue; }
+
+    constexpr size_type block_size{256};
+    auto const grid =
+      cudf::detail::grid_1d{static_cast<thread_index_type>(string_count), block_size};
+    cudf::detail::valid_if_kernel<block_size>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        reinterpret_cast<bitmask_type*>(null_masks.back().data()),
+        string_pairs.data(),
+        string_count,
+        [] __device__(string_index_pair const pair) -> bool { return pair.first != nullptr; },
+        d_valid_counts.data() + idx);
+  }
+
+  auto const chars_sizes  = cudf::detail::make_std_vector_async(d_chars_sizes, stream);
+  auto const valid_counts = cudf::detail::make_std_vector_async(d_valid_counts, stream);
+
+  // Except for other stream syncs in `CUB` that we cannot control,
+  // this should be the only stream sync we need in the entire API.
+  stream.synchronize();
+
+  auto const threshold = cudf::strings::get_offset64_threshold();
+  auto const overflow_count =
+    std::count_if(chars_sizes.begin(), chars_sizes.end(), [threshold](auto const chars_size) {
+      return chars_size >= threshold;
+    });
+  CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || overflow_count == 0,
+               "Size of output exceeds the column size limit",
+               std::overflow_error);
+
+  if (overflow_count > 0) {
+    std::vector<column_string_pairs> long_string_input;
+    std::vector<std::size_t> long_string_col_idx;
+    long_string_input.reserve(overflow_count);
+    long_string_col_idx.reserve(overflow_count);
+    for (std::size_t idx = 0; idx < num_columns; ++idx) {
+      if (chars_sizes[idx] >= threshold) {
+        long_string_input.push_back(input[idx]);
+        long_string_col_idx.push_back(idx);
+      }
+    }
+
+    [[maybe_unused]] auto [new_offsets_cols, d_new_chars_sizes] =
+      make_offsets_child_column_batch_async<int64_t>(long_string_input, stream, mr);
+
+    // Update the new offsets columns.
+    // The new chars sizes should be the same as before, thus we don't need to update them.
+    for (std::size_t idx = 0; idx < long_string_col_idx.size(); ++idx) {
+      offsets_cols[long_string_col_idx[idx]] = std::move(new_offsets_cols[idx]);
+    }
+  }
+
+  std::vector<std::unique_ptr<column>> output(num_columns);
+  for (std::size_t idx = 0; idx < num_columns; ++idx) {
+    auto const strings_count = static_cast<size_type>(input[idx].size());
+    if (strings_count == 0) {
+      output[idx] = make_empty_column(type_id::STRING);
+      continue;
+    }
+
+    auto const chars_size  = chars_sizes[idx];
+    auto const valid_count = valid_counts[idx];
+
+    auto chars_data = make_chars_buffer(
+      offsets_cols[idx]->view(), chars_size, input[idx].data(), strings_count, stream, mr);
+
+    auto const null_count = strings_count - valid_count;
+    output[idx]           = make_strings_column(
+      strings_count,
+      std::move(offsets_cols[idx]),
+      chars_data.release(),
+      null_count,
+      null_count ? std::move(null_masks[idx]) : rmm::device_buffer{0, stream, mr});
+  }
+
+  return output;
+}
+
+}  // namespace strings::detail
+
 // Create a strings-type column from vector of pointer/size pairs
 std::unique_ptr<column> make_strings_column(
   device_span<thrust::pair<char const*, size_type> const> strings,
@@ -53,10 +188,32 @@ std::unique_ptr<column> make_strings_column(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-
   return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr);
 }
 
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> const& input,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return cudf::strings::detail::make_strings_column_batch(input, stream, mr);
+}
+
+namespace {
+struct string_view_to_pair {
+  string_view null_placeholder;
+  string_view_to_pair(string_view n) : null_placeholder(n) {}
+  __device__ thrust::pair<char const*, size_type> operator()(string_view const& i)
+  {
+    return (i.data() == null_placeholder.data())
+             ? thrust::pair<char const*, size_type>{nullptr, 0}
+             : thrust::pair<char const*, size_type>{i.data(), i.size_bytes()};
+  }
+};
+
+}  // namespace
+
 std::unique_ptr<column> make_strings_column(device_span<string_view const> string_views,
                                             string_view null_placeholder,
                                             rmm::cuda_stream_view stream,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 799a84cbc37..a4213dcbe94 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -726,6 +726,7 @@ ConfigureTest(
   streams/strings/contains_test.cpp
   streams/strings/convert_test.cpp
   streams/strings/extract_test.cpp
+  streams/strings/factory_test.cpp
   streams/strings/filter_test.cpp
   streams/strings/find_test.cpp
   streams/strings/replace_test.cpp
diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp
new file mode 100644
index 00000000000..36e595ab9fa
--- /dev/null
+++ b/cpp/tests/streams/strings/factory_test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/pair.h>
+
+#include <string>
+#include <vector>
+
+class StringsFactoryTest : public cudf::test::BaseFixture {};
+
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+
+TEST_F(StringsFactoryTest, StringConstructionFromPairs)
+{
+  auto const stream = cudf::test::get_default_stream();
+
+  auto const h_data = std::vector<char>{'a', 'b', 'c'};
+  auto const d_data = cudf::detail::make_device_uvector_async(
+    h_data, stream, cudf::get_current_device_resource_ref());
+
+  auto const h_input =
+    std::vector<string_pair>{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}};
+  auto const d_input = cudf::detail::make_device_uvector_async(
+    h_input, stream, cudf::get_current_device_resource_ref());
+  auto const input = cudf::device_span<string_pair const>{d_input.data(), d_input.size()};
+  cudf::make_strings_column(input, stream);
+}
+
+TEST_F(StringsFactoryTest, StringBatchConstruction)
+{
+  auto const stream = cudf::test::get_default_stream();
+
+  auto const h_data = std::vector<char>{'a', 'b', 'c'};
+  auto const d_data = cudf::detail::make_device_uvector_async(
+    h_data, stream, cudf::get_current_device_resource_ref());
+
+  auto const h_input =
+    std::vector<string_pair>{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}};
+  auto const d_input = cudf::detail::make_device_uvector_async(
+    h_input, stream, cudf::get_current_device_resource_ref());
+
+  std::vector<cudf::device_span<string_pair const>> input(
+    10, cudf::device_span<string_pair const>{d_input.data(), d_input.size()});
+  cudf::make_strings_column_batch(input, stream);
+}
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 90054e41d36..7eb429da7d9 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -37,6 +37,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
 #include <thrust/pair.h>
+#include <thrust/tabulate.h>
 #include <thrust/transform.h>
 
 #include <cstring>
@@ -44,6 +45,8 @@
 
 struct StringsFactoriesTest : public cudf::test::BaseFixture {};
 
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+
 TEST_F(StringsFactoriesTest, CreateColumnFromPair)
 {
   std::vector<char const*> h_test_strings{"the quick brown fox jumps over the lazy dog",
@@ -61,7 +64,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   cudf::size_type count = (cudf::size_type)h_test_strings.size();
   thrust::host_vector<char> h_buffer(memsize);
   rmm::device_uvector<char> d_buffer(memsize, cudf::get_default_stream());
-  thrust::host_vector<thrust::pair<char const*, cudf::size_type>> strings(count);
+  thrust::host_vector<string_pair> strings(count);
   thrust::host_vector<cudf::size_type> h_offsets(count + 1);
   cudf::size_type offset = 0;
   cudf::size_type nulls  = 0;
@@ -69,12 +72,12 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   for (cudf::size_type idx = 0; idx < count; ++idx) {
     char const* str = h_test_strings[idx];
     if (!str) {
-      strings[idx] = thrust::pair<char const*, cudf::size_type>{nullptr, 0};
+      strings[idx] = string_pair{nullptr, 0};
       nulls++;
     } else {
       auto length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
-      strings[idx] = thrust::pair<char const*, cudf::size_type>{d_buffer.data() + offset, length};
+      strings[idx] = string_pair{d_buffer.data() + offset, length};
       offset += length;
     }
     h_offsets[idx + 1] = offset;
@@ -201,14 +204,13 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn)
     cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release());
   cudf::test::expect_column_empty(results->view());
 
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> d_strings{
-    0, cudf::get_default_stream()};
+  rmm::device_uvector<string_pair> d_strings{0, cudf::get_default_stream()};
   results = cudf::make_strings_column(d_strings);
   cudf::test::expect_column_empty(results->view());
 }
 
 namespace {
-using string_pair = thrust::pair<char const*, cudf::size_type>;
+
 struct string_view_to_pair {
   __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
   {
@@ -234,3 +236,198 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty)
   auto result = cudf::make_strings_column(pairs);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data);
 }
+
+struct StringsBatchConstructionTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsBatchConstructionTest, EmptyColumns)
+{
+  auto constexpr num_columns = 10;
+  auto const stream          = cudf::get_default_stream();
+
+  auto const d_string_pairs = rmm::device_uvector<string_pair>{0, stream};
+  auto const input          = std::vector<cudf::device_span<string_pair const>>(
+    num_columns, {d_string_pairs.data(), d_string_pairs.size()});
+  auto const output = cudf::make_strings_column_batch(input, stream);
+
+  auto const expected_col = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  for (auto const& col : output) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view());
+  }
+}
+
+TEST_F(StringsBatchConstructionTest, AllNullsColumns)
+{
+  auto constexpr num_columns = 10;
+  auto constexpr num_rows    = 100;
+  auto const stream          = cudf::get_default_stream();
+
+  auto d_string_pairs = rmm::device_uvector<string_pair>{num_rows, stream};
+  thrust::uninitialized_fill_n(rmm::exec_policy(stream),
+                               d_string_pairs.data(),
+                               d_string_pairs.size(),
+                               string_pair{nullptr, 0});
+  auto const input = std::vector<cudf::device_span<string_pair const>>(
+    num_columns, {d_string_pairs.data(), d_string_pairs.size()});
+  auto const output = cudf::make_strings_column_batch(input, stream);
+
+  auto const expected_col = cudf::make_strings_column(d_string_pairs);
+  for (auto const& col : output) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view());
+  }
+}
+
+namespace {
+
+struct index_to_pair {
+  int const num_test_strings;
+  char const* d_chars;
+  std::size_t const* d_offsets;
+  int const* is_null;
+
+  __device__ string_pair operator()(cudf::size_type idx)
+  {
+    auto const data_idx = idx % num_test_strings;
+    return {is_null[data_idx] ? nullptr : d_chars + d_offsets[data_idx],
+            static_cast<cudf::size_type>(d_offsets[data_idx + 1] - d_offsets[data_idx])};
+  }
+};
+
+}  // namespace
+
+TEST_F(StringsBatchConstructionTest, CreateColumnsFromPairs)
+{
+  auto constexpr num_columns  = 10;
+  auto constexpr max_num_rows = 1000;
+  auto const stream           = cudf::get_default_stream();
+  auto const mr               = cudf::get_current_device_resource_ref();
+
+  std::vector<char const*> h_test_strings{"the quick brown fox jumps over the lazy dog",
+                                          "the fat cat lays next to the other accénted cat",
+                                          "a slow moving turtlé cannot catch the bird",
+                                          "which can be composéd together to form a more complete",
+                                          "thé result does not include the value in the sum in",
+                                          "",
+                                          nullptr,
+                                          "absent stop words"};
+  auto const num_test_strings = static_cast<int>(h_test_strings.size());
+
+  std::vector<std::size_t> h_offsets(num_test_strings + 1, 0);
+  for (int i = 0; i < num_test_strings; ++i) {
+    h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0);
+  }
+
+  std::vector<char> h_chars(h_offsets.back());
+  std::vector<int> is_null(num_test_strings, 0);
+  for (int i = 0; i < num_test_strings; ++i) {
+    if (h_test_strings[i]) {
+      memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i]));
+    } else {
+      is_null[i] = 1;
+    }
+  }
+
+  auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
+  auto const d_chars   = cudf::detail::make_device_uvector_async(h_chars, stream, mr);
+  auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr);
+
+  std::vector<rmm::device_uvector<string_pair>> d_input;
+  std::vector<cudf::device_span<string_pair const>> input;
+  d_input.reserve(num_columns);
+  input.reserve(num_columns);
+
+  for (int col_idx = 0; col_idx < num_columns; ++col_idx) {
+    // Columns have sizes increase from `max_num_rows / num_columns` to `max_num_rows`.
+    auto const num_rows =
+      static_cast<int>(static_cast<double>(col_idx + 1) / num_columns * max_num_rows);
+
+    auto string_pairs = rmm::device_uvector<string_pair>(num_rows, stream);
+    thrust::tabulate(
+      rmm::exec_policy_nosync(stream),
+      string_pairs.begin(),
+      string_pairs.end(),
+      index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()});
+
+    d_input.emplace_back(std::move(string_pairs));
+    input.emplace_back(d_input.back());
+  }
+
+  auto const output = cudf::make_strings_column_batch(input, stream, mr);
+
+  for (std::size_t i = 0; i < num_columns; ++i) {
+    auto const string_pairs = input[i];
+    auto const expected     = cudf::make_strings_column(string_pairs, stream, mr);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view());
+  }
+}
+
+// The test below requires a huge amount of memory, thus it is disabled by default.
+TEST_F(StringsBatchConstructionTest, DISABLED_CreateLongStringsColumns)
+{
+  auto constexpr num_columns = 2;
+  auto const stream          = cudf::get_default_stream();
+  auto const mr              = cudf::get_current_device_resource_ref();
+
+  std::vector<char const*> h_test_strings{"the quick brown fox jumps over the lazy dog",
+                                          "the fat cat lays next to the other accénted cat",
+                                          "a slow moving turtlé cannot catch the bird",
+                                          "which can be composéd together to form a more complete",
+                                          "thé result does not include the value in the sum in",
+                                          "",
+                                          nullptr,
+                                          "absent stop words"};
+  auto const num_test_strings = static_cast<int>(h_test_strings.size());
+
+  std::vector<std::size_t> h_offsets(num_test_strings + 1, 0);
+  for (int i = 0; i < num_test_strings; ++i) {
+    h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0);
+  }
+
+  std::vector<char> h_chars(h_offsets.back());
+  std::vector<int> is_null(num_test_strings, 0);
+  for (int i = 0; i < num_test_strings; ++i) {
+    if (h_test_strings[i]) {
+      memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i]));
+    } else {
+      is_null[i] = 1;
+    }
+  }
+
+  auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
+  auto const d_chars   = cudf::detail::make_device_uvector_async(h_chars, stream, mr);
+  auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr);
+
+  // If we create a column by repeating h_test_strings by `max_cycles` times,
+  // we will have it size around (1.5*INT_MAX) bytes.
+  auto const max_cycles = static_cast<int>(static_cast<int64_t>(std::numeric_limits<int>::max()) *
+                                           1.5 / h_offsets.back());
+
+  std::vector<rmm::device_uvector<string_pair>> d_input;
+  std::vector<cudf::device_span<string_pair const>> input;
+  d_input.reserve(num_columns);
+  input.reserve(num_columns);
+
+  for (int col_idx = 0; col_idx < num_columns; ++col_idx) {
+    // Columns have sizes increase from `max_cycles * num_test_strings / num_columns` to
+    // `max_cycles * num_test_strings`.
+    auto const num_rows = static_cast<int>(static_cast<double>(col_idx + 1) / num_columns *
+                                           max_cycles * num_test_strings);
+
+    auto string_pairs = rmm::device_uvector<string_pair>(num_rows, stream);
+    thrust::tabulate(
+      rmm::exec_policy_nosync(stream),
+      string_pairs.begin(),
+      string_pairs.end(),
+      index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()});
+
+    d_input.emplace_back(std::move(string_pairs));
+    input.emplace_back(d_input.back());
+  }
+
+  auto const output = cudf::make_strings_column_batch(input, stream, mr);
+
+  for (std::size_t i = 0; i < num_columns; ++i) {
+    auto const string_pairs = input[i];
+    auto const expected     = cudf::make_strings_column(string_pairs, stream, mr);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view());
+  }
+}

From 3683e4685ff0f0bc8122fe654742f708bf9fdbcc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 16 Oct 2024 18:13:32 -1000
Subject: [PATCH 077/117] Add strings.combine APIs to pylibcudf (#16790)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16790
---
 .../api_docs/pylibcudf/strings/combine.rst    |   6 +
 .../api_docs/pylibcudf/strings/index.rst      |   1 +
 python/cudf/cudf/_lib/strings/combine.pyx     | 130 +++-------
 .../pylibcudf/libcudf/strings/CMakeLists.txt  |   2 +-
 .../pylibcudf/libcudf/strings/combine.pxd     |  27 ++-
 .../pylibcudf/libcudf/strings/combine.pyx     |   0
 .../pylibcudf/strings/CMakeLists.txt          |   1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |   1 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |   1 +
 .../pylibcudf/pylibcudf/strings/combine.pxd   |  33 +++
 .../pylibcudf/pylibcudf/strings/combine.pyx   | 223 ++++++++++++++++++
 .../pylibcudf/tests/test_string_combine.py    |  83 +++++++
 12 files changed, 397 insertions(+), 111 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx
 create mode 100644 python/pylibcudf/pylibcudf/strings/combine.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/combine.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_combine.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst
new file mode 100644
index 00000000000..38a46641200
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst
@@ -0,0 +1,6 @@
+=======
+combine
+=======
+
+.. automodule:: pylibcudf.strings.combine
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 65dc5d2d1c3..c8c0016126d 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -6,6 +6,7 @@ strings
 
     capitalize
     char_types
+    combine
     contains
     extract
     find
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 76cc13db0da..0f7b27d85d7 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -2,24 +2,11 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+from cudf._lib.column cimport Column
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.combine cimport (
-    concatenate as cpp_concatenate,
-    join_list_elements as cpp_join_list_elements,
-    join_strings as cpp_join_strings,
-    output_if_empty_list,
-    separator_on_nulls,
-)
-from pylibcudf.libcudf.table.table_view cimport table_view
+import pylibcudf as plc
 
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport table_view_from_columns
+import cudf
 
 
 @acquire_spill_lock()
@@ -31,26 +18,12 @@ def concatenate(list source_strings,
     with the specified `sep` between each column and
     `na`/`None` values are replaced by `na_rep`
     """
-    cdef DeviceScalar separator = sep.device_value
-    cdef DeviceScalar narep = na_rep.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef table_view source_view = table_view_from_columns(source_strings)
-
-    cdef const string_scalar* scalar_separator = \
-        <const string_scalar*>(separator.get_raw_ptr())
-    cdef const string_scalar* scalar_narep = <const string_scalar*>(
-        narep.get_raw_ptr()
+    plc_column = plc.strings.combine.concatenate(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]),
+        sep.device_value.c_value,
+        na_rep.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_concatenate(
-            source_view,
-            scalar_separator[0],
-            scalar_narep[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -62,27 +35,12 @@ def join(Column source_strings,
     with the specified `sep` between each column and
     `na`/`None` values are replaced by `na_rep`
     """
-
-    cdef DeviceScalar separator = sep.device_value
-    cdef DeviceScalar narep = na_rep.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_separator = \
-        <const string_scalar*>(separator.get_raw_ptr())
-    cdef const string_scalar* scalar_narep = <const string_scalar*>(
-        narep.get_raw_ptr()
+    plc_column = plc.strings.combine.join_strings(
+        source_strings.to_pylibcudf(mode="read"),
+        sep.device_value.c_value,
+        na_rep.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_join_strings(
-            source_view,
-            scalar_separator[0],
-            scalar_narep[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -96,29 +54,15 @@ def join_lists_with_scalar(
     between each string in lists and `<NA>`/`None` values
     are replaced by `py_narep`
     """
-
-    cdef DeviceScalar separator = py_separator.device_value
-    cdef DeviceScalar narep = py_narep.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_separator = \
-        <const string_scalar*>(separator.get_raw_ptr())
-    cdef const string_scalar* scalar_narep = <const string_scalar*>(
-        narep.get_raw_ptr()
+    plc_column = plc.strings.combine.join_list_elements(
+        source_strings.to_pylibcudf(mode="read"),
+        py_separator.device_value.c_value,
+        py_narep.device_value.c_value,
+        cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value,
+        plc.strings.combine.SeparatorOnNulls.YES,
+        plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
     )
-
-    with nogil:
-        c_result = move(cpp_join_list_elements(
-            source_view,
-            scalar_separator[0],
-            scalar_narep[0],
-            separator_on_nulls.YES,
-            output_if_empty_list.NULL_ELEMENT
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -135,28 +79,12 @@ def join_lists_with_column(
     `<NA>`/`None` values in `separator_strings` are replaced
     by `py_separator_narep`
     """
-
-    cdef DeviceScalar source_narep = py_source_narep.device_value
-    cdef DeviceScalar separator_narep = py_separator_narep.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view separator_view = separator_strings.view()
-
-    cdef const string_scalar* scalar_source_narep = \
-        <const string_scalar*>(source_narep.get_raw_ptr())
-    cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
-        separator_narep.get_raw_ptr()
+    plc_column = plc.strings.combine.join_list_elements(
+        source_strings.to_pylibcudf(mode="read"),
+        separator_strings.to_pylibcudf(mode="read"),
+        py_separator_narep.device_value.c_value,
+        py_source_narep.device_value.c_value,
+        plc.strings.combine.SeparatorOnNulls.YES,
+        plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
     )
-
-    with nogil:
-        c_result = move(cpp_join_list_elements(
-            source_view,
-            separator_view,
-            scalar_separator_narep[0],
-            scalar_source_narep[0],
-            separator_on_nulls.YES,
-            output_if_empty_list.NULL_ELEMENT
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
index b8b4343173e..f5f2113332a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)
+set(cython_sources char_types.pyx combine.pyx regex_flags.pyx side_type.pyx translate.pyx)
 
 set(linked_libraries cudf::cudf)
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
index e4c9fa5817a..e659993b834 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+from libcpp cimport int
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 
 cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
 
-    ctypedef enum separator_on_nulls:
-        YES 'cudf::strings::separator_on_nulls::YES'
-        NO  'cudf::strings::separator_on_nulls::NO'
+    cpdef enum class separator_on_nulls(int):
+        YES
+        NO
 
-    ctypedef enum output_if_empty_list:
-        EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING'
-        NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT'
+    cpdef enum class output_if_empty_list(int):
+        EMPTY_STRING
+        NULL_ELEMENT
 
     cdef unique_ptr[column] concatenate(
-        table_view source_strings,
+        table_view strings_columns,
         string_scalar separator,
-        string_scalar narep) except +
+        string_scalar narep,
+        separator_on_nulls separate_nulls) except +
+
+    cdef unique_ptr[column] concatenate(
+        table_view strings_columns,
+        column_view separators,
+        string_scalar separator_narep,
+        string_scalar col_narep,
+        separator_on_nulls separate_nulls) except +
 
     cdef unique_ptr[column] join_strings(
-        column_view source_strings,
+        column_view input,
         string_scalar separator,
         string_scalar narep) except +
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index eeb44d19333..04dd131cd75 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -18,6 +18,7 @@ set(cython_sources
     case.pyx
     char_types.pyx
     contains.pyx
+    combine.pyx
     extract.pyx
     find.pyx
     find_multiple.pyx
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index e45048a500f..93c61f3f72c 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -5,6 +5,7 @@ from . cimport (
     capitalize,
     case,
     char_types,
+    combine,
     contains,
     convert,
     extract,
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index c6253d94b40..d52b0405f1e 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -5,6 +5,7 @@
     capitalize,
     case,
     char_types,
+    combine,
     contains,
     convert,
     extract,
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd
new file mode 100644
index 00000000000..ea22f626973
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/combine.pxd
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.strings.combine cimport (
+    output_if_empty_list,
+    separator_on_nulls,
+)
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column concatenate(
+    Table strings_columns,
+    ColumnOrScalar separator,
+    Scalar narep=*,
+    Scalar col_narep=*,
+    separator_on_nulls separate_nulls=*,
+)
+
+cpdef Column join_strings(Column input, Scalar separator, Scalar narep)
+
+
+cpdef Column join_list_elements(
+    Column source_strings,
+    ColumnOrScalar separator,
+    Scalar separator_narep,
+    Scalar string_narep,
+    separator_on_nulls separate_nulls,
+    output_if_empty_list empty_list_policy,
+)
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx
new file mode 100644
index 00000000000..f17d5265ab4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/combine.pyx
@@ -0,0 +1,223 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings cimport combine as cpp_combine
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+from pylibcudf.libcudf.strings.combine import \
+    output_if_empty_list as OutputIfEmptyList  # no-cython-lint
+from pylibcudf.libcudf.strings.combine import \
+    separator_on_nulls as SeparatorOnNulls  # no-cython-lint
+
+
+cpdef Column concatenate(
+    Table strings_columns,
+    ColumnOrScalar separator,
+    Scalar narep=None,
+    Scalar col_narep=None,
+    separator_on_nulls separate_nulls=separator_on_nulls.YES,
+):
+    """
+    Concatenate all columns in the table horizontally into one new string
+    delimited by an optional separator string.
+
+    Parameters
+    ----------
+    strings_columns : Table
+        Strings for this operation
+
+    separator : Column or Scalar
+        Separator(s) for a given row
+
+    narep : Scalar
+        String to replace a null separator for a given row.
+
+    col_narep : Scalar
+        String that should be used in place of any null strings found in any column.
+        An exception is raised when separator is a Scalar.
+
+    separate_nulls : SeparatorOnNulls
+        If YES, then the separator is included for null rows.
+
+    Returns
+    -------
+    Column
+        New column with concatenated results
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_col_narep
+    cdef const string_scalar* c_separator
+
+    if narep is None:
+        narep = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    cdef const string_scalar* c_narep = <const string_scalar*>(
+        narep.c_obj.get()
+    )
+
+    if ColumnOrScalar is Column:
+        if col_narep is None:
+            col_narep = Scalar.from_libcudf(
+                cpp_make_string_scalar("".encode())
+            )
+        c_col_narep = <const string_scalar*>(
+            col_narep.c_obj.get()
+        )
+        with nogil:
+            c_result = move(
+                cpp_combine.concatenate(
+                    strings_columns.view(),
+                    separator.view(),
+                    dereference(c_narep),
+                    dereference(c_col_narep),
+                    separate_nulls
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        if col_narep is not None:
+            raise ValueError(
+                "col_narep cannot be specified when separator is a Scalar"
+            )
+        c_separator = <const string_scalar*>(separator.c_obj.get())
+        with nogil:
+            c_result = move(
+                cpp_combine.concatenate(
+                    strings_columns.view(),
+                    dereference(c_separator),
+                    dereference(c_narep),
+                    separate_nulls
+                )
+            )
+    else:
+        raise ValueError("separator must be a Column or a Scalar")
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column join_strings(Column input, Scalar separator, Scalar narep):
+    """
+    Concatenates all strings in the column into one new string delimited
+    by an optional separator string.
+
+    Parameters
+    ----------
+    input : Column
+        List of strings columns to concatenate
+
+    separator : Scalar
+        Strings column that provides the separator for a given row
+
+    narep : Scalar
+        String to replace any null strings found.
+
+    Returns
+    -------
+    Column
+        New column containing one string
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_separator = <const string_scalar*>(
+        separator.c_obj.get()
+    )
+    cdef const string_scalar* c_narep = <const string_scalar*>(
+        narep.c_obj.get()
+    )
+    with nogil:
+        c_result = move(
+            cpp_combine.join_strings(
+                input.view(),
+                dereference(c_separator),
+                dereference(c_narep),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column join_list_elements(
+    Column lists_strings_column,
+    ColumnOrScalar separator,
+    Scalar separator_narep,
+    Scalar string_narep,
+    separator_on_nulls separate_nulls,
+    output_if_empty_list empty_list_policy,
+):
+    """
+    Given a lists column of strings (each row is a list of strings),
+    concatenates the strings within each row and returns a single strings
+    column result.
+
+    Parameters
+    ----------
+    lists_strings_column : Column
+        Column containing lists of strings to concatenate
+
+    separator : Column or Scalar
+        String(s) that should inserted between each string from each row.
+
+    separator_narep : Scalar
+        String that should be used to replace a null separator.
+
+    string_narep : Scalar
+        String to replace null strings in any non-null list row.
+        Ignored if separator is a Scalar.
+
+    separate_nulls : SeparatorOnNulls
+        If YES, then the separator is included for null rows
+        if `narep` is valid
+
+    empty_list_policy : OutputIfEmptyList
+        If set to EMPTY_STRING, any input row that is an empty
+        list will result in an empty string. Otherwise, it will
+        result in a null.
+
+
+    Returns
+    -------
+    Column
+        New strings column with concatenated results
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_separator_narep = <const string_scalar*>(
+        separator_narep.c_obj.get()
+    )
+    cdef const string_scalar* c_string_narep = <const string_scalar*>(
+        string_narep.c_obj.get()
+    )
+    cdef const string_scalar* c_separator
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            c_result = move(
+                cpp_combine.join_list_elements(
+                    lists_strings_column.view(),
+                    separator.view(),
+                    dereference(c_separator_narep),
+                    dereference(c_string_narep),
+                    separate_nulls,
+                    empty_list_policy,
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        c_separator = <const string_scalar*>(separator.c_obj.get())
+        with nogil:
+            c_result = move(
+                cpp_combine.join_list_elements(
+                    lists_strings_column.view(),
+                    dereference(c_separator),
+                    dereference(c_separator_narep),
+                    separate_nulls,
+                    empty_list_policy,
+                )
+            )
+    else:
+        raise ValueError("separator must be a Column or a Scalar")
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
new file mode 100644
index 00000000000..4a7007a0d6b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+def test_concatenate_scalar_seperator():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]})
+    )
+    sep = plc.interop.from_arrow(pa.scalar("-"))
+    result = plc.strings.combine.concatenate(
+        plc_table,
+        sep,
+    )
+    expected = pa.array(["a-a", "-b", "c-"])
+    assert_column_eq(result, expected)
+
+    result = plc.strings.combine.concatenate(
+        plc_table, sep, narep=plc.interop.from_arrow(pa.scalar("!"))
+    )
+    expected = pa.array(["a-a", "!-b", "c-!"])
+    assert_column_eq(result, expected)
+
+    with pytest.raises(ValueError):
+        plc.strings.combine.concatenate(
+            plc_table,
+            sep,
+            narep=plc.interop.from_arrow(pa.scalar("!")),
+            col_narep=plc.interop.from_arrow(pa.scalar("?")),
+        )
+
+
+def test_concatenate_column_seperator():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]})
+    )
+    sep = plc.interop.from_arrow(pa.array(["-", "?", ","]))
+    result = plc.strings.combine.concatenate(
+        plc_table,
+        sep,
+    )
+    expected = pa.array(["a-a", "?b", "c,"])
+    assert_column_eq(result, expected)
+
+    result = plc.strings.combine.concatenate(
+        plc_table,
+        plc.interop.from_arrow(pa.array([None, "?", ","])),
+        narep=plc.interop.from_arrow(pa.scalar("1")),
+        col_narep=plc.interop.from_arrow(pa.scalar("*")),
+    )
+    expected = pa.array(["a1a", "*?b", "c,*"])
+    assert_column_eq(result, expected)
+
+
+def test_join_strings():
+    pa_arr = pa.array(list("abc"))
+    sep = pa.scalar("")
+    result = plc.strings.combine.join_strings(
+        plc.interop.from_arrow(pa_arr),
+        plc.interop.from_arrow(sep),
+        plc.interop.from_arrow(pa.scalar("")),
+    )
+    expected = pa.array(["abc"])
+    assert_column_eq(result, expected)
+
+
+def test_join_list_elements():
+    pa_arr = pa.array([["a", "a"], ["b", "b"]])
+    sep = pa.scalar("")
+    result = plc.strings.combine.join_list_elements(
+        plc.interop.from_arrow(pa_arr),
+        plc.interop.from_arrow(sep),
+        plc.interop.from_arrow(pa.scalar("")),
+        plc.interop.from_arrow(pa.scalar("")),
+        plc.strings.combine.SeparatorOnNulls.YES,
+        plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
+    )
+    expected = pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep)
+    assert_column_eq(result, expected)

From e49334093b05ed318110734ba73aeaa903ac63eb Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 17 Oct 2024 08:20:02 -0500
Subject: [PATCH 078/117] Make tests more deterministic (#17008)

Fixes #17045

This PR removes randomness in our pytests and switches from using `np.random.seed` to `np.random.default_rng` in all of the codebase.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Lawrence Mitchell (https://github.com/wence-)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/17008
---
 .pre-commit-config.yaml                       |   10 +
 pyproject.toml                                |    2 +
 python/cudf/benchmarks/API/bench_functions.py |    7 +-
 .../cudf/benchmarks/API/bench_multiindex.py   |   12 +-
 python/cudf/cudf/_fuzz_testing/avro.py        |   15 +-
 python/cudf/cudf/_fuzz_testing/csv.py         |   40 +-
 python/cudf/cudf/_fuzz_testing/io.py          |    5 +-
 python/cudf/cudf/_fuzz_testing/json.py        |   14 +-
 python/cudf/cudf/_fuzz_testing/orc.py         |   28 +-
 python/cudf/cudf/_fuzz_testing/parquet.py     |   14 +-
 .../_fuzz_testing/tests/fuzz_test_parquet.py  |    6 +-
 python/cudf/cudf/_fuzz_testing/utils.py       |   15 +-
 python/cudf/cudf/core/column/struct.py        |    7 +-
 python/cudf/cudf/core/dataframe.py            |    9 +-
 python/cudf/cudf/testing/_utils.py            |   24 +-
 python/cudf/cudf/testing/dataset_generator.py |  172 +-
 .../bert_base_cased_sampled/vocab-hash.txt    | 8580 ++++++++---------
 .../groupby/test_ordering_pandas_compat.py    |    5 +-
 python/cudf/cudf/tests/test_array_function.py |   12 +-
 .../test_avro_reader_fastavro_integration.py  |    4 +-
 python/cudf/cudf/tests/test_binops.py         |   80 +-
 python/cudf/cudf/tests/test_categorical.py    |   10 +-
 python/cudf/cudf/tests/test_column.py         |    7 +-
 python/cudf/cudf/tests/test_concat.py         |   15 +-
 python/cudf/cudf/tests/test_copying.py        |   14 +-
 python/cudf/cudf/tests/test_csv.py            |    8 +-
 python/cudf/cudf/tests/test_dataframe.py      |  234 +-
 python/cudf/cudf/tests/test_dataframe_copy.py |   34 +-
 python/cudf/cudf/tests/test_datetime.py       |   30 +-
 python/cudf/cudf/tests/test_dlpack.py         |   10 +-
 python/cudf/cudf/tests/test_dropna.py         |    6 +-
 python/cudf/cudf/tests/test_duplicates.py     |   13 +-
 python/cudf/cudf/tests/test_factorize.py      |   18 +-
 python/cudf/cudf/tests/test_feather.py        |    5 +-
 python/cudf/cudf/tests/test_groupby.py        |   97 +-
 python/cudf/cudf/tests/test_index.py          |   54 +-
 python/cudf/cudf/tests/test_indexing.py       |  109 +-
 python/cudf/cudf/tests/test_joining.py        |   56 +-
 python/cudf/cudf/tests/test_json.py           |    3 +-
 python/cudf/cudf/tests/test_monotonic.py      |    3 +-
 python/cudf/cudf/tests/test_multiindex.py     |   35 +-
 python/cudf/cudf/tests/test_orc.py            |   38 +-
 python/cudf/cudf/tests/test_pack.py           |   68 +-
 python/cudf/cudf/tests/test_parquet.py        |   34 +-
 python/cudf/cudf/tests/test_pickling.py       |   18 +-
 python/cudf/cudf/tests/test_query.py          |   12 +-
 python/cudf/cudf/tests/test_rank.py           |   28 +-
 python/cudf/cudf/tests/test_reductions.py     |   16 +-
 python/cudf/cudf/tests/test_repr.py           |   26 +-
 python/cudf/cudf/tests/test_resampling.py     |   32 +-
 python/cudf/cudf/tests/test_reshape.py        |   86 +-
 python/cudf/cudf/tests/test_serialize.py      |   47 +-
 python/cudf/cudf/tests/test_series.py         |   43 +-
 python/cudf/cudf/tests/test_seriesmap.py      |    4 +-
 python/cudf/cudf/tests/test_sorting.py        |   69 +-
 python/cudf/cudf/tests/test_sparse_df.py      |    5 +-
 python/cudf/cudf/tests/test_stats.py          |   50 +-
 python/cudf/cudf/tests/test_string.py         |   15 +-
 python/cudf/cudf/tests/test_struct.py         |   12 +-
 python/cudf/cudf/tests/test_transform.py      |    4 +-
 python/cudf/cudf/tests/test_unaops.py         |   13 +-
 python/cudf/cudf/tests/test_unique.py         |    4 +-
 python/cudf/cudf/utils/hash_vocab_utils.py    |   22 +-
 .../data/repr_slow_down_test.ipynb            |    4 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |    4 +-
 .../cudf/cudf_pandas_tests/test_profiler.py   |    8 +-
 .../tests/test_cuml.py                        |    2 +-
 .../tests/test_stumpy_distributed.py          |    8 +-
 .../dask_cudf/io/tests/test_parquet.py        |    8 +-
 .../dask_cudf/tests/test_accessor.py          |    2 +-
 .../dask_cudf/dask_cudf/tests/test_binops.py  |   13 +-
 python/dask_cudf/dask_cudf/tests/test_core.py |  110 +-
 .../dask_cudf/tests/test_delayed_io.py        |   37 +-
 .../dask_cudf/tests/test_dispatch.py          |    6 +-
 .../dask_cudf/dask_cudf/tests/test_groupby.py |   51 +-
 python/dask_cudf/dask_cudf/tests/test_join.py |   50 +-
 .../dask_cudf/tests/test_reductions.py        |    6 +-
 python/dask_cudf/dask_cudf/tests/test_sort.py |    4 +-
 python/dask_cudf/dask_cudf/tests/utils.py     |    3 +-
 79 files changed, 5496 insertions(+), 5288 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f861fb57916..174dc72bf02 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -95,6 +95,16 @@ repos:
         entry: 'pytest\.xfail'
         language: pygrep
         types: [python]
+      - id: no-unseeded-default-rng
+        name: no-unseeded-default-rng
+        description: 'Enforce that no non-seeded default_rng is used and default_rng is used instead of np.random.seed'
+        entry: |
+          # Check for usage of default_rng without seeding
+          default_rng\(\)|
+          # Check for usage of np.random.seed
+          np.random.seed\(
+        language: pygrep
+        types: [python]
       - id: cmake-format
         name: cmake-format
         entry: ./cpp/scripts/run-cmake-format.sh cmake-format
diff --git a/pyproject.toml b/pyproject.toml
index 8f9aa165e5a..661c68ee62e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -90,6 +90,8 @@ select = [
     "UP007",
     # Import from `collections.abc` instead: `Callable`
     "UP035",
+    # usage of legacy `np.random` function calls
+    "NPY002",
 ]
 ignore = [
     # whitespace before :
diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py
index 93109838900..f902111b0db 100644
--- a/python/cudf/benchmarks/API/bench_functions.py
+++ b/python/cudf/benchmarks/API/bench_functions.py
@@ -72,12 +72,13 @@ def bench_pivot_table_simple(benchmark, dataframe):
 
 @pytest_cases.parametrize("nr", NUM_ROWS)
 def bench_crosstab_simple(benchmark, nr):
+    rng = np.random.default_rng(seed=0)
     series_a = np.array(["foo", "bar"] * nr)
     series_b = np.array(["one", "two"] * nr)
     series_c = np.array(["dull", "shiny"] * nr)
-    np.random.shuffle(series_a)
-    np.random.shuffle(series_b)
-    np.random.shuffle(series_c)
+    rng.shuffle(series_a)
+    rng.shuffle(series_b)
+    rng.shuffle(series_c)
     series_a = cudf.Series(series_a)
     series_b = cudf.Series(series_b)
     series_c = cudf.Series(series_c)
diff --git a/python/cudf/benchmarks/API/bench_multiindex.py b/python/cudf/benchmarks/API/bench_multiindex.py
index 6268bcc4267..77004c3313e 100644
--- a/python/cudf/benchmarks/API/bench_multiindex.py
+++ b/python/cudf/benchmarks/API/bench_multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of MultiIndex methods."""
 
@@ -11,16 +11,18 @@
 @pytest.fixture
 def pidx():
     num_elements = int(1e3)
-    a = np.random.randint(0, num_elements // 10, num_elements)
-    b = np.random.randint(0, num_elements // 10, num_elements)
+    rng = np.random.default_rng(seed=0)
+    a = rng.integers(0, num_elements // 10, num_elements)
+    b = rng.integers(0, num_elements // 10, num_elements)
     return pd.MultiIndex.from_arrays([a, b], names=("a", "b"))
 
 
 @pytest.fixture
 def midx(pidx):
     num_elements = int(1e3)
-    a = np.random.randint(0, num_elements // 10, num_elements)
-    b = np.random.randint(0, num_elements // 10, num_elements)
+    rng = np.random.default_rng(seed=0)
+    a = rng.integers(0, num_elements // 10, num_elements)
+    b = rng.integers(0, num_elements // 10, num_elements)
     df = cudf.DataFrame({"a": a, "b": b})
     return cudf.MultiIndex.from_frame(df)
 
diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py
index d9974037daa..172193aa672 100644
--- a/python/cudf/cudf/_fuzz_testing/avro.py
+++ b/python/cudf/cudf/_fuzz_testing/avro.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 import io
@@ -68,12 +68,12 @@ def generate_input(self):
                 # https://github.com/rapidsai/cudf/issues/6604
                 - cudf.utils.dtypes.TIMEDELTA_TYPES
             )
-
+            seed = random.randint(0, 2**32 - 1)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
-            seed = random.randint(0, 2**32 - 1)
+
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
@@ -100,17 +100,18 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if values == ALL_POSSIBLE_VALUES:
                 if param == "columns":
                     col_size = self._rand(len(self._df.columns))
                     params_dict[param] = list(
-                        np.unique(np.random.choice(self._df.columns, col_size))
+                        np.unique(rng.choice(self._df.columns, col_size))
                     )
                 elif param in ("skiprows", "num_rows"):
-                    params_dict[param] = np.random.choice(
+                    params_dict[param] = rng.choice(
                         [None, self._rand(len(self._df))]
                     )
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
index 67211a1c4bf..fa3ed40ce91 100644
--- a/python/cudf/cudf/_fuzz_testing/csv.py
+++ b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -54,7 +54,7 @@ def generate_input(self):
             random.seed(seed)
             dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
@@ -77,25 +77,22 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if values == ALL_POSSIBLE_VALUES:
                 if param == "usecols":
                     col_size = self._rand(len(self._df.columns))
-                    col_val = np.random.choice(
+                    col_val = rng.choice(
                         [
                             None,
-                            np.unique(
-                                np.random.choice(self._df.columns, col_size)
-                            ),
+                            np.unique(rng.choice(self._df.columns, col_size)),
                         ]
                     )
                     params_dict[param] = (
                         col_val if col_val is None else list(col_val)
                     )
                 elif param == "dtype":
-                    dtype_val = np.random.choice(
-                        [None, self._df.dtypes.to_dict()]
-                    )
+                    dtype_val = rng.choice([None, self._df.dtypes.to_dict()])
                     if dtype_val is not None:
                         dtype_val = {
                             col_name: "category"
@@ -105,25 +102,25 @@ def set_rand_params(self, params):
                         }
                     params_dict[param] = dtype_val
                 elif param == "header":
-                    header_val = np.random.choice(
-                        ["infer", np.random.randint(low=0, high=len(self._df))]
+                    header_val = rng.choice(
+                        ["infer", rng.integers(low=0, high=len(self._df))]
                     )
                     params_dict[param] = header_val
                 elif param == "skiprows":
-                    params_dict[param] = np.random.randint(
+                    params_dict[param] = rng.integers(
                         low=0, high=len(self._df)
                     )
                 elif param == "skipfooter":
-                    params_dict[param] = np.random.randint(
+                    params_dict[param] = rng.integers(
                         low=0, high=len(self._df)
                     )
                 elif param == "nrows":
-                    nrows_val = np.random.choice(
-                        [None, np.random.randint(low=0, high=len(self._df))]
+                    nrows_val = rng.choice(
+                        [None, rng.integers(low=0, high=len(self._df))]
                     )
                     params_dict[param] = nrows_val
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
@@ -159,7 +156,7 @@ def generate_input(self):
             random.seed(seed)
             dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
@@ -182,26 +179,25 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if values == ALL_POSSIBLE_VALUES:
                 if param == "columns":
                     col_size = self._rand(len(self._current_buffer.columns))
                     params_dict[param] = list(
                         np.unique(
-                            np.random.choice(
-                                self._current_buffer.columns, col_size
-                            )
+                            rng.choice(self._current_buffer.columns, col_size)
                         )
                     )
                 elif param == "chunksize":
-                    params_dict[param] = np.random.choice(
+                    params_dict[param] = rng.choice(
                         [
                             None,
-                            np.random.randint(
+                            rng.integers(
                                 low=1, high=max(1, len(self._current_buffer))
                             ),
                         ]
                     )
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py
index ffb7171a855..a4b8e18d8b4 100644
--- a/python/cudf/cudf/_fuzz_testing/io.py
+++ b/python/cudf/cudf/_fuzz_testing/io.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 import json
@@ -91,8 +91,9 @@ def get_next_regression_params(self):
         return dtypes_meta, num_rows, num_cols, seed
 
     def set_rand_params(self, params):
+        rng = np.random.default_rng(seed=None)
         params_dict = {
-            param: np.random.choice(values) for param, values in params.items()
+            param: rng.choice(values) for param, values in params.items()
         }
         self._current_params["test_kwargs"] = self.process_kwargs(
             params_dict=params_dict
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index e987529c8ba..45d2c8d8cf0 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -80,7 +80,7 @@ def generate_input(self):
             # https://github.com/rapidsai/cudf/issues/7086
             # dtypes_list.extend(["list"])
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
@@ -105,14 +105,15 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if param == "dtype" and values == ALL_POSSIBLE_VALUES:
-                dtype_val = np.random.choice(
+                dtype_val = rng.choice(
                     [True, self._current_buffer.dtypes.to_dict()]
                 )
                 params_dict[param] = _get_dtype_param_value(dtype_val)
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
@@ -155,7 +156,7 @@ def generate_input(self):
             # https://github.com/rapidsai/cudf/issues/7086
             # dtypes_list.extend(["list"])
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
@@ -180,12 +181,13 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if param == "dtype" and values == ALL_POSSIBLE_VALUES:
-                dtype_val = np.random.choice(
+                dtype_val = rng.choice(
                     [True, self._current_buffer.dtypes.to_dict()]
                 )
                 params_dict[param] = _get_dtype_param_value(dtype_val)
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index ecddc72fa85..4d9e4abb09e 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 import io
@@ -62,13 +62,11 @@ def generate_input(self):
                 - cudf.utils.dtypes.UNSIGNED_TYPES
                 - {"datetime64[ns]"}
             )
-
+            seed = random.randint(0, 2**32 - 1)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
-
             self._current_params["dtypes_meta"] = dtypes_meta
-            seed = random.randint(0, 2**32 - 1)
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
@@ -94,42 +92,41 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if values == ALL_POSSIBLE_VALUES:
                 if param == "columns":
                     col_size = self._rand(len(self._df.columns))
                     params_dict[param] = list(
-                        np.unique(np.random.choice(self._df.columns, col_size))
+                        np.unique(rng.choice(self._df.columns, col_size))
                     )
                 elif param == "stripes":
                     f = io.BytesIO(self._current_buffer)
                     orcFile = pa.orc.ORCFile(f)
                     stripes = list(range(orcFile.nstripes))
-                    params_dict[param] = np.random.choice(
+                    params_dict[param] = rng.choice(
                         [
                             None,
                             list(
                                 map(
                                     int,
                                     np.unique(
-                                        np.random.choice(
-                                            stripes, orcFile.nstripes
-                                        )
+                                        rng.choice(stripes, orcFile.nstripes)
                                     ),
                                 )
                             ),
                         ]
                     )
                 elif param == "use_index":
-                    params_dict[param] = np.random.choice([True, False])
+                    params_dict[param] = rng.choice([True, False])
                 elif param in ("skiprows", "num_rows"):
-                    params_dict[param] = np.random.choice(
+                    params_dict[param] = rng.choice(
                         [None, self._rand(len(self._df))]
                     )
             else:
                 if not isinstance(values, list):
                     raise TypeError("values must be of type list")
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
@@ -177,12 +174,11 @@ def generate_input(self):
                 # https://github.com/rapidsai/cudf/issues/7355
                 - cudf.utils.dtypes.DATETIME_TYPES
             )
-
+            seed = random.randint(0, 2**32 - 1)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
-            seed = random.randint(0, 2**32 - 1)
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 2d934e4816d..bd3df1b0847 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import logging
 import random
@@ -59,12 +59,11 @@ def generate_input(self):
                 - {"uint32"}
                 | {"list", "decimal64"}
             )
-
+            seed = random.randint(0, 2**32 - 1)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
-            seed = random.randint(0, 2**32 - 1)
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
@@ -96,14 +95,15 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if param == "columns" and values == ALL_POSSIBLE_VALUES:
                 col_size = self._rand(len(self._df.columns))
                 params_dict[param] = list(
-                    np.unique(np.random.choice(self._df.columns, col_size))
+                    np.unique(rng.choice(self._df.columns, col_size))
                 )
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
@@ -146,7 +146,7 @@ def generate_input(self):
                 | {"list", "decimal64"}
             )
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
index 3d070576a12..bbc19dce1a4 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import sys
 
@@ -68,7 +68,9 @@ def parquet_writer_test(pdf):
 @pythonfuzz(
     data_handle=ParquetWriter,
     params={
-        "row_group_size": np.random.random_integers(1, 10000, 100),
+        "row_group_size": np.random.default_rng(seed=0).integers(
+            1, 10000, 100
+        ),
         "compression": ["snappy", None],
     },
 )
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 8ce92e1c0f6..4cadb3a109c 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -40,8 +40,11 @@
 }
 
 
-def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
+def _generate_rand_meta(
+    obj, dtypes_list, null_frequency_override=None, seed=0
+):
     obj._current_params = {}
+    rng = np.random.default_rng(seed=seed)
     num_rows = obj._rand(obj._max_rows)
     num_cols = obj._rand(obj._max_columns)
 
@@ -69,12 +72,12 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
                 meta["max_string_length"] = obj._max_string_length
         elif dtype == "list":
             if obj._max_lists_length is None:
-                meta["lists_max_length"] = np.random.randint(0, 2000000000)
+                meta["lists_max_length"] = rng.integers(0, 2000000000)
             else:
                 meta["lists_max_length"] = obj._max_lists_length
 
             if obj._max_lists_nesting_depth is None:
-                meta["nesting_max_depth"] = np.random.randint(
+                meta["nesting_max_depth"] = rng.integers(
                     1, np.iinfo("int64").max
                 )
             else:
@@ -85,7 +88,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             )
         elif dtype == "struct":
             if obj._max_lists_nesting_depth is None:
-                meta["nesting_max_depth"] = np.random.randint(2, 10)
+                meta["nesting_max_depth"] = rng.integers(2, 10)
             else:
                 meta["nesting_max_depth"] = obj._max_lists_nesting_depth
 
@@ -95,9 +98,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
                 meta["max_null_frequency"] = obj._max_struct_null_frequency
 
             if obj._max_struct_types_at_each_level is None:
-                meta["max_types_at_each_level"] = np.random.randint(
-                    low=1, high=10
-                )
+                meta["max_types_at_each_level"] = rng.integers(low=1, high=10)
             else:
                 meta["max_types_at_each_level"] = (
                     obj._max_struct_types_at_each_level
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 2fda3b2c434..8f16ba4e15b 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -68,12 +68,7 @@ def base_size(self):
             return self.size + self.offset
 
     def to_arrow(self) -> pa.Array:
-        children = [
-            pa.nulls(len(child))
-            if len(child) == child.null_count
-            else child.to_arrow()
-            for child in self.children
-        ]
+        children = [child.to_arrow() for child in self.children]
 
         pa_type = pa.struct(
             {
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 79ed5a0e187..f8d544e04b1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5118,11 +5118,12 @@ def info(
         useful for big DataFrames and fine-tune memory optimization:
 
         >>> import numpy as np
-        >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
+        >>> rng = np.random.default_rng(seed=0)
+        >>> random_strings_array = rng.choice(['a', 'b', 'c'], 10 ** 6)
         >>> df = cudf.DataFrame({
-        ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
-        ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
-        ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
+        ...     'column_1': rng.choice(['a', 'b', 'c'], 10 ** 6),
+        ...     'column_2': rng.choice(['a', 'b', 'c'], 10 ** 6),
+        ...     'column_3': rng.choice(['a', 'b', 'c'], 10 ** 6)
         ... })
         >>> df.info(memory_usage='deep')
         <class 'cudf.core.dataframe.DataFrame'>
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 8cb9efa873c..a5dc8a5498c 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -92,7 +92,8 @@ def random_bitmask(size):
         number of bits
     """
     sz = bitmask_allocation_size_bytes(size)
-    data = np.random.randint(0, 255, dtype="u1", size=sz)
+    rng = np.random.default_rng(seed=0)
+    data = rng.integers(0, 255, dtype="u1", size=sz)
     return data.view("i1")
 
 
@@ -209,9 +210,10 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
 
 
 def gen_rand(dtype, size, **kwargs):
+    rng = np.random.default_rng(seed=kwargs.get("seed", 0))
     dtype = cudf.dtype(dtype)
     if dtype.kind == "f":
-        res = np.random.random(size=size).astype(dtype)
+        res = rng.random(size=size).astype(dtype)
         if kwargs.get("positive_only", False):
             return res
         else:
@@ -219,25 +221,23 @@ def gen_rand(dtype, size, **kwargs):
     elif dtype == np.int8 or dtype == np.int16:
         low = kwargs.get("low", -32)
         high = kwargs.get("high", 32)
-        return np.random.randint(low=low, high=high, size=size).astype(dtype)
+        return rng.integers(low=low, high=high, size=size).astype(dtype)
     elif dtype.kind == "i":
         low = kwargs.get("low", -10000)
         high = kwargs.get("high", 10000)
-        return np.random.randint(low=low, high=high, size=size).astype(dtype)
+        return rng.integers(low=low, high=high, size=size).astype(dtype)
     elif dtype == np.uint8 or dtype == np.uint16:
         low = kwargs.get("low", 0)
         high = kwargs.get("high", 32)
-        return np.random.randint(low=low, high=high, size=size).astype(dtype)
+        return rng.integers(low=low, high=high, size=size).astype(dtype)
     elif dtype.kind == "u":
         low = kwargs.get("low", 0)
         high = kwargs.get("high", 128)
-        return np.random.randint(low=low, high=high, size=size).astype(dtype)
+        return rng.integers(low=low, high=high, size=size).astype(dtype)
     elif dtype.kind == "b":
         low = kwargs.get("low", 0)
         high = kwargs.get("high", 2)
-        return np.random.randint(low=low, high=high, size=size).astype(
-            np.bool_
-        )
+        return rng.integers(low=low, high=high, size=size).astype(np.bool_)
     elif dtype.kind == "M":
         low = kwargs.get("low", 0)
         time_unit, _ = np.datetime_data(dtype)
@@ -246,14 +246,14 @@ def gen_rand(dtype, size, **kwargs):
             int(1e18) / _unit_to_nanoseconds_conversion[time_unit],
         )
         return pd.to_datetime(
-            np.random.randint(low=low, high=high, size=size), unit=time_unit
+            rng.integers(low=low, high=high, size=size), unit=time_unit
         )
     elif dtype.kind in ("O", "U"):
         low = kwargs.get("low", 10)
         high = kwargs.get("high", 11)
-        nchars = np.random.randint(low=low, high=high, size=1)[0]
+        nchars = rng.integers(low=low, high=high, size=1)[0]
         char_options = np.array(list(string.ascii_letters + string.digits))
-        all_chars = "".join(np.random.choice(char_options, nchars * size))
+        all_chars = "".join(rng.choice(char_options, nchars * size))
         return np.array(
             [all_chars[nchars * i : nchars * (i + 1)] for i in range(size)]
         )
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 13c194d6be0..99b686406fb 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -48,16 +48,22 @@ def __init__(
         self,
         cardinality=100,
         null_frequency=0.1,
-        generator=lambda: [
-            _generate_string(string.ascii_letters, random.randint(4, 8))
-            for _ in range(100)
-        ],
+        generator=None,
         is_sorted=True,
         dtype=None,
     ):
         self.cardinality = cardinality
         self.null_frequency = null_frequency
-        self.generator = generator
+        if generator is None:
+            rng = np.random.default_rng(seed=0)
+            self.generator = lambda: [
+                _generate_string(
+                    string.ascii_letters, rng, rng.integers(4, 8).item()
+                )
+                for _ in range(100)
+            ]
+        else:
+            self.generator = generator
         self.is_sorted = is_sorted
         self.dtype = dtype
 
@@ -96,7 +102,7 @@ def _write(tbl, path, format):
             tbl.to_parquet(path, row_group_size=format["row_group_size"])
 
 
-def _generate_column(column_params, num_rows):
+def _generate_column(column_params, num_rows, rng):
     # If cardinality is specified, we create a set to sample from.
     # Otherwise, we simply use the given generator to generate each value.
 
@@ -115,10 +121,8 @@ def _generate_column(column_params, num_rows):
             )
             return pa.DictionaryArray.from_arrays(
                 dictionary=vals,
-                indices=np.random.randint(
-                    low=0, high=len(vals), size=num_rows
-                ),
-                mask=np.random.choice(
+                indices=rng.integers(low=0, high=len(vals), size=num_rows),
+                mask=rng.choice(
                     [True, False],
                     size=num_rows,
                     p=[
@@ -142,7 +146,7 @@ def _generate_column(column_params, num_rows):
                 column_params.generator,
                 names=column_params.dtype.fields.keys(),
                 mask=pa.array(
-                    np.random.choice(
+                    rng.choice(
                         [True, False],
                         size=num_rows,
                         p=[
@@ -163,10 +167,10 @@ def _generate_column(column_params, num_rows):
                 type=arrow_type,
             )
         vals = pa.array(
-            np.random.choice(column_params.generator, size=num_rows)
+            rng.choice(column_params.generator, size=num_rows)
             if isinstance(arrow_type, pa.lib.Decimal128Type)
-            else np.random.choice(vals, size=num_rows),
-            mask=np.random.choice(
+            else rng.choice(vals, size=num_rows),
+            mask=rng.choice(
                 [True, False],
                 size=num_rows,
                 p=[
@@ -189,7 +193,7 @@ def _generate_column(column_params, num_rows):
         # Generate data for current column
         return pa.array(
             column_params.generator,
-            mask=np.random.choice(
+            mask=rng.choice(
                 [True, False],
                 size=num_rows,
                 p=[
@@ -233,7 +237,9 @@ def generate(
 def get_dataframe(parameters, use_threads):
     # Initialize seeds
     if parameters.seed is not None:
-        np.random.seed(parameters.seed)
+        rng = np.random.default_rng(seed=parameters.seed)  # noqa: F841
+    else:
+        rng = np.random.default_rng(seed=0)  # noqa: F841
 
     # For each column, invoke the data generator
     for column_params in parameters.column_parameters:
@@ -281,14 +287,16 @@ def get_dataframe(parameters, use_threads):
     if not use_threads:
         for i, column_params in enumerate(parameters.column_parameters):
             column_data[i] = _generate_column(
-                column_params, parameters.num_rows
+                column_params,
+                parameters.num_rows,
+                rng,
             )
     else:
         pool = Pool(pa.cpu_count())
         column_data = pool.starmap(
             _generate_column,
             [
-                (column_params, parameters.num_rows)
+                (column_params, parameters.num_rows, rng)
                 for i, column_params in enumerate(parameters.column_parameters)
             ],
         )
@@ -336,7 +344,7 @@ def rand_dataframe(
     """
     # Apply seed
     random.seed(seed)
-    np.random.seed(seed)
+    rng = np.random.default_rng(seed=seed)
 
     column_params = []
     for meta in dtypes_meta:
@@ -348,7 +356,7 @@ def rand_dataframe(
             lists_max_length = meta["lists_max_length"]
             nesting_max_depth = meta["nesting_max_depth"]
             value_type = meta["value_type"]
-            nesting_depth = np.random.randint(1, nesting_max_depth)
+            nesting_depth = rng.integers(1, nesting_max_depth)
 
             dtype = cudf.core.dtypes.ListDtype(value_type)
 
@@ -368,6 +376,7 @@ def rand_dataframe(
                         size=cardinality,
                         nesting_depth=nesting_depth,
                         lists_max_length=lists_max_length,
+                        rng=rng,
                     ),
                     is_sorted=False,
                     dtype=dtype,
@@ -377,10 +386,11 @@ def rand_dataframe(
             nesting_max_depth = meta["nesting_max_depth"]
             max_types_at_each_level = meta["max_types_at_each_level"]
             max_null_frequency = meta["max_null_frequency"]
-            nesting_depth = np.random.randint(1, nesting_max_depth)
+            nesting_depth = rng.integers(1, nesting_max_depth)
             structDtype = create_nested_struct_type(
                 max_types_at_each_level=max_types_at_each_level,
                 nesting_level=nesting_depth,
+                rng=rng,
             )
 
             column_params.append(
@@ -392,6 +402,7 @@ def rand_dataframe(
                         cardinality=cardinality,
                         size=rows,
                         max_null_frequency=max_null_frequency,
+                        rng=rng,
                     ),
                     is_sorted=False,
                     dtype=structDtype,
@@ -401,14 +412,16 @@ def rand_dataframe(
             max_precision = meta.get(
                 "max_precision", cudf.Decimal64Dtype.MAX_PRECISION
             )
-            precision = np.random.randint(1, max_precision)
-            scale = np.random.randint(0, precision)
+            precision = rng.integers(1, max_precision)
+            scale = rng.integers(0, precision)
             dtype = cudf.Decimal64Dtype(precision=precision, scale=scale)
             column_params.append(
                 ColumnParameters(
                     cardinality=cardinality,
                     null_frequency=null_frequency,
-                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    generator=decimal_generator(
+                        dtype=dtype, size=cardinality, rng=rng
+                    ),
                     is_sorted=False,
                     dtype=dtype,
                 )
@@ -417,14 +430,16 @@ def rand_dataframe(
             max_precision = meta.get(
                 "max_precision", cudf.Decimal32Dtype.MAX_PRECISION
             )
-            precision = np.random.randint(1, max_precision)
-            scale = np.random.randint(0, precision)
+            precision = rng.integers(1, max_precision)
+            scale = rng.integers(0, precision)
             dtype = cudf.Decimal32Dtype(precision=precision, scale=scale)
             column_params.append(
                 ColumnParameters(
                     cardinality=cardinality,
                     null_frequency=null_frequency,
-                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    generator=decimal_generator(
+                        dtype=dtype, size=cardinality, rng=rng
+                    ),
                     is_sorted=False,
                     dtype=dtype,
                 )
@@ -433,14 +448,16 @@ def rand_dataframe(
             max_precision = meta.get(
                 "max_precision", cudf.Decimal128Dtype.MAX_PRECISION
             )
-            precision = np.random.randint(1, max_precision)
-            scale = np.random.randint(0, precision)
+            precision = rng.integers(1, max_precision)
+            scale = rng.integers(0, precision)
             dtype = cudf.Decimal128Dtype(precision=precision, scale=scale)
             column_params.append(
                 ColumnParameters(
                     cardinality=cardinality,
                     null_frequency=null_frequency,
-                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    generator=decimal_generator(
+                        dtype=dtype, size=cardinality, rng=rng
+                    ),
                     is_sorted=False,
                     dtype=dtype,
                 )
@@ -469,6 +486,7 @@ def rand_dataframe(
                             size=cardinality,
                             min_bound=meta.get("min_bound", None),
                             max_bound=meta.get("max_bound", None),
+                            rng=rng,
                         ),
                         is_sorted=False,
                         dtype=dtype,
@@ -484,6 +502,7 @@ def rand_dataframe(
                             size=cardinality,
                             min_bound=meta.get("min_bound", None),
                             max_bound=meta.get("max_bound", None),
+                            rng=rng,
                         ),
                         is_sorted=False,
                         dtype=dtype,
@@ -497,7 +516,8 @@ def rand_dataframe(
                         generator=lambda cardinality=cardinality: [
                             _generate_string(
                                 string.printable,
-                                np.random.randint(
+                                rng,
+                                rng.integers(
                                     low=0,
                                     high=meta.get("max_string_length", 1000),
                                     size=1,
@@ -519,6 +539,7 @@ def rand_dataframe(
                             size=cardinality,
                             min_bound=meta.get("min_bound", None),
                             max_bound=meta.get("max_bound", None),
+                            rng=rng,
                         ),
                         is_sorted=False,
                         dtype=cudf.dtype(dtype),
@@ -534,6 +555,7 @@ def rand_dataframe(
                             size=cardinality,
                             min_bound=meta.get("min_bound", None),
                             max_bound=meta.get("max_bound", None),
+                            rng=rng,
                         ),
                         is_sorted=False,
                         dtype=cudf.dtype(dtype),
@@ -544,7 +566,7 @@ def rand_dataframe(
                     ColumnParameters(
                         cardinality=cardinality,
                         null_frequency=null_frequency,
-                        generator=boolean_generator(cardinality),
+                        generator=boolean_generator(cardinality, rng),
                         is_sorted=False,
                         dtype=cudf.dtype(dtype),
                     )
@@ -567,7 +589,7 @@ def rand_dataframe(
     return df
 
 
-def int_generator(dtype, size, min_bound=None, max_bound=None):
+def int_generator(dtype, size, rng, min_bound=None, max_bound=None):
     """
     Generator for int data
     """
@@ -577,7 +599,7 @@ def int_generator(dtype, size, min_bound=None, max_bound=None):
         iinfo = np.iinfo(dtype)
         low, high = iinfo.min, iinfo.max
 
-    return lambda: np.random.randint(
+    return lambda: rng.integers(
         low=low,
         high=high,
         size=size,
@@ -585,13 +607,13 @@ def int_generator(dtype, size, min_bound=None, max_bound=None):
     )
 
 
-def float_generator(dtype, size, min_bound=None, max_bound=None):
+def float_generator(dtype, size, rng, min_bound=None, max_bound=None):
     """
     Generator for float data
     """
     if min_bound is not None and max_bound is not None:
         low, high = min_bound, max_bound
-        return lambda: np.random.uniform(
+        return lambda: rng.uniform(
             low=low,
             high=high,
             size=size,
@@ -599,7 +621,7 @@ def float_generator(dtype, size, min_bound=None, max_bound=None):
     else:
         finfo = np.finfo(dtype)
         return (
-            lambda: np.random.uniform(
+            lambda: rng.uniform(
                 low=finfo.min / 2,
                 high=finfo.max / 2,
                 size=size,
@@ -608,7 +630,7 @@ def float_generator(dtype, size, min_bound=None, max_bound=None):
         )
 
 
-def datetime_generator(dtype, size, min_bound=None, max_bound=None):
+def datetime_generator(dtype, size, rng, min_bound=None, max_bound=None):
     """
     Generator for datetime data
     """
@@ -618,14 +640,14 @@ def datetime_generator(dtype, size, min_bound=None, max_bound=None):
         iinfo = np.iinfo("int64")
         low, high = iinfo.min + 1, iinfo.max
 
-    return lambda: np.random.randint(
+    return lambda: rng.integers(
         low=np.datetime64(low, "ns").astype(dtype).astype("int"),
         high=np.datetime64(high, "ns").astype(dtype).astype("int"),
         size=size,
     )
 
 
-def timedelta_generator(dtype, size, min_bound=None, max_bound=None):
+def timedelta_generator(dtype, size, rng, min_bound=None, max_bound=None):
     """
     Generator for timedelta data
     """
@@ -635,25 +657,25 @@ def timedelta_generator(dtype, size, min_bound=None, max_bound=None):
         iinfo = np.iinfo("int64")
         low, high = iinfo.min + 1, iinfo.max
 
-    return lambda: np.random.randint(
+    return lambda: rng.integers(
         low=np.timedelta64(low, "ns").astype(dtype).astype("int"),
         high=np.timedelta64(high, "ns").astype(dtype).astype("int"),
         size=size,
     )
 
 
-def boolean_generator(size):
+def boolean_generator(size, rng):
     """
     Generator for bool data
     """
-    return lambda: np.random.choice(a=[False, True], size=size)
+    return lambda: rng.choice(a=[False, True], size=size)
 
 
-def decimal_generator(dtype, size):
+def decimal_generator(dtype, size, rng):
     max_integral = 10 ** (dtype.precision - dtype.scale) - 1
     max_float = (10**dtype.scale - 1) if dtype.scale != 0 else 0
     return lambda: (
-        np.random.uniform(
+        rng.uniform(
             low=-max_integral,
             high=max_integral + (max_float / 10**dtype.scale),
             size=size,
@@ -661,32 +683,33 @@ def decimal_generator(dtype, size):
     )
 
 
-def get_values_for_nested_data(dtype, lists_max_length=None, size=None):
+def get_values_for_nested_data(dtype, rng, lists_max_length=None, size=None):
     """
     Returns list of values based on dtype.
     """
     if size is None:
-        cardinality = np.random.randint(0, lists_max_length)
+        cardinality = rng.integers(0, lists_max_length)
     else:
         cardinality = size
 
     dtype = cudf.dtype(dtype)
     if dtype.kind in ("i", "u"):
-        values = int_generator(dtype=dtype, size=cardinality)()
+        values = int_generator(dtype=dtype, size=cardinality, rng=rng)()
     elif dtype.kind == "f":
-        values = float_generator(dtype=dtype, size=cardinality)()
+        values = float_generator(dtype=dtype, size=cardinality, rng=rng)()
     elif dtype.kind in ("U", "O"):
         values = [
             _generate_string(
                 string.printable,
+                rng,
                 100,
             )
             for _ in range(cardinality)
         ]
     elif dtype.kind == "M":
-        values = datetime_generator(dtype=dtype, size=cardinality)().astype(
-            dtype
-        )
+        values = datetime_generator(
+            dtype=dtype, size=cardinality, rng=rng
+        )().astype(dtype)
     elif dtype.kind == "m":
         values = timedelta_generator(dtype=dtype, size=cardinality)().astype(
             dtype
@@ -699,14 +722,14 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None):
     return values
 
 
-def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
+def make_lists(dtype, lists_max_length, nesting_depth, top_level_list, rng):
     """
     Helper to create random list of lists with `nesting_depth` and
     specified value type `dtype`.
     """
     nesting_depth -= 1
     if nesting_depth >= 0:
-        L = np.random.randint(1, lists_max_length)
+        L = rng.integers(1, lists_max_length)
         for i in range(L):
             top_level_list.append(
                 make_lists(
@@ -714,11 +737,14 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
                     lists_max_length=lists_max_length,
                     nesting_depth=nesting_depth,
                     top_level_list=[],
+                    rng=rng,
                 )
             )
     else:
         top_level_list = get_values_for_nested_data(
-            dtype=dtype, lists_max_length=lists_max_length
+            dtype=dtype,
+            lists_max_length=lists_max_length,
+            rng=rng,
         )
         # To ensure numpy arrays are not passed as input to
         # list constructor, returning a python list object here.
@@ -728,22 +754,22 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
     return top_level_list
 
 
-def make_array_for_struct(dtype, cardinality, size, max_null_frequency):
+def make_array_for_struct(dtype, cardinality, size, max_null_frequency, rng):
     """
     Helper to create a pa.array with `size` and `dtype`
     for a `StructArray`.
     """
 
-    null_frequency = np.random.uniform(low=0, high=max_null_frequency)
-    local_cardinality = max(np.random.randint(low=0, high=cardinality), 1)
+    null_frequency = rng.uniform(low=0, high=max_null_frequency)
+    local_cardinality = max(rng.integers(low=0, high=cardinality), 1)
     data = get_values_for_nested_data(
-        dtype=dtype.type.to_pandas_dtype(), size=local_cardinality
+        dtype=dtype.type.to_pandas_dtype(), size=local_cardinality, rng=rng
     )
-    vals = np.random.choice(data, size=size)
+    vals = rng.choice(data, size=size)
 
     return pa.array(
         vals,
-        mask=np.random.choice(
+        mask=rng.choice(
             [True, False],
             size=size,
             p=[null_frequency, 1 - null_frequency],
@@ -756,7 +782,7 @@ def make_array_for_struct(dtype, cardinality, size, max_null_frequency):
     )
 
 
-def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
+def get_nested_lists(dtype, size, nesting_depth, lists_max_length, rng):
     """
     Returns a list of nested lists with random nesting
     depth and random nested lists length.
@@ -770,13 +796,14 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
                 lists_max_length=lists_max_length,
                 nesting_depth=nesting_depth,
                 top_level_list=[],
+                rng=rng,
             )
         )
 
     return list_of_lists
 
 
-def get_nested_structs(dtype, cardinality, size, max_null_frequency):
+def get_nested_structs(dtype, cardinality, size, max_null_frequency, rng):
     """
     Returns a list of arrays with random data
     corresponding to the dtype provided.
@@ -787,7 +814,7 @@ def get_nested_structs(dtype, cardinality, size, max_null_frequency):
     for name, col_dtype in dtype.fields.items():
         if isinstance(col_dtype, cudf.StructDtype):
             result_arrays = get_nested_structs(
-                col_dtype, cardinality, size, max_null_frequency
+                col_dtype, cardinality, size, max_null_frequency, rng
             )
             result_arrays = pa.StructArray.from_arrays(
                 result_arrays, names=col_dtype.fields.keys()
@@ -798,13 +825,14 @@ def get_nested_structs(dtype, cardinality, size, max_null_frequency):
                 cardinality=cardinality,
                 size=size,
                 max_null_frequency=max_null_frequency,
+                rng=rng,
             )
         list_of_arrays.append(result_arrays)
 
     return list_of_arrays
 
 
-def list_generator(dtype, size, nesting_depth, lists_max_length):
+def list_generator(dtype, size, nesting_depth, lists_max_length, rng):
     """
     Generator for list data
     """
@@ -813,10 +841,11 @@ def list_generator(dtype, size, nesting_depth, lists_max_length):
         size=size,
         nesting_depth=nesting_depth,
         lists_max_length=lists_max_length,
+        rng=rng,
     )
 
 
-def struct_generator(dtype, cardinality, size, max_null_frequency):
+def struct_generator(dtype, cardinality, size, max_null_frequency, rng):
     """
     Generator for struct data
     """
@@ -825,25 +854,26 @@ def struct_generator(dtype, cardinality, size, max_null_frequency):
         cardinality=cardinality,
         size=size,
         max_null_frequency=max_null_frequency,
+        rng=rng,
     )
 
 
-def create_nested_struct_type(max_types_at_each_level, nesting_level):
+def create_nested_struct_type(max_types_at_each_level, nesting_level, rng):
     dtypes_list = cudf.utils.dtypes.ALL_TYPES
-    picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level)
+    picked_types = rng.choice(list(dtypes_list), max_types_at_each_level)
     type_dict = {}
     for name, type_ in enumerate(picked_types):
         if type_ == "struct":
             type_dict[str(name)] = create_nested_struct_type(
-                max_types_at_each_level, nesting_level - 1
+                max_types_at_each_level, nesting_level - 1, rng
             )
         else:
             type_dict[str(name)] = cudf.dtype(type_)
     return cudf.StructDtype(type_dict)
 
 
-def _generate_string(str_seq: str, length: int = 10) -> str:
-    return "".join(random.choices(str_seq, k=length))
+def _generate_string(str_seq: str, rng, length: int = 10) -> str:
+    return "".join(rng.choice(list(str_seq), size=length))
 
 
 def _unique_string() -> str:
diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt
index 84b13c9d946..566ac2c337d 100644
--- a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt
+++ b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt
@@ -1,4382 +1,4382 @@
-26899
-27424
+19535
+9039
 875
-7428432802425011718 0
-5054974408289448963 6
-18358444369622338053 9
-5716902217424485892 14
-8236612966193239043 18
-15282833726017872390 21
-15533348956988973570 27
-9001315167781089284 29
-7621090240282984451 33
-15337888141402371590 36
-16169070283077377537 42
-15615300272936709634 43
-12338784885023498756 45
-3175624061711419395 49
-9436392785812228615 52
-12978641027296058883 59
-14468815760709033991 62
-15607694490571932163 69
-53295083356623878 72
-0 78
-2230148770582976004 78
-6120456721458209796 82
-15411373208619074054 86
-10274574020114097153 92
-9000294930530661890 93
-13031557903172483076 95
-11350066664294002181 99
-6325605033787362307 104
-2909954277284188676 107
-4104562716099355138 111
-3267092979937387012 113
-17525453481571210244 117
-11532627846208440834 121
-10784672185103672321 123
-11229796758348255749 124
-4379577250247562242 129
-1041161126836283908 131
-3854383966527313413 135
-16467720483237810694 140
-14820844471735454722 146
-13111220924289178119 148
-2548683052821249538 155
-719749806464434178 157
-2121722119826170883 159
-9005614210949580292 162
-7050169108294333445 166
-17351764915062575107 171
-14644698505496219141 174
-11657834349296686081 179
-13626797927783164930 180
-14735048589438940164 182
-1078491261937017863 186
-7952761372439242754 193
-7692446865301965827 195
-4552111108816020995 198
-12455022990418032132 201
-1123962659471997957 205
-3056549312838577156 210
-1025661670765243906 214
-5397331336358247944 216
-7810366437124875782 224
-1195318972358038531 230
-7079722807026103811 233
-2524512050942986248 236
-1208593608912656389 244
-458260789232344578 249
-13194777122325112327 251
-5922704468287492 258
-11746235869336195079 262
-8611574268876189188 269
-7889840228953421829 273
-16998721522558936068 278
-6703563424903621638 282
-8885848295085850114 288
-13776273837475230211 290
-6036043703810622467 293
-2006225773287659526 296
-14202467530861800964 302
-7157057020317447684 306
-16885485872491802629 310
-12800303798361952772 315
-621325108927868418 319
-16727475898656483841 321
-6890112792805515778 322
-2421332377941126151 324
-16243404411124196356 331
-179400401794890244 335
-2630159406474274819 339
-1306609735592145925 342
-14908020842914311174 347
-1684452927247835651 353
-9400495923215416322 356
-8041860727239247878 358
-5619270496913133574 364
-2985476283152588291 370
-18150632792370312198 373
-13075355875451793410 379
-7596576612263365635 381
-7174955249282660868 384
-2272878747426984963 388
-9645618748109430277 391
-5995177571885476868 396
-16871713338758691845 400
-11801224416933808644 405
-15551192014010130949 409
-8196030292452405250 414
-4794784530053649411 416
-68047322062825475 419
-10163451915097363972 422
-4366630365820669955 426
-9174613115382159879 429
-17673253091692480002 436
-10710744348807818249 438
-6301209632168211460 447
-6557199531177304066 451
-10370980735304160259 453
-2426040420413965827 456
-18123352379522220547 459
-15891150425892429319 462
-16507447417454265351 469
-487708338428237827 476
-14107089365716616196 479
-747857609528251395 483
-17357876987202521607 486
-321005419951863300 493
-703083947315053061 497
-0 502
-17149635587492691460 502
-8277651075246678020 506
-1819886593879462403 510
-13106328552418381315 513
-17519686381941948418 516
-10696099526822671877 518
-4627984173327437314 523
-2628632462897246722 525
-3686397216490033667 527
-6617920799692924934 530
-6679301623707790339 536
-2596030458845084674 539
-13288938917088308226 541
-8348492885671808517 543
-6252009608718840325 548
-5807005916268695559 553
-15382799971167504899 560
-14954638692016032262 563
-8963684459383523331 569
-2934745887866391556 572
-8236887590303639044 576
-2016330563068923911 580
-12976290063611676164 587
-9986513189506445831 591
-780378482699725318 598
-383862355994530823 604
-7511344867307093508 611
-1435616864863593988 615
-12590979271693393411 619
-859813995721111047 622
-17910873098448224770 629
-16703366890805911553 631
-6922480979814889987 632
-8200210214462711297 635
-18382541080931060232 636
-12959023536126992897 644
-11055794376142651906 645
-8668012051305565187 647
-6795201209679524868 650
-3864186432644490244 654
-4574634299775772674 658
-2086703290536303619 660
-7145543127561014787 663
-9889572542971630085 666
-3510566585561691650 671
-10482036181312531460 673
-4296479271603189251 677
-17165580381790665732 680
-17931697598514948104 684
-5072138329769649158 692
-17857316349005986308 698
-1196313437880152072 702
-16094827446472526340 710
-6365083142954013701 714
-17639674970007880709 719
-1336948026798963208 724
-15719079816546418177 732
-453771991153695748 733
-15666021623592344581 737
-3887496731301423107 742
-16351565489992748547 745
-12913808626051103749 748
-9427161342471792643 753
-14610089064185748483 756
-11909740995340709890 759
-3386059367942955011 761
-7100313088634791944 764
-14954362273735097348 772
-5300343188950335490 776
-3306636399811602435 778
-15049176780536452612 781
-11478464585367391747 785
-4192691696663825924 788
-1724981527538165256 792
-8923121468991320579 800
-10407927314751914499 803
-4140577061391662082 806
-11024499228689010181 808
-11103397578962422789 813
-16103730809841527300 818
-2161511371026989571 822
-16905537098408481288 825
-14418359835235787780 833
-8643099440826274820 837
-15803230958149170691 841
-2270949347024239618 844
-16607521085023703556 846
-12520505897845165062 850
-10502193626894192132 856
-12350321094518214659 860
-4950437143309872131 863
-938542234576037889 866
-9547302901107668484 867
-7827404372121768966 871
-17757593377946824198 877
-13699186867246955524 883
-9859653826627356163 887
-16394835100035514883 890
-13800374264730731525 893
-16954635983094506500 898
-8015308433863798275 902
-858715644299290630 905
-4519655150699331077 911
-7134867591233050115 916
-6432786657037144579 919
-0 922
-9408341322832972291 922
-13653279902433200130 925
-1249019122170091524 927
-5444522055126761479 931
-18233734556082323457 938
-1838285473517654531 939
-10799019207790220804 942
-2448710159565130755 946
-18425837006146807297 949
-1384258267102048263 950
-6553795393861204486 957
-5022631533298058243 963
-2595435540421003780 966
-18298501952506793480 970
-17380720526409169413 978
-10291550905275666437 983
-8968303908578660869 988
-7762552109517888009 993
-12993351549860134403 1002
-13098482377540869636 1005
-17174134275815044100 1009
-2405939573849534980 1013
-11051603729345690626 1017
-2765842466801084934 1019
-13348255112383532037 1025
-4560899789258637829 1030
-17071422935680193539 1035
-11513452937230732294 1038
-1637355496640499203 1044
-14940739688966611972 1047
-8286559267538602502 1051
-6029036263825492484 1057
-6337648087046756355 1061
-12327119652833755139 1064
-7489768843341343236 1067
-17101806024406781955 1071
-1494687508867621385 1074
-915975103999953922 1083
-14731060910946571783 1085
-7993361195780195330 1092
-13688799604315935236 1094
-7328858946338903047 1098
-2913637027195678723 1105
-18189363439163655681 1108
-11261484070936291332 1109
-1244962005334571010 1113
-12618388435910808066 1115
-655187203027088898 1117
-1699259352638115337 1119
-9837815037477742085 1128
-10558465000768489987 1133
-3128326958710492164 1136
-16210393874387209731 1140
-3831602806328386054 1143
-1858477608543888899 1149
-11203849268139405826 1152
-14876215834473532933 1154
-838167957834962945 1159
-4472540425609859076 1160
-11410947109444917250 1164
-8435818218907397633 1166
-11045000766266457089 1167
-12325335880954441220 1168
-16708265953266297345 1172
-18342265362969646594 1173
-6953158344648897539 1175
-9922701673105435137 1178
-10113283973443524101 1179
-11668798096262926343 1184
-2129351334726026241 1191
-5692959118811792390 1192
-2917574127780044290 1198
-0 1200
-14420924818562740228 1200
-6098057863303978497 1204
-1252966646111680002 1205
-7111078464697947144 1207
-14144456899593720327 1215
-7367692118573781509 1222
-9319588592876439043 1227
-5212294342286609410 1230
-1600499660866511361 1232
-17579747388547180552 1233
-8365608306992954885 1241
-10307394306592963076 1246
-17092600292669807621 1250
-17030981925892977667 1255
-6929843536411176451 1258
-9908722951841282057 1261
-14685407131320535554 1270
-12861962652898171396 1272
-11958437143660911107 1276
-15904867421058229764 1279
-7283769647955500035 1283
-7872121678898447876 1286
-11726527760261815816 1290
-2316085662456682505 1298
-12840093831481137155 1307
-15574983692566917639 1310
-15176154862895929860 1317
-16186650646772958214 1321
-1965140296142659588 1327
-17362020270091437575 1331
-26356620300320263 1338
-4688323194808506371 1345
-470137109846916612 1348
-785647648524588041 1352
-686083037273571331 1361
-8705676087000994307 1364
-15985311040931325446 1367
-8848102120172622345 1373
-14900059783221505542 1382
-11611185676221023751 1388
-5823293000835959809 1395
-11173877492782561286 1396
-5985141512875075076 1402
-16607272189142469634 1406
-7000924871247012354 1408
-12796508861938638339 1410
-16352304696891085315 1413
-12654027566339262469 1416
-17652126895193709571 1421
-2059554016646703617 1424
-8824828815238545922 1425
-8026041213654553606 1427
-189105210507091461 1433
-8038465995762949635 1438
-0 1441
-4346653818095449092 1441
-13441396742193060358 1445
-5067771148519478785 1451
-210369551309682178 1452
-7856429334361659909 1454
-6456628847560069634 1459
-4777640967745320451 1461
-8983636279512822276 1464
-14568805960710332932 1468
-13817574021643753989 1472
-14625711259902278149 1477
-4632056779689710085 1482
-17613320542667293189 1487
-3172012402848437254 1492
-8040798394603101188 1498
-14064841209998140419 1502
-1914908168343121410 1505
-7368139610144548354 1507
-12868473585497306119 1509
-0 1516
-1618708134596732930 1516
-12587973098332420105 1518
-4964388169698209795 1527
-11644359715676310021 1530
-2644060095775605251 1535
-6430078223195648003 1538
-10183198452214045187 1541
-1240799682393062914 1544
-594310634075621378 1546
-2369514519273954820 1548
-10180653661786314245 1552
-954303650251543043 1557
-14430712698160791045 1560
-7362398115224322564 1565
-17170839233019868678 1569
-4334478792852912645 1575
-6976600872204725253 1580
-2757627166710815234 1585
-11581525848542896643 1587
-1902097979216049156 1590
-7092174838851165700 1594
-3776232881097953287 1598
-4956341896516184071 1605
-16560365104979398147 1612
-9985649880040289799 1615
-8870322153106933763 1622
-6905121755133908995 1625
-13368640352340902916 1628
-6681848478588709895 1632
-1825204937600832520 1639
-10492979809894170628 1647
-16021790814379410438 1651
-2537982728896871938 1657
-17110141827238231043 1659
-8972517116882764291 1662
-6878463938568223238 1665
-3653948979877717506 1671
-11414481194651397126 1673
-14522267179648162819 1679
-3098339502618796035 1682
-7079749050994126342 1685
-13571764215085394946 1691
-4748948606525397506 1693
-1577643399485818884 1695
-4080235243237779462 1699
-10874175738252140040 1705
-8407257242091918850 1713
-13208300770644489219 1715
-692428139842995202 1718
-1811883090719733762 1720
-9059362818280152070 1722
-1942856588307002885 1728
-8118332366482353665 1733
-4958069245857057284 1734
-14647311378680886789 1738
-10762024033896625670 1743
-28898254948429830 1749
-9834906317233815042 1755
-14985989359682912259 1757
-1282980713864208388 1760
-6063131598875265027 1764
-11171681444901584901 1767
-9942643440891227650 1772
-7536761905759707139 1774
-17586310513048226310 1777
-5368266791748388869 1783
-14231943828217691651 1788
-12518647321260815877 1791
-129394441281844743 1796
-2483490487411335170 1803
-654244401428041732 1805
-15646533714849457160 1809
-11807354932867949571 1817
-15902831808268765699 1820
-16275101253541722114 1823
-7489443708629377026 1825
-15395914353243975682 1827
-5617555619731661829 1829
-3134100206450675206 1834
-11607495136261988868 1840
-4974806308616426501 1844
-17446584074836170241 1849
-15686830167444742663 1850
-9706307518401206273 1857
-1668062460313515521 1858
-1175330870409010693 1859
-6316020408117881860 1864
-3926008952689808899 1868
-7412001888157663237 1871
-16350342416828571139 1876
-17722048717800707588 1879
-6638262866276511751 1883
-7428951476729761793 1890
-17816197047883941382 1891
-1346568064340942337 1897
-3701787015222295555 1898
-6659812133237486083 1901
-1828541539854978054 1904
-12379063259192634885 1910
-2611769333840765443 1915
-9618163593004828678 1918
-10135224491789939206 1924
-12979651712861326853 1930
-8882180359699969027 1935
-8839565787481092102 1938
-13328456084920556038 1944
-14232512278042323458 1950
-1868952656876792325 1952
-7567044498348088836 1957
-9878469525845452294 1961
-10877666723773861891 1967
-4437849393189355524 1970
-542122243470857732 1974
-4059190346138068994 1978
-14321675947144358916 1980
-14971180244834539009 1984
-7944574903635664900 1985
-6982417546170903047 1989
-9205813465909939715 1996
-14237044737088801799 1999
-636814072910696963 2006
-12520841226045264391 2009
-8898943418672995331 2016
-15646690259358356484 2019
-15618851112604340228 2023
-10285088843216830977 2027
-18286036510192394760 2028
-6450286360774949890 2036
-12025307250191760899 2038
-7044602746592181249 2041
-8270361223031661060 2042
-7199149542695273990 2046
-16798091800673956358 2052
-5285433079037354499 2058
-8498140496880657410 2061
-18434636390635965953 2063
-8780418579830073348 2064
-959965579978681347 2068
-2666650386212475906 2071
-4093783342266269185 2073
-7977153448080645638 2074
-3230317076849645570 2080
-2644129221999468547 2082
-7597431151331275265 2085
-6151418962808616963 2086
-16786361788616914434 2089
-9522044737514147334 2091
-15360350686533802498 2097
-4398995179394704386 2099
-4163122903470647302 2101
-18110267126768664070 2107
-17811600627481865731 2113
-11988559903619469315 2116
-5893679902922151940 2119
-3302430115655037445 2123
-2756050317441962502 2128
-7373324598575981572 2134
-15626353672087051269 2138
-9026268416534243843 2143
-5857105831257628164 2146
-11246462751297413124 2150
-7459631049065515526 2154
-2175352842263141379 2160
-9748465532031254533 2163
-12060676108130005507 2168
-8160425232164846593 2171
-1665947540125783558 2172
-10758171140537368580 2178
-5744770555727548418 2182
-15867521551313803780 2184
-11178209498970826244 2188
-2663862265833334277 2192
-646145646253570050 2197
-6886825228888300036 2199
-5219187155516171272 2203
-16142200027647465989 2211
-8727938199665870852 2216
-1200328579526163971 2220
-12449385538114001417 2223
-14632283715533800450 2232
-5295800027246062086 2234
-8827019094633400323 2240
-14543826221768176641 2243
-12388128316821831686 2244
-3087048392675298821 2250
-17669786912563615747 2255
-3879520399747123716 2258
-15648071975541157893 2262
-5580473107362200071 2267
-6895786389712974853 2274
-17709709086906012676 2279
-9627483233657542665 2283
-9602326803985618949 2292
-6748599026443758086 2297
-11488364339401397254 2303
-6716511183525677573 2309
-16003763240189186563 2314
-6003803301075291138 2317
-15800367754014516746 2319
-2817341800198731782 2329
-2110085916033252869 2335
-10353852055773781511 2340
-8745468498457416193 2347
-15197463976907486213 2348
-11844773108515011075 2353
-10745169896165544965 2356
-9502565595236673539 2361
-18340734722524717062 2364
-0 2370
-4877506240735029250 2370
-6632868101528461318 2372
-1094192348264738308 2378
-15930308455756352518 2382
-7517061312773919237 2388
-11537382714050522116 2393
-15343851421525887493 2397
-15685583084244037124 2402
-11443729733346354693 2406
-18096845502703148037 2411
-13060060807344890377 2416
-8226818503915081731 2425
-5171144332412330499 2428
-5367144440061049859 2431
-4687503341676126209 2434
-8115677569098133507 2435
-8753274682505368066 2438
-6767268893840927749 2440
-10747160183142327300 2445
-5318831768157948930 2449
-16744837601970291208 2451
-3968740997769839108 2459
-1041860322726726147 2463
-13185494599343868419 2466
-3781663100474830852 2469
-8664347289501861378 2473
-7145447006642560001 2475
-977858689003972101 2476
-188865761021926916 2481
-14781205616979726850 2485
-7514076159997088261 2487
-15227633270557658627 2492
-7486357174119883778 2495
-7899052859637422087 2497
-4312982947448530435 2504
-2484418012864310785 2507
-8450324929602980870 2508
-11374778755239228418 2514
-10780034123560756745 2516
-10313953391808102916 2525
-13836623279669341188 2529
-16297706918062760459 2533
-6404560275247226885 2544
-8323769790774729734 2549
-10061687257419431941 2555
-6724033317759518212 2560
-12265972209834273288 2564
-4748706107567735299 2572
-17588235414846031363 2575
-16029681841978911746 2578
-333014962274056196 2580
-2819861156000228870 2584
-17301319418358929926 2590
-14323022738651812355 2596
-17758251407482208260 2599
-9992216596142364674 2603
-5541911712511293955 2605
-1880849355295036931 2608
-15421034026101803523 2611
-2288503501826235907 2614
-2336333131728265731 2617
-15127408664422292997 2620
-6756061181968708102 2625
-2316367058427453443 2631
-13786932856453332482 2634
-17564157627292750852 2636
-5809790665868502019 2640
-9389430036410766853 2643
-15157257604368261123 2648
-523412383725034497 2651
-5270886391729814021 2652
-8987256414287503365 2657
-2751897370690544643 2662
-47819066577966599 2665
-9543124453318907909 2672
-15186331456703232514 2677
-9731347057535958023 2679
-6234700495105510914 2686
-17720066604242729989 2688
-611878128332703234 2693
-6029104170087404549 2695
-14612606995632327172 2700
-7357792311987945475 2704
-6074856230289873410 2707
-13368808999886628358 2709
-5918378978107988995 2715
-15624776793824203778 2718
-4241055509726121476 2720
-12687432015779367427 2724
-4003272975122620932 2727
-17483676776191982087 2731
-2701605488646040584 2738
-7387630099939362308 2746
-16331822462747681798 2750
-2197183442359868933 2756
-17624623361194542087 2761
-1749450990014992388 2768
-2888206094896619010 2772
-12985412669390948353 2774
-9843120678422464515 2775
-15590458610270713859 2778
-5950622975418741251 2781
-17607672802725530117 2784
-1225097419526011394 2789
-3758572251524375044 2791
-5891371767718009858 2795
-6843754938996156419 2797
-13418347525088883204 2800
-2887280155684756490 2804
-7867196614872225796 2814
-10992396837241625094 2818
-15526482250456426497 2824
-7582254907030848515 2825
-14309589056601523716 2828
-2843794758628944386 2832
-10106627892829635078 2834
-11117505412117820418 2840
-17559521087909430786 2842
-18410508844162253834 2844
-7796754440171003912 2854
-1826091018065355268 2862
-5568124937607335426 2866
-9164033835486570503 2868
-7917102923116225537 2875
-10708221634884163076 2876
-966446973350329348 2880
-1882776320247897092 2884
-18137433528115911172 2888
-7577505208556149252 2892
-3902521102041700356 2896
-11942362790107158020 2900
-2328713611561709573 2904
-8376513561567004165 2909
-18415012889800110091 2914
-7983446382889179652 2925
-2304166271864391689 2929
-708759182721729026 2938
-10774631175750681603 2940
-2608247964063907842 2943
-7317603117343176707 2945
-12615180422705001477 2948
-17995452459822326275 2953
-12439250137675515394 2956
-9947610136498965509 2958
-10340600516380348420 2963
-10073894039732477444 2967
-15954561361998232578 2971
-6039226287079734788 2973
-12684813664097613833 2977
-8337524429261820932 2986
-0 2990
-5738139389410570757 2990
-0 2995
-163262518049440773 2995
-11390362112332120070 3000
-7666496378417453571 3006
-17188351170280199170 3009
-14157925477049500677 3011
-16535316221715341826 3016
-701193705161007105 3018
-15417977144980853763 3019
-9623949443365348357 3022
-16537640731048440324 3027
-9880057250380779521 3031
-10507448958568448514 3032
-9901540867816521219 3034
-10882434502571251716 3037
-15939490563935542790 3041
-3818155241101528578 3047
-10810785028031231493 3049
-17268925026504538113 3054
-6000103580025957894 3055
-14492044616225970179 3061
-8964295197943843335 3064
-13244227239481936387 3071
-2072267724499101186 3074
-735562179013069826 3076
-3271477415853879302 3078
-1150251700717751812 3084
-11835839830005115393 3088
-17028480913889055238 3089
-16864969398419772420 3095
-9646252156141336066 3099
-5589333819644110342 3101
-14729039479109188098 3107
-2256025994407046148 3109
-5630416426912279555 3113
-23611161351524356 3116
-16061932977440933889 3120
-7560058124185071106 3121
-8943767870065516551 3123
-17388385529962317834 3130
-11686727589179028995 3140
-2993671307613155843 3143
-7451626547139373061 3146
-12726375988952098305 3151
-0 3152
-1735273330892205060 3152
-2746028049042776065 3156
-17093562035495421445 3157
-7598703106262353411 3162
-17526920923827930631 3165
-0 3172
-18087597149122765317 3172
-11336730259137625602 3177
-9704022087244797957 3179
-14531181144788964866 3184
-5103530438547424773 3186
-7049971328222257156 3191
-2593832991454060548 3195
-2549992206172832771 3199
-2656864556911864322 3202
-3094347590740453380 3204
-0 3208
-10556974365044028932 3208
-12597146506913681926 3212
-18243354473097630721 3218
-4168646291002030084 3219
-8893226051755120644 3223
-7904367695210051587 3227
-17247367703075879942 3230
-1338287165638264836 3236
-6734394253777139715 3240
-14645087877274778627 3243
-1841749727013933062 3246
-0 3252
-9793622484838288388 3252
-15384076833580083718 3256
-14678310837729104389 3262
-8947895455599830021 3267
-12421729442783160325 3272
-14382812703434878978 3277
-3484468606955360259 3279
-2411175954345499653 3282
-18322361710054416389 3287
-8989744845956541448 3292
-9637438279185886726 3300
-8282725403817063939 3306
-10727259769060221446 3309
-280860399088910340 3315
-3074647116268871172 3319
-9311932047626983431 3323
-2990333995786696707 3330
-11415454184475025922 3333
-8194042667332418565 3335
-11269986522125913093 3340
-10773634478079810565 3345
-0 3350
-4302235270674672643 3350
-4579270605621971460 3353
-3687011949425630213 3357
-9678333478858482691 3362
-14661606109051090440 3365
-9504123850532876291 3373
-14299233528797568008 3376
-10370491504729965060 3384
-286239823911254530 3388
-7969121812144744451 3390
-16606218867148559880 3393
-11756345184017143302 3401
-8204961944753809412 3407
-12456910480062157316 3411
-7569786299014196739 3415
-3372309516929818119 3418
-16631131943564946948 3425
-4436969913528429575 3429
-14467771002258720772 3436
-15278270405312088583 3440
-6638334178561090565 3447
-8154814430089498114 3452
-17289464348431017987 3454
-13185969354886446085 3457
-4725380864147687429 3462
-14933071000620043778 3467
-12471883028204926466 3469
-13286302152236950530 3471
-12020003522260348419 3473
-11784545509165047810 3476
-10311182359550097412 3478
-2262872037167824902 3482
-15672162207595698690 3488
-8479660175647360516 3490
-543122224331105283 3494
-8738610060644560897 3497
-15969479020845567490 3498
+0 0
+1196190418526572547 0
+3117251964976502276 3
+0 7
+3266452963994632202 7
+6701451810090115586 17
+10156473964989528067 19
+6270220596053033473 22
+8689732391113957377 23
+345423933508452359 24
+9048486634542125058 31
+13000119181766437380 33
+1008808785591799299 37
+12586249368236978177 40
+11161089178393358857 41
+0 50
+6900865085865625094 50
+2615908179610132483 56
+1617129254806601731 59
+1607892326666533378 62
+123501381755693059 64
+17180234710792039941 67
+17345742025318016002 72
+7933590365928361474 74
+16187522989672200717 76
+14893593683284454915 89
+6001767212789422083 92
+1805417936920808451 95
+8589625060174958594 98
+13148488988905702416 100
+6759231203841442819 116
+798806762886474754 119
+13949836854106156034 121
+4277844318153606661 123
+18162360468357982216 128
+17429735113921325570 136
+10428297564837543938 138
+10174389176493224450 140
+4782734429389924866 142
+16828613770926935558 144
+16924367891356487169 150
+15473269356473895940 151
+10277883249583756290 155
+7398921953351034881 157
+15672774546004063755 158
+7032338026028942337 169
+12638648541163088900 170
+11956890857542837252 174
+10813991647348979717 178
+698603259209416204 183
+104155371596876289 195
+8849883347580968451 196
+13523964487472320004 199
+12948374094552270339 203
+16624700721113753096 206
+0 214
+630014773304871940 214
+14669827911540386306 218
+16593543947487157254 220
+16189120489289924617 226
+5936869209199720450 235
+6504800368776816645 237
+17628010111075734529 242
+16073662248530872322 243
+15997624981342335497 245
+13519486007586370049 254
+469623719382726661 255
+10478598590185625089 260
+5239294057556035586 261
+17274642882001730567 263
+7924882265216651266 270
+13138720901108912133 272
+13741737182438464004 277
+14608811194009491970 281
+2489742908982890509 283
+14952279757728973318 296
+13432486964055121926 302
+15397241996877524995 308
+7400937882698838020 311
+13309132794101168654 315
+8519404085542453250 329
+2551722931538879493 331
+4492819152473235971 336
+9634175483270757380 339
+5023439465649179147 343
+2912624940235659267 354
+15615524075652075524 357
+15131856319265032196 361
+7560465986110364673 365
+16393161300057821706 366
+6737538541011470849 376
+6394493716971627523 377
+0 380
+6957953643235488257 380
+7533365794097524234 381
+11551517784611555841 391
+0 392
+14017003685401013761 392
+13868858036311946245 393
+609890416048967688 398
+15853752823436186626 406
+13008887538399190534 408
+275598997711474690 414
+612244017304434692 416
+265561555991638021 420
+0 425
+4771730300985403909 425
+14595656195986303489 430
+13010615142623560194 431
+3520044222049365512 433
+4843556531627173889 441
+9544321596489038851 442
+18097338319835691009 445
+17588488217883868161 446
+4553739803879796748 447
+12247953831639953411 459
+1685939678565356546 462
+2454121115370725890 464
+7699707784321416706 466
+2322428462912444939 468
+4251948422489921028 479
+8009626371771665409 483
+15830912148611917313 484
+15530208627603713027 485
+14550069280077337095 488
+3074860258671426050 495
+9819565310679728648 497
+0 505
+239920763215632386 505
+4479084686100589069 507
+7541436040510714881 520
+0 521
+18361828565940659201 521
+13943609537766478850 522
+1644071836581560844 524
+3325147442114083333 536
+9121949682662027269 541
+5375060563545179653 546
+11461944020052039682 551
+10205876604940857353 553
+17856338086929782276 562
+3964733248608209412 566
+15252617693956101123 570
+5198588053258159617 573
+7294352613378259976 574
+14274593384918848004 582
+12443356879762990084 586
+15967601366558600195 590
+0 593
+1596502676746638348 593
+3447763432008799745 605
+2154246728958848517 606
+1249748142575979010 611
+12802117032328183298 613
+14720455521613154825 615
+14431397366571454983 624
+8968154969419252739 631
+61922506310515202 634
+17332184019644205571 636
+1580044533016865796 639
+0 643
+16037339623732295172 643
+0 647
+6451385579602643969 647
+2249232807147062791 648
+15969372656029624833 655
+9184080755936318981 656
+10444965622910510594 661
+976846907109217284 663
+15036566770534162954 667
+2852219209756952581 677
+14428186506827194885 682
+0 687
+9583345567128655877 687
+8154021185610424842 692
+7639653587249864197 702
+284400846134645765 707
+5822594207495943172 712
+4666916656146452484 716
+10837424823999667726 720
+7662230599689246212 734
+16769958284715374596 738
+14214321919518354947 742
+7700892892210644993 745
+5647486165416790024 746
+12807160877623480835 754
+17202327424132939777 757
+5849043248643779075 758
+18232796011600235523 761
+4957062118189902859 764
+6105730765254667266 775
+8753292226633308675 777
+14066686889142136835 780
+1047708050925830148 783
+5555751253338228747 787
+8205438979066793987 798
+10100035083082646017 801
+3037731532850264067 802
+16470238215781450756 805
+15841867742103541257 809
+8087512074161331714 818
+15493250668750321668 820
+3797087601271950854 824
+2623502875154101252 830
+15159098560356506121 834
+343051006899596292 843
+16668194639613285891 847
+0 850
+9601059867653113858 850
+1570493927206813191 852
+9118300038493915138 859
+9563382677447647747 861
+5285530497249013763 864
+14598000812816350721 867
+15243372398425255435 868
+9815541045508240385 879
+408899826773384197 880
+7463961818871554565 885
+12980371725716597249 890
+15376403281856848903 891
+0 898
+5841652391326774789 898
+6476912065420260354 903
+3963854010828661252 905
+5784218172655345161 909
+15327721657175197701 918
+13180549833166182403 923
+15904501101973266436 926
+0 930
+14206180323061139974 930
+1106786522797875713 936
+17058832169116321282 937
+721828206256696835 939
+0 942
+8561789411832569355 942
+13374043249168898050 953
+15922789491870388229 955
+0 960
+16131878595889026564 960
+5509499768642979336 964
+12415614990376579585 972
+11304605070154481157 973
+7663245502729528834 978
+2692663086158549507 980
+14133757573751133701 983
+6813598296480126979 988
+13616528755765764611 991
+16303994430841145861 994
+12880492472155407874 999
+14023778603465187338 1001
+1658551813664662018 1011
+8148008758896362498 1013
+10688946549204321795 1015
+13274653424094307841 1018
+10847911221158770190 1019
+0 1033
+4643539771717744131 1033
+4169507947260962821 1036
+3126526255358650372 1041
+13449815687571241992 1045
+9421207081901200898 1053
+6898163624184020997 1055
+7290174431607841794 1060
+2741902156609523715 1062
+15499057183587255302 1065
+16461426401301993476 1071
+11278211202787295747 1075
+0 1078
+9413985875830324739 1078
+4646548733144616463 1081
+7078801759685020673 1096
+5376123263925219331 1097
+14227335667134915589 1100
+0 1105
+7295351152600562699 1105
+0 1116
+1397641409882635269 1116
+2364632016557825025 1121
+7290779788839345158 1122
+223977268476071945 1128
+13026660262516529667 1137
+17998435953459809796 1140
+8522469059272339460 1144
+16293947433309880833 1148
+4576500186674335749 1149
+0 1154
+4042247147937702403 1154
+3034443556411821057 1157
+13667368622259281923 1158
+15202537810082257934 1161
+15337640185400698372 1175
+8308041085868251649 1179
+8832030889396702722 1180
+10436989792260434949 1182
+14898581533124037641 1187
+9317528159836099585 1196
+1612938252083390982 1197
+6278485319310800898 1203
+10612805446261845508 1205
+13787162434835940874 1209
+12133705386992745478 1219
+5227473436681376774 1225
+5656787771058157057 1231
+4433258109319585794 1232
+6704526927800668169 1234
+17440456789764264962 1243
+6979104089888754689 1245
+10768049747876580866 1246
+15707303682313568257 1248
+15148244407999994380 1249
+2841265161354426373 1261
+5252307512862989316 1266
+13331565891980378113 1270
+18159416118263116290 1271
+501516395825858060 1273
+3867012501081805829 1285
+8267472486312505860 1290
+12872828689431491073 1294
+727773195231890946 1295
+7322382021491738631 1297
+5402024496579473921 1304
+6959655625064837122 1305
+10187142685062514177 1307
+3029360479097259523 1308
+3524388403479357447 1311
+5803404108302127107 1318
+3322880653425492483 1321
+14014789072627667972 1324
+0 1328
+17779075582177396743 1328
+11597164340541700097 1335
+18164718194518923266 1336
+0 1338
+3688441162538457604 1338
+12763684824056344584 1342
+6555198237040291843 1350
+8999497138912988675 1353
+9277828726380557826 1356
+1652226711750231042 1358
+6386464493042135559 1360
+11832103051565904386 1367
+7889400420599073793 1369
+5173699340624307713 1370
+9839391635984425985 1371
+9179189546563518985 1372
+8987610858276033026 1381
+14211262843725043205 1383
+9924217736728436740 1388
+4401850895204555779 1392
+5541709837691148811 1395
+10214740045672277507 1406
+14656675767246138369 1409
+5518164076312088578 1410
+8819194535554354691 1412
+1202694809888231436 1415
+9937648736864647683 1427
+4776509399304216066 1430
+3828150896429232641 1432
+9726415758235178498 1433
+15478358790166008844 1435
+0 1447
+447632828248568324 1447
+10254625284015096321 1451
+9602208154038649858 1452
+7918490636759656966 1454
+4464032935723660291 1460
+517803065456797188 1463
+11296051306811729413 1467
+9559870439106258948 1472
+18140734313948729864 1476
+5761393475703308289 1484
+5817187969532432391 1485
+7214411138154648580 1492
+8556555308704695297 1496
+5517275039512219661 1497
+155198283803470849 1510
+12028807386786979841 1511
+9402878779861331461 1512
+7529466829850301953 1517
+3700043109242268166 1518
+7889220073888590849 1524
+9698905706548099588 1525
+950350740255051780 1529
+16659267722661032455 1533
+11934825441675277832 1540
+1840952787151591937 1548
+3181706929772123141 1549
+13084360636440561667 1554
+7392348362663288323 1557
+11299566685738323463 1560
+11865504406956790788 1567
+470806909387516931 1571
+11392390055026286594 1574
+0 1576
+15250035972710306824 1576
+1841748561073501700 1584
+13959366503388518404 1588
+16383575845586120707 1592
+5993903773214649347 1595
+12927537188954086928 1598
+6310676060569643522 1614
+6823572598110530053 1616
+0 1621
+10355215107753852930 1621
+12991560131813107723 1623
+6463225875312731650 1634
+444925180768886788 1636
+8287375501749122564 1640
+8102699978355624961 1644
+3217121844483982342 1645
+0 1651
+15310893597687290371 1651
+4651888484278436356 1654
+16622466823413339137 1658
+14426029300798547465 1659
+16208338759425902084 1668
+13384891560853317123 1672
+10542264124115582467 1675
+0 1678
+13404868863569442317 1678
+8380728838811013123 1691
+2656782871938641923 1694
+5621105522992570375 1697
+16165957063051496962 1704
+17183335989224497157 1706
+0 1711
+12377944724210268163 1711
+15698714840429098497 1714
+2063306500131813891 1715
+7135499884796623879 1718
+14916197160702468612 1725
+14565364212611500547 1729
+17109666354199615491 1732
+18420265465448709122 1735
+5039636110599831051 1737
+13648715526743665665 1748
+8648155745742680580 1749
+0 1753
+4128476852805537282 1753
+12229435493123252233 1755
+18671114624524289 1764
+0 1765
+4330985506003776003 1765
+4960636854468069379 1768
+2825174586054641673 1771
+8083214972260871169 1780
+1656668836635006471 1781
+15658718806708214274 1788
+1364137667359422465 1790
+5440910769879224326 1791
+1242060995600047617 1797
+6028285323527704577 1798
+9862524515548398083 1799
+14095132043223516673 1802
+5330121798209797643 1803
+3047808178481674242 1814
+7009881287782938629 1816
+3836453927748870146 1821
+4828562734878493698 1823
+6251707885160171534 1825
+13503013357676597250 1839
+13120060435028427777 1841
+17453157023102628866 1842
+6659266074333195266 1844
+12122449770852231175 1846
+76872493233309186 1853
+10510620038219076100 1855
+3104474465142299652 1859
+15145875800387371010 1863
+14514645157364972555 1865
+5990940750853294082 1876
+9568631318395414530 1878
+13307393937882497539 1880
+0 1883
+13432428898749511691 1883
+2851874300532727813 1894
+16127254686981486084 1899
+11152828733555106817 1903
+8099684063905722369 1904
+10726727557015251463 1905
+0 1912
+16773004137299201537 1912
+0 1913
+1737396243104320517 1913
+12312810570815952904 1918
+8420117868402509825 1926
+4468099455608655362 1927
+17181412210024682497 1929
+7344171998747088899 1930
+11200240032637073926 1933
+9773885730549905922 1939
+2888420847349521921 1941
+0 1942
+3301971714535044611 1942
+6622000068430301708 1945
+14679279568503564291 1957
+15312513401406547971 1960
+11219696574507219971 1963
+15557068645919193090 1966
+14518831268196627465 1968
+11306244334020066818 1977
+445302382600591361 1979
+4798518764725378563 1980
+12833053520101596161 1983
+6569110733351726088 1984
+1133142439547627010 1992
+6020738327851480577 1994
+0 1995
+0 1995
+15123217074875560455 1995
+5146261845254048769 2002
+15577303646915962882 2003
+5068854713026915334 2005
+5662217880612308482 2011
+13584286678752042508 2013
+17647669975855288324 2025
+7092182408195844613 2029
+5243600304614296065 2034
+16379641210199802883 2035
+6541142296931350023 2038
+17648968980389751301 2045
+3633167252938199556 2050
+691728008305302531 2054
+7434042972483105284 2057
+1243474674683616271 2061
+439217426838173186 2076
+10460352595647090183 2078
+5080394082232633345 2085
+7346464481151790597 2086
+8068677175549539843 2091
+4859996294860352513 2094
+12470823893961605122 2095
+10033529424736163842 2097
+10769920382809060357 2099
+16128670331104411146 2104
+2973668094989328385 2114
+16323032859702780931 2115
+12227727930958763521 2118
+7302528030871866371 2119
+8967586997946816013 2122
+13935701471042006020 2135
+15676859696752227844 2139
+0 2143
+2397906929972799494 2143
+731429270944234509 2149
+14629591375919925252 2162
+14201687141277194244 2166
+8813493889730974725 2170
+4967156306307785221 2175
+12152782138863493635 2180
+5716269545878689795 2183
+12118250850399448070 2186
+10079764034817249795 2192
+9905170822798166018 2195
+7330246949116896272 2197
+4975588281894977539 2213
+2377967791858227715 2216
+1711948357573607427 2219
+15733402191778006532 2222
+13617127880905861132 2226
+5413022680339381252 2238
+12001217113207191043 2242
+605362804928389124 2245
+10888521749365150723 2249
+11742554576381655052 2252
+3591551764774430724 2264
+8647496912976230402 2268
+3843626828621262342 2270
+3921763517492323331 2276
+7707493410895858692 2279
+3920334550068498946 2283
+2658528064200329217 2285
+9038122947820533253 2286
+6952499746958836740 2291
+7951530266135717388 2295
+16076637508890388481 2307
+15187897527562671106 2308
+5520701509759360003 2310
+2598679891400145409 2313
+17512255026679867408 2314
+10995766946592999425 2330
+18117038245928559618 2331
+5391766950501834244 2333
+14461374868186265605 2337
+1273598128050393611 2342
+11820949665032480260 2353
+17841646829021216260 2357
+10200569215461547521 2361
+3670141860910412289 2362
+18396940417538187269 2363
+14261984156631670787 2368
+106960762513502723 2371
+16393357936187300353 2374
+7032931990465729538 2375
+15907195827890083338 2377
+16437195285078765571 2387
+17301257309241798147 2390
+8236593629924756481 2393
+1379157623727557125 2394
+14767417508072398345 2399
+16695407490005887489 2408
+1414009372711604744 2409
+499004129948061185 2417
+5775255721778604547 2418
+16754393591199635469 2421
+10568987941526160386 2434
+3311623553148127749 2436
+10255724520964794369 2441
+3121950734017230849 2442
+2129428121322164230 2443
+5233872436075409922 2449
+5115946926893418500 2451
+298818270766586369 2455
+2534391384903305218 2456
+13962240998865999372 2458
+2858192092257344002 2470
+2246014736733727747 2472
+18208224108542041605 2475
+5900635063125726209 2480
+8459478259862856201 2481
+3106812066263162882 2490
+6016756381746226178 2492
+375597697640802819 2494
+2513762961093744131 2497
+15366269329105501700 2500
+10035949288505144322 2504
+427851159373997574 2506
+4274431321888115714 2512
+5253654952100000770 2514
+16894221500064376839 2516
+14687193167626954754 2523
+13771965837935513090 2525
+8874009193925074945 2527
+4974093839237721093 2528
+741620693598341642 2533
+11991618038806280705 2543
+11116672093208526850 2544
+15807249887587362818 2546
+7323942637968351746 2548
+3660270925885407751 2550
+0 2557
+10684033640943126020 2557
+16989816981004759553 2561
+9001924880900419075 2562
+1998443310251235851 2565
+17567979874939109890 2576
+13652482471668535812 2578
+17509569230481751555 2582
+3182500785161561606 2585
+13325982159032983558 2591
+1923914978402147329 2597
+5589189981371284484 2598
+1161601912578541572 2602
+1916235467976744451 2606
+16280831412119656968 2609
+5531274414859838467 2617
+13599333592024061957 2620
+17989155199582565378 2625
+3030922814179764740 2627
+14644007464957335564 2631
+0 2643
+5497605392959732225 2643
+2032331457863458818 2644
+8100338548587463682 2646
+993329328502006794 2648
+6750921732510502913 2658
+13748899324120622595 2659
+15617703054210413571 2662
+13138109094843573761 2665
+6544485718564688390 2666
+4168731610225209858 2672
+7315066071491735044 2674
+11306658702491732995 2678
+1460741416990041090 2681
+8624484085251326469 2683
+4952143576172173826 2688
+11470130411385533445 2690
+8808161070990530055 2695
+3407659004810532870 2702
+9761503347061253645 2708
+347929962150473217 2721
+15682869073661250565 2722
+12636859761190001153 2727
+2169559175677957635 2728
+6583723534446631435 2731
+11332478688871909892 2742
+3541912969021597188 2746
+15665073567582359041 2750
+6811971824255872515 2751
+17832657550632072714 2754
+8908928359280249862 2764
+16149194899805562374 2770
+16584564148406323202 2776
+8926638669588577795 2778
+8056234806465729542 2781
+20557314279745028 2787
+1574148835258315780 2791
+0 2795
+5593745704266732037 2795
+8450014032945361420 2800
+7024373575570305540 2812
+11737655816003366406 2816
+4727037432569372673 2822
+8600949146786643459 2823
+9003058529087846919 2826
+14052664559056898 2833
+1424791599736305667 2835
+5413427196124183555 2838
+13050600684981920260 2841
+8589685071512056331 2845
+13186761374251900929 2856
+14090913721681066498 2857
+0 2859
+2742241767433926657 2859
+6309431184810384395 2860
+16867533333923942913 2871
+555261403132789763 2872
+5659601479152637444 2875
+18276768397881284098 2879
+6852010445819064844 2881
+16631838326863331329 2893
+246764640492975110 2894
+1313867708490425347 2900
+8944238870676823556 2903
+1060867472129666057 2907
+16635885715046522883 2916
+13334184179287121921 2919
+1341139991463623173 2920
+0 2925
+6310211216600221189 2925
+3521973268169620995 2930
+1462184866304097281 2933
+8359017763585949185 2934
+14138351761235446785 2935
+6817592922583008262 2936
+0 2942
+6385096150346020868 2942
+0 2946
+5484657660585723395 2946
+10615912620259059212 2949
+11956475177743584771 2961
+14617995947569946629 2964
+16460942815259223553 2969
+9814422111234662404 2970
+4608931955518876683 2974
+8617716815688349187 2985
+17740454941921826819 2988
+0 2991
+10586556775954286081 2991
+11028786367153901576 2992
+7561184979369551368 3000
+10180555287637633027 3008
+262376940139235842 3011
+1252244297117510657 3013
+17286434400127825418 3014
+11940732067173687811 3024
+9446744360256471555 3027
+583923543216445954 3030
+8153426984110241281 3032
+8998238685693393417 3033
+11022193474305971204 3042
+18018779292443289604 3046
+13782486654821986817 3050
+1031535266324627457 3051
+17367371162468022278 3052
+16063095350159409665 3058
+16006913374966627331 3059
+0 3062
+317830424679224322 3062
+14882116247225631239 3064
+9977848214775454210 3071
+15016859152309685763 3073
+1451917599200393219 3076
+14163345466838668289 3079
+7124786413716748809 3080
+8972415547684808706 3089
+17905923295565835779 3091
+11508735911159903238 3094
+1060738927182784515 3100
+3235164743035444235 3103
+7249634886133244929 3114
+13627026919527422469 3115
+804144428748921345 3120
+4260278694170215937 3121
+2554890109424057864 3122
+0 3130
+2939022249034957313 3130
+3727916159743203841 3131
+14170274700031256577 3132
+7153627445263524879 3133
+6798175517396767234 3148
+1899052595905691141 3150
+4651137331222245891 3155
+14020723224952528387 3158
+5768869715157669895 3161
+13394211108659571714 3168
+15788932119193980932 3170
+13584005658508513793 3174
+9286626632069867523 3175
+2398026920081879562 3178
+1285989134179298818 3188
+9371873775174273029 3190
+18182246561705410049 3195
+3627164815665507843 3196
+18002283031389555722 3199
+13723140536667785217 3209
+11940684153082156547 3210
+16151440538186193925 3213
+13475891972713434115 3218
+5932226594251481096 3221
+15508203776273810434 3229
+13958242421862434307 3231
+2178759546197172739 3234
+12536204645038731778 3237
+14021691565090239498 3239
+0 3249
+18424936840617633797 3249
+9515558058741110274 3254
+14427656809453646337 3256
+15295479713001905676 3257
+6924455800485778945 3269
+5547275743159208965 3270
+15965423529103676930 3275
+6276065480049782274 3277
+923852355669990415 3279
+5171389834127005698 3294
+15756927494767584258 3296
+5380717287071449607 3298
+6048706605171842052 3305
+10493631130929582093 3309
+2792686703001238018 3322
+16318095573166788102 3324
+14961739739381704706 3330
+13885085964549002242 3332
+8803999472247604229 3334
+13681809489997040642 3339
+1274343414475602434 3341
+17525390131260455942 3343
+4637625228183366658 3349
+8313154017818126861 3351
+13090076428282480132 3364
+18133227728108545 3368
+8282473413611347970 3369
+107193099920609282 3371
+8505179371271580173 3373
+11102079825957593602 3386
+10212767298703785475 3388
+5215453497761775618 3391
+3298152084179375111 3393
+1095163960428030473 3400
+16887781145875813889 3409
+14786085928210816520 3410
+8581278387803219458 3418
+6241337607249230852 3420
+9254719800476612099 3424
+2568855290428722689 3427
+1289519920250085381 3428
+14618186241114017793 3433
+9612541243912769538 3434
+13926515287424429066 3436
+11093957915681312769 3446
+12010544601346956290 3447
+11839562359654205442 3449
+6839541636025740804 3451
+6012482217637302795 3455
+0 3466
+5775335776577318914 3466
+2685494297938271233 3468
+18186802079969910787 3469
+3127521196291951624 3472
+6934893239724900866 3480
+11630798772510404609 3482
+2767762624498050052 3483
+14135084772626181124 3487
+11643008759045397001 3491
 3500
-5303047073946667464
+14107087915135404740
+3545799512027105927
+32996413518841137
+15568274631689570656
+20587511236070012
+2390363305266056430
+3863606920688567965
 210658854139
+9870724567599405
+103154228
+3007753865419557454
 493093586
-15289397349632312454
-5941764183477191834
-3477193953305167424
-236453760381
-7470284155521404014
-24445261
-16426766960960540026
-14549236
-817365937
+814220189
+538968856
+45810044
+11403474
+2625321602296383846
+3076135121411313050
+16635669954819197974
+5514354727165429372
+18413391979390173264
+3544953467117898450
+6361518319333476776
+5833854247140395797
+518849275
+2752627
+71565807
+9870724570416301
+163316374
+60096910
+817038254
+18411417877468545037
+5993603989931887912
+1873618523431177265
+14787093348585572176
+18413109988782047308
+1283692271244348427
+17461812017531651650
+13165236096819726043
+14883032307819284131
+2789363538679106064
+11161692095903435283
+62914993
+2365498112266798670
+154665586
+13726068529822894439
+5570718
+544604964
+33560368941433940
+819856323
+1873618458944931675
+1873618489039064439
+6156738032733324876
+10259573046193883986
+6208295848581203181
+5991347927496394467
+2272905061487347697
+8972557000702888938
+15289397384024950845
+4767727591019973374
+10758418391935812957
+2292825785040636736
+1545208828
+219257441372
+5569296714050766113
+2207492642904016905
+12612941966326959190
+12426051065400527122
+18331556280207363
+2785415334835848520
+6156737968247080128
+15292217517958891614
+5780604328577598853
+3188833133853148985
+4078298757842341053
+6051485356288903427
+573178715
+102957618
+91488775
+2625321602296187261
+114426460
+22675774
+11206864
+9870724567402585
+5406444726343502428
+68551110
+515834601
+2431124533
+538772246
+11065179658016983681
+8930986418384079868
+4076606646528706921
 1873618471841499416
-71893492
-10694515171064744788
-29330183088506125
-61997475
-4653200
-109445719
-8926052536804313893
-7528330190111771360
-1418462186
-5887104182899575287
-2625321597997091447
-23407864425745813
-1647838213
-6152225753094686522
-14151987057237756511
-18058417591402760409
-538510099
-17855463731522440261
-240752528220
-27920040887059601
-11078361536363433136
-12517601
-15885957841278600403
-518718202
-805438326
-2621553
-1550910461
-2411070513
-59965836
-13012951802392676509
-97518103
-2625321602295859611
-30277976
-546374457
+3701601059573925529
+16166203682344470241
+6101795981361546864
+15289397371128186695
+7569568047215545466
+18411981910273949729
 16759426304739641933
-259654328
-27356063970624739
-1873618458944931675
-6209987959894902621
-5728764444739437994
-18413109988782047308
-13885455448020813663
+48431492
+24535874148371011
+14024943
+59900299
+105775699
+10770155859627543824
+71369196
+9870724570219682
+163119765
+2530739313276357975
+5052785364214352114
+805372789
+5652457623480305518
+644809585
+816841645
+2556016
+4501477955215362649
+4502324021619918399
+2150364451440035988
+6156455943246842659
+1873618497637649718
+12309852946450942075
+3660444556051220001
+11103300151687644832
+8714520725396523830
+5461104765611607541
+27356033875641745
+5352348805862394041
+2012415014
+5151629580948802356
+5374107
+154468975
+108593749
+62718382
+16843031
+28311895
+1107456968073808590
+11490081257974859839
+16633695840000739887
+9386257335747873389
+4959080478982475006
+11408348231855703653
 13464164481390611573
-5514354709969504081
-6364097374632348674
-2676033351739376985
-1136798196293306910
-5299098874403555921
-2120987217453057458
-17306856587979066781
-1873618532028844481
-5572365145471912335
-18412263926676652075
-105382480
-5303047039553965447
-9881712940254169714
-152830562
-8610102806501591788
-15524263781940136850
-14282671233461718187
-2857298572705729021
-29330122900898936
-10554335258691243263
-8453377129057749572
-18411417864571256842
-811271050
-1873618489038604579
-4657106642463886071
-2676033356038145381
-514654951
-10757572347027851837
-4237766514325588729
-571999061
-9821766011288487605
-7230168968130792223
-2704904949959166469
-1823671323
-103350839
-46006654
-2755882956846859930
-15289397371128186695
-12662636664722033563
-16318735
-18411417894664929297
-5462796894122411284
-9950019064427710530
-6981729909914862956
-1992588707391932346
-63766972
-6422699
-23407808536904833
-15394822466617412826
-16881139139804531782
-14312300901618944289
-2625321593698061230
-9870724570679212
-5780604289886653255
-3870997034531752803
-2531021389865944442
-10908568553618343357
-1860700038481053299
-196215461
-1801847830
-24183115
-18424247431471827427
-14287090
-417019855960
-71631344
-4391052
-61735328
-18413674012989259870
-2625321597996829544
-17957750408840481687
-9870724568648556
-41943405
-2789363542978135882
-18412827950883864637
-548143940
-22151483
-17257283845880874759
-899112529018292807
-538247952
-69599701
-8510664359869943178
-27356081165698156
-27638084672359236
-12255453
-11400819049620310987
-1321272283
-16881139122607162703
-2359405
-3101815889301670444
-518456056
-9232147856523987724
-3758799212073651272
-3591160524196219107
-154600049
-17946608694533885076
-11500631658516907905
-825323275339564903
-9870724566615620
-39911783
-12318365723907459763
-546112310
-18412827980977537092
-536216330
-2676033351739114988
-11069796553860646809
-7880043043777809442
-451412296787
-18411981918872141859
-11678577273375754735
-8856014234050823647
-105120332
-1309344723
-162464400
-681145240220010584
-2626514825137096412
-6589396841525218018
-356832249381
-6156738032733324876
-11202456151687629452
-27638041680086900
-11243723090649876783
-5726358144768542273
-12498251711624252784
-13702827714901707594
-811008904
+15494005608834598990
+1407386597
 8192198
-8714520725396523830
-514392806
-9960543895307946415
-15287141235608259625
-5727354401416546168
+219257244681
+42598769
+811008904
+2573543610120276856
+5356297048398365877
+7595953279435999504
+5726226297114658480
+2723374776553770162
+1543385872365455415
+11535686880442518166
+15289397379726773461
+5565348488711963913
+504169174
+9870724567205432
+14212253575230457510
+5831598111619679502
+2625321602295990612
+572982104
+813826970
+279448324634
+538575636
+11010253
+68354499
+11243723090649876783
+18331491793766525
+15292781563660995825
+5991347884505304103
+9409295256684857617
+3759645248384009814
+5832726134240118664
+14312300901618944289
+20305615210743190
+13001845694847518363
+2652485274356286816
+6151097653090126690
+2203332276215481610
+18412545964574834746
 1808894516123993997
-3686437022462641529
+518456056
+2359405
+1321272283
+71172585
+417019398489
+18895516000586505
+162923155
+9870724570023121
+13828334
+2625321864544389907
+816645035
+8453377129057749572
+11949535972653271176
+1873618467543321286
 5249797181178709209
-2625321589399030850
-103088691
-3062219857732765097
-830399540494469985
-530117487457144076
-12454108019635062383
-197984938
-8930986418384079868
-818873277
-16056587
-11526999220155450649
-6160551
-63504826
-7621890105505615217
-11847668763332905754
-10377426660276898779
-1873618519132015281
-18092519415945890646
-15882855708139391266
-7993599274919922706
-2789363538679106064
-2150364451440035988
-9870724570416301
-2625321593697799226
-91161094
-1410073577
-23920969
-7513578521803359945
-22279798815198594
-15520597512816297356
-1023125932615797552
-540017436
-8910392170935354895
-195953314
-644809585
-14024943
-71369196
-1873618476141774348
-816841645
-10906583479868327250
-1454041666728626384
-4128904
-18413392005184749654
-108921430
-468609401971
-16204201012116260706
-99025451
-9870724568385196
-18412545943079354421
-11878630053446878902
+5567604589840172352
+3707523343842937215
+17088205463377873568
+2169005683868174908
+9568723490388248888
+6103488088376871190
+4025969582498383295
+62521771
+18276979644936029994
+154272366
+16646420
+544211744
+28766107292140894
+5177496
+509805280
+1873618519132801026
+1873618544926132491
+7676326001635166459
+7676326031729298383
+869984510486186619
+13146357072728951328
+2000487899013646903
+2449021711964768402
+6155298010574883251
+6098975770044401989
+3189961199463959445
+2676033351739376985
+7995587
+19464489
+547029825
+219257046468
+2021331689141374237
+15288269301218674108
+11705421198335413148
+2508194873
+2625321610894575340
+6097847713031849822
+16064731596255856452
+13701595356116683915
+6364097396127827248
+18413391987988365394
+16364556117061994922
+10296839827164892306
+5403008449516603011
+15858116883009440274
+5833854255738587405
+45220217
+194314911
+10813643
+68157888
+56689033
+114033243
+4287350266942457603
+987047180239768912
+813630359
+18411417886066737167
+18413109997380239438
+11548493110908749415
+6364097387529046615
+5561348123192067576
+5835546388547569431
+5246976935469649046
+13884327378110449525
 18204249488608200784
-5566476545725367766
-17951898368652543383
-7558005371879033601
-16542141154387102177
-6316393479032998553
-11694336983993944146
-11427331956784106382
-4662073785906890031
-1873618454645640429
-537985804
-12999620585941961275
-2295119206548507606
-11993306
-1597536180772867045
-5299098844309358384
-8294669686619703163
-69337553
-1873618506235448739
-518193910
-5406444726343502428
-16765215479188031591
-5460499803636172954
-3431717683755289915
-28202117477106938
-5249797172580910311
-5745384143842643344
-14065038233622153931
-14311172801615955497
-16758489844492275047
-5510538272098551989
-11065487220741573048
-9870724566353399
-5679882735784101879
-259130038
-87097857
-3491703471172619422
-545850164
-18271599167641487963
-5991347923196709309
-1873618458944406678
-7033448275620070919
-812778389
-434977997061097911
-3445982126355516078
-2676033351738852867
-3545799512027105927
-1873618484739311861
-12749251354825264418
-14836382508930370955
-2625321585100000596
-21997756618246082
-8716776809328151764
-15580874176502892132
-3332575624131774585
-4445946672738010859
-5780604328577598853
-2848264744227112681
-1873618441749072804
-257098416
-4930631980557601532
-6877319166685482198
-1005889956380019628
-820642761
-17826079
-23125779236849772
-810746758
-7930050
-8929320279979198383
-9654763076979264499
-11949535972653271176
-1873618514832984063
-514130660
-18066207382028748450
-2573543666009114673
-18613585580197092
-1427238547443354327
-2625321589398768544
-102826544
-5903884228619468800
-4279043148
-7036226112429884975
-818611132
-15794439
-3324580943442478547
-1903640920853056624
-5898403
-1873618497637649718
-1133620887485417426
-10156853965084755435
-63242678
-282723005
-13586095437453200186
-9082058141968173941
-1987794462939089941
-13237708531286474753
-5240852582657493474
-1915314009235720841
-9870724570154139
-90898949
-17090754651615726815
-492307151
-195691169
-11050161621988804687
-23658823
-11623400942792738969
-9304480456320748248
-71107048
-816579498
-23971751058934778
-17869638717220195611
-1873618476141513316
-361675971417279818
-61211034
-1873618501936418049
-3866756
-567411536
-5302201063430292982
-8486888319115725460
-12406930521299355297
-9870724568123690
-11034422950646711803
-4287350254045103750
-5566476545725106758
-1923875870
-547619651
-6366353527348595732
+70975974
+9870724569826462
+816448424
+4211213383
+2162794
+12974919760129952993
+105382480
+5459976661309982295
+21433723812579518
+32432320527074663
+1873618497637255436
+9305858029919208637
+10225919154718574351
 8597156797828894009
-13590665243542948895
-13237708561380147208
-4254959725487523541
-2907303882175415846
-1873618454645376983
-9230753948926543533
-11731158
-527827717
-5511666307614640107
-1330643932
-69075405
-28202091681942395
-4727296740454696303
-1992881785902860007
-18301216972081072101
-4076606659425995504
-9870724566091296
+12461042340477994821
+1455946274504313841
+9538952396691934382
+927164962728314711
+5782296426993943791
+9714916684781063078
+16449809
+4980885
+819266496
+2625321589399030850
+10907429529076434052
+257295025
 39387493
 154075756
-5459976644113468289
-545588016
-12461042340477994821
-223556406340
-32432337723721245
-19595563
-2573543610120276856
-24535874149025753
-5196265237615086368
+62325160
+1495925747
+288043895627
+4504298205224635444
+14835085562484362568
+16881139122607162703
+1839046019115124804
+11923578915473263059
+9388513449772451585
+5247593352907982888
+5153885686374731086
+12020808312486431384
+14848239906707278405
+5405598728725530322
+3653991426073234491
+5566476498435442740
+4333982245204396969
+17007720368052373541
+14458654042895551171
+16885259953617962521
+2676033351739180486
+6877309693745106245
+21997713627284659
+7562235540534921217
+2625321610894378836
+5458848587099997499
+1647838213
+288046714075
+1454859013759438228
+1133620887485417426
+237175467
+810615685
+1418462186
+12162857194684744950
+88080898
+19267879
+7798976
+546833214
+6206321690771522709
+21433680821684597
+1873618480439692390
+3932922014897081298
+2549492329236335496
+5249797112394286460
+12294570438877711433
+2324121364801391676
+3315661715940248009
+8971880411373045432
+5461104782808583112
+18411981918872141859
+15371922320578972378
+361675971417279818
+90898949
+13390152586296232130
+492307151
+13522668389390157414
+538182415
+10617033
+12498251711624252784
+22085946
+1987794462939089941
+425617786716
+1730937871
+5356297014005859746
+5569296739846327213
+16881139139804531782
+4196703391028741586
+1873618476141710425
+821147663836514852
+3158171969379764633
+30176223702288623
 17735566651085687884
-6204347601746593065
-1873618484739049815
-812516243
-6152225714402428442
-15291935501556190620
-15505670362359531298
-451411772583
-9484411285755463284
-161940107
-15292499508566297469
-563348302
-506004186
-11238431078799509026
-18323667541285735009
-2625321610894640833
-103179363763488430
-503001580666
-12769025487284210679
-17785259844527786731
-29612147900877606
-15290243377345399572
-17563932
-7667902
-3186488476490139978
-810484612
-1192315333980326167
-1873618514832721746
-15292499491370961900
-513868514
-5347351719937377689
-45220217
-11775490430040476325
-12240192446106372977
-35324256
-2396555433535145871
-7409502855497715015
-7888341864134085054
-4278781002
-1732546121802517809
-2374936041605498895
-21433680820701635
-12189960762281954023
-869984510486186619
-3598203394278688718
-6103488079777762245
-72876542
-16990917635978692369
-818348984
-15532291
-1146796961722731823
-17761874897365304540
-62980530
-4534407021717882867
-5636255
-32714379920409891
-12552846396214610071
-6262673798361580735
-2528483177756102046
-9870724569894177
-9297735470756268616
-5831598115918776853
-32432303331018178
-6064762127302393958
-6156455943246842659
-23396678
-13500652
-16916327697533962956
-70844900
-816317351
-18411699885273055253
-5884848047378859255
-5837238405281154301
-14311736903207619026
-5141736951422061236
-3604608
-31022281504523376
-3599049409094225259
-577045344
-2974323816123992770
-8021450341214588326
-3577503648415550265
-509805280
-9870724567861628
-11098517635487303139
-7462549834646555859
-98501157
-5779476207078475458
-219257375260
-490013379
-4222974949961697922
+1427238547443354327
+10223260478367337870
+10720606758114626648
+70779363
+105185869
+162529937
+9870724569630759
+24904017
+2681814701524780811
+1320879066
+1584661506
+644219759
+13435115
+6097847786116483627
+12477949191893683608
+6925759835249836137
+27920040887322186
+10003084053115964048
+16253198
+153879145
+2625321589398833886
+257098416
+4784274
+9103100569952650951
+12474564753552836994
+1495729137
+62128549
+9774054990929462949
+5356296971014964874
+6153353870293665804
+9568883447315500158
+1915314009235720841
+16655465042802838677
+14866462842593414402
+2676033351738984017
+546636604
+535167753
+42008942
+30540122
+6365225483234117329
+7602365
+282854078
+2625321610894182276
+13307798926833551183
+10913926882465549337
+15906307047154976446
+6104586261131037638
+8483828720841721486
+15287423226215073909
+17785259896117529586
+2785415278947600352
+9000175594581527004
+14425661002709010016
+5513226652957347114
+805679481429165719
+17859691850682797212
+9181555677596944971
+1363739614
+9870724566615620
+537985804
+572392279
+15175534989820758889
+1873618476141513316
+2152780467316001469
+12601357272775920269
+16765215479188031591
+6534429686359852912
 6366353553143235674
-3158171969379764633
-21365044
-27638058876667848
-29330140097217635
-1873618454645114642
-2703776923039566000
-68813257
-279448782049
-814285726
-12237654319976351671
-517669620
-5779476284463187670
-10375505326587315831
-18411699915366727708
-6205475624366966000
-3307734082
-39125348
-1087507565178193378
-545325868
-15986098390340470919
-223556143025
-19177592590632702
-8865366478519731984
-19333416
-32432337723461001
-812254097
-11305519054433421356
-1873618484738787248
-5105416417023100899
-572982104
-505742040
-563086155
-104333894
-8070528080642443989
-11327137566841769230
-2625321610894378836
-16377260960560187819
-15586729198848181726
-1873618441748546884
-18413109971585663048
-4825924017323379312
-5915592292141435844
+12689613402799605860
+9138963602338286574
+104989258
+644023149
+361131345578
+816055205
+9870724569433729
+70582752
+1309213649
+17634738504986593825
+5639662680184522626
+6316393479032998553
+16340493341965880015
+5344573059048999857
+34124461934314600
+5994450030541998229
+2625321589398637514
+2676819007
+15515140772745448064
+498702419026
+227855238971
+4587663
+16893851890367073119
+14264208198271043974
+555090760
+818873277
+61931938
+16056587
+8821966780582857359
+18411699885273055253
+4861149623842704773
+18413391996586557524
+18115578910873816258
 5832726151436896491
-17247780946628644032
+365262179507571896
+16896582888638318388
+4445946672738929841
+17186370630874106258
 810222466
 7405754
-11549275701007551889
-10161648502327149991
-570950482
-1873618514832459339
-313841222762
-4452458274095237609
-1445774942907271091
-6101795934071424788
-92406286
-5293539447540681024
-18331491793766525
-197198505
-11199980773228349986
-32432320526091507
-818086838
-1997667722089860216
-2524806027085153844
-1964966944
-15270143
-1370042529145686776
-5565348523104797810
-18331539082773742
-62718382
-2012415014
-18413110001679335503
-5374107
-14282027259104724924
-10375505339483621145
-9887461037680036022
-1873618544926132491
-4662355883991631380
-18412263939573940270
-157614716
-3295137431799204142
-9870724569630759
-491782859
-214958343888
-16875205763331852041
-7241607903360452069
-5408471212899110030
-23134531
-18411417877468545037
-27356081166681957
-644023149
-70582752
-816055205
-3342460
-5246976952665638015
-14212253575230457510
-576783198
-1842511416005692464
-806159226
-5566476498435574920
-15292217517958891614
-13516735047310051359
-5728764487730398405
-468608617008
-4025969582498383295
-16044698410490725659
-1519546451849645365
-9870724567599405
-5566476545724581156
-5619444426388998007
-98239009
-547095362
-27356033875641745
-219257112483
-8140646021471143544
-4713167439824750602
-16357059045845960667
-5462796881224795644
-9138963602338286574
-21102898
-10905173367761798655
-13701595356116683915
-2477484405147109478
-1880166538706292058
-11206864
-1283692271244348427
-68551110
-5885543833259674054
-18413673995792875610
-2352415791
-14947075702982868
-5299098870103476096
-681145240220994278
-163447447
-331038328206
-38863202
-96207382
-153551462
-2625321606595348609
-5461104757014004985
-10744889200825601240
-1988559907
-258343605
-6517011693716180143
-535167753
-2530175340657839273
-811991951
-15291935475760762248
-4397798264919820154
-18413674025886548065
-12109395139072755174
-475082778886408323
-104071746
-161415815
-8697110475982376165
-15584540329550678645
-13669583335851559254
-2625321610894116800
-1873618441748286746
-18412827963781152832
-819856323
-6209141854797957852
-1783548230307677653
-18411981901675757599
-637928298
-7143606
-15855332315905657597
-2625321864544389907
-12020808312486431384
-3076135121411313050
-10139438201185111279
-6152225744495577231
-33560368941368890
-210659313158
-4278256712
-27638024483702949
-24904017
-32432320525830439
-13263754581809432790
-817824692
-15007995
-359800716494834349
-18613516794268696
-9839328478246341893
-62456234
-5111959
-18411981931769430054
-16219982623696489082
-6261827792145090364
-7692717626264324682
-42664306
-13806855580317125108
-9870724569368358
-16269555352897260337
-214958081659
-11214563466575480865
-15636771529559117046
-13271165719268362246
-2652485274356286816
-538968856
-3784724792312663401
-18263821886743185772
-1986666427421953426
-5565348480114297669
-5352348827359053328
-12976359
-1873618476140725820
-421319345246
-70320604
-11703165067112811597
-21715697223994697
-3757107087862401328
-60424594
-3080312
-10697899350700788395
-1873618527730534170
-468608354196
-509280991
-50528646
-1193603335023233930
-16635669954819197974
-15426482629288462533
-5460499803637156023
-2625321602296318353
-9870724567336570
-97976862
-8818864638845060491
-14288223544298637564
-88080898
-6996745855548787140
-5566476571519223063
-546833214
-220421203678071202
-31022238513759415
-1873618458945389823
-6406389097441592980
-20840752
-813761433
-27356085465188671
-68288962
-5865888353649363875
-109394696450803010
-12213481117926952067
-18413391987988365394
-10944716
-517145329
-5723537903358642458
-21715753112570631
-7758478083289188556
-10675690836223986039
-153289315
-95945236
-11547019543992076059
-9649086479758069023
-2625321606595086582
-258081459
-544801575
-5887799994573980828
-2845029447323880298
-18809125
-8510103668314541335
-6205475701751155414
-1990332636357069057
-429916882098
-2673382969485886910
-1873618489039064439
-18413392018082037849
-10914208898869168291
-3773122177597967623
-161153669
-103809598
-14107087915135404740
-6366071515245381876
-18412545955976642616
-15289397371128645360
-5462796868327967227
-1402930148
-28202057290482949
-797695489810761887
-16777494
-18116142943679220675
-5142301044413893172
-17219576355390295334
-5249797112394286460
-13735950183222348532
-6881458
-29048192479791616
-16896582888638318388
-14517406836956661503
-5458848655886518922
-313840698753
-5197393273133271298
-3861350810962691992
-6375653898722412075
-16885380374869314205
-361129707266
-210659050964
-29048123694646491
-3017170418691476659
-1873618450347593089
-15290243360149277503
-14745847
-72090103
-14546784569801180959
-7431889721301470079
-6364097387529111599
-2435475427475262665
-1873618497636600365
-6151097734773868363
-62194086
-17083693200934636558
-32150372909516328
-4849811
-3172873313800750756
-2150364429944620611
-3862478902367620470
-9305858029919208637
-2625321597997287853
-2508194873
-491258567
-1408762855
-5015996636573993090
-2414921941537785811
-538706709
-5734260728554980678
-22610237
-12714212
-70058456
-6208295882974168451
-32714336929384395
-16643035121679272213
-20023641798084435
-4770547828131824981
-2818164
-1930668198955452820
-13726068529822894439
-468608091255
-5569296714050766113
-17490170188584258190
-8694008299851745161
-7073102484926630551
-155058804
-97714714
-40370537
-2625321602296056238
-1703347206
-15895039144349470066
-5352348805862656188
-3068049059797011246
-5880738612678821404
-12309852946450942075
-33560429128451329
-15289397384024950845
-4767727591019973374
-10682570
-10233718743719545342
-850088361543927300
-2792183694107936667
-1107456968073808590
-5759560470823897206
-162923155
-29612216687004362
-5875369269012203157
-95683088
-294416195335096411
-22279760122415532
-5639662680184522626
-17619012653768771484
-13237708544183762948
-8550520059753138843
-27356042474686002
-249849483538007723
-544539427
-13390152586296232130
-10906513561824594910
-18546980
-1873618489038801706
-2676033356038342054
-6313103561496791450
-2063139881
-6848542126596623056
-160891523
-103547450
-14101293042239958
-6151097653090126690
-1584595969
-12424382439595706534
-17698252132056434004
-4129856573689694799
-16885259953617962521
-12393440069873436875
-32432320527338097
-21433680821684597
-8617826180017097033
-1413046597527668667
-3973491001936446780
-819332033
-17305802226190387588
-1873618467542665344
-16515346
-6619310
-6206321690771522709
-4089771542585346905
-1223976962194278208
-13487493291780736605
-2487491354099451134
-8854886172739175692
-9870724570875039
-2625321593698257851
-1535116279
-6262673798362565305
-91619849
-493028049
-5352348797264856883
-8143564249694210398
-6151097683183797493
-9386257309953099582
-196412070
-3865299044899163405
-71827955
-18613366323088485
-18157949162008873831
-7562235583526800081
-817300400
-4618470194090937269
-4587663
-3932922014897081298
-61931938
-1873618497636337289
-2522831856378710008
-6364097413323754682
-6053028402293443390
-42140016
-12287601267178473523
-2625321597997025900
-538444562
-15991329612793777185
-15291089478142986477
-12452064
-2676033644081056812
-2556016
-16508579235574254010
-805372789
-59900299
-14787093348585572176
-2575517759332551933
-2412665810316625225
-7730749911729375728
-6155298010574883251
-10488220504998020326
-1311572948
-883931539946605906
-5352348805862394041
-2786543383251193103
-546308920
-3346269252
-5782296426993943791
-4469799173763958889
-6205475671656957491
-7872981661881076049
-18116424960081923281
-2676033351739311464
-516621038
-1465168459078698840
-5677488692584514734
-105316943
-4562124351240801677
-5245848874158263187
-16432982289349543214
-162661010
-3971798877726246151
-4787251587800828866
-5875369294806846690
-12217235256243064050
-95420943
-5354604868299326678
-4502324021619918399
-544277281
-5940918086979029952
-2014710471177341259
-2140013610
-1873618463243635741
-18284834
-2676033356038079832
-10531295876509927029
-5458848625792321791
-18411699898170343448
-7410231625909407077
-3478039985316562895
-6204347606046083061
-31586254122912349
-6829167320236755019
-27920101074341046
-13165236096819726043
-32432389312220424
-571933524
-5727354401416743090
-10225919154718574351
-4127600472563058730
-160629376
-103285302
-8483828720842049762
-15740334315622960494
-206359759935
-9813006656186419950
-9319686106503382840
-5515085278788979157
-232154663489
-26149204
-6208295848581203181
-3094190453106412515
-6520986101609793850
-32432320527074663
-5245848925746038203
-5942328186188203485
-1873618467542403595
-16253198
-15881445561639371975
-6357162
-63701435
-15515478115209971466
-5833854247140395797
-283181761
-19177532404009207
-16567374854657149772
-684134257893509654
-9870724570613070
-15680489859993767209
-12826571498698443033
-2625321593697995819
-10329316755526125416
-10754752208794748192
-10758418391935812957
-12105446909435186010
-3143159678306028631
-236453432350
-540214046
-14848239906707278405
-29330157293274228
-684134210602468610
-817038254
-4977791693940394179
-71565807
-1873618497636075077
-807142269
-61669791
-11287403619712895066
-4325515
-13819298136066198
-7734678113259293802
-6098975847429179176
-99222062
-18056758355458722638
-9870724568582655
-16224960573811657069
-2625321597996763849
-4078298757842341053
-17625510063045740642
-10528906628815718922
-490734276
-5412367062202975465
-22085946
-12751507524739009261
-538182415
-12189916
-18413109984482951243
-2541195915421354200
-6671860954713623381
-2893509029140760671
-69534164
-747829823970020707
-6770804071406897080
-2293868
-5566476498434524382
-6534429686359852912
-18412263922377556010
-164430493
-9870724566550039
-154534512
-10167299845199168903
-12754891682880490747
-5250413516934022944
-3315661715940248009
-451651625195343029
-32432333423379563
-5941764217869305943
-2141783083
-283748271730
-10161648493728303880
-5240846595623881868
-67502526
-15618641120352995308
-2676033351739049517
-6205475697451599682
-4023356732265137752
-14986955239351847842
-31304272112126853
-516358893
-2207492698791414354
-477207135345
-1309279186
-105054795
-17859691850682797212
-162398863
-4238330517036600601
-152502880
-18412263952471228465
-257295025
-10905173350565414454
-17498716255300421272
-8881019260503721949
-18022689
-534119176
-18411417890365833232
-6293435910568086045
-9374458755688828226
-820839372
-6153071780807051278
-5909364179964069981
-8126661
-3735453693364143828
-6155045908522469290
-745740842898098858
-2625321589398965240
-12142525752872799042
-160367231
-17958290734101235336
-9523554809025136564
-16892239439269464715
-15289397371127860096
-1736311827
-15991050
-63439289
-6095014
-12484855343804124176
-9658025172156550406
-18067928153034001057
-292345808939
-16572875051796793000
-10542598463376395267
-12772641161582545873
-18413674008690163805
-1544487931
-14737352740221028816
-282919615
-12808641794728789765
-2625321593697733840
-17128487303121020
-1706624008
-14101026494875963
-11214563466576463780
-18412827946584768572
-11966722661119888545
-6156455943247300775
-5300226909920168653
-6004915412369541960
-816776108
-4223816177647290930
-71303659
-1873618476141710425
-12477949191893683608
-417019528294
-9511403338599564690
-4063367
-61407645
-2543805385922512178
-9870724578216632
-5407707525201267705
-9870724568320021
-2564752444
-98959914
-15494005608834598990
-15140097999495498431
-21823800
-12734096628671909131
-537920267
-18412827976678441027
-11927769
-69272016
-18411981914573045794
-2571498445011814318
-10592171188278987146
-2057911839619745748
-9870724566287831
-154272366
-545784627
-17616192489740896443
-21715680027609308
-16886908734816455284
-583336804
-2246313005
-516096747
-2625321585099935141
-620888934
-162136717
-331037018572
-477206873177
-503001777494
-15592058013925444099
-1652810939277510396
-10531295803425490030
-3205882223899445065
-31304323701671300
-28484129580057898
-1873618441749006513
-16893851890367073119
-820577224
-16904712944498838074
-1394017249
-17760542
-4160689491693538063
-4047541379259827663
-7864513
-14219872676477209184
-504169174
-17244622751296785814
-2625321589398702921
-4278977611
-7239633818635733091
-5462796868326918190
-1334641629
-73073152
-7460569593843485201
-15287141188316891641
-818545595
-9339868219275806468
-15728902
-5382561551670903978
-9373330690077689939
-18413392000885653589
-5832866
-63177141
-438515402871
-2373415502940997016
-2148672322930150296
-168849237244054062
-12339564610979564477
-8327325764367420682
-7630443591734791098
-12608147700378373379
-9870724570088730
-2150364451439708714
-18412545938780258356
-13221120945827219803
-492241614
-4129856608083381232
-15740733274947783803
-15858116883009440274
-1873618476141446514
-816513961
-17564225130023161250
-13697261
-10668197763104573447
-71041511
-5357143003026951378
-31022281504720056
-1873618501936351339
-3801219
-442814170389
-5701610621477129021
-8520914754064026558
-15289397306641222853
-108593749
-98697768
-9870724568058057
-5780604294184830225
-156041850
-5192881006389626514
-32150304123324262
-219257572663
-18412545968873930811
-5249797099496672683
-11127945220196076778
-9103100569952650951
-11665621
-421318034537
-17619012718254098754
-14443179094226111164
-1873618480440216958
-69009868
-10594427319499622429
-814482337
-13968724050119231192
-28202091681875145
-27638110466671725
-16166203682344470241
-1712194570
-472907842721
-507970270
-15580874172203795679
-23689855033805297
-154010219
-17092164759424403479
-12893049762838873864
-6877309693745106245
-545522479
-5887800020369606783
-14977809576148535095
-19530026
-14105033451515939293
-6795216411027442152
-2543452128325209336
-1385890784
-114426460
-6444189713816225654
-6152225714402364510
-524384476410219715
-17953567922355439196
-17113993018971653874
-573178715
-515834601
-17090754617222956318
-161874570
-1538130937
-47186305
-30458188512103543
-2449021711964768402
-2414448843017751282
-5214737420442796133
-505938649
-2625321610894575340
-13965057806789381527
-970700105235760464
-15223822230290106035
-16285378285009240167
-16940455997476965252
-2601013084734032090
-5248157445900799208
-1580068669843704469
-15043322265989680207
-29048166685607288
-3863606942184311140
-820315079
-17045009756596405420
-29048192480512516
-11510172448171493799
-5885976160280708469
-7602365
-17785259896117529586
-8856014216854897981
-14477731067643038195
-1873618514832657292
-2578187325
-15292499491370895395
-33560368941827284
-13146357072728951328
-17353152791227993245
-159842942
-15530553734630409457
-5569296726948055802
-494159375523777824
-1812923415
-6366353518750729401
-4278715465
-17097308613030775025
-35258719
-1899651063193471062
-12103109825679658143
-6364338522051512284
-2429880031182916564
-11621189233770302317
-72811005
-15466754
-3880024017885400135
-818283447
-62914993
-4076606625033226775
-1873618497637320883
-7746405201714873917
-5570718
-10859426818132543221
-6925759835249836137
-3506237898852665380
-23407812836853915
-1873618523432225060
-17166316876055971050
-18008952305986046279
-43123062
-9870724569826462
-7410173966093388838
-33560399035500221
-511599051947
-214958540605
-13237708557081051143
-20587696099952690
-15339421027537585423
-6104586261132347910
-11103300151687644832
-1456931819
-1873618450346281005
-9181531069949872018
-14650572868605052119
-17783567759008991682
-575239712866634722
-15288269284022357372
-6206321673575138470
-644219759
-13435115
-399811749952817933
-145335345147610979
-70779363
-6366071455058494624
-7529998377695250462
-519635711
-3539071
-576979807
-9568723490388248888
-634323816
-13012951802393594980
-853643387796785445
-98435620
-28766107292140894
-9181555677596944971
-5195701200510977145
-5129024196560096606
-5831598124518278362
-4844858457232050089
-219257310372
-7569568047215545466
-5461104800004441485
-1518418407735101149
-814220189
-11403474
-18005251247539029895
-10333839787251271664
-1836516380
-8054758354584013306
-507708124
-163644058
-9001701177466488459
-2625321606595545096
-153748072
-4787251587801811388
-39059811
-545260331
-2036204584
-5356296971014964874
-19267879
-9714916684781063078
-3055188874828713383
-14576212124415364447
-2150364417046743283
-4662355849599126556
-1372824966366170355
-1318388695
-15289397293744393060
-8423108281783224429
-505676503
-104268357
-477206348880
-5831598081526006949
-4625631396377398109
-2625321610894313322
-6206321759557388696
-12237654281284815334
-17236251
-9391897711091583990
-3891732840317912522
-8856014216854636141
-5758903550139959418
-7340217
-638124907
-810156929
-6206321690772243584
-112132697
-15287987228927658628
-339636063086
-7721139320100816372
-684134305183500639
-22279768720672168
-5831598111619679502
-14814059355306855043
-4211213383
-15290243360149735302
-18411699880973959188
-15204606
-11507341268100646834
-62652845
-6365225483234117329
-5308570
-3491703531359374171
-17791918762976347730
-4127600455366674792
-11130039777759856047
-13951205954302381098
-18115578910873816258
-8659114857360722535
-6153353844499089111
-157549179
-9870724569564298
-16327183209838150989
-491717322
-214958278120
-32432303330691092
-17684252729367202593
-16965951797418331227
-23068994
-2272905061487347697
-1873618450346019367
-7515799761807542411
-815989668
-2576363817137867614
-70517215
-17763448248357489818
-13172970
-3276923
-806093689
-17621268802185464283
-60621205
-18411699911067631643
-576717661
-1685722535145180234
-23689824939607125
-17256155806064642777
-5516892801706297876
-12982659022915898414
-9870724567533791
-15515140725455259155
-547029825
-219257046468
-4180850416920431050
-21037361
-68485573
-11141327
-813958043
-189614828176542708
-1873618480439692390
-279448454880
-16253215886083360174
-572110149897422243
-9896616181508082455
-153485925
-8021450371307931626
+2507605048
+17607182983838566334
+546439994
+2679637056
+41812332
+99156525
+9140909979694467183
+11742834345080916462
+9950583114428779661
+18411417894664929297
+17160329975787685834
+1518418407735101149
+18331556279224644
+15289397293744393060
+13950077918785243724
+15287141235606883137
+2789363555875490728
+491913932
+90505732
+214958474959
+21692726
+2063139881
+9870724566418979
+339635799228
+11740202404159819072
+12769623874203027091
+7171706723174648069
+16156684754098128751
+6208295835683456476
+1873618476141316771
+5882159696614657105
+3431717683755289915
+1873618506235448739
+17166316876055971050
+1023125932615797552
+22279798815198594
+12346762090311779845
+162136717
+331037018572
+13041896
+1733362705
+643826540
+2306802734
+477206873177
+17309267256018536307
+2625321597997222413
+517669620
+620888934
+70386141
+31022281504720056
+7409502855497715015
+6155045934318095559
+18412263918078459945
+5458848625792321791
 38797665
-19177566795402134
-27356016680241600
 669582195
-2625321606595283106
+27328854
 554894151
-5512098557251945790
-9568883447315500158
-1440671446449589035
-4502324021620638916
-3249068390006196153
-15292781563660995825
-821822415
-27356063969248337
-18413109967286566983
-10911952793442192048
-6064503826171693679
-11161692095903435283
-1004761907965660269
-2207210695286917386
-6388664954993575829
-46662016
-5885976061401368013
-104006209
-5572809636517250553
-2625321610894051277
-17955470565775510239
-4661227814082512385
-6368045642960996241
-5463642874544129714
-16974104
-533070599
-809894783
-18413109997380239438
-7078069
-637862761
-6288511205539515238
+468608091255
+15859976
+4287350254045103750
+61735328
+4391052
+6520986101609793850
+153485925
+8510664359869943178
+11050161621988804687
+20869691006257762
+5196265237615086368
+3491703531359374171
+1873618489037883086
+11633356565151157114
+16633695839999756254
+23407812836853915
+1873618519132015281
+12074216556992400767
+6153071832396467338
+16120716880803858984
+5299098848608717979
+17149817495305717193
+18411981927470333989
+3308118261903721908
+5831598124518278362
+7209143
+810025856
+797977540607610221
+98959914
+7470284155521404014
+2564752444
+1727529996
+12318365723907459763
+5884848047378859255
+13222096480399396057
+6314795724398267312
+4397798316509039896
 3974700764184054454
-18613559784442970
-2791055594105669609
-4504298205224635444
-18412263935274844205
-2605266760616185153
-15287987228927396675
-339635799228
-92078603
-8501910827968825512
-5991347884504386492
-210659247559
-17284241873202253123
-16893851873170950707
-651404368114879038
-18411417873169448972
-24838480
-5726226344404977639
-10259573046193883986
-2676958769323838072
-72286714
-6886936648282539655
-14942458
-521143041
-5046422
-13980703149896829784
-1495991284
-62390697
-18199185222634702635
-8834282535679560676
-15925946803693423456
-42598769
-9870724569302153
-5459976661309982295
-11084138473134491150
-5303047078245827995
-214958016090
-12451287838412704489
-5509410202188647833
-2681814701524780811
-10628953736434486617
-9774054990929462949
+5514354709969504081
+2893509029140760671
+1873618514834032208
+5516046791188875281
+1223976962194278208
+14737352740221028816
+6368045642960996241
+3489447322753895731
+21496117
+9870724566222277
+514654951
+189614828176542708
+214958278120
+491717322
+571999061
+6367324841362000185
+10375505339483621145
+8070938101082033295
+5569296709751605280
+1316357237551401394
+12020684574736647824
+15991329612793777185
+10697899350700788395
+16739161091967945306
+3891732840317912522
+1899651063193471062
+161940107
+24314188
+2224234517276395067
+17082847207615301902
+2625321597997025900
+6152225714402364510
+12845285
+506004186
+1733166096
+70189530
+10906583445476542150
+563348302
+31022281504523376
+1873618527730601376
+2530175340657839273
+1873618497636468806
+1873618441749006513
+18412827950883864637
+6366353518750729401
+1413046597527668667
+4078298775038527628
+5565348505908348542
+4022831255438034250
+153289315
+4194441
+15663365
+11700344903086704893
+73007615
+818480058
+296644709200
+95945236
+2150364417046743283
+30740204914804024
+15290525359355331959
+4237766475632413481
+16758489844492275047
+5408700909154273182
+5153885660580611393
+1873618519131818153
+13951205954302051853
+3597357340772992368
+7432602967203907143
+1880166538706292058
+399811749952817933
+10381427610856195682
+4644563108626631009
+14665351642484132
+7012532
+5141736951422061236
+3344434243
+16330874579771459902
+1873618484739705502
+8550520059753138843
+4645667113710455153
+5885976069999102359
+15501473033754248808
+9896616181508082455
+5462796868327967227
+14585410861575638263
+214958081659
+339635406848
+11818157132458100908
+11526999220155450649
+18613533990193666
+1873618450347593089
+3861350810962691992
+684134305183500639
+18413673995792875610
+15530271666638163275
+17621561863500597513
+4238330517036600601
+22279768720672168
+4502606072414800438
+10655291933708585535
+161743496
+517276402
+2625321597996829544
+12648675
+563151692
+1308623824
+104399431
+236453432350
+4279043148
+540214046
+1744643526947965735
+2065251783916127931
+18411699893871247383
+5459976691403131036
+21715680027609308
+5726226344404977639
+15292499491370895395
+18413392005184749654
+1873618497636273356
+32432320526091507
+31304323701802109
+2576363817137867614
+1631882651478526261
+5995296157134227520
+7558005371879033601
+61342108
+95748625
+520094464
+15466754
+3997830
+5240846595623881868
+5887800020369606783
+15288833278135765839
+818283447
+72811005
+5459935770830768809
+9355193091131312182
 18411417903263121427
-3865299049198390675
-12910822
-5356297009705911966
-2421359666
-70255067
-2248112069177510680
-3493395634074945822
-60359057
-12654580528992553525
-519111421
-3808100888100343209
-3014775
-13513632858283052077
-15289397310941235057
-8861613698626554738
-9697577994188492052
-155255415
-10381427610856195682
-9870724567271440
-2625321602296252770
-14512708438227029368
-97911325
-489423554
-4022831255438034250
-30671195
-1873618458945324208
-20775215
-5459976691403654584
-813695896
-12665415616966166285
-5645056620059298667
-68223425
+8430474765433179073
+5247593352907000017
+27638110466671725
+32714414313178248
+9234597529149245860
+3229097897723824621
+7449919019336600171
+2413828615876118471
+2414448843017751282
+6101795942670075923
+7697026996393675938
+31304285008889193
+15777957523720635747
+3143159678306028631
+11065487220741573048
+6815921
+2140013610
+14282671233461718187
+9230753948926543533
+98566694
+2625321585100065781
+5382561551670903978
+259130038
+155910777
+87097857
+18284834
+282067642
+545850164
+33278352538406314
+21433680820701635
+5625593289148533238
+10512763733196344280
+3784125305237867347
+1873618514833639109
+32432337723461001
+1873618454645376983
+15292499534361593441
+5133829485925763690
+16904712944497920326
+5511666277521099409
+5622264654903379074
+571605843
+514261733
+491324104
+2625321606595414132
+21102898
+1385890784
+524384476410219715
+17257283880273643533
+5195701200510977145
+10280579133800254203
+200191596416994196
+1873618476140725820
+13117263636444153530
+15096032016463431129
+6729754102968812754
+18412263926676652075
+31304272112126853
+12118995007347033900
+1996555300
+9870724568648556
+540017436
 1319896024
-2390363305266056430
-17634738504986593825
-20305632407192782
-17462509665872383079
+4236074403011431549
 1606616067
+195953314
+23920969
+104202820
 305243098454
-163119765
-48431492
-10590197086357423689
-2787671431665157349
-6366353484357502971
-18413674021587452000
-17620986833073014515
-105775699
-20869665212206112
-4445946672738929841
-95879699
-2625321606595021110
-10906583445476542150
-18412827959482056767
-17205553309096938840
-12294570438877711433
-5461104782808583112
-544736038
-9950019055828534995
-5991347927496394467
-811664269
-5403008449516603011
-18411981897376661534
-572392279
-7677136701370927115
-6155045908523191668
-18067928196024961188
-20587511236070012
-103744061
-161088132
-335336768790
-6155045934318095559
-13322381941750499717
-15291371425760087333
-30740222110467489
-5245848925746498573
-5349308051975768286
-4548309565419816229
-255984301
-5461104787107351969
-16711957
-10906583475570214623
-6365225453139920066
-6177363118375897150
-6815921
-7032232753418799293
-5558136817694803400
-4030203865610717075
-12718336251608304605
-18411981927470333989
-1545208828
-15287141235606883137
-5837238474067478018
-11705421198335413148
-5524868651610213131
-210658985303
-6098975770044925746
-24576334
-13151687854617134836
-4662073803102881076
-72024566
-817497011
+12452064
+5248157445900799208
+31022281504130688
+1873618497636075077
+1454041666728626384
+1873618441748613384
+15289397310941170468
+12999620585941961275
+5875369294806846690
+18142877345697171235
+2789363542978135882
+18411981936068526119
+12057284352529139627
+356832576945
+17092164759424403479
+1460339693
+3801219
+256115374
+1987904546
+1964966944
+15270143
+33278386930321618
+442814170389
+818086838
+17462509665872383079
+6206321759557388696
+5408471212899110030
+1873618523432225060
+17353152791227993245
+6261827792145090364
+15223822230290106035
+15287141218411547437
+7576852735584046364
+9714916620293637762
+31586271318836879
+41025899
+2625321585099869531
+223556471268
+545653553
+4023356732265137752
+98370083
+1531970550
+6619310
+23689824939607125
+9950019064427710530
+12424382439595706534
+1873618484739311861
+18331530485106038
+23407864425745813
+797695489810761887
+15289397353931868100
 29330157293733695
-17096567568145714575
-1454859013759438228
-14680310
-4784274
-62128549
-1493907215600323645
-6364097387529046615
-12583654612056476062
-12851509922494416016
-1495729137
-15287141218411547437
-828143439367899804
-2523959969279970191
-3919394969679695174
-7595953279435999504
-2625321597997222413
-491193030
-1839046019115124804
-7241043922144659849
-18613499598604650
-18413391983689269329
-10594427319500605883
-12648675
-4861149623842704773
-5782296448490276391
-5516046782590617836
-518849275
-10015828607276288922
-15662612681012938353
-2752627
-60096910
-5133829485924779401
+6101795994259359091
+7692717626264324682
+5299098870103476096
+9813006656186419950
+3591160524196219107
+4129856608083381232
+2755882956846859930
+5352348797264856883
+812254097
+5524868651610213131
+11124082762859154482
+2857298572705729021
+19177566795402134
+18301216972081072101
+2625321606595217579
+6152225753094686522
+548471621
+5512098595943810546
+6584302816815089825
+11092808190891527547
+5941764144784016877
+18412827959482056767
+14428481252717692252
+5301355009924597043
+12284124821458521275
+3577503648415550265
+9870724568450966
+69599701
+7431889721301470079
+46662016
+35193182
+104006209
+12255453
+17877509356004052233
+11069796553860646809
+4347925833116091776
+10590197086357423689
+5570988786672405753
+9297735470756268616
+14637903957824965363
+539325825270679628
+15584540329550678645
+17247780946628644032
+15073532
+1574831104
+72417787
+152699489
+496763604
+577045344
+817890229
+3604608
+6204347606046083061
+12771845711499103316
+15290243377345399572
+11127945220196076778
+6208295882974168451
+11846037961957312985
+13106160036035691955
+1906986264008723742
+4657106642463886071
+9198943117821938833
+15270695523793439937
+5246976952665638015
 7003516464553396964
-12903069678853164419
-2625321602295990612
-97649177
-259785401
-5464488953846367762
-546505531
-30409049
-374027977988
-1396769762
-21715680028329254
-5637072609524124450
-7731877951544692100
-1873618458945062288
-6767393152337644543
-9467310877347154547
-5429433323061448040
-10617033
-1730937871
-107356700000258304
-425617786716
-451412690018
+10956724774926813288
+820708298
+545456942
+63766972
+4585257123188181630
+6422699
+17891616
+9117971362513815455
+18413674004391067740
+14597841535548131734
+9772926989806013640
+1873618458945784014
+4522983015860209939
+13806855580317125108
+15426482629288462533
+3506237898852665380
+5787083718919720300
+13322381941750499717
+13237708531286474753
+32178524
+490930885
+2625321606595021110
+20709678
+1706624008
+513868514
+245051624454
+525337347
+16483453074211931840
+12217235256243064050
+4794173760728861833
+5347351719937377689
+18411699902469439513
+30458205707308380
+10750398672578676906
+13351948495571782591
 18413392013782941784
-12020684574736647824
-105513554
-3541851256594893702
-16038494049631274933
-497025749
-4661227783988316231
-18412545951677546551
-5565348467217401524
-14428481252717692252
-544473890
-3344434243
-2169005683868174908
-5993603989931887912
-12972952285742288
-13117263636444153530
-811402123
-2676033356038276482
-1873618514833639109
-514786024
-572130134
-160825986
-1938490399
-10280579133800254203
-285938493736356261
-6425213859614951480
-103481913
-11364576519499679975
-1881294612915292853
-15739206202722094240
-4397798316509039896
-17011915733784398286
-1873618446048496233
-14383326641327005
+17088031212435737557
+5105416417023100899
+11427331956784106382
+3698377064120454404
+69403090
+1629160452
+161153669
+17153706162049649384
+103809598
+15580874133512784221
+7872981661881076049
+2544766567488424310
+8818864638845060491
+1597536180772867045
+17631161371525515669
+4977791693940394179
+29048123694646491
+15288269284022357372
+806224763
 26345813
-6156455960443095577
-14975681650483333306
-819266496
-16449809
-15288269301218674108
+72221177
+14876921
+60752278
+3407997
+152502880
+2625321593698257851
+5675796551176948530
+5337229686545450161
+10649664295314066054
+18271599167641487963
+5741055947585816645
+1873618523431831649
+9763237054719266042
+5778348175860436286
+11906248855590275274
+145335345147610979
+27356063970624739
+2676033356038407648
+6226088
+7285785859232107205
+2036204584
+109445719
+5779476228575003910
+13117159209037203716
+97976862
+545260331
+1839046014815569788
+23407778443758207
+13885455448020813663
+17091318705916150977
+14749580058850363919
+32714379920475611
+5511666307614640107
+5780604362970367638
+18412263935274844205
+4767727591020628301
+5885976160280708469
+1396769762
+811860878
+5996988225456246061
+9887461037680036022
+490734276
+295544243748734701
+30458205707109873
+9950019055828534995
+17090754617222956318
+5566476545725367766
+17648172271756969893
+15289115277341755866
+30176189305784773
+6205475701751155414
+9940375093616053431
+5728764444739437994
+15140097999495498431
+6523880655054768550
+5727354401416743090
+210659313158
+1456931819
+11862232
+527958790
+103612987
+4278256712
+9870724568058057
+69206479
+1997667722089860216
+12339564610979564477
+5243108696682924272
+23125779236849772
+1873618501936285414
+16044698410490725659
+13669583335851559254
+14425661019905001872
+13681696359428851594
+16161820259100396848
+72024566
+24535844054893958
+3211386
+817497011
+9870724570875039
+60555668
+26149204
+24535874149025753
+806028152
+232154663489
+507839197
+14680310
+2625321593698061230
+15273884660262766886
+6633286351216577743
 1873618493337504776
-5782296461386581535
-12162857194684744950
-16633695839999756254
-6553773
-6206321690771457172
-5411573444917201071
-14273081993166850387
-17297538988880889355
-9870724570810095
-339635275824
-101450287
-2625321593698192308
-91554312
-3812049113439014303
-492962512
-15289397349632182266
-342928503145892901
-9257009393629660721
-13674941621707869313
-17952462371364276975
-24314188
-7676326001635166459
-12622921449567619867
-14471968401314024391
-14418163
-71762418
-4522126
-1873618497636273356
-1873618523431177265
-31304285008889193
-2625321597996960522
-42074479
-18895601982637667
-14883032307819284131
-32178524
-490930885
-5459976661309458015
-194314911
-1873618454646032908
-9386257314251803173
-13950077918785243724
-5831598146013367591
-5882159627828332650
-69730775
-6100103913039400051
-15744000533156660854
-12386527
-518587129
-59834762
-9231865831523354279
-2490479
+3094190453106412515
+1087507565178193378
+8481290603310483360
+16885380374869314205
+5412367062202975465
+1372824966366170355
+2543805385922512178
+27356033876298083
+2676033356038210923
+820315079
+63373752
+6813843502384089093
+97780251
+258343605
+155124341
 2148672331528407961
-2908260051937332390
-16876615841046071902
-9950583114428779661
-154731123
+6029477
+17632571483632043275
+18349504802666319185
+12133908888583014197
+18412827968080248897
+6100103913039400051
+5249797202674912471
+1873618458945389823
+16271603835638319461
+605557034314828631
+5881866613803452649
+3544107379218385465
+7406761759861377295
+811664269
+3234866947851553000
+43254135
+10675690836223986039
+3346269252
+17946608694533885076
+5459976644113468289
+15290525376551717170
+5946696443085589628
+3477193953305167424
+5353476789790574588
+28202091681942395
+5944020284604679310
+8143564249694210398
+31304332299601191
+8856014216854636141
+160760449
+28766090095429261
+5727354401416546168
+421318034537
+814482337
+103416376
+210659116497
+15505670362359531298
+16893851873170950707
+15515478115209971466
+46072191
+11665621
+9870724567861628
+23134531
+69009868
+14105033451515939293
+1784112314702629632
+32714336929384395
+853643387796785445
+4713167439824750602
+3812049126336301758
+16130159995999161574
+15289397371128645360
+3910652327921846506
+519111421
+71827955
+9870724570679212
+2625321593697864817
+60359057
+326737659857
+163578521
+17219576355390295334
+197984938
+817300400
+3014775
+18413674012989259870
+27638084672359236
+6156455943247300775
+18115860927276518423
+18323667541285735009
+5572809636518234139
+2581508047683782932
 13237708539884666883
-30458205708158447
-2964529530791004471
+2524806027085153844
+2676033356038014277
+31243224015702806
+12982659022915898414
+510460641
+464308734399
+2219404100148201532
+544867112
+438515402871
+258146996
+5832866
+97583640
+63177141
+21715697223994697
+14657391675119111356
+18411699911067631643
+1873618514832657292
+15339421027537585423
+3545799490531822688
+16916327697533962956
+5299098844309358384
+4127600472562141528
+8920676095134336910
+5133829485924779401
+6151097665987347595
+6152225697206437786
+1706034183
+15897859265386515143
+43057525
+536216330
+943759021439519007
+10939233345785957508
+1746899701160150410
+1384869222252743071
+5881302580997523790
+107356700000258304
+11287403619712895066
+814285726
+68813257
+279448782049
+539034393
+22937920
+210658919829
+493159123
+91750922
+5831598081526006949
+103219765
+45875581
+516096747
+9870724567665640
+2625321602296449309
+5568582435113995179
+6154199872212897041
+5356297009705911966
+9001701177466488459
+6425213859614951480
+5129024196560096606
+3971798877726246151
+6471725260172231870
+18412545934481162291
+6155045908523191668
+27356042474686002
+11226550812567014265
+9870724570481756
+417019855960
+12512221777814685432
+2625321593697668091
+18116424960081923281
+14287090
+2818164
+71631344
+18895601982637667
+18066207382028748450
+5353476806987745228
+6100103891543657570
+6996745855548787140
+10594427319500605883
+575239712866634722
+3870997034531752803
+7239633818635733091
+29612147900877606
+8865366478519731984
+5352348805862656188
+8709732826087622782
+18412263943873036335
 40042856
-2933734509745341832
-5459976691403131036
-1730675726
-1873618484739705502
-2676033351739245930
-15215179494928287321
-14866462842593414402
-5463642917535614049
-631243623
+5245789596497153220
+819921860
+15580874176502892132
+154731123
+544670501
+62980530
+17105177
+4254959725487523541
+5636255
+30458188511970924
+3973491001936446780
+7410173966093388838
+8704274751914773568
 5885261859847867262
-11391362031143292020
-506659547
-105251406
-5778348197355914873
-16324853745603185849
-5509410163496651347
-152699489
-15292499534361856724
-496763604
-544211744
-4078298792234977417
-5461104782808057591
-14648423506775771515
-10504814416598927327
-8709732826087622782
-2544766567488424310
-811139977
-17088205463377873568
-15798241638577276499
-2676033356038014277
-2785415326238639918
-12562453432512743836
-12350988444867431112
-1873618514833377412
-16940553195690134509
-45875581
-103219765
-8854886168440079511
-5941764153383128192
-2625321589399162008
-11818157132458100908
-2785415278947600352
-15257764832492062794
-232154598652
-819004351
-16187661
-4644563108626631009
-4000515045253449269
-16872667624306444468
-1873618493337242815
-6291625
-6156737968247080128
-292346005443
-283116224
-3220426554520570467
-12356593998396393868
+13590665243542948895
+2412665810316625225
+18613516794268696
+1873618514832459339
+11623400942792738969
 684134257893444250
-17175427809786595961
-9870724570547380
+9126223058424237061
+10530167802300925091
+14267039342724252928
+7042731044489660311
+811271050
+219257507157
+16204201012116260706
+19923244
+2248112069177510680
+19741625396299098
+14311172801615955497
+313840698753
+157549179
+4662073785906890031
+9658025172156550406
+6364338522051512284
+3599049409094225259
+6177363118375897150
+1801912319444323213
+11272401
+91554312
+2625321602296252770
+68616647
+538837783
+1411918553413126104
+22741311
+492962512
+451411772583
+160367231
+9870724567468020
+814089116
+2528483177756102046
+10370417990725208293
+6829167367527204602
+10167299845199168903
+14284319595218536933
+18413109967286566983
+9881712940254169714
+13819298136066198
+10906513561824594910
+9486667438472824267
+10215782083327823122
+3685635809078676834
+518718202
 1992881803100621054
-2625321593697930351
-9450798976826149302
-16655465042802838677
-6474545510181176536
-11740202404159819072
-15289397349631921063
-9714916620293637762
-6098975770044401989
-16364556117061994922
-196084388
-540148509
-24052042
-11065179658016983681
-12480382642832672298
-71500270
-7285785859232107205
-14156017
-17632571483632043275
-61604254
-4259978
-17750109864738752812
-1873618523430913566
-9830100417878166271
-14425661002709010016
-4794173760728861833
-464308734399
-510460641
-2507605048
-41812332
-2679637056
-99156525
-16044698410491643447
-9870724568517151
-5516046735301085409
-6261263733545503259
-3759645248384009814
-538116878
-5779476232874035736
-6104586261131037638
-10531295842117158093
-12124379
-69468627
-5565348505908348542
-814941090
-5299098870104394759
-14322284629040564382
-10440328872292254866
+14090480
+59965836
+1550910461
+17063697102470777209
+71434733
+2411070513
+17639632887023929831
+805438326
+2621553
+9870724570285675
+1192315333980326167
+6928358494714596151
+5512098613140196086
+15611911946388048179
+5214737420442796133
+5778348150066318224
+27638024483702949
+18412827976678441027
+8881019260503721949
+5837238405281154301
+5461104765611674055
+4181978486829943864
+154534512
+5439644
+755605599842665887
+62783919
+292345154665
+544473890
+567411536
+15257764832492062794
+15122676476569913436
+5835546375651263675
+5516046795487905107
+10758418391935682553
+27356085465188671
+11400819049620310987
+5245848947241584851
+1728578573
+42664306
+219257310372
+8257735
+524354306
+429916226796
+4076606625033226775
+10162787574158199843
+6064503826171693679
+1873618450346019367
+5566476545724581156
+2704904932763174902
+4548309565419816229
+12484855343804124176
+879419277503368073
+6153917830015027393
+573047641
+68420036
+9870724567271440
+2625321602296056238
+2531021389865944442
+11075790
+102826544
+6064762127302393958
+5249797159683425776
+18413674021587452000
+31586340104504804
+4469799173763958889
+13237708548482859013
+31586254122912349
+9870724570088730
+331037869860
+417019463453
+48300418
+71238122
+1539245050
+644678512
+2424942
+11305519054433421356
+15739206202722094240
+18411699919665823773
+4787251587801811388
+32432320527338097
+27920126869375123
+18008952305986046279
+4661227783988316231
+2543452128325209336
+12826571498698443033
+16711957
+4160546838840674266
+5245848925746038203
+62587308
+5784522635265967958
+5243033
+544277281
+6211116016906930204
+16253215886083360174
+23407808536904833
+9821766011288487605
+2676033351739442523
+8061124
+13012951802392676509
+219257112483
+547095362
+1992881785902860007
+19530026
+810877831
+2625321610894640833
+4825924017323379312
+8854886168440079511
+3808100888100343209
+3493395634074945822
+12591757708863407913
+5349308051975768286
+13361048879788721990
+4074350485214726972
+1626046306171619904
+8617826180017097033
+13513914891881875478
+29330183088506125
+18412545943079354421
+6405261049027955879
+17939922315943150958
+1410073577
+7837634943338155161
+85348920764927349
+2625321602295859611
+813695896
+538444562
+29048192480512516
+45285754
+68223425
+91161094
+10184384893691038634
+17542946455516547053
+4180850416920431050
+1928986057181235415
+6364097387529111599
+15289397371127860096
+2522831856378710008
+5885976061401368013
+21997756618246082
+18412263952471228465
+816513961
+13678484518713821516
 2228331
+2531021385567766485
+197198505
 518324983
-16872385650894636566
-6284197438710222140
-8098722631875955846
-5727354392818878727
-9870724566484489
-154468975
-2292825785040636736
-3172873343893834792
-14418466534433295118
-2707725182771857350
-15293345523383077603
-259261111
-19988781
-15371922320578972378
-19741625396299098
-18411699893871247383
-12818875419963886521
-2676033351738984017
-14268291611706526293
-1309213649
-104989258
-6367324841362000185
-7432602967203907143
-11331649863678691999
-15292499534361593441
-1815413785
-5778348223150556659
-5572809636518234139
-11408348231855703653
-2446197814
-13001682102565734253
-17186370630874106258
-2785415274648570354
+71041511
+9870724569894177
+105448017
+5779476207078475458
+13980703149896829784
+13697261
+12769025487284210679
+2150364451439708714
+4162099616697747321
+1873618497637320883
+1873618523430651633
+12716605781588708278
+5248951136269502637
+11703165067112811597
+62390697
+5046422
+521143041
+16515346
+2625321589399096275
+154141293
+1495991284
+165610142
+10528906628815718922
+555549513
+819332033
+567018319
+1095239150174145285
+1873618458944406678
+6364097374632348674
+10811668319096800853
+1465168459078698840
+17269866578628118199
+2676033351739245930
+7864513
+19333416
+11034422950646711803
+283116224
+2625321610894444316
+546898751
+18411417864571256842
+2372569380647405649
+12754891682880490747
+18413109975884759113
+3332575624131774585
+13536993689470216
+15291935475760762248
+6153353775713551670
+15289397349632312454
+9373330690077689939
+29330183088310857
 14264783202905229777
-7171706723174648069
-820773835
-4645667113710455153
-16425638839461284611
-5353476806987745228
-1840738151924108521
-6153071806601889790
-810877831
-8061124
-5356297048398365877
-4770547841029572913
-12804866717273491655
-15580874133512784221
-514261733
-571605843
-12346762090311779845
-102957618
-10907429529076434052
+5995296139937712949
+12269921124804135698
+5885976155981547843
+4129856573689694799
+538247952
+3598203394278688718
+159777405
+22151483
+1409876968
+45089144
+10682570
+3172873343893834792
+15290243360149277503
+18412827985276633157
+28202117477106938
+2057911839619745748
+1193603335023233930
+1410790466305853323
+1873618476141774348
+16643035121679272213
+8716776878113885241
+14581169549127125939
+6375653898722412075
+8694008299851745161
+21997756618049241
+13500652
+816317351
+9870724569695611
+105251406
+70844900
+4279895118
+197001895
+24969554
+15855332315905657597
+151126621
+506659547
+5142301044413893172
+1917322269
+3137288237381257087
+14409153078548760239
+14633483086300318059
+5780604315680310424
+5572809636517250553
+15592058013925444099
+17244622751296785814
+27356063969248337
+33560399034844286
+62194086
+153944682
+4849811
+16318735
 2625321589398899121
-5354604872597767596
-4279174221
-27638024484621167
-8483828720841721486
-1459422188
+9374458755688828226
+15882855708139391266
+10225919150420460684
+15292499508566297469
+14065038233622153931
+17943317531893566468
+7031431794891230329
+16385074671182940805
+6205475624366966000
+6103488079777762245
+18411981897376661534
+9796067026192895123
+7996312456522828750
+13487493291780736605
+17245750799710423012
+2676033351739049517
+546702141
+7667902
+42074479
+282919615
+5515085278788979157
+810484612
+2625321610894247837
+1544487931
+9511403338599564690
+5192881006389626514
+5778348223150556659
+32432363517642192
+14046027923586223423
+18413674030185644130
+10771469979967884371
+2229804632046437624
+17480593072628501093
+6368045642961454306
+13237708557081051143
+13331692090551241408
+7677136701370927115
+339636063086
+538051341
+21954873
+492176077
+9870724566681132
+1398211555
+8807886331112851129
+5516892801706297876
+16546579671803956126
+13217980679449939250
+8503320935775669540
+32150372909516328
+1675756474409617568
+7721139320100816372
+2541195915421354200
+24772943
+1309279186
+105054795
+9870724569498781
+162398863
+70648289
+477207135345
+3452050537366752044
+5460499803637156023
+18199185222634702635
+5512098557251945790
+21715680028265805
+249849483538007723
+3554388240579364748
+9386257314251803173
+10594427319499622429
+15289397306641222853
+29330140097217635
+14311736903207619026
+8856014234050823647
+17855463731522440261
+15291089478142986477
+6365225461738636619
+2625321589398702921
+39059811
 23689889426704296
-17648172271756969893
-232154335723
-15925513
-10811668319096800853
-6365225478934037607
-9763237054719266042
-11633356565151157114
-63373752
-1873618493336979326
-6029477
-3580814869236944221
-5199085482290645376
-282854078
-2625321593697668091
-9870724570285675
-7449919019336600171
-1839046014815569788
-23789896
-9131616131521448314
-5779476228575003910
-5511666277521099409
-13940760354079114484
-18413109980183855178
-644678512
-71238122
-417019463453
-15131353489256221185
-447360420122266222
-520094464
-3997830
-15096032016463431129
-1873618501936549084
-61342108
-1873618523430651633
-18412263918078459945
-5344573059048999857
-5155859771100236117
-5405598659939206416
-27356033876298083
-2146416200305806198
-5303893093062347743
+2474797953316292924
+61997475
+818938814
+153748072
+4653200
+50528646
+256967343
+468608354196
+509280991
+5463642917535614049
+6209141923583494372
+16122124
+10375505326587315831
+4397798264919820154
+8054758354584013306
+1873618489038145001
+830399540494469985
+5637072609524124450
+13913370016116443160
+18412545951677546551
+2676033351738852867
+99222062
+546505531
+18940198
+5411521012994540776
+2625321610894051277
+374027977988
+282723005
+30409049
+7471291
+259785401
+821756878
+11229884474259868248
+16357059045845960667
+1873618454646032908
+10542598463376395267
+5887104182899575287
+12393440069873436875
+7730749911729375728
+15289397349631921063
+7621890105505615217
+18263821886743185772
+18058417591402760409
+1317733333
+511599051947
+214958540605
+67633599
+9870724566484489
 21758263
-3189961199463959445
-527958790
-69206479
-11862232
-6364097396127827248
-1320879066
-365262179507571896
-23689855034002659
-1473119215
-18412263948172132400
-31243224015702806
-39518566
-9870724566222277
-545719090
-5301355009924597043
-9391897706793274792
-11514789185312918199
-18411417886066737167
-5299098848607995194
-2284412389694637269
-10530167802300925091
-10427987387505837891
-14322803714593785119
-2625321585099869531
-6829167367527204602
-6013889919468112625
-4181978486829943864
-8698802578697685482
-1654120425802828663
+1873618532028844481
+8328637003859953646
+5509410163496651347
+15289397375428069113
+16436648502584675667
+6205475697451599682
+8929320279979198383
+4974759074281293673
+9870724569302153
+13107433
+815924131
+2524775482
+28484129580057898
+206359759935
+2625321597997287853
+24576334
+70451678
+16432982289349543214
+5459976661309458015
+4237766514325588729
+14322284629040564382
+9697577994188492052
+5759560470823897206
+1873618527730862181
+6153071780807051278
+15586729198848181726
+5558136817694803400
+10694515171064744788
+1988559907
+5302201063430292982
+4456589
+1713308683
+96207382
+15925513
+13650504529854072320
+5458848655886518922
+61800865
+5779476232874035736
+153551462
+38863202
+8099682237308012832
+18411417873169448972
+2787671431665157349
+651404368114879038
+17955470565775510239
+18413109984482951243
+14911447610171655366
+8858552325786961082
+13840095604897025041
+5940918086979029952
+5723537903358642458
+11238431078799509026
+7758478083289188556
+2014710471177341259
+12454108019635062383
+99025451
+3862478902367620470
+17621268784989013395
+7274680
+546308920
+27638041680086900
+5991347923197297866
+6208295874376239521
+34124418943289166
+27356081166223293
+5461104782808057591
+5357143003026951378
+113312346
+7885152524183078888
+214958343888
+309541536868
+2395604016
+9870724566287831
+491782859
+2414921941537785811
+7528330190111771360
+30458205708158447
+2707725182771857350
+23407795639356361
+5991347884504386492
+14541340640523322842
+14576212124415364447
+525512494730316455
+6795216411027442152
+1160107295621122785
+11391362031143292020
+2421359666
+162005644
+517538547
+196412070
+6152225714402428442
+447112610911
+620757861
+2625321597997091447
+12910822
+9467310877347154547
+70255067
+8926052536804313893
+1873618467542403595
+1873618441749072804
+18116142943679220675
+4787251587800828866
+18411981905974853664
+17175427809786595961
+153354852
+818545595
+5462796881224795644
+15728902
+4259978
+96010773
+1334641629
+73073152
+61604254
+7570985824906512457
+20305632407192782
+14288223544298637564
+12772641161582545873
+13237708565679243273
+15394822466617412826
+18430745393540238720
+282329787
+98828841
+809894783
+17011915733784398286
+7078069
+637862761
+18546980
+7993599274919922706
+546112310
 5569296748444387676
-1873618441748940565
-256967343
-5245848947241584851
-15862817677379702068
-14633483086300318059
-288046714075
-2203332276215481610
-7798976
-810615685
-237175467
-11340219378265033230
-313841615983
-513999587
-18413674004391067740
-2116750858326574509
-8070938101082033295
-2625321589398637514
-25099937047839912
-5245848878456439955
-12118995007347033900
-4562124381333884039
-31586327206235137
-16436648502583690678
-9181481831755875838
-5516046752497929091
+5429433323061448040
+1873618454645640429
+16425638839461284611
+32432337723721245
+2785415309041207732
+6156455960443095577
+15292499534361856724
+4727296740454696303
+15289115311734721621
+10914208898869168291
+583336804
+214958147231
+21365044
+491586249
+9870724566091296
+2246313005
+3901910077209972079
 4183106466458307862
-1991460714865167155
-17082847207615301902
-818480058
-15663365
-73007615
-3701600990787603378
-63111604
-5767329
-579208034
-1493907215601306869
-11535686880442518166
-3313969578832561394
-2704904932763174902
-6570315963541227654
-282591932
-5726226297114658480
-17160329975787685834
-8843457619279611284
-18413674034484740195
-9870724570023121
-492176077
-30740204914083091
-21433663625497129
-1629160452
-1873618450346477252
-18412827972379344962
-5243108696682924272
-7260902865540482639
-816448424
-70975974
-15287423196122254433
-1873618501936285414
-5151629580948802356
-3735682
-61079961
-18411981910273949729
-7837634943338155161
-3597357340772992368
-5133829485925763690
-51184007
-10956724774926813288
-98632231
-17309267256018536307
-9870724567992379
-29048106498198701
-3544107379218385465
-14386655907412249373
-219257507157
-21496117
-68944331
-16330874579771459902
-11600084
-11124082762859154482
-5459935770830768809
-814416800
-347984565637089693
-11923578915473263059
-575144796
-517800693
-3297856681506178941
-326737923180
-16038494049632258844
-15104099179857577674
-32996413518841137
-153944682
-2152780467316001469
-8722536002903082945
-10646954815923686447
-545456942
-14458654042895551171
-3935742187522887052
-16064731596255856452
-19464489
-17648172288953812474
-6213874949885069218
-14851060135220743194
-6471725260172231870
-4504298175131421894
-573113178
-11701191021079496730
-12314601354656483126
-13957562954616997312
-161809033
-563217229
-104464968
+2989761681675848960
+681145240220994278
+4089771542585346905
+9960543895307946415
+1801847830
 1366033375
-1133620930477295468
-6209141923583494372
-2625321610894509848
-5052785364214352114
-6155298040667702671
-5246977012853376412
-4074350485214726972
-27328854
-1873618441748677997
-2000487899013646903
-7465404271946632160
-7239351853821397993
-11742834345080916462
-6368045642961454306
-5516046795487905107
-434216307724
-3493677603186412637
-810353539
-16633695840000739887
-821147663836514852
-18413391996586557524
-7536828
-4151361015346562251
-14540810596246030644
-5995296139937712949
-159777405
-8816997369364548341
-45089144
-18412545934481162291
-9298403582666148514
-15108492788614827244
-35193182
-5568582435113995179
-5570988833963444820
-15289397375428069113
-15401217
-8430474765433179073
-10750398672578676906
-72745468
-5405598728725859379
-9250794030848869727
-62849456
-17422075106091075868
-5505181
-1873618497637255436
-578945889
-13106160036035691955
-282329787
-5570988786672405753
-9870724569761068
-7031431794891230329
-43057525
-1706034183
-491913932
-214958474959
-90505732
-18412545964574834746
-32432303330887118
-846140170598090257
-5458848587099997499
-17607182983838566334
-195297952
-539362075
-5460499872422693597
-23265605
-943759021439519007
-70713826
-816186278
-2207492642904016905
-644154222
-60817815
-806290300
-3473534
-1873618501936022824
-13307798926833551183
-1873618527730926929
-11349795265056081195
-567018319
-9388513449772451585
-165610142
-2625321576501808484
-7290339324003420579
+70058456
+2625321597996894992
+104464968
+563217229
+9391897711091583990
+12714212
+161809033
+24183115
+196215461
+4662073803102881076
+5564423899624572258
+1930668198955452820
+1873618497636337289
+2785415326238575484
+14418466534433295118
+15292499491370961900
+17305802226190387588
+1413046597527538184
+11547019543992076059
+18412545960275738681
+72876542
+6671860954713623381
+818348984
+7073102484926630551
+4844858457232050089
+4063367
+1436145094782551981
+95814162
+8697110475982376165
+15532291
+61407645
+23971751058934778
+13418544886826534789
+17616192489740896443
+8486888319115725460
+5941764217869305943
+5566476498434524382
+17083693235326683482
+12583654612056476062
 15287141244205140113
-41025899
-9870724567730368
-5569296739846327213
-98370083
-1531970550
-219257244681
-2065251783916127931
-6151097665987347595
-1407386597
-3973490993339565383
-12463417266756127924
-17631161371525515669
-21233971
-3232498753
-4767727591020628301
-8972557000702888938
-1873618458945784014
-15290525376551717170
-1559626750
-68682184
-12689613402799605860
-527434500
-517538547
-3542979343701772038
-447112610911
-163578521
-326737659857
-30458205707109873
-2625321606595479619
-498702419026
-555090760
-11846037961957312985
+10329316755526125416
+545915701
+6881458
+23689855034002659
+12734096628671909131
+11199980773228349986
+98632231
+12818875419963886521
+6848542126596623056
+5941764183477191834
+3186488476490139978
+3875793754648151904
+14319322726341872694
+17090754651615726815
+3221822110943152678
+5516046735301085409
 2286775792223980496
-2676819007
-11599686562536949325
-3968978683605551949
-5831598103022077418
-15175534989820758889
-3812049126336301758
-545194794
-12348736218027264207
-12743882002561631754
-12318365723906541324
-8882845388820581451
-12769623874203027091
-1732546160493595960
-10430737389551487761
-9512531412808567772
-21433723812579518
-812123024
-9140909979694467183
-4025048830681353606
-1873618489039455401
-18331530485106038
-5516046791188875281
-6156456003434055463
-12474564753552836994
-17621561863500597513
-104202820
-29612220986426501
-1996555300
-2625321610894247837
-17489156252859434801
-103179363763095696
-15920335005095365860
-13112992413209136128
-2034107431
-17291573824845253535
-9772926989806013640
-819987397
-17170714
-1873618467543321286
-16156684754098128751
-6925759830950740072
-7274680
-16161820259100396848
-3698377064120454404
-10296839827164892306
-13913370016116443160
-1363739614
-92275213
-210659444315
-1784112314702629632
-5461104765611674055
-507299956084
-13237708552781955078
-197067432
-4211147846
-14657391675119111356
-25035091
-1735459858
-15139069
-14426056237756189706
-12771845711499103316
-9940375093616053431
-6523880655054768550
-62587308
-10967349376607587326
-1873618497636993704
-15290807392954681807
-5243033
-1133620917580466754
-1873618523431898109
-11613165301442872555
-282067642
-9870724569498781
-2141513421469058406
-14318336791419094928
-5885976069999102359
-6153917830015027393
-214958212644
-548995910
-90243587
-16101055855214332856
-9409295256684857617
-539099930
-30458248699119542
-23003457
-252379820
-6173800107753209956
-70451678
-13107433
-815924131
-1873618476140856959
-3188833133853148985
-3211386
-60555668
-5514354727165429372
-18430745393540238720
-5566476498435442740
-8821966780582857359
-806028152
-31022281504130688
-15273884660262766886
-17153706162049649384
-15568274631689570656
-98107936
-9870724567468020
-2625321602296449309
+20587696099952690
+6886936648282539655
+6013889919468112625
+2625321606595479619
+1370042529145686776
+365428606574
+339635275824
+101450287
+17625510063045740642
+214957950334
+6213874949885069218
+812516243
+5463642874544129714
+10906583479868327250
+15744000533156660854
+5885543833259674054
+18413391983689269329
+1873618476140792146
+6177363174264606048
+827351178595273968
+10488220504998020326
+12517601
+5354604868299326678
+7734678113259293802
+3055188874828713383
+477206348880
+196018851
+517145329
+1801651221
+104268357
+505676503
+9870724568714358
+10531295842117158093
+15215179494928287321
+18411417881767641102
+11678577273375754735
+50011031689890353
+1873618441748677997
+9839328478246341893
+2704904949959166469
+18413109993081143373
+15289397310941235057
+31304323701671300
+61211034
+15335680
+164430493
+1884114794138700522
+497025749
+3866756
+4131548702199581670
+17761874897365304540
+17619012653768771484
+16542141154387102177
+4451048243670025981
+2973047420892220660
+6155298040667702671
+6684847
 5250413516934940017
-10377197347619277484
-546964288
+98435620
+2625321585099935141
+15293345523383077603
+3324580943442478547
+545719090
+23689855033805297
+6829167320236755019
+5991347923196709309
+12903069678853164419
+8098722631875955846
+17142588721119759321
+6151097721875664077
+5352348827359053328
+491193030
+812319634
+2295119206548507606
+514130660
 2429420595
-68420036
-13840095604897025041
-11075790
+2625321606595283106
+10757572347027851837
+10668197763104573447
+1873618476140594617
+18411981914573045794
+14847634415788362123
+451651625195343029
+4278715465
+746324852
+69665238
+15288551330518206781
+35258719
+23789896
+12320990
+161415815
+1491796976
+1812923415
+9870724568517151
+104071746
+5460499803636172954
+10592171188278987146
+9388513432576593267
+2704904949958969710
+16038494049632258844
+28202057290482949
+5303893093062347743
+5780604350074062990
+13674941621707869313
+15881445561639371975
+11405246090117384413
+61014424
+3670145
+1685722535145180234
+8834282535679560676
+95420943
+15139069
+3205882223899445065
+255984301
+1735459858
+1192315303887243384
+20869665212206112
+6925759830950740072
+883931539946605906
+5299098848607995194
+1445774942907271091
+4445946672738010859
+63832509
+17620986833073014515
+545522479
+98239009
+17045009756596405420
+6488236
+7536133444401891169
+820773835
+13968724050119231192
+5701610621477129021
+3068049059797011246
+20775215
+2625321606595086582
+29048166685607288
+15618641120352995308
+5831598115918776853
+812123024
+313841551493
+5880738612678821404
 1873618506234530930
-517276402
+32432303331018178
+16219982623696489082
+12258209558853782455
+16436648502583690678
+9387385384162626351
+4151361015346562251
+3542979343701772038
+18412545968873930811
+6741138607599781046
+1709507593
+12124379
+451412624470
+516752111
+161219206
+9523554809025136564
+17951898368652543383
+69468627
+814941090
+92406286
+103875135
+9870724568320021
+2523959969279970191
+5144036663261072615
+30740239306197230
+2089265852158774334
+18331517588211470
+1873618501936549084
+6364097443417820330
+1873618441748286746
+27638058876667848
+828143439367899804
 31304293607146613
-10225919150420460684
-32714392818354350
-163316374
-17480593072628501093
-3653991426073234491
-28202143271093720
-2625321606595217579
-669516658
-11075097734987253589
-544932649
-5248951136269502637
-24535874148371011
-5247593352907000017
-13750803869111880047
-821756878
-5565348488711963913
-18940198
-23407778443822783
-811860878
-3910652327921846506
-2372569380647405649
-6151097721875664077
-8481290603310483360
-15289115311734721621
-5197393238738928914
-8858552325786961082
-15270695523793439937
-103940672
+1580068669843704469
+5565348480114297669
+72286714
+164037275
+3473534
+14942458
+356832249381
+806290300
+60817815
 6206603741566403719
-151388766
-2531021385567766485
-7563081637033018620
-13044533461222491710
-6154199872212897041
-9126223058424237061
-1160107295621122785
-32714349826081871
-6152225697206437786
-4333982245204396969
-7012532
-5411521012994803182
-5249797159683425776
-570557265
-17619527108083517000
-3758799224970808644
-11069796609748044689
-210659181949
-14926165161459649868
-7570985824906512457
-3234866947851553000
-1906986264008723742
-24772943
-1873618446046923526
+12240192446106372977
+5942328186188203485
+1493907215601306869
+30740204914083091
+1873618463243635741
+1873618523431898109
+2341393787733871788
+1873618549225229533
+129168850403198609
+5728764487730398405
+10015828607276288922
+12057002331826554305
+10159392401199270768
+12142525752872799042
+2676033356038473537
+494403323033
+292346005443
+223556143025
+10911952793442192048
+820577224
+6291625
+1394017249
+98042399
+12163381674616883890
+1992588707391932346
+109394696450803010
+17760542
+545325868
+12318365723906541324
 7516607870825792868
-14876921
-72221177
-18411699906768535578
-1495925747
-62325160
-288043895627
-31304259214443724
-3685635809078676834
-4980885
-313838798363
-13951205954302051853
-464309454125
-7151957518376504179
-6153353870293665804
-365428606574
-14319322726341872694
-3493083035910933027
-214957950334
-13222096480399396057
-22741311
-538837783
-12845285
-1675756474409617568
-7676326031729298383
-1873618476140594617
-70189530
-2861086850442987769
-12590629664748537952
-15501473033754248808
-1733166096
-2949238
-5833854255738587405
-6405261049027955879
-60293520
-6364097417622914469
-50397573
-15289397310941170468
-1436145094782551981
-9870724567205432
-155189878
-7996312456522828750
-2413828615876118471
-1818166298
-97845788
-2625321602296187261
+18411699880973959188
+2396555433535145871
+4074350515307087985
+18172096623373846790
+23407778443822783
+18413391992287461459
+6284197438710222140
+13819010092172884
+15636771529559117046
+530117487457144076
+7241607903360452069
+17113993018971653874
+6155045921420412650
+17869638717220195611
+8695136395555507435
+548143940
+1992881785903908578
+15741861301866530650
+18411417890365833232
+5726358144768542273
+18413110001679335503
+16149105318148965958
+6366353527348595732
+6155045912821630127
+18067928153034001057
+7888341864134085054
+8856014216854897981
+11202456151687629452
+6152225744495577231
+69272016
+631243623
+210659378559
+3541851256594893702
+23396678
+11927769
+9870724568123690
+103678524
+92209676
+17297538988880889355
+1873618501936351339
+1519546451849645365
+5438906422044199526
+2626514825137096412
+15740334315622960494
+8912366315847026281
+5461104800004441485
+326737923180
+6388664954993575829
+14264208172476401284
+14745847
+3276923
+576717661
+806093689
+1815348248
+152371807
+2625321593698126810
+9870724570940976
+72090103
+60621205
+5569296726948055802
+4502324021620638916
+2599235317101562153
+12665415616966166285
+5197393238738928914
+3701600990787603378
+9654763076979264499
+12808641794728789765
+2676033356038276482
+18412263913779363880
+3865299044899163405
+6877319166685482198
 4451323549999957434
-3544953467117898450
+15287141188316891641
+17563932
+155189878
+292345808939
 40501610
-6364097443417820330
-1543385872365455415
-12606726616442537392
-16436379939763522008
-7562235540534921217
-546702141
-20709678
-18413109962987470918
-10939233345785957508
-1384869222252743071
+97845788
+6095014
+63439289
+2791055594105669609
+28484146776247610
 14383042897579063
-245051624454
-813630359
-5881866613803452649
-1455946274504313841
-68157888
-10813643
-4502606072414800438
-9388513432576593267
-517014256
-16739161091967945306
-6203168539198949844
-20305658202031811
-15122676476569913436
-48365955
-5941764144784016877
-12601357272775920269
-5900805793554762144
-163054228
-6155327937823509637
-95814162
-2625321606594955469
-544670501
-11092808190891527547
-6365225423046182853
-3545799490531822688
-5991347927496329957
-2676033356038473537
-6928358494714596151
-18895516000586505
-18413109993081143373
-1317798870
-3242943116712479419
-8468495303965871404
-10215782083327823122
-295544243748734701
-7536133444401891169
-13880529192106527090
-18412263930975748140
-103678524
-8816997365064994109
-5513226652957347114
-13427220419978791304
-4279895118
-2581508047683782932
-151126621
-16436648502584675667
-5245789596497153220
-18411417868870352907
-1574831104
-5512098613140196086
-16646420
-16881311723980129501
-580191075
-6750384
-460010423829
-17142588721119759321
-5411521012994540776
-13331692090551241408
-2236213724530672835
-10512763733196344280
-91750922
-493159123
-210658919829
-5353476789791099071
-2973047420892220660
-102615266471184862
-817431474
-71959029
-14614773
-29330157293667421
-18411417898964025362
-8854886129749066875
-62063012
-1631882651478526261
-1873618497636468806
-1626046306171619904
-4718737
-6971710725545264615
-15463390673086056969
-5996988225456246061
-2625321597997156982
-1258091056198584472
-2365498112266798670
-12258209558853782455
-548471621
-200191596416994196
+5199085482290645376
+1818166298
+8843457619279611284
+4076606659425995504
+10441809166173275876
+17306856587979066781
+918014392040164136
+1873618458945456185
+12237654281284815334
+1873618484738787248
+3101815889301670444
+20023641798084435
+17404805271631431757
+10554335258691243263
+7239351853821397993
+12012735189037878155
+12893049762838873864
+5565348523104797810
+4078298792234977417
+18411981923171237924
+15636380174699072146
+1440671446449589035
+112132697
+429916882098
+3822179681402096264
+1881294612915292853
+16508579235574254010
+32432389312220424
+11911353550167017696
+3493395603981665996
+15287423196122254433
+6104586261132347910
+1873618450346674286
+2848264744227112681
+681145240220010584
+6366353527348399255
+437798118096833445
+16872667624306444468
+516358893
+15515140725455259155
+527827717
+103481913
+210659181949
+1938490399
+23200068
+160825986
+69075405
+7460569593843485201
+3172873313800750756
+9870724567926898
+14383326641327005
+4452458274095237609
+1330643932
+11731158
+4025048830681353606
+4530582984922301900
+10233718743719545342
+1873618471842024407
+2487491354099451134
+15292499491369977714
+11078361536363433136
+7513578521803359945
+15662612681012938353
+2146416200305806198
+342928503145892901
+7746405201714873917
 5565348480113903112
-10159392401199270768
-538575636
+60424594
+14549236
+15897908900500866947
+33560403333875446
+3080312
+1005889956380019628
+817365937
+163644058
+507708124
+71893492
+2625321593697930351
+9870724570745030
+17750109864738752812
+17419818910382754661
+15475002888547338265
+16224960573811657069
+18412827946584768572
+7880043043777809442
+27638084672424186
+5246977012853376412
+6366353484357502971
+5407707525201267705
+359800716494834349
+447360420122266222
+2676033356038079832
+12463417266756127924
+6053028402293443390
+820184006
+97649177
+63242678
+28484176870181368
+5898403
+33278412725750974
+5293539447540681024
+4504298175131421894
+6313103561496791450
+544932649
+2597848126
+17783567759008991682
+9195012299735698785
+3491703471172619422
+8722536002903082945
+1873618514832721746
+2676958769323838072
+4000515045253449269
+9298403582666148514
+5780604337176709161
+11701191021079496730
+17943317531894613402
+5299098874403555921
 5782296448490211725
-15289115277341755866
-12583138
-4959080478982475006
-4237766475632413481
-2687090
-60031373
-11241814380293784908
-18413674017288355935
-10162787574158199843
-5625593289148533238
-605557034314828631
-2625321602295925195
-97583640
-16546579671803956126
-546439994
-13513914891881875478
-18412827955182960702
-18142877345697171235
-8716776878113885241
-5991347923197297866
-21715680028265805
-5299098848608717979
-2686971790050919863
-10551496
-2676033351739442523
-5246976935469649046
-4236074403011431549
-5561348123192067576
-516752111
-13525196865559988902
-451412624470
-6813843502384089093
-3452050537366752044
-2723374776553770162
-105448017
-14284319595218536933
-356832576945
-1987904546
+13965057806789381527
+3880024017885400135
+43123062
+811533196
+1133620917580466754
+17096567568145714575
+7462549834646555859
+4223816177647290930
+1873618450346477252
+9232147856523987724
+6517011693716180143
+6219166245997316001
+29330122900898936
+17782439637510195701
+16892239439269464715
+8072726556923202784
+15862817677379702068
 2789363555876800106
-17063697102470777209
-6584302816815089825
-5727354422913010657
-13944415416121166662
-28311895
-11906248855590275274
-3707523343842937215
-18412827985276633157
-821232589
-18415907
-2676033356038210923
-17257283880273643533
-18331556279224644
-9117971362513815455
-18411981923171237924
-309541536868
-113312346
-46072191
-103416376
-27920126869375123
-160760449
-361131345578
-9234597529149245860
-14835085562484362568
-4585257123188181630
-1413046597527538184
-6208295874376239521
-13217980679449939250
-1966081057
-6101795981361546864
-16384272
-10370417990725208293
-4196703391028741586
-6488236
-63832509
-5153885660580611393
-6155045912821630127
-5197393273132877515
-2625321593698126810
-10720606758114626648
-9870724570745030
-30740204914804024
-91488775
-7792373120121047026
-3579577413
-5458848587100981064
-755605599842665887
-17404805271631431757
+11534547
+279448847671
+210658985303
+252379820
+539099930
+814351263
+160629376
+9870724567730368
+103285302
+68878794
+14268291611706526293
+23003457
+16876615841046071902
+16269555352897260337
+1873618446048496233
+10161648493728303880
+10430737389551487761
+13735950183222348532
+1991460714865167155
+9870724570547380
 417019921504
-9386257335747873389
+572110149897422243
+2883701
 817169327
-18413391979390173264
+5240852582657493474
+518980348
+60227983
+4618470194090937269
+1459422188
+232154335723
+3773122177597967623
 71696881
-8328637003859953646
-14665059300281706
-6101796011455220816
-4456589
-13070886371126478108
-8733200714257204941
-10913926882465549337
-29330183088310857
-61800865
+2625321593697733840
+331038328206
+163447447
+18411699889572151318
+13044533461223476582
+18413392000885653589
+16436379939763522008
+12314601354656483126
+5539270545646881104
+11507341268100646834
+102615266471184862
+31304259214443724
+7731878007432940921
+7401639761433855789
+9450798976826149302
+5701792
+5245848925746498573
+15104099179857577674
+108921430
+17170714
+5411573444917201071
+544736038
+468609401971
+3758799224970808644
+63046067
+819987397
+18411417898964025362
+14322803714593785119
+3062219857732765097
+2207210695286917386
+10532987970627045461
+1873618458945062288
+684134257893509654
+14101293042239958
+5015996636573993090
+6098975770044925746
+850088361543927300
+157614716
+111739480
+219257572663
+19988781
+30740222110861163
+8618954206935976415
+6206321690772243584
+1873618450346281005
+5405598659939206416
+15872788267758849077
+5572365145471912335
+4625631396377398109
+17619527108083517000
+493028049
+9870724567533791
+1559626750
+19177592590632702
+91619849
+2625321602296318353
+515965674
+103088691
+68682184
+527434500
+1990332636357069057
+899112529018292807
+6570315963541227654
+16038494049631274933
+15920335005095365860
+28202143271093720
 14949273699027977966
+18412263922377556010
+5782296461386581535
+13001682102565734253
+12972952285742288
+2703776923039566000
+2687090
+11775490430040476325
+14156017
+2686971790050919863
+17489156252859434801
+105906772
+60031373
+11241814380293784908
+71500270
+5734260728554980678
+5197393273133271298
+17648172288953812474
+1873618493336979326
+683965395648908415
+18331539082773742
+32150304123324262
+17422075106091075868
 1873618523431110190
-3573803894998305775
-5569296709751605280
-5835546375651263675
-9870724568714358
-42008942
-1746899701160150410
-9664889374910385451
-7406761759861377295
-2625321597996894992
-365428082633
+10628953736434486617
+14512708438227029368
+18411981931769430054
+16886908734816455284
+62849456
+9131616131521448314
+5505181
+11364576519499679975
+33560368941368890
+544539427
+39911783
+33560399035500221
+533070599
+578945889
+8097653480026868144
+154600049
+16974104
+17957750408840481687
+285938493736356261
+5991347927496329957
+10377197347619277484
+15131353489256221185
+15287987228927396675
+14282027259104724924
+8070528080642443989
+17128487303121020
+219257375260
+42729843
+811139977
+6365225478934037607
+460010423829
+490013379
+13850612818404510827
+2207492698791414354
+7515799761807542411
+1873618480440216958
 11888218815508973537
-6311975551774360856
-1408369638
-6101795942670075923
-15515140772745448064
-27638058877519937
-13361048879788721990
-2430665780
-22217020
-538313489
-927164962728314711
-69665238
-27638084672424186
-2573543627316201844
-12320990
-2424942
-18413392009483845719
-3660444556051220001
-18412545947378450486
-154665586
-9870724566681132
-546177847
-2229804632046437624
-5245848917148372136
-15906307047154976446
-827351178595273968
-5780604350074062990
-6350640494756627870
-9198943117821938833
-2676033351739180486
-1192315303887243384
-67633599
-6205475723246636047
-17419818910382754661
-162529937
-17083693235326683482
-105185869
-8912366315847026281
-5249797202674912471
-2446394423
-1461650414
-257426098
-17299513133793348673
-4451048243670025981
-14597841535548131734
-14130457194541352666
-15290525359355331959
-9195012299735698785
-524354306
-429916226796
-6153353788611431303
-1728578573
-6153071806602085789
-2676033356037948725
-8257735
-2785415326238575484
-1873618489038408278
-8072726556923202784
-7731878007432940921
-16271603835638319461
-11229884474259868248
-5835546388547569431
-2704904949958969710
-103154228
-2625321589399096275
-6887529782530082437
-45810044
-16365628939578247566
-4408861808311732424
-3554388240579364748
-3431353251379022211
-4131548706499659810
-3229097897723824621
-818938814
-16122124
-10831084194895235709
-6226088
-6366071472254485645
-10441809166173275876
-9538952396691934382
-5994450030541998229
-6835382734606174906
-4397798273518472097
-2625321593697864817
-9870724570481756
-17782439637510195701
-31304332299601191
-4074350515307087985
-10758418391935682553
-11405246090117384413
-196018851
-17943317531894613402
-15289397375426759758
-1801651221
-12716605781588708278
-5353476789790574588
-1873618450346936800
-14462121002204464918
-2785415309041207732
-71434733
-10770155859627543824
-1873618476141841211
-5780604362970367638
-2530739313276357975
-14090480
-5567604589840172352
-296644709200
-11266915032714840583
-4194441
-2200512120787569683
-2549492329236335496
-6211116016906930204
-99090988
-9625506809262378259
-13237708535585570818
-490103571663
-14541340640523322842
-9870724568450966
-1793158821936040552
-9486667438472824267
-21954873
-538051341
-1398211555
-5408700909154273182
-5356297014005859746
-8444237263823374707
-69403090
-2599235317101562153
-15897859265386515143
-6097847713031849822
-2162794
-9796067026192895123
-13117159209037203716
-164299420
-17088031212435737557
-8099682237308012832
-8971880411373045432
-3099205763721988894
-9870724566418979
-545915701
-13237708565679243273
-4449074137450482853
-18115860927276518423
-5247593352907982888
+10754752208794748192
+4770547841029572913
+2236213724530672835
+12085352355710699032
+2625321602296121720
+11141327
+68485573
+22610237
+279448454880
+573113178
+9870724567336570
+17097308613030775025
+45547899
+102892081
+538706709
+813958043
+2150364429944620611
+15290243360149735302
+1004761907965660269
+13427220419978791304
+18412827955182960702
+2116750858326574509
+11847668763332905754
+15530553734630409457
+6366353492955694732
+5875369269012203157
+7528870385169533979
+71303659
+816776108
+417019528294
+48365955
+2490479
+518587129
+59834762
+163054228
+9870724570154139
+5300226909920168653
+1493907215600323645
+1873618523430913566
+6293435910568086045
+4661227814082512385
+9231865831523354279
+4562124381333884039
+5994450030542718433
+8468495303965871404
+6151097734773868363
+5308570
+51184007
+16777494
+62652845
+7410231625909407077
+8336200085500660803
+10377426660276898779
+15108492788614827244
+5405598728725859379
+5349367342193968413
+1873618489038801706
+12851509922494416016
+3812049113439014303
+14151987057237756511
+10744889200825601240
+9304480456320748248
+19595563
+219257178788
+15798241638577276499
+3017170418691476659
+8857706336766199103
+13957562954616997312
+8126661
+30740222110467489
+4662355883991631380
+18413674000091971675
+6203168539198949844
+12480382642832672298
+14814059355306855043
+10229733804179524009
+6887529782530082437
+10905173350565414454
 16533468055605152863
+3758799212073651272
+240752528220
+68288962
+1318388695
+6350640494756627870
+5462796894122411284
+813761433
+10944716
+2625321602295925195
+538510099
+18411699898170343448
+13880529192106527090
+18413392009483845719
+25099937047839912
+10440328872292254866
+6835382734606174906
+3686437022462641529
+816579498
+541328159
+2293868
+71107048
+105513554
+151388766
+9870724569957729
+5197393273132877515
+5882159627828332650
+7851156332894489575
+11510172448171493799
+9250794030848869727
+8327325764367420682
+16778219695595128445
+8733200714257204941
+16580883
+13513632858283052077
+1461650414
+2625321589399162008
+819397570
+8423108281783224429
+5111959
+1473119215
+257426098
+62456234
+39518566
+12622921449567619867
+10531295803425490030
+5566476498435574920
 1873618458944474091
-19923244
-3188833116656765520
-2676033351738918494
-4501477955215362649
-17621268784989013395
-14581169549127125939
-6206321707968234614
-33278352538406314
-516227820
+1873618489038604579
+18613366323088485
+3008657884776564221
+11621189233770302317
+5464488953846367762
+475082778886408323
+6155327937823509637
+9956242218935388443
+11065487246536017596
+2676033351739311464
+14361095630442006439
+810746758
+283181761
+2625321610894509848
+546964288
+6365225423046182853
+7930050
+6262673798362565305
+1840738151924108521
+8483828720842049762
+9013091190501541709
+8294669686619703163
+9288263741171697848
+15520597512816297356
+2792183694107936667
+9227353569080969220
+2429880031182916564
+5833854255738521265
+18412263930975748140
+2430665780
+22217020
+18301216972082383618
+11964228788262537512
+159842942
+28766150282773591
+538313489
+813564822
+7032232753418799293
+12348736218027264207
+15290243360149343057
+6406389097441592980
+2964529530791004471
+18613559784442970
+1873618476141841211
+5991347884505041337
+6101796011455220816
+6366071455058494624
+6155045908522469290
+8412057600244518110
+3478039985316562895
+12718336251608304605
+70910437
+4211147846
+197067432
+14443179094226111164
+2192639020
+9870724569761068
+105316943
+25035091
+162661010
+518193910
+5303047078245827995
+1903640920853056624
+18092519415945890646
+4127600455366674792
+6474545510181176536
+7731877951544692100
+11084138473134491150
+2625321589398965240
+1495860210
+154010219
+16384272
+15043322265989680207
+6204347601746593065
+4915348
+62259623
+468608617008
+1966081057
+1192315299587689576
+17256155806064642777
+1873618489038408278
+12662636664722033563
+1654120425802828663
+25099894056749168
+5299098874402571932
+2676033351739114988
+489423554
+30671195
+5411521012994803182
+42140016
+7733439
+2625321610894313322
+7329667560521271617
+6206321690771457172
+5967447917778832244
+2284412389694637269
+2572415553107265488
+18412827963781152832
+16904712944498838074
+15289397349632182266
+29330122899915877
+27356081166681957
+6173800107753209956
+538116878
+10551496
+3919394969679695174
+9870724578216632
+492241614
+8816997369364548341
+4662355849599126556
+16567374854657149772
+12884708026702235763
+6364097417622914469
+1873618532029106835
+8861613698626554738
 6890349946557761313
-1411918553413126104
-162267790
-2474797953316292924
-1694703987789596868
-18172096623373846790
-28766090095429261
-1223976979390989739
-3221822110943152678
-104923721
-15185362616787929146
-10003084053115964048
-2625321585100065781
-437798118096833445
-1815348248
-31304323701802109
-152371807
-14046027923586223423
-2021331689141374237
-20869691006257762
-13044533461223476582
-16778219695595128445
-12057002331826554305
-17465760298758178660
-7576852735584046364
-129168850403198609
-820708298
-17891616
-1873618489038145001
-7995587
-11911353550167017696
-4522983015860209939
-12612941966326959190
-102892081
-2625321589398833886
-45547899
-11548493110908749415
-4076606693818764590
-7851156332894489575
-12779163922391107832
-5991347884505304103
-1095239150174145285
-3863606920688567965
-10771469979967884371
-15859976
-14312864964518020808
-17245750799710423012
-5963940
-10655291933708585535
-4162099616697747321
-63308215
-1873618519131818153
-30176189305784773
+5837238474067478018
+5780604294184830225
+11214563466576463780
+29612216687004362
+5516046782590617836
+10156853965084755435
+6151097683183797493
+11613165301442872555
+1986666427421953426
+6155045882728942511
+7033448275620070919
+2907303882175415846
+1320813529
+1584595969
+105120332
+7465404271946632160
+70713826
+24838480
+162464400
+12451287838412704489
+816186278
+644154222
+3735453693364143828
+9870724569564298
+1309344723
+21715680028329254
+13044533461222491710
+1873618497636993704
+3445982126355516078
+7529998377695250462
+12237654319976351671
+4534407021717882867
+3431353251379022211
+494159375523777824
+1136798196293306910
+16426766960960540026
+819004351
+12356593998396393868
+16187661
+3307734082
+14273081993166850387
+4718737
+434977997061097911
+62063012
+2625321589398768544
+39125348
+30458248699315998
+17858552114457937219
+5903884228619468800
+16872385650894636566
+10504814416598927327
+12213481117926952067
+18413674008690163805
+14101026494875963
+4709060078846741586
+2676033351738918494
+9714916620294556051
+13237708535585570818
+810353539
+2625321610894116800
 53412232
+434216307724
+7536828
+41943405
+6770804071406897080
+821822415
 318140582948
-15611911946388048179
+6365225453139920066
+4502324038816629924
+4030203865610717075
+18411699906768535578
+15290807392954681807
+11966722661119888545
+8618954206934993224
+12189960762281954023
+32432333423379563
+18413392018082037849
+6004915412369541960
+14546784569801180959
+745740842898098858
+15289397293744523027
+5299098870104394759
+9257009393629660721
+5900805793554762144
+6155045917120857525
+21823800
+1317798870
+537920267
+1730675726
+1535706104
+9870724566550039
+14648423506775771515
+10531295876509927029
+3973490993339565383
+14312864964518020808
+14824583848163281869
+16940553195690134509
+1873618476141446514
+5778348218852443426
+5758903550139959418
+27356016680241600
+13940760354079114484
+5645056620059298667
+347984565637089693
+815989668
+9870724569368358
+5887799994573980828
+162267790
+517800693
+70517215
+15925946803693423456
+2625321597997353452
+16572875051796793000
+575144796
+104923721
+13172970
+14426056237756189706
+5909364179964069981
+5459976691403654584
+4397798273518472097
+27920040887059601
+1873618527730926929
+1873618467542665344
+18613585580197092
+32714392818354350
+18613499598604650
+5780604289886653255
+3865299049198390675
+22279760122415532
+18412545930182066226
+50397573
+153616999
+2625321589398571980
+1736311827
+15991050
+14665059300281706
+4522126
+7792373120121047026
+30458248699119542
+13951205954302381098
+17785259844527786731
+6444189713816225654
+747829823970020707
+8698802578697685482
+14477731067643038195
+18412263939573940270
+14318336791419094928
+15291371425760087333
+12109395139072755174
+30277976
+99090988
+282591932
+546374457
+490103571663
+15580874172203795679
+810156929
+7340217
+638124907
+259654328
+18809125
+18056758355458722638
+5679882735784101879
+7563081637033018620
+8520914754064026558
+283748271730
+67502526
+9870724566353399
+7242736046355514492
+572130134
+514786024
+214958409445
+29048192479791616
+2625321576501808484
+5354604872597767596
+29048106498198701
+2575517759332551933
+6311975551774360856
+14036340911856223966
+32150286927595340
+17291573824845253535
+14926165161459649868
 12640696470018459947
-30176223702288623
-9870724570219682
-33278412725750974
-1409876968
-28766150282773591
-1873618450346674286
+17498716255300421272
+3968978683605551949
+16377260960560187819
+19177532404009207
+2625321597997156982
+24445261
+5245848878456439955
+421319345246
+5510538272098551989
+70320604
+3249068390006196153
+5888081980883929307
+1836516380
+12976359
+236453760381
+2141513421469058406
+1873618497636600365
+11878630053446878902
+6156456003434055463
+27638058877519937
+18413109962987470918
+6288511205539515238
+4770547828131824981
+4160689491693538063
+14836382508930370955
+12751507524739009261
+10427987387505837891
+2605266760616185153
+2524806001290315567
+33560429128451329
+4325515
+669516658
+15794439
+807142269
+5303047104041388600
+818611132
+61669791
+12644080653952551280
+6045857707735386835
+11229983338076703492
+2845029447323880298
+18412827972379344962
+6767393152337644543
+2673382969485886910
+15185362616787929146
+17490170188584258190
+4047541379259827663
+15680489859993767209
+546177847
+7143606
+637928298
+7276444624641068235
+12287601267178473523
+31022238513759415
+17698252132056434004
+1732546160493595960
+7036226112429884975
+2676033644081056812
+548995910
+90243587
+571933524
+812778389
+9870724566156739
+214958212644
+1873618446046923526
+3493083035910933027
+15291935501556190620
+14650572868605052119
+6971710725545264615
+17302333254828493968
+6098975847429179176
+4504298213822565083
+505938649
+3579577413
+2786543383251193103
+70123993
+47186305
+2352415791
+4279174221
+2625321597996960522
+1538130937
+161874570
+17082847207615236134
+6206321707968234614
+8854886129749066875
+10908568553618343357
+2785415326238639918
+1873618527730534170
+1873618441748940565
+5745384143842643344
+18413674017288355935
+16044698410491643447
+9181531069949872018
+10905173367761798655
+13237708544183762948
+3757107087862401328
+1311572948
+2034107431
+15597828
+2538734651
+5727354392818878727
+4128904
+818414521
+95879699
+5727354422913010657
+5245848874158263187
+9664889374910385451
+18411699915366727708
+14851060135220743194
+17958290734101235336
+9319686106503382840
+89657146100418951
+11349795265056081195
+14540810596246030644
+5779476284463187670
+18415907
+156041850
+259261111
+821232589
+809763710
+98697768
+6946995
+5941764153383128192
+17684252729367202593
+10233694917695638297
+970700105235760464
+21715753112570631
+17953636526298302297
+6262673798361580735
+5847102830955857465
+3313969578832561394
+2974323816123992770
+13271165719268362246
+17083693200934636558
+6101795934071424788
+16990917635978692369
+812581780
+16327183209838150989
+21233971
+1535116279
+214958016090
+2625321606595545096
+3232498753
+1500709877
+514392806
+5831598146013367591
+4502324004423927097
+3099205763721988894
 15290243360148359553
-14036340911856223966
-6365225461738636619
-816645035
-417019398489
-6206321673575531611
-12057284352529139627
-71172585
-13828334
-7528870385169533979
-5832726134240118664
-2785415334835848520
-2572415553107265488
+1873618476140856959
+3295137431799204142
+14130457194541352666
+8910392170935354895
+3967850626592737364
+18412545938780258356
+12583138
+505742040
+4278977611
+540148509
+24052042
+196084388
+563086155
+104333894
+2625321597996763849
+16324853745603185849
+13586095437453200186
+15804734059994287439
+18005251247539029895
+13516735047310051359
+3493677603186412637
+10159956468397444373
+5249797099496672683
+17763448248357489818
+18412263948172132400
 61276571
+7630443591734791098
 3932293
-9870724568188981
+72745468
+95683088
+15401217
+4076606693818764590
+15986098390340470919
+1873618519131556994
+9386257309953099582
+8501910827968825512
+168849237244054062
+6750384
+545784627
+2625321585100000596
+1652810939277510396
+580191075
+98501157
+5198803303557629187
+3297856681506178941
+3935742187522887052
+2601013084734032090
+11500631658516907905
+8021450341214588326
+14977809576148535095
+4127600472563058730
+16965951797418331227
+27356081165698156
+491258567
+12804866717273491655
+1408762855
+2573543666009114673
+2200512120787569683
+2625321606595348609
+21037361
+14462121002204464918
+5619444426388998007
+3973491023432910866
+12103109825679658143
+7260902865540482639
+5566476571519223063
+18413109971585663048
+17791918762976347730
+16365628939578247566
+4449074137450482853
+11214563466575480865
+7239069803025663720
+17952462371364276975
+9512531412808567772
+11075097734987253589
+2373415502940997016
+16874702537456224943
+517014256
+2573543627316201844
+4278781002
+69730775
+9870724568582655
+12386527
+12743882002561631754
+10906583475570214623
+104137283
+35324256
+10167863869407233224
+18412827980977537092
+363084051790629688
+11694336983993944146
+1873618441748546884
+32432320525830439
+12654580528992553525
+7241043922144659849
+9391897706793274792
+152830562
+1402930148
+164299420
+5303047073946667464
+3735682
+61079961
+15204606
 1873618549225491555
-2360543918673038210
-98828841
-12512221777814685432
-17939922315943150958
-6045857707735386835
-21692726
-4502324038816629924
-11490081257974859839
-17639632887023929831
-1316357237551401394
-6101795994259359091
-11796695
-69140942
-18411699889572151318
-12074216556992400767
-1320813529
-8618954206934993224
-164037275
-4160546838840674266
-12591757708863407913
-555549513
-9870724566156739
-154141293
-32714414313178248
-545653553
-223556471268
-12613788024133322735
-812581780
-5778348150066318224
-1500709877
-6741138607599781046
-9227353569080969220
-515965674
-13884327378110449525
-18411699919665823773
-16340493341965880015
-162005644
-620757861
-21997756618049241
-17007720368052373541
-13001845694847518363
-227855238971
-17629469
-1737950228
-9288263741171697848
-20305615210743190
-1873618489037883086
-18613533990193666
-7733439
-313841551493
-15288551330518206781
-17302333254828493968
-6153071832396467338
+3188833116656765520
+31586327206235137
+820839372
+464309454125
+18022689
+545588016
+17205553309096938840
+313838798363
+223556406340
+98304546
+15463390673086056969
+4240022615453076686
+10831084194895235709
+11549275701007551889
+155648632
+6553773
+534119176
+4222974949961697922
+8326286517935867839
+1873618454645114642
+1146796961722731823
+5509410202188647833
+1873618514833377412
+3242943116712479419
+29330157293667421
+8882845388820581451
+12608147700378373379
+14465116522071263669
+5461104757014004985
+9649086479758069023
+2625321606595152102
+513999587
+20840752
+2148672322930150296
+10646954815923686447
+10831360821402142464
+313841615983
+10139438201185111279
+16881311723980129501
+18413674025886548065
+2785415274648570354
+5353476789791099071
 2979056014524680527
-8857706336766199103
-2625321589398571980
-45285754
-5991347884505041337
-4502324004423927097
-16874702537456224943
-14911447610171655366
+6366071515245381876
+8610102806501591788
+10333839787251271664
+13237708552781955078
+451412690018
+16101055855214332856
+9870724568385196
+12189916
+23658823
+195691169
+5155859771100236117
+69534164
+35127645
+103940672
+11069796609748044689
 13944990587222231178
-3308118261903721908
-18413109975884759113
-8412057600244518110
-15597828
-2538734651
-818414521
-17082847207615236134
-18276979644936029994
-5701792
-63046067
-5882159696614657105
-1410790466305853323
-18412263913779363880
-32714379920475611
-539325825270679628
-1873618519131556994
-13536993689470216
-9870724569957729
-43254135
-5153885686374731086
-9387385384162626351
-8336200085500660803
-5303047104041388600
-5512098595943810546
-5717788221838658971
-2324121364801391676
-12012735189037878155
-2192639020
-1873618476141316771
-70910437
-3670145
-2219404100148201532
-2544580112253650683
-61014424
-6155045921420412650
-18412263943873036335
-1873618549225229533
-9870724567926898
-98566694
+27920101074341046
+17298949057997047589
+2908260051937332390
+6364097413323754682
+12350988444867431112
+1223976979390989739
+5782296431293302176
+11098517635487303139
+13525196865559988902
+2374936041605498895
+15007995
+1574765567
+519635711
+5831598103022077418
+576979807
+817824692
+634323816
+3539071
+2446394423
+6206321673575531611
+2360543918673038210
+27638024484621167
+11340219378265033230
+6366071472254485645
+4562124351240801677
 29894215892535509
-155910777
-6366353527348399255
-9956242218935388443
-31586340104504804
-219257441372
-13522668389390157414
-18411417881767641102
-11534547
-279448847671
-7242736046355514492
-68878794
-814351263
-1192315299587689576
-2524775482
-34124461934314600
-507839197
-5539270545646881104
-4974759074281293673
-5337229686545450161
-153879145
-12644080653952551280
-30458205707308380
+6153353844499089111
+13070886371126478108
+9181481831755875838
+18067928196024961188
+6981729909914862956
+63701435
+6357162
+15288269305517836796
+17299513133793348673
 545391405
-17877509356004052233
+17826079
+820642761
+98107936
+8854886172739175692
+9082058141968173941
+1873618484739049815
+11514789185312918199
+5778348197355914873
+11130039777759856047
+294416195335096411
+846140170598090257
+2571498445011814318
+18412545947378450486
+1408369638
+2625321606594955469
+5245848947242502849
+365428082633
+5245848917148372136
+10859426818132543221
+15524263781940136850
+2578187325
+17564225130023161250
+811991951
+1694703987789596868
+1873618450346936800
+12105446909435186010
+14975681650483333306
+32432303330887118
+29612220986426501
+11644189250161151139
 17520266449292560845
-11065487246536017596
-2011949215506761725
-6155045882728942511
-812319634
-1130753852548581517
-573047641
-5299098874402571932
-18413674000091971675
-18331556280207363
-17269866578628118199
-15289397293744523027
-161743496
-10649664295314066054
-6051485356288903427
-4347925833116091776
-30458188511970924
-104399431
-10184384893691038634
-7401639761433855789
-1308623824
-563151692
-2625321610894444316
-7239069803025663720
-11434534198373320614
-1873618441748613384
-5622264654903379074
-29330122899915877
-15636380174699072146
-820184006
-2597848126
-10233694917695638297
-14585410861575638263
-7471291
-85348920764927349
-6366353492955694732
-18413674030185644130
-4127600472562141528
-35127645
-5780604337176709161
-541328159
-2524806001290315567
-13850612818404510827
-18412827968080248897
-15335680
-3493395603981665996
-17858552114457937219
-62783919
-3875793754648151904
-5564423899624572258
-292345154665
-3489447322753895731
-18411981905974853664
-5439644
-42991988
-9870724569695611
-12269921124804135698
-559088458
-33278386930321618
-15289397353931868100
-214958409445
-6219166245997316001
-15289397379726773461
-30458248699315998
-23200068
-12163381674616883890
-70648289
-9000175594581527004
-806224763
-89657146100418951
-15475002888547338265
-3407997
-60752278
-18411981936068526119
-14267039342724252928
+92275213
+335336768790
+69337553
+7290339324003420579
+17621268802185464283
+161088132
+9870724568188981
+516621038
+11993306
+507299956084
+210659444315
+103744061
+13151687854617134836
+8659114857360722535
+825323275339564903
+103179363763488430
+684134210602468610
+1873618501936418049
+6205475723246636047
+5516046752497929091
+15885957841278600403
+2477484405147109478
+16875205763331852041
+72155640
+472907842721
+14471968401314024391
+806159226
+1712194570
+576783198
+1815413785
+2446197814
+14811384
+507970270
+8929038315166239946
+3342460
+3220426554520570467
+2625321593698192308
+5677488692584514734
+21433663625497129
+2435475427475262665
+16940455997476965252
+6153071806602085789
+5865888353649363875
+17465760298758178660
+13263754581809432790
+8716776809328151764
+13112992413209136128
+6153353788611431303
+3784724792312663401
+12590629664748537952
+2676033356038342054
+14219872676477209184
+11327137566841769230
+63504826
+97911325
+9339868219275806468
 13726068525522684375
-1873618527730862181
-4504298213822565083
-155648632
-98304546
-9870724567665640
-13681696359428851594
-219257178788
-24535844054893958
-50011031689890353
-10532987940533372886
-11272401
-23407795639356361
-68616647
-814089116
-15635925519041823968
-1998521381
-163512984
-797977540607610221
-32150286927595340
-4709060078846741586
-5967447917778832244
-5885976078596834724
-2625321606595414132
-153616999
-1744643526947965735
-17461812017531651650
-987047180239768912
-30740239306197230
-15288833278135765839
-525337347
-5885976155981547843
-18413391992287461459
-10532987970627045461
-56689033
-5722409915131627177
-114033243
-10159956468397444373
-18412545930182066226
-5349367342193968413
-13819010092172884
-104137283
-17953636526298302297
-2224234517276395067
-2789363555875490728
-2625321610894182276
-12426051065400527122
-9355193091131312182
-30740222110861163
-14361095630442006439
-3137288237381257087
-17105177
-819921860
-7209143
-1727529996
-810025856
-805679481429165719
-17298949057997047589
-21997713627284659
-16120716880803858984
-33560368941433940
-1535706104
-10229733804179524009
-18412545960275738681
-9714916620294556051
-4078298775038527628
-5461104765611607541
-210659378559
-92209676
-13418544886826534789
-14264208172476401284
-1917322269
-197001895
-24969554
-5405598728725530322
-15073532
-817890229
-72417787
-1873618471842024407
-17091318705916150977
-5946696443085589628
-5177496
-5847102830955857465
-62521771
-1873618523431831649
+2011949215506761725
+1737950228
+6160551
+9830100417878166271
+155255415
+17629469
+8140646021471143544
+545194794
+8510103668314541335
+18411417868870352907
 5835546371351184527
-14824583848163281869
-42729843
-9870724569433729
-5780604315680310424
-16385074671182940805
-214958147231
-3007753865419557454
-491586249
-17943317531893566468
-1801912319444323213
-22937920
-539034393
-27356055371580547
-1873618476140792146
-5198803303557629187
-6103488088376871190
-13041896
-1733362705
-70386141
-2306802734
-643826540
+18413109980183855178
+5249797172580910311
+10532987940533372886
+32714379920409891
+1873618514832984063
+13702827714901707594
+29330157293274228
+220421203678071202
+5565348467217401524
+313841222762
+570950482
+13012951802393594980
+6209141854797957852
+5717788221838658971
+5460499872422693597
+8444237263823374707
+2544580112253650683
+32432303330691092
+14986955239351847842
+4392112055939237960
+16285378285009240167
+6205475671656957491
+11266915032714840583
+15289397375426759758
+17284241873202253123
+1783548230307677653
+195297952
+69140942
+23265605
+11796695
+210659247559
+17257283845880874759
+451412296787
+92078603
+160891523
+539362075
+103547450
+9870724567992379
+11331649863678691999
+12613788024133322735
+13944415416121166662
+15895039144349470066
+8816997365064994109
+1732546121802517809
+13221120945827219803
+3863606942184311140
+12562453432512743836
+7562235583526800081
+9870724570810095
+71959029
+232154598652
+14614773
 3145849
-14637903957824965363
 519242494
-60490131
+2625321593697995819
+1133620930477295468
+817431474
 805962615
-5784522635265967958
-1873618527730601376
-18301216972082383618
-11644189250161151139
-2625321602296383846
-9870724567402585
-98042399
-15741861301866530650
-494403323033
-6729754102968812754
-546898751
-6208295835683456476
-33560403333875446
-14409153078548760239
-15530271666638163275
-1873618458945456185
+4131548706499659810
+60490131
+503001777494
+6206321673575138470
+1258091056198584472
+3573803894998305775
+10967349376607587326
+1873618523431569790
+6153071806601889790
+12749251354825264418
+9625506809262378259
+2676033356038145381
+15635925519041823968
+5885976078596834724
+9484411285755463284
+532291916112267238
+18411981901675757599
+1703347206
+33560368941827284
+5303047039553965447
+40370537
+97714714
+155058804
+6261263733545503259
+5963940
+63308215
+1130753852548581517
+5570988833963444820
+18157949162008873831
+8021450371307931626
+2861086850442987769
+1873618489039455401
+18413674034484740195
+1873618458945324208
+32714349826081871
+18424247431471827427
+1842511416005692464
+6589396841525218018
+5782296448490276391
+13237708561380147208
+27356055371580547
+5462796868326918190
+1860700038481053299
+5458848587100981064
+3580814869236944221
+5566476545725106758
+28202091681875145
+5915592292141435844
+11434534198373320614
+15740733274947783803
+10161648502327149991
+15287141235608259625
+12779163922391107832
+68944331
+814416800
+1823671323
+23068994
+210659050964
+46006654
+516227820
+11600084
+103350839
+361129707266
+13750803869111880047
+103179363763095696
+1873618501936022824
+2933734509745341832
+7230168968130792223
+14517406836956661503
+17619012718254098754
+12406930521299355297
+4408861808311732424
+2949238
+9870724570613070
+60293520
+503001580666
+14947075702982868
+1998521381
+2625321593697799226
+14418163
+163512984
+71762418
+5722409915131627177
+11599686562536949325
+1873618493337242815
 16951650337051970851
-5144036663261072615
-813826970
-12133908888583014197
-68354499
-11010253
-279448324634
-14749580058850363919
-6633286351216577743
-2089265852158774334
-8929038315166239946
-31586271318836879
-13678484518713821516
-105906772
-96010773
-2625321606595152102
-153354852
-10831360821402142464
-5652457623480305518
-8503320935775669540
-16483453074211931840
-363084051790629688
-544867112
-258146996
-5944020284604679310
-5782296431293302176
-28484176870181368
-23407778443758207
-3973491023432910866
-5778348175860436286
-1873618514834032208
-5438906422044199526
-103875135
-7697026996393675938
-1709507593
-161219206
-13237708548482859013
-3701601059573925529
-879419277503368073
-3822179681402096264
+2676033356037948725
+18412545955976642616
 5565348445721659362
-532291916112267238
-256115374
-1460339693
-13351948495571782591
-14665351642484132
-3008657884776564221
-2341393787733871788
-16904712944497920326
-3967850626592737364
-16843031
-4131548702199581670
-6946995
-809763710
-1928986057181235415
-11964228788262537512
-2989761681675848960
-1873618519132801026
-7276444624641068235
-5994450030542718433
-12284124821458521275
-111739480
-4076606646528706921
-13650504529854072320
-15804734059994287439
-14425661019905001872
-2395604016
-14465116522071263669
-210659116497
-15290243360149343057
-15777957523720635747
-10167863869407233224
-18331517588211470
-12884708026702235763
-14811384
-72155640
-7042731044489660311
-15288269305517836796
-5675796551176948530
-14264208198271043974
-1495860210
-5787083718919720300
-25099894056749168
-683965395648908415
-62259623
-4915348
-12974919760129952993
-6155045917120857525
-1873618523431569790
-9013091190501541709
-4392112055939237960
-2625321597997353452
-15897908900500866947
-6177363174264606048
-15872788267758849077
-491324104
-33560399034844286
-22675774
-17542946455516547053
-2431124533
-538772246
-27920040887322186
-8704274751914773568
-12085352355710699032
-6153353775713551670
-70123993
-27356081166223293
-7885152524183078888
-60227983
-2883701
-11700344903086704893
-7329667560521271617
-518980348
-5833854255738521265
-8618954206935976415
-3901910077209972079
-1713308683
-1992881785903908578
-4530582984922301900
-16130159995999161574
-155124341
-2625321602296121720
-1884114794138700522
-5778348218852443426
-97780251
-4240022615453076686
-6097847786116483627
-6361518319333476776
-30540122
-28484146776247610
-546636604
-5741055947585816645
-6100103891543657570
-8807886331112851129
-813564822
-10223260478367337870
-746324852
-15287423226215073909
-11226550812567014265
-1491796976
-8097653480026868144
-5995296157134227520
-1873618532029106835
-1539245050
-48300418
-331037869860
-95748625
-6314795724398267312
-5888081980883929307
-544604964
-34124418943289166
-5245848947242502849
-32432363517642192
-2676033356038407648
-811533196
-1317733333
-8920676095134336910
-17149817495305717193
-918014392040164136
-103612987
-8695136395555507435
-18349504802666319185
-14847634415788362123
-1584661506
-4287350266942457603
-525512494730316455
-5881302580997523790
-1574765567
-3784125305237867347
-819397570
-8326286517935867839
-16149105318148965958
-16580883
-6684847
-18411699902469439513
-11229983338076703492
-15292499491369977714
-339635406848
-9870724570940976
+5767329
+5250413516934022944
+97518103
+63111604
+579208034
+544801575
+17236251
+258081459
+17953567922355439196
+30458188512103543
+15287987228927658628
+4930631980557601532
+20305658202031811
+2120987217453057458
+6209987959894902621
+7151957518376504179
+12552846396214610071
+1793158821936040552
+5461104787107351969
+559088458
+14386655907412249373
+547619651
+2141783083
+12606726616442537392
+1923875870
+811402123
+570557265
+42991988
 100
 101
 102
diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
index a009802bab0..a8c5ae3b6a3 100644
--- a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
+++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
@@ -14,9 +14,10 @@ def with_nulls(request):
 @pytest.mark.parametrize("nrows", [30, 300, 300_000])
 @pytest.mark.parametrize("nkeys", [1, 2, 4])
 def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
+    rng = np.random.default_rng(seed=0)
     key_names = [f"key{key}" for key in range(nkeys)]
-    key_values = [np.random.randint(100, size=nrows) for _ in key_names]
-    value = np.random.randint(-100, 100, size=nrows)
+    key_values = [rng.integers(100, size=nrows) for _ in key_names]
+    value = rng.integers(-100, 100, size=nrows)
     df = cudf.DataFrame(dict(zip(key_names, key_values), value=value))
     if with_nulls:
         for key in key_names:
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 979c936a182..af9a6c7e696 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -33,7 +33,7 @@ def __array_function__(self, *args, **kwargs):
 
 missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
-np.random.seed(0)
+rng = np.random.default_rng(seed=0)
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
@@ -49,7 +49,7 @@ def __array_function__(self, *args, **kwargs):
     ],
 )
 def test_array_func_cudf_series(func):
-    np_ar = np.random.random(100)
+    np_ar = rng.random(100)
     cudf_ser = cudf.Series(np_ar)
     expect = func(np_ar)
     got = func(cudf_ser)
@@ -74,7 +74,7 @@ def test_array_func_cudf_series(func):
     ],
 )
 def test_array_func_cudf_dataframe(func):
-    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
+    pd_df = pd.DataFrame(rng.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     expect = func(pd_df)
     got = func(cudf_df)
@@ -91,7 +91,7 @@ def test_array_func_cudf_dataframe(func):
     ],
 )
 def test_array_func_missing_cudf_dataframe(func):
-    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
+    pd_df = pd.DataFrame(rng.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     with pytest.raises(TypeError):
         func(cudf_df)
@@ -105,7 +105,7 @@ def test_array_func_missing_cudf_dataframe(func):
     ],
 )
 def test_array_func_cudf_index(func):
-    np_ar = np.random.random(100)
+    np_ar = rng.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     expect = func(np_ar)
     got = func(cudf_index)
@@ -125,7 +125,7 @@ def test_array_func_cudf_index(func):
     ],
 )
 def test_array_func_missing_cudf_index(func):
-    np_ar = np.random.random(100)
+    np_ar = rng.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     with pytest.raises(TypeError):
         func(cudf_index)
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 5acdf36de80..17ef033ea9e 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -600,12 +600,12 @@ def test_avro_reader_multiblock(
     else:
         assert dtype in ("float32", "float64")
         avro_type = "float" if dtype == "float32" else "double"
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         # We don't use rand_dataframe() here, because it increases the
         # execution time of each test by a factor of 10 or more (it appears
         # to use a very costly approach to generating random data).
         # See also: https://github.com/rapidsai/cudf/issues/13128
-        values = np.random.rand(total_rows).astype(dtype)
+        values = rng.random(total_rows).astype(dtype)
         bytes_per_row = values.dtype.itemsize
 
     # The sync_interval is the number of bytes between sync blocks.  We know
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 2e8519509e2..949fa909b5b 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -2,7 +2,6 @@
 
 import decimal
 import operator
-import random
 import warnings
 from itertools import combinations_with_replacement, product
 
@@ -179,7 +178,13 @@
 
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("binop", _binops)
-def test_series_binop(binop, obj_class):
+def test_series_binop(request, binop, obj_class):
+    request.applymarker(
+        pytest.mark.xfail(
+            binop is operator.floordiv,
+            reason="https://github.com/rapidsai/cudf/issues/17073",
+        )
+    )
     nelem = 1000
     arr1 = utils.gen_rand("float64", nelem) * 10000
     # Keeping a low value because CUDA 'pow' has 2 full range error
@@ -187,13 +192,15 @@ def test_series_binop(binop, obj_class):
 
     sr1 = Series(arr1)
     sr2 = Series(arr2)
+    psr1 = sr1.to_pandas()
+    psr2 = sr2.to_pandas()
 
     if obj_class == "Index":
         sr1 = Index(sr1)
         sr2 = Index(sr2)
 
+    expect = binop(psr1, psr2)
     result = binop(sr1, sr2)
-    expect = binop(pd.Series(arr1), pd.Series(arr2))
 
     if obj_class == "Index":
         result = Series(result)
@@ -204,7 +211,8 @@ def test_series_binop(binop, obj_class):
 @pytest.mark.parametrize("binop", _binops)
 def test_series_binop_concurrent(binop):
     def func(index):
-        arr = np.random.random(100) * 10
+        rng = np.random.default_rng(seed=0)
+        arr = rng.random(100) * 10
         sr = Series(arr)
 
         result = binop(sr.astype("int32"), sr)
@@ -223,8 +231,9 @@ def func(index):
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("nelem,binop", list(product([1, 2, 100], _binops)))
 def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar):
-    arr = np.random.random(nelem)
-    rhs = random.choice(arr).item()
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(nelem)
+    rhs = rng.choice(arr).item()
 
     sr = Series(arr)
     if obj_class == "Index":
@@ -247,10 +256,11 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar):
     "lhs_dtype,rhs_dtype", list(product(_int_types, _int_types))
 )
 def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype):
-    arr1 = (np.random.random(100) * 100).astype(lhs_dtype)
+    rng = np.random.default_rng(seed=0)
+    arr1 = (rng.random(100) * 100).astype(lhs_dtype)
     sr1 = Series(arr1)
 
-    arr2 = (np.random.random(100) * 100).astype(rhs_dtype)
+    arr2 = (rng.random(100) * 100).astype(rhs_dtype)
     sr2 = Series(arr2)
 
     if obj_class == "Index":
@@ -271,8 +281,9 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype):
     "dtype", ["int8", "int32", "int64", "float32", "float64", "datetime64[ms]"]
 )
 def test_series_compare(cmpop, obj_class, dtype):
-    arr1 = np.random.randint(0, 100, 100).astype(dtype)
-    arr2 = np.random.randint(0, 100, 100).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    arr1 = rng.integers(0, 100, 100).astype(dtype)
+    arr2 = rng.integers(0, 100, 100).astype(dtype)
     sr1 = Series(arr1)
     sr2 = Series(arr2)
 
@@ -438,9 +449,10 @@ def test_str_series_compare_num_reflected(
 def test_series_compare_scalar(
     nelem, cmpop, obj_class, dtype, use_cudf_scalar
 ):
-    arr1 = np.random.randint(0, 100, 100).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    arr1 = rng.integers(0, 100, 100).astype(dtype)
     sr1 = Series(arr1)
-    rhs = random.choice(arr1).item()
+    rhs = rng.choice(arr1).item()
 
     if use_cudf_scalar:
         rhs = cudf.Scalar(rhs)
@@ -465,9 +477,9 @@ def test_series_compare_scalar(
 @pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128])
 @pytest.mark.parametrize("lhs_nulls,rhs_nulls", list(product(_nulls, _nulls)))
 def test_validity_add(nelem, lhs_nulls, rhs_nulls):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     # LHS
-    lhs_data = np.random.random(nelem)
+    lhs_data = rng.random(nelem)
     if lhs_nulls == "some":
         lhs_mask = utils.random_bitmask(nelem)
         lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem]
@@ -478,7 +490,7 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls):
     else:
         lhs = Series(lhs_data)
     # RHS
-    rhs_data = np.random.random(nelem)
+    rhs_data = rng.random(nelem)
     if rhs_nulls == "some":
         rhs_mask = utils.random_bitmask(nelem)
         rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem]
@@ -525,8 +537,9 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls):
 )
 def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class):
     nelem = 10
-    lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype)
-    rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype)
+    rng = np.random.default_rng(seed=0)
+    lhs = (rng.random(nelem) * nelem).astype(lhs_dtype)
+    rhs = (rng.random(nelem) * nelem).astype(rhs_dtype)
 
     sr1 = Series(lhs)
     sr2 = Series(rhs)
@@ -550,8 +563,9 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class):
 )
 def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class):
     nelem = 5
-    lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype)
-    rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype)
+    rng = np.random.default_rng(seed=0)
+    lhs = (rng.random(nelem) * nelem).astype(lhs_dtype)
+    rhs = (rng.random(nelem) * nelem).astype(rhs_dtype)
 
     sr1 = Series(lhs)
     sr2 = Series(rhs)
@@ -574,8 +588,7 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class):
 )
 def test_series_reflected_ops_scalar(func, dtype, obj_class):
     # create random series
-    np.random.seed(12)
-    random_series = utils.gen_rand(dtype, 100, low=10)
+    random_series = utils.gen_rand(dtype, 100, low=10, seed=12)
 
     # gpu series
     gs = Series(random_series)
@@ -631,8 +644,7 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
     cpu_func, gpu_func = funcs
 
     # create random series
-    np.random.seed(12)
-    random_series = utils.gen_rand(dtype, 100, low=10)
+    random_series = utils.gen_rand(dtype, 100, low=10, seed=12)
 
     # gpu series
     gs = Series(random_series)
@@ -774,7 +786,8 @@ def test_df_different_index_shape(df2, binop):
 
 @pytest.mark.parametrize("op", [operator.eq, operator.ne])
 def test_boolean_scalar_binop(op):
-    psr = pd.Series(np.random.choice([True, False], 10))
+    rng = np.random.default_rng(seed=0)
+    psr = pd.Series(rng.choice([True, False], 10))
     gsr = cudf.from_pandas(psr)
     assert_eq(op(psr, True), op(gsr, True))
     assert_eq(op(psr, False), op(gsr, False))
@@ -923,16 +936,17 @@ def test_operator_func_dataframe(func, nulls, fill_value, other):
     num_cols = 3
 
     def gen_df():
+        rng = np.random.default_rng(seed=0)
         pdf = pd.DataFrame()
         from string import ascii_lowercase
 
-        cols = np.random.choice(num_cols + 5, num_cols, replace=False)
+        cols = rng.choice(num_cols + 5, num_cols, replace=False)
 
         for i in range(num_cols):
             colname = ascii_lowercase[cols[i]]
             data = utils.gen_rand("float64", num_rows) * 10000
             if nulls == "some":
-                idx = np.random.choice(
+                idx = rng.choice(
                     num_rows, size=int(num_rows / 2), replace=False
                 )
                 data[idx] = np.nan
@@ -954,21 +968,21 @@ def gen_df():
 @pytest.mark.parametrize("nulls", _nulls)
 @pytest.mark.parametrize("other", ["df", "scalar"])
 def test_logical_operator_func_dataframe(func, nulls, other):
-    np.random.seed(0)
     num_rows = 100
     num_cols = 3
 
     def gen_df():
+        rng = np.random.default_rng(seed=0)
         pdf = pd.DataFrame()
         from string import ascii_lowercase
 
-        cols = np.random.choice(num_cols + 5, num_cols, replace=False)
+        cols = rng.choice(num_cols + 5, num_cols, replace=False)
 
         for i in range(num_cols):
             colname = ascii_lowercase[cols[i]]
             data = utils.gen_rand("float64", num_rows) * 10000
             if nulls == "some":
-                idx = np.random.choice(
+                idx = rng.choice(
                     num_rows, size=int(num_rows / 2), replace=False
                 )
                 data[idx] = np.nan
@@ -977,8 +991,12 @@ def gen_df():
 
     pdf1 = gen_df()
     pdf2 = gen_df() if other == "df" else 59.0
-    gdf1 = cudf.DataFrame.from_pandas(pdf1)
-    gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0
+    gdf1 = cudf.DataFrame.from_pandas(pdf1, nan_as_null=False)
+    gdf2 = (
+        cudf.DataFrame.from_pandas(pdf2, nan_as_null=False)
+        if other == "df"
+        else 59.0
+    )
 
     got = getattr(gdf1, func)(gdf2)
     expect = getattr(pdf1, func)(pdf2)[list(got._data)]
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index cd1ad21ae59..db41f689255 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -252,10 +252,10 @@ def test_cat_series_binop_error():
 @pytest.mark.parametrize("num_elements", [10, 100, 1000])
 def test_categorical_unique(num_elements):
     # create categorical series
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     pd_cat = pd.Categorical(
         pd.Series(
-            np.random.choice(
+            rng.choice(
                 list(string.ascii_letters + string.digits), num_elements
             ),
             dtype="category",
@@ -279,12 +279,10 @@ def test_categorical_unique(num_elements):
 @pytest.mark.parametrize("nelem", [20, 50, 100])
 def test_categorical_unique_count(nelem):
     # create categorical series
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=0)
     pd_cat = pd.Categorical(
         pd.Series(
-            np.random.choice(
-                list(string.ascii_letters + string.digits), nelem
-            ),
+            rng.choice(list(string.ascii_letters + string.digits), nelem),
             dtype="category",
         )
     )
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 4aa7fb27c9b..65947efc2df 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -31,12 +31,13 @@
 @pytest.fixture(params=dtypes, ids=dtypes)
 def pandas_input(request):
     dtype = request.param
-    rng = np.random.default_rng()
+    rng = np.random.default_rng(seed=0)
     size = 100
 
     def random_ints(dtype, size):
         dtype_min = np.iinfo(dtype).min
         dtype_max = np.iinfo(dtype).max
+        rng = np.random.default_rng(seed=0)
         return rng.integers(dtype_min, dtype_max, size=size, dtype=dtype)
 
     try:
@@ -154,7 +155,9 @@ def test_column_slicing(pandas_input, offset, size):
     [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype],
 )
 def test_decimal_column_slicing(offset, size, precision, scale, decimal_type):
-    col = cudf.core.column.as_column(pd.Series(np.random.rand(1000)))
+    col = cudf.core.column.as_column(
+        pd.Series(np.random.default_rng(seed=0).random(1000))
+    )
     col = col.astype(decimal_type(precision, scale))
     column_slicing_test(col, offset, size, True)
 
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 8da589ba45b..ab0f1767cd6 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -30,6 +30,7 @@ def _hide_concat_empty_dtype_warning():
 
 
 def make_frames(index=None, nulls="none"):
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
             "x": range(10),
@@ -51,7 +52,7 @@ def make_frames(index=None, nulls="none"):
         df2.y = np.full_like(df2.y, np.nan)
     if nulls == "some":
         mask = np.arange(10)
-        np.random.shuffle(mask)
+        rng.shuffle(mask)
         mask = mask[:5]
         df.loc[mask, "y"] = np.nan
         df2.loc[mask, "y"] = np.nan
@@ -203,10 +204,9 @@ def test_concat_misordered_columns():
 
 @pytest.mark.parametrize("axis", [1, "columns"])
 def test_concat_columns(axis):
-    pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3])
-    pdf2 = pd.DataFrame(
-        np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7]
-    )
+    rng = np.random.default_rng(seed=0)
+    pdf1 = pd.DataFrame(rng.integers(10, size=(5, 3)), columns=[1, 2, 3])
+    pdf2 = pd.DataFrame(rng.integers(10, size=(5, 4)), columns=[4, 5, 6, 7])
     gdf1 = cudf.from_pandas(pdf1)
     gdf2 = cudf.from_pandas(pdf2)
 
@@ -1398,11 +1398,12 @@ def test_concat_single_object(ignore_index, typ):
     ],
 )
 def test_concat_decimal_dataframe(ltype, rtype):
+    rng = np.random.default_rng(seed=0)
     gdf1 = cudf.DataFrame(
-        {"id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"]}
+        {"id": rng.integers(0, 10, 3), "val": ["22.3", "59.5", "81.1"]}
     )
     gdf2 = cudf.DataFrame(
-        {"id": np.random.randint(0, 10, 3), "val": ["2.35", "5.59", "8.14"]}
+        {"id": rng.integers(0, 10, 3), "val": ["2.35", "5.59", "8.14"]}
     )
 
     gdf1["val"] = gdf1["val"].astype(ltype)
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index 9b6f82ec705..f33cfe268a3 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -16,8 +16,9 @@
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
 def test_repeat(dtype):
-    arr = np.random.rand(10) * 10
-    repeats = np.random.randint(10, size=10)
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(10) * 10
+    repeats = rng.integers(10, size=10)
     psr = pd.Series(arr).astype(dtype)
     gsr = cudf.from_pandas(psr)
 
@@ -25,18 +26,20 @@ def test_repeat(dtype):
 
 
 def test_repeat_index():
+    rng = np.random.default_rng(seed=0)
     arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
     psr = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
     gsr = cudf.from_pandas(psr)
-    repeats = np.random.randint(10, size=4)
+    repeats = rng.integers(10, size=4)
 
     assert_eq(psr.repeat(repeats), gsr.repeat(repeats))
 
 
 def test_repeat_dataframe():
+    rng = np.random.default_rng(seed=0)
     psr = pd.DataFrame({"a": [1, 1, 2, 2]})
     gsr = cudf.from_pandas(psr)
-    repeats = np.random.randint(10, size=4)
+    repeats = rng.integers(10, size=4)
 
     # pd.DataFrame doesn't have repeat() so as a workaround, we are
     # comparing pd.Series.repeat() with cudf.DataFrame.repeat()['a']
@@ -45,7 +48,8 @@ def test_repeat_dataframe():
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES)
 def test_repeat_scalar(dtype):
-    arr = np.random.rand(10) * 10
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(10) * 10
     repeats = 10
     psr = pd.Series(arr).astype(dtype)
     gsr = cudf.from_pandas(psr)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index b6efc8ebd88..8800275bf67 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1764,13 +1764,13 @@ def test_csv_writer_multiindex(tmpdir):
     pdf_df_fname = tmpdir.join("pdf_df_3.csv")
     gdf_df_fname = tmpdir.join("gdf_df_3.csv")
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
-            "a": np.random.randint(0, 5, 20),
-            "b": np.random.randint(0, 5, 20),
+            "a": rng.integers(0, 5, 20),
+            "b": rng.integers(0, 5, 20),
             "c": range(20),
-            "d": np.random.random(20),
+            "d": rng.random(20),
         }
     )
     gdg = gdf.groupby(["a", "b"]).mean()
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6f88d942746..7c3547c59d6 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -428,7 +428,7 @@ def test_series_init_none():
 
 
 def test_dataframe_basic():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame()
 
     # Populate with cuda memory
@@ -437,7 +437,7 @@ def test_dataframe_basic():
     assert len(df) == 10
 
     # Populate with numpy array
-    rnd_vals = np.random.random(10)
+    rnd_vals = rng.random(10)
     df["vals"] = rnd_vals
     np.testing.assert_equal(df["vals"].to_numpy(), rnd_vals)
     assert len(df) == 10
@@ -1238,8 +1238,9 @@ def test_empty_dataframe_to_cupy():
 
     df = cudf.DataFrame()
     nelem = 123
+    rng = np.random.default_rng(seed=0)
     for k in "abc":
-        df[k] = np.random.random(nelem)
+        df[k] = rng.random(nelem)
 
     # Check all columns in empty dataframe.
     mat = df.head(0).to_cupy()
@@ -1250,8 +1251,9 @@ def test_dataframe_to_cupy():
     df = cudf.DataFrame()
 
     nelem = 123
+    rng = np.random.default_rng(seed=0)
     for k in "abcd":
-        df[k] = np.random.random(nelem)
+        df[k] = rng.random(nelem)
 
     # Check all columns
     mat = df.to_cupy()
@@ -1279,8 +1281,9 @@ def test_dataframe_to_cupy_null_values():
     na = -10000
 
     refvalues = {}
+    rng = np.random.default_rng(seed=0)
     for k in "abcd":
-        df[k] = data = np.random.random(nelem)
+        df[k] = data = rng.random(nelem)
         bitmask = utils.random_bitmask(nelem)
         df[k] = df[k]._column.set_mask(bitmask)
         boolmask = np.asarray(
@@ -1321,10 +1324,11 @@ def test_dataframe_append_empty():
 
 
 def test_dataframe_setitem_from_masked_object():
-    ary = np.random.randn(100)
+    rng = np.random.default_rng(seed=0)
+    ary = rng.standard_normal(100)
     mask = np.zeros(100, dtype=bool)
     mask[:20] = True
-    np.random.shuffle(mask)
+    rng.shuffle(mask)
     ary[mask] = np.nan
 
     test1_null = cudf.Series(ary, nan_as_null=True)
@@ -1534,14 +1538,12 @@ def test_dataframe_hash_values_xxhash64():
 @pytest.mark.parametrize("nparts", [1, 2, 8, 13])
 @pytest.mark.parametrize("nkeys", [1, 2])
 def test_dataframe_hash_partition(nrows, nparts, nkeys):
-    np.random.seed(123)
-    gdf = cudf.DataFrame()
-    keycols = []
-    for i in range(nkeys):
-        keyname = f"key{i}"
-        gdf[keyname] = np.random.randint(0, 7 - i, nrows)
-        keycols.append(keyname)
-    gdf["val1"] = np.random.randint(0, nrows * 2, nrows)
+    rng = np.random.default_rng(seed=0)
+    gdf = cudf.DataFrame(
+        {f"key{i}": rng.integers(0, 7 - i, nrows) for i in range(nkeys)}
+    )
+    keycols = gdf.columns.to_list()
+    gdf["val1"] = rng.integers(0, nrows * 2, nrows)
 
     got = gdf.partition_by_hash(keycols, nparts=nparts)
     # Must return a list
@@ -1751,8 +1753,9 @@ def test_concat_with_axis():
 
     assert_eq(concat_cdf_s, concat_s, check_index_type=True)
 
+    rng = np.random.default_rng(seed=0)
     # concat series and dataframes
-    s3 = pd.Series(np.random.random(5))
+    s3 = pd.Series(rng.random(5))
     cs3 = cudf.Series.from_pandas(s3)
 
     concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1)
@@ -1787,13 +1790,14 @@ def test_concat_with_axis():
         check_index_type=True,
     )
 
+    rng = np.random.default_rng(seed=0)
     # concat groupby multi index
     gdf1 = cudf.DataFrame(
         {
-            "x": np.random.randint(0, 10, 10),
-            "y": np.random.randint(0, 10, 10),
-            "z": np.random.randint(0, 10, 10),
-            "v": np.random.randint(0, 10, 10),
+            "x": rng.integers(0, 10, 10),
+            "y": rng.integers(0, 10, 10),
+            "z": rng.integers(0, 10, 10),
+            "v": rng.integers(0, 10, 10),
         }
     )
     gdf2 = gdf1[5:]
@@ -1833,14 +1837,14 @@ def test_concat_with_axis():
 
 @pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000])
 def test_nonmatching_index_setitem(nrows):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     gdf = cudf.DataFrame()
-    gdf["a"] = np.random.randint(2147483647, size=nrows)
-    gdf["b"] = np.random.randint(2147483647, size=nrows)
+    gdf["a"] = rng.integers(2147483647, size=nrows)
+    gdf["b"] = rng.integers(2147483647, size=nrows)
     gdf = gdf.set_index("b")
 
-    test_values = np.random.randint(2147483647, size=nrows)
+    test_values = rng.integers(2147483647, size=nrows)
     gdf["c"] = test_values
     assert len(test_values) == len(gdf["c"])
     gdf_series = cudf.Series(test_values, index=gdf.index, name="c")
@@ -1974,10 +1978,11 @@ def test_index_in_dataframe_constructor():
 @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000])
 @pytest.mark.parametrize("data_type", dtypes)
 def test_from_arrow(nelem, data_type):
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "a": np.random.randint(0, 1000, nelem).astype(data_type),
-            "b": np.random.randint(0, 1000, nelem).astype(data_type),
+            "a": rng.integers(0, 1000, nelem).astype(data_type),
+            "b": rng.integers(0, 1000, nelem).astype(data_type),
         }
     )
     padf = pa.Table.from_pandas(
@@ -2012,10 +2017,11 @@ def test_from_arrow_chunked_categories():
 @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000])
 @pytest.mark.parametrize("data_type", dtypes)
 def test_to_arrow(nelem, data_type):
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "a": np.random.randint(0, 1000, nelem).astype(data_type),
-            "b": np.random.randint(0, 1000, nelem).astype(data_type),
+            "a": rng.integers(0, 1000, nelem).astype(data_type),
+            "b": rng.integers(0, 1000, nelem).astype(data_type),
         }
     )
     gdf = cudf.DataFrame.from_pandas(df)
@@ -2119,17 +2125,16 @@ def test_to_arrow_missing_categorical():
 
 @pytest.mark.parametrize("data_type", dtypes)
 def test_from_scalar_typing(data_type):
+    rng = np.random.default_rng(seed=0)
     if data_type == "datetime64[ms]":
         scalar = (
-            np.dtype("int64")
-            .type(np.random.randint(0, 5))
-            .astype("datetime64[ms]")
+            np.dtype("int64").type(rng.integers(0, 5)).astype("datetime64[ms]")
         )
     elif data_type.startswith("datetime64"):
         scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]")
         data_type = "datetime64[ms]"
     else:
-        scalar = np.dtype(data_type).type(np.random.randint(0, 5))
+        scalar = np.dtype(data_type).type(rng.integers(0, 5))
 
     gdf = cudf.DataFrame()
     gdf["a"] = [1, 2, 3, 4, 5]
@@ -2140,7 +2145,8 @@ def test_from_scalar_typing(data_type):
 
 @pytest.mark.parametrize("data_type", NUMERIC_TYPES)
 def test_from_python_array(data_type):
-    np_arr = np.random.randint(0, 100, 10).astype(data_type)
+    rng = np.random.default_rng(seed=0)
+    np_arr = rng.integers(0, 100, 10).astype(data_type)
     data = memoryview(np_arr)
     data = arr.array(data.format, data)
 
@@ -2220,7 +2226,7 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype):
     # against pandas nullable types as they are the ones that closely
     # resemble `cudf` dtypes behavior.
     pdf = pd.DataFrame()
-
+    rng = np.random.default_rng(seed=0)
     null_rep = np.nan if dtype in ["float32", "float64"] else None
     np_dtype = dtype
     dtype = np.dtype(dtype)
@@ -2228,13 +2234,11 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype):
     for i in range(num_cols):
         colname = string.ascii_lowercase[i]
         data = pd.Series(
-            np.random.randint(0, 26, num_rows).astype(np_dtype),
+            rng.integers(0, 26, num_rows).astype(np_dtype),
             dtype=dtype,
         )
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             if len(idx):
                 data[idx] = null_rep
         elif nulls == "all":
@@ -2652,8 +2656,8 @@ def test_unaryops_df(pdf, unaryop, col_name, assign_col_name):
 
 
 def test_df_abs(pdf):
-    np.random.seed(0)
-    disturbance = pd.Series(np.random.rand(10))
+    rng = np.random.default_rng(seed=0)
+    disturbance = pd.Series(rng.random(10))
     pdf = pdf - 5 + disturbance
     d = pdf.apply(np.abs)
     g = cudf.from_pandas(pdf).abs()
@@ -2706,8 +2710,9 @@ def test_iteritems(gdf):
 def test_quantile(q, numeric_only):
     ts = pd.date_range("2018-08-24", periods=5, freq="D")
     td = pd.to_timedelta(np.arange(5), unit="h")
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
-        {"date": ts, "delta": td, "val": np.random.randn(len(ts))}
+        {"date": ts, "delta": td, "val": rng.standard_normal(len(ts))}
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
 
@@ -2729,9 +2734,10 @@ def test_quantile(q, numeric_only):
     [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
 )
 def test_decimal_quantile(q, interpolation, decimal_type):
+    rng = np.random.default_rng(seed=0)
     data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"]
     gdf = cudf.DataFrame(
-        {"id": np.random.randint(0, 10, size=len(data)), "val": data}
+        {"id": rng.integers(0, 10, size=len(data)), "val": data}
     )
     gdf["id"] = gdf["id"].astype("float64")
     gdf["val"] = gdf["val"].astype(decimal_type(7, 2))
@@ -2843,9 +2849,9 @@ def test_cuda_array_interface(dtype):
 @pytest.mark.parametrize("nchunks", [1, 2, 5, 10])
 @pytest.mark.parametrize("data_type", dtypes)
 def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
+    rng = np.random.default_rng(seed=0)
     np_list_data = [
-        np.random.randint(0, 100, nelem).astype(data_type)
-        for i in range(nchunks)
+        rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks)
     ]
     pa_chunk_array = pa.chunked_array(np_list_data)
 
@@ -2855,8 +2861,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     assert_eq(expect, got)
 
     np_list_data2 = [
-        np.random.randint(0, 100, nelem).astype(data_type)
-        for i in range(nchunks)
+        rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks)
     ]
     pa_chunk_array2 = pa.chunked_array(np_list_data2)
     pa_table = pa.Table.from_arrays(
@@ -2881,11 +2886,13 @@ def query_GPU_memory(note=""):
     cuda.current_context().deallocations.clear()
     nRows = int(1e8)
     nCols = 2
-    dataNumpy = np.asfortranarray(np.random.rand(nRows, nCols))
+    rng = np.random.default_rng(seed=0)
+    dataNumpy = np.asfortranarray(rng.random(nRows, nCols))
     colNames = ["col" + str(iCol) for iCol in range(nCols)]
     pandasDF = pd.DataFrame(data=dataNumpy, columns=colNames, dtype=np.float32)
     cudaDF = cudf.core.DataFrame.from_pandas(pandasDF)
-    boolmask = cudf.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool"))
+    rng = np.random.default_rng(seed=0)
+    boolmask = cudf.Series(rng.integers(1, 2, len(cudaDF)).astype("bool"))
 
     memory_used = query_GPU_memory()
     cudaDF = cudaDF[boolmask]
@@ -2903,7 +2910,8 @@ def query_GPU_memory(note=""):
 
 
 def test_boolmask(pdf, gdf):
-    boolmask = np.random.randint(0, 2, len(pdf)) > 0
+    rng = np.random.default_rng(seed=0)
+    boolmask = rng.integers(0, 2, len(pdf)) > 0
     gdf = gdf[boolmask]
     pdf = pdf[boolmask]
     assert_eq(pdf, gdf)
@@ -2922,12 +2930,11 @@ def test_boolmask(pdf, gdf):
     ],
 )
 def test_dataframe_boolmask(mask_shape):
-    pdf = pd.DataFrame()
-    for col in "abc":
-        pdf[col] = np.random.randint(0, 10, 3)
-    pdf_mask = pd.DataFrame()
-    for col in mask_shape[1]:
-        pdf_mask[col] = np.random.randint(0, 2, mask_shape[0]) > 0
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame({col: rng.integers(0, 10, 3) for col in "abc"})
+    pdf_mask = pd.DataFrame(
+        {col: rng.integers(0, 2, mask_shape[0]) > 0 for col in mask_shape[1]}
+    )
     gdf = cudf.DataFrame.from_pandas(pdf)
     gdf_mask = cudf.DataFrame.from_pandas(pdf_mask)
     gdf = gdf[gdf_mask]
@@ -2992,7 +2999,8 @@ def test_arrow_handle_no_index_name(pdf, gdf):
 
 
 def test_pandas_non_contiguious():
-    arr1 = np.random.sample([5000, 10])
+    rng = np.random.default_rng(seed=0)
+    arr1 = rng.random(size=(5000, 10))
     assert arr1.flags["C_CONTIGUOUS"] is True
     df = pd.DataFrame(arr1)
     for col in df.columns:
@@ -3052,10 +3060,11 @@ def test_series_rename():
 @pytest.mark.parametrize("data_type", dtypes)
 @pytest.mark.parametrize("nelem", [0, 100])
 def test_head_tail(nelem, data_type):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "a": np.random.randint(0, 1000, nelem).astype(data_type),
-            "b": np.random.randint(0, 1000, nelem).astype(data_type),
+            "a": rng.integers(0, 1000, nelem).astype(data_type),
+            "b": rng.integers(0, 1000, nelem).astype(data_type),
         }
     )
     gdf = cudf.from_pandas(pdf)
@@ -3308,15 +3317,15 @@ def test_set_index_verify_integrity(data, index, verify_integrity):
 @pytest.mark.parametrize("drop", [True, False])
 @pytest.mark.parametrize("nelem", [10, 200, 1333])
 def test_set_index_multi(drop, nelem):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     a = np.arange(nelem)
-    np.random.shuffle(a)
+    rng.shuffle(a)
     df = pd.DataFrame(
         {
             "a": a,
-            "b": np.random.randint(0, 4, size=nelem),
-            "c": np.random.uniform(low=0, high=4, size=nelem),
-            "d": np.random.choice(["green", "black", "white"], nelem),
+            "b": rng.integers(0, 4, size=nelem),
+            "c": rng.uniform(low=0, high=4, size=nelem),
+            "d": rng.choice(["green", "black", "white"], nelem),
         }
     )
     df["e"] = df["d"].astype("category")
@@ -3894,13 +3903,13 @@ def test_select_dtype_datetime_with_frequency():
 
 
 def test_dataframe_describe_exclude():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
     df["x"] = df.x.astype("int64")
-    df["y"] = np.random.normal(10, 1, data_length)
+    df["y"] = rng.normal(10, 1, data_length)
     pdf = df.to_pandas()
 
     gdf_results = df.describe(exclude=["float"])
@@ -3910,13 +3919,13 @@ def test_dataframe_describe_exclude():
 
 
 def test_dataframe_describe_include():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
     df["x"] = df.x.astype("int64")
-    df["y"] = np.random.normal(10, 1, data_length)
+    df["y"] = rng.normal(10, 1, data_length)
     pdf = df.to_pandas()
     gdf_results = df.describe(include=["int"])
     pdf_results = pdf.describe(include=["int"])
@@ -3925,12 +3934,12 @@ def test_dataframe_describe_include():
 
 
 def test_dataframe_describe_default():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
-    df["y"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
+    df["y"] = rng.normal(10, 1, data_length)
     pdf = df.to_pandas()
     gdf_results = df.describe()
     pdf_results = pdf.describe()
@@ -3939,14 +3948,14 @@ def test_dataframe_describe_default():
 
 
 def test_series_describe_include_all():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
     df["x"] = df.x.astype("int64")
-    df["y"] = np.random.normal(10, 1, data_length)
-    df["animal"] = np.random.choice(["dog", "cat", "bird"], data_length)
+    df["y"] = rng.normal(10, 1, data_length)
+    df["animal"] = rng.choice(["dog", "cat", "bird"], data_length)
 
     pdf = df.to_pandas()
     gdf_results = df.describe(include="all")
@@ -3962,13 +3971,13 @@ def test_series_describe_include_all():
 
 
 def test_dataframe_describe_percentiles():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
     sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99]
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
-    df["y"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
+    df["y"] = rng.normal(10, 1, data_length)
     pdf = df.to_pandas()
     gdf_results = df.describe(percentiles=sample_percentiles)
     pdf_results = pdf.describe(percentiles=sample_percentiles)
@@ -4098,10 +4107,11 @@ def test_ndim():
     ],
 )
 def test_dataframe_round(decimals):
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
             "floats": np.arange(0.5, 10.5, 1),
-            "ints": np.random.normal(-100, 100, 10),
+            "ints": rng.normal(-100, 100, 10),
             "floats_with_na": np.array(
                 [
                     14.123,
@@ -4117,9 +4127,9 @@ def test_dataframe_round(decimals):
                 ]
             ),
             "floats_same": np.repeat([-0.6459412758761901], 10),
-            "bools": np.random.choice([True, None, False], 10),
-            "strings": np.random.choice(["abc", "xyz", None], 10),
-            "struct": np.random.choice([{"abc": 1}, {"xyz": 2}, None], 10),
+            "bools": rng.choice([True, None, False], 10),
+            "strings": rng.choice(["abc", "xyz", None], 10),
+            "struct": rng.choice([{"abc": 1}, {"xyz": 2}, None], 10),
             "list": [[1], [2], None, [4], [3]] * 2,
         }
     )
@@ -5811,10 +5821,11 @@ def test_memory_usage(deep, index, set_index):
 @pytest_xfail
 def test_memory_usage_string():
     rows = int(100)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
             "A": np.arange(rows, dtype="int32"),
-            "B": np.random.choice(["apple", "banana", "orange"], rows),
+            "B": rng.choice(["apple", "banana", "orange"], rows),
         }
     )
     gdf = cudf.from_pandas(df)
@@ -5837,10 +5848,11 @@ def test_memory_usage_string():
 
 def test_memory_usage_cat():
     rows = int(100)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
             "A": np.arange(rows, dtype="int32"),
-            "B": np.random.choice(["apple", "banana", "orange"], rows),
+            "B": rng.choice(["apple", "banana", "orange"], rows),
         }
     )
     df["B"] = df.B.astype("category")
@@ -5870,13 +5882,14 @@ def test_memory_usage_list():
 def test_memory_usage_multi(rows):
     # We need to sample without replacement to guarantee that the size of the
     # levels are always the same.
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
             "A": np.arange(rows, dtype="int32"),
-            "B": np.random.choice(
+            "B": rng.choice(
                 np.arange(rows, dtype="int64"), rows, replace=False
             ),
-            "C": np.random.choice(
+            "C": rng.choice(
                 np.arange(rows, dtype="float64"), rows, replace=False
             ),
         }
@@ -6698,8 +6711,16 @@ def test_dataframe_init_1d_list(data, columns):
         (cupy.array([11, 123, -2342, 232]), ["z"], [0, 1, 1, 0]),
         (cupy.array([11, 123, -2342, 232]), ["z"], [1, 2, 3, 4]),
         (cupy.array([11, 123, -2342, 232]), ["z"], ["a", "z", "d", "e"]),
-        (np.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]),
-        (np.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]),
+        (
+            np.random.default_rng(seed=0).standard_normal(size=(2, 4)),
+            ["a", "b", "c", "d"],
+            ["a", "b"],
+        ),
+        (
+            np.random.default_rng(seed=0).standard_normal(size=(2, 4)),
+            ["a", "b", "c", "d"],
+            [1, 0],
+        ),
         (cupy.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]),
         (cupy.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]),
     ],
@@ -6873,8 +6894,9 @@ def test_dataframe_info_basic():
     memory usage: 859.0+ bytes
     """
     )
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        np.random.randn(10, 10),
+        rng.standard_normal(size=(10, 10)),
         index=["a", "2", "3", "4", "5", "6", "7", "8", "100", "1111"],
     )
     cudf.from_pandas(df).info(buf=buffer, verbose=True)
@@ -9374,8 +9396,8 @@ def test_dataframe_roundtrip_arrow_struct_dtype(gdf):
 
 
 def test_dataframe_setitem_cupy_array():
-    np.random.seed(0)
-    pdf = pd.DataFrame(np.random.randn(10, 2))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.standard_normal(size=(10, 2)))
     gdf = cudf.from_pandas(pdf)
 
     gpu_array = cupy.array([True, False] * 5)
@@ -10161,7 +10183,7 @@ def df_eval(request):
             }
         )
     int_max = 10
-    rng = cupy.random.default_rng(0)
+    rng = cupy.random.default_rng(seed=0)
     return cudf.DataFrame(
         {
             "a": rng.integers(N, size=int_max),
@@ -10529,11 +10551,12 @@ def test_dataframe_init_length_error(data, index):
 
 
 def test_dataframe_binop_with_mixed_date_types():
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        np.random.rand(2, 2),
+        rng.random(size=(2, 2)),
         columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"),
     )
-    ser = pd.Series(np.random.rand(3), index=[0, 1, 2])
+    ser = pd.Series(rng.random(size=3), index=[0, 1, 2])
     gdf = cudf.from_pandas(df)
     gser = cudf.from_pandas(ser)
     expected = df - ser
@@ -10542,9 +10565,10 @@ def test_dataframe_binop_with_mixed_date_types():
 
 
 def test_dataframe_binop_with_mixed_string_types():
-    df1 = pd.DataFrame(np.random.rand(3, 3), columns=pd.Index([0, 1, 2]))
+    rng = np.random.default_rng(seed=0)
+    df1 = pd.DataFrame(rng.random(size=(3, 3)), columns=pd.Index([0, 1, 2]))
     df2 = pd.DataFrame(
-        np.random.rand(6, 6),
+        rng.random(size=(6, 6)),
         columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]),
     )
     gdf1 = cudf.from_pandas(df1)
@@ -10557,7 +10581,8 @@ def test_dataframe_binop_with_mixed_string_types():
 
 
 def test_dataframe_binop_and_where():
-    df = pd.DataFrame(np.random.rand(2, 2), columns=pd.Index([True, False]))
+    rng = np.random.default_rng(seed=0)
+    df = pd.DataFrame(rng.random(size=(2, 2)), columns=pd.Index([True, False]))
     gdf = cudf.from_pandas(df)
 
     expected = df > 1
@@ -10572,12 +10597,13 @@ def test_dataframe_binop_and_where():
 
 
 def test_dataframe_binop_with_datetime_index():
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        np.random.rand(2, 2),
+        rng.random(size=(2, 2)),
         columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"),
     )
     ser = pd.Series(
-        np.random.rand(2),
+        rng.random(2),
         index=pd.Index(
             [
                 "2000-01-04",
@@ -10615,8 +10641,8 @@ def test_dataframe_dict_like_with_columns(columns, index):
 
 
 def test_dataframe_init_columns_named_multiindex():
-    np.random.seed(0)
-    data = np.random.randn(2, 2)
+    rng = np.random.default_rng(seed=0)
+    data = rng.standard_normal(size=(2, 2))
     columns = cudf.MultiIndex.from_tuples(
         [("A", "one"), ("A", "two")], names=["y", "z"]
     )
@@ -10627,8 +10653,8 @@ def test_dataframe_init_columns_named_multiindex():
 
 
 def test_dataframe_init_columns_named_index():
-    np.random.seed(0)
-    data = np.random.randn(2, 2)
+    rng = np.random.default_rng(seed=0)
+    data = rng.standard_normal(size=(2, 2))
     columns = pd.Index(["a", "b"], name="custom_name")
     gdf = cudf.DataFrame(data, columns=columns)
     pdf = pd.DataFrame(data, columns=columns)
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index 45bd31ef58e..3aedbf8365b 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -93,11 +93,15 @@ def test_dataframe_deep_copy_and_insert(copy_parameters):
 @pytest.mark.parametrize("ncols", [0, 1, 10])
 @pytest.mark.parametrize("data_type", ALL_TYPES)
 def test_cudf_dataframe_copy(copy_fn, ncols, data_type):
-    pdf = pd.DataFrame()
-    for i in range(ncols):
-        pdf[chr(i + ord("a"))] = pd.Series(
-            np.random.randint(0, 1000, 20)
-        ).astype(data_type)
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(
+        {
+            chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype(
+                data_type
+            )
+            for i in range(ncols)
+        }
+    )
     df = DataFrame.from_pandas(pdf)
     copy_df = copy_fn(df)
     assert_eq(df, copy_df)
@@ -116,18 +120,20 @@ def test_cudf_dataframe_copy(copy_fn, ncols, data_type):
 @pytest.mark.parametrize("ncols", [0, 1, 10])
 @pytest.mark.parametrize("data_type", ALL_TYPES)
 def test_cudf_dataframe_copy_then_insert(copy_fn, ncols, data_type):
-    pdf = pd.DataFrame()
-    for i in range(ncols):
-        pdf[chr(i + ord("a"))] = pd.Series(
-            np.random.randint(0, 1000, 20)
-        ).astype(data_type)
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(
+        {
+            chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype(
+                data_type
+            )
+            for i in range(ncols)
+        }
+    )
     df = DataFrame.from_pandas(pdf)
     copy_df = copy_fn(df)
     copy_pdf = copy_fn(pdf)
-    copy_df["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype(data_type)
-    copy_pdf["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype(
-        data_type
-    )
+    copy_df["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type)
+    copy_pdf["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type)
     assert not copy_pdf.to_string().split() == pdf.to_string().split()
     assert not copy_df.to_string().split() == df.to_string().split()
 
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 976b12a9ab5..b7403c12bcd 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -216,17 +216,21 @@ def test_setitem_datetime():
 
 
 def test_sort_datetime():
-    df = pd.DataFrame()
-    df["date"] = np.array(
-        [
-            np.datetime64("2016-11-20"),
-            np.datetime64("2020-11-20"),
-            np.datetime64("2019-11-20"),
-            np.datetime64("1918-11-20"),
-            np.datetime64("2118-11-20"),
-        ]
+    rng = np.random.default_rng(seed=0)
+    df = pd.DataFrame(
+        {
+            "date": np.array(
+                [
+                    np.datetime64("2016-11-20"),
+                    np.datetime64("2020-11-20"),
+                    np.datetime64("2019-11-20"),
+                    np.datetime64("1918-11-20"),
+                    np.datetime64("2118-11-20"),
+                ]
+            ),
+            "vals": rng.random(5),
+        }
     )
-    df["vals"] = np.random.sample(len(df["date"]))
 
     gdf = cudf.from_pandas(df)
 
@@ -432,11 +436,12 @@ def test_datetime_to_arrow(dtype):
 )
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_datetime_unique(data, nulls):
+    rng = np.random.default_rng(seed=0)
     psr = data.copy()
 
     if len(data) > 0:
         if nulls == "some":
-            p = np.random.randint(0, len(data), 2)
+            p = rng.integers(0, len(data), 2)
             psr[p] = None
 
     gsr = cudf.from_pandas(psr)
@@ -461,10 +466,11 @@ def test_datetime_unique(data, nulls):
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_datetime_nunique(data, nulls):
     psr = data.copy()
+    rng = np.random.default_rng(seed=0)
 
     if len(data) > 0:
         if nulls == "some":
-            p = np.random.randint(0, len(data), 2)
+            p = rng.integers(0, len(data), 2)
             psr[p] = None
 
     gsr = cudf.from_pandas(psr)
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index ebcc35784ee..20c24bd7564 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -42,9 +42,10 @@ def data_1d(request):
     nelems = request.param[0]
     dtype = request.param[1]
     nulls = request.param[2]
-    a = np.random.randint(10, size=nelems).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    a = rng.integers(10, size=nelems).astype(dtype)
     if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating):
-        idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False)
+        idx = rng.choice(a.size, size=int(a.size * 0.2), replace=False)
         a[idx] = np.nan
     return a
 
@@ -55,9 +56,10 @@ def data_2d(request):
     nrows = request.param[1]
     dtype = request.param[2]
     nulls = request.param[3]
-    a = np.random.randint(10, size=(nrows, ncols)).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    a = rng.integers(10, size=(nrows, ncols)).astype(dtype)
     if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating):
-        idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False)
+        idx = rng.choice(a.size, size=int(a.size * 0.2), replace=False)
         a.ravel()[idx] = np.nan
     return np.ascontiguousarray(a)
 
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index 5b1ee0ffac6..eeac78dbebc 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -22,13 +22,13 @@
 @pytest.mark.parametrize("inplace", [True, False])
 def test_dropna_series(data, nulls, inplace):
     psr = pd.Series(data)
-
+    rng = np.random.default_rng(seed=0)
     if len(data) > 0:
         if nulls == "one":
-            p = np.random.randint(0, 4)
+            p = rng.integers(0, 4)
             psr[p] = None
         elif nulls == "some":
-            p1, p2 = np.random.randint(0, 4, (2,))
+            p1, p2 = rng.integers(0, 4, (2,))
             psr[p1] = None
             psr[p2] = None
         elif nulls == "all":
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index 0b4ed52ba96..67dd7a8388b 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -368,9 +368,13 @@ def test_dataframe_drop_duplicates_method():
 
 
 def test_datetime_drop_duplicates():
-    date_df = cudf.DataFrame()
-    date_df["date"] = pd.date_range("11/20/2018", periods=6, freq="D")
-    date_df["value"] = np.random.sample(len(date_df))
+    rng = np.random.default_rng(seed=0)
+    date_df = cudf.DataFrame(
+        {
+            "date": pd.date_range("11/20/2018", periods=6, freq="D"),
+            "value": rng.random(6),
+        }
+    )
 
     df = concat([date_df, date_df[:4]])
     assert_eq(df[:-4], df.drop_duplicates())
@@ -585,7 +589,8 @@ def test_drop_duplicates_multi_index():
     ]
 
     idx = pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"])
-    pdf = pd.DataFrame(np.random.randint(0, 2, (8, 4)), index=idx)
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.integers(0, 2, (8, 4)), index=idx)
     gdf = cudf.DataFrame.from_pandas(pdf)
 
     expected = pdf.drop_duplicates()
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 47f9180dcb1..cfb4ae2c0f8 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -13,13 +13,16 @@
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
 def test_factorize_series_obj(ncats, nelem):
     df = DataFrame()
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # initialize data frame
-    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
+    df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32)
 
     uvals, labels = df["cats"].factorize()
-    np.testing.assert_array_equal(labels.to_numpy(), sorted(set(arr)))
+    unique_values, indices = np.unique(arr, return_index=True)
+    expected_values = unique_values[np.argsort(indices)]
+
+    np.testing.assert_array_equal(labels.to_numpy(), expected_values)
     assert isinstance(uvals, cp.ndarray)
     assert isinstance(labels, Index)
 
@@ -31,14 +34,17 @@ def test_factorize_series_obj(ncats, nelem):
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
 def test_factorize_index_obj(ncats, nelem):
     df = DataFrame()
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # initialize data frame
-    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
+    df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32)
     df = df.set_index("cats")
 
     uvals, labels = df.index.factorize()
-    np.testing.assert_array_equal(labels.values.get(), sorted(set(arr)))
+    unique_values, indices = np.unique(arr, return_index=True)
+    expected_values = unique_values[np.argsort(indices)]
+
+    np.testing.assert_array_equal(labels.values.get(), expected_values)
     assert isinstance(uvals, cp.ndarray)
     assert isinstance(labels, Index)
 
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 7e5523bb8c7..f93bd2c5d32 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -15,13 +15,14 @@
 
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
+    rng = np.random.default_rng(seed=0)
     types = NUMERIC_TYPES + ["bool"]
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
         {
-            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
             for typ in types
         }
     )
@@ -30,7 +31,7 @@ def pdf(request):
     test_pdf.index.name = "index"
 
     # Create non-numeric categorical data otherwise may get typecasted
-    data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
+    data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)]
     test_pdf["col_category"] = pd.Series(data, dtype="category")
 
     # Feather can't handle indexes properly
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 14ba9894fd3..6b222841622 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -77,21 +77,21 @@ def make_frame(
     extra_vals=(),
     with_datetime=False,
 ):
-    np.random.seed(seed)
+    rng = np.random.default_rng(seed=seed)
 
     df = dataframe_class()
 
-    df["x"] = np.random.randint(0, 5, nelem)
-    df["y"] = np.random.randint(0, 3, nelem)
+    df["x"] = rng.integers(0, 5, nelem)
+    df["y"] = rng.integers(0, 3, nelem)
     for lvl in extra_levels:
-        df[lvl] = np.random.randint(0, 2, nelem)
+        df[lvl] = rng.integers(0, 2, nelem)
 
-    df["val"] = np.random.random(nelem)
+    df["val"] = rng.random(nelem)
     for val in extra_vals:
-        df[val] = np.random.random(nelem)
+        df[val] = rng.random(nelem)
 
     if with_datetime:
-        df["datetime"] = np.random.randint(
+        df["datetime"] = rng.integers(
             _now, _tomorrow, nelem, dtype=np.int64
         ).astype("datetime64[ns]")
 
@@ -266,9 +266,10 @@ def test_groupby_getitem_getattr(as_index):
 
 
 def test_groupby_cats():
-    df = DataFrame()
-    df["cats"] = pd.Categorical(list("aabaacaab"))
-    df["vals"] = np.random.random(len(df))
+    rng = np.random.default_rng(seed=0)
+    df = DataFrame(
+        {"cats": pd.Categorical(list("aabaacaab")), "vals": rng.random(9)}
+    )
 
     cats = df["cats"].values_host
     vals = df["vals"].to_numpy()
@@ -285,13 +286,16 @@ def test_groupby_cats():
 
 
 def test_groupby_iterate_groups():
-    np.random.seed(0)
-    df = DataFrame()
+    rng = np.random.default_rng(seed=0)
     nelem = 20
-    df["key1"] = np.random.randint(0, 3, nelem)
-    df["key2"] = np.random.randint(0, 2, nelem)
-    df["val1"] = np.random.random(nelem)
-    df["val2"] = np.random.random(nelem)
+    df = DataFrame(
+        {
+            "key1": rng.integers(0, 3, nelem),
+            "key2": rng.integers(0, 2, nelem),
+            "val1": rng.random(nelem),
+            "val2": rng.random(nelem),
+        }
+    )
 
     def assert_values_equal(arr):
         np.testing.assert_array_equal(arr[0], arr)
@@ -307,13 +311,16 @@ def assert_values_equal(arr):
     reason="Fails in older versions of pandas",
 )
 def test_groupby_apply():
-    np.random.seed(0)
-    df = DataFrame()
+    rng = np.random.default_rng(seed=0)
     nelem = 20
-    df["key1"] = np.random.randint(0, 3, nelem)
-    df["key2"] = np.random.randint(0, 2, nelem)
-    df["val1"] = np.random.random(nelem)
-    df["val2"] = np.random.random(nelem)
+    df = DataFrame(
+        {
+            "key1": rng.integers(0, 3, nelem),
+            "key2": rng.integers(0, 2, nelem),
+            "val1": rng.random(nelem),
+            "val2": rng.random(nelem),
+        }
+    )
 
     expect_grpby = df.to_pandas().groupby(
         ["key1", "key2"], as_index=False, group_keys=False
@@ -351,13 +358,16 @@ def f3(df, k, L, m):
     reason="Fails in older versions of pandas",
 )
 def test_groupby_apply_args(func, args):
-    np.random.seed(0)
-    df = DataFrame()
+    rng = np.random.default_rng(seed=0)
     nelem = 20
-    df["key1"] = np.random.randint(0, 3, nelem)
-    df["key2"] = np.random.randint(0, 2, nelem)
-    df["val1"] = np.random.random(nelem)
-    df["val2"] = np.random.random(nelem)
+    df = DataFrame(
+        {
+            "key1": rng.integers(0, 3, nelem),
+            "key2": rng.integers(0, 2, nelem),
+            "val1": rng.random(nelem),
+            "val2": rng.random(nelem),
+        }
+    )
 
     expect_grpby = df.to_pandas().groupby(
         ["key1", "key2"], as_index=False, group_keys=False
@@ -369,7 +379,6 @@ def test_groupby_apply_args(func, args):
 
 
 def test_groupby_apply_grouped():
-    np.random.seed(0)
     df = DataFrame()
     nelem = 20
     df["key1"] = range(nelem)
@@ -1010,6 +1019,7 @@ def test_groupby_2keys_agg(nelem, func):
     # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"],
 )
 def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
+    rng = np.random.default_rng(seed=0)
     # The number of digits after the decimal to use.
     decimal_digits = 2
     # The number of digits before the decimal to use.
@@ -1026,8 +1036,8 @@ def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
     # https://github.com/pandas-dev/pandas/issues/40685). However, if that is
     # ever enabled, then this issue will crop up again so we may as well have
     # it fixed now.
-    x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))
-    y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))
+    x = np.unique((rng.random(nelem) * scale).round(decimal_digits))
+    y = np.unique((rng.random(nelem) * scale).round(decimal_digits))
 
     if x.size < y.size:
         total_elements = x.size
@@ -1313,9 +1323,9 @@ def test_empty_groupby(func):
 
 
 def test_groupby_unsupported_columns():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     pd_cat = pd.Categorical(
-        pd.Series(np.random.choice(["a", "b", 1], 3), dtype="category")
+        pd.Series(rng.choice(["a", "b", 1], 3), dtype="category")
     )
     pdf = pd.DataFrame(
         {
@@ -1421,10 +1431,11 @@ def test_groupby_apply_basic_agg_single_column():
 
 
 def test_groupby_multi_agg_single_groupby_series():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
     gdf = cudf.from_pandas(pdf)
@@ -1435,12 +1446,13 @@ def test_groupby_multi_agg_single_groupby_series():
 
 
 def test_groupby_multi_agg_multi_groupby():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "a": np.random.randint(0, 5, 10),
-            "b": np.random.randint(0, 5, 10),
-            "c": np.random.randint(0, 5, 10),
-            "d": np.random.randint(0, 5, 10),
+            "a": rng.integers(0, 5, 10),
+            "b": rng.integers(0, 5, 10),
+            "c": rng.integers(0, 5, 10),
+            "d": rng.integers(0, 5, 10),
         }
     )
     gdf = cudf.from_pandas(pdf)
@@ -1450,6 +1462,7 @@ def test_groupby_multi_agg_multi_groupby():
 
 
 def test_groupby_datetime_multi_agg_multi_groupby():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
             "a": pd.date_range(
@@ -1457,9 +1470,9 @@ def test_groupby_datetime_multi_agg_multi_groupby():
                 datetime.datetime.now() + datetime.timedelta(9),
                 freq="D",
             ),
-            "b": np.random.randint(0, 5, 10),
-            "c": np.random.randint(0, 5, 10),
-            "d": np.random.randint(0, 5, 10),
+            "b": rng.integers(0, 5, 10),
+            "c": rng.integers(0, 5, 10),
+            "d": rng.integers(0, 5, 10),
         }
     )
     gdf = cudf.from_pandas(pdf)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 3f483219423..24d42d9eb4c 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2645,21 +2645,20 @@ def test_isin_multiindex(data, values, level, err):
         )
 
 
-range_data = [
-    range(np.random.randint(0, 100)),
-    range(9, 12, 2),
-    range(20, 30),
-    range(100, 1000, 10),
-    range(0, 10, -2),
-    range(0, -10, 2),
-    range(0, -10, -2),
-]
-
-
-@pytest.fixture(params=range_data)
+@pytest.fixture(
+    params=[
+        range(np.random.default_rng(seed=0).integers(0, 100)),
+        range(9, 12, 2),
+        range(20, 30),
+        range(100, 1000, 10),
+        range(0, 10, -2),
+        range(0, -10, 2),
+        range(0, -10, -2),
+    ]
+)
 def rangeindex(request):
     """Create a cudf RangeIndex of different `nrows`"""
-    return RangeIndex(request.param)
+    return cudf.RangeIndex(request.param)
 
 
 @pytest.mark.parametrize(
@@ -2830,21 +2829,20 @@ def test_rangeindex_append_return_rangeindex():
     assert_eq(result, expected)
 
 
-index_data = [
-    range(np.random.randint(0, 100)),
-    range(0, 10, -2),
-    range(0, -10, 2),
-    range(0, -10, -2),
-    range(0, 1),
-    [1, 2, 3, 1, None, None],
-    [None, None, 3.2, 1, None, None],
-    [None, "a", "3.2", "z", None, None],
-    pd.Series(["a", "b", None], dtype="category"),
-    np.array([1, 2, 3, None], dtype="datetime64[s]"),
-]
-
-
-@pytest.fixture(params=index_data)
+@pytest.fixture(
+    params=[
+        range(np.random.default_rng(seed=0).integers(0, 100)),
+        range(0, 10, -2),
+        range(0, -10, 2),
+        range(0, -10, -2),
+        range(0, 1),
+        [1, 2, 3, 1, None, None],
+        [None, None, 3.2, 1, None, None],
+        [None, "a", "3.2", "z", None, None],
+        pd.Series(["a", "b", None], dtype="category"),
+        np.array([1, 2, 3, None], dtype="datetime64[s]"),
+    ]
+)
 def index(request):
     """Create a cudf Index of different dtypes"""
     return cudf.Index(request.param)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 00ae99466bb..421bc0c298b 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -32,7 +32,8 @@ def pdf_gdf():
 
 @pytest.fixture
 def pdf_gdf_multi():
-    pdf = pd.DataFrame(np.random.rand(7, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(7, 5)))
     pdfIndex = pd.MultiIndex(
         [
             ["a", "b", "c"],
@@ -212,12 +213,17 @@ def test_dataframe_column_name_indexing():
         df[1].to_numpy(), np.asarray(range(10), dtype=np.int32)
     )
 
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame()
     nelem = 10
-    pdf["key1"] = np.random.randint(0, 5, nelem)
-    pdf["key2"] = np.random.randint(0, 3, nelem)
-    pdf[1] = np.arange(1, 1 + nelem)
-    pdf[2] = np.random.random(nelem)
+    pdf = pd.DataFrame(
+        {
+            "key1": rng.integers(0, 5, nelem),
+            "key2": rng.integers(0, 3, nelem),
+            1: np.arange(1, 1 + nelem),
+            2: rng.random(nelem),
+        }
+    )
     df = cudf.from_pandas(pdf)
 
     assert_eq(df[df.columns], df)
@@ -239,16 +245,13 @@ def test_dataframe_column_name_indexing():
 
 
 def test_dataframe_slicing():
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame()
     size = 123
-    df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(
-        np.int32
-    )
-    df["b"] = hb = np.random.random(size).astype(np.float32)
-    df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype(
-        np.int64
-    )
-    df["d"] = hd = np.random.random(size).astype(np.float64)
+    df["a"] = ha = rng.integers(low=0, high=100, size=size).astype(np.int32)
+    df["b"] = hb = rng.random(size).astype(np.float32)
+    df["c"] = hc = rng.integers(low=0, high=100, size=size).astype(np.int64)
+    df["d"] = hd = rng.random(size).astype(np.float64)
 
     # Row slice first 10
     first_10 = df[:10]
@@ -287,12 +290,13 @@ def test_dataframe_slicing():
 @pytest.mark.parametrize("scalar", [0, 20, 100])
 def test_dataframe_loc(scalar, step):
     size = 123
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "a": np.random.randint(low=0, high=100, size=size),
-            "b": np.random.random(size).astype(np.float32),
-            "c": np.random.random(size).astype(np.float64),
-            "d": np.random.random(size).astype(np.float64),
+            "a": rng.integers(low=0, high=100, size=size),
+            "b": rng.random(size).astype(np.float32),
+            "c": rng.random(size).astype(np.float64),
+            "d": rng.random(size).astype(np.float64),
         }
     )
     pdf.index.name = "index"
@@ -392,12 +396,11 @@ def test_dataframe_loc_mask(mask, arg):
 
 
 def test_dataframe_loc_outbound():
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame()
     size = 10
-    df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(
-        np.int32
-    )
-    df["b"] = hb = np.random.random(size).astype(np.float32)
+    df["a"] = ha = rng.integers(low=0, high=100, size=size).astype(np.int32)
+    df["b"] = hb = rng.random(size).astype(np.float32)
 
     pdf = pd.DataFrame()
     pdf["a"] = ha
@@ -590,8 +593,8 @@ def test_dataframe_series_loc_multiindex(obj):
 @pytest.mark.parametrize("nelem", [2, 5, 20, 100])
 def test_series_iloc(nelem):
     # create random cudf.Series
-    np.random.seed(12)
-    ps = pd.Series(np.random.sample(nelem))
+    rng = np.random.default_rng(seed=0)
+    ps = pd.Series(rng.random(nelem))
 
     # gpu cudf.Series
     gs = cudf.Series(ps)
@@ -625,12 +628,11 @@ def test_series_iloc(nelem):
 
 @pytest.mark.parametrize("nelem", [2, 5, 20, 100])
 def test_dataframe_iloc(nelem):
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame()
 
-    gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(
-        np.int32
-    )
-    gdf["b"] = hb = np.random.random(nelem).astype(np.float32)
+    gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32)
+    gdf["b"] = hb = rng.random(nelem).astype(np.float32)
 
     pdf = pd.DataFrame()
     pdf["a"] = ha
@@ -679,12 +681,11 @@ def test_dataframe_iloc(nelem):
 
 
 def test_dataframe_iloc_tuple():
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame()
     nelem = 123
-    gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(
-        np.int32
-    )
-    gdf["b"] = hb = np.random.random(nelem).astype(np.float32)
+    gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32)
+    gdf["b"] = hb = rng.random(nelem).astype(np.float32)
 
     pdf = pd.DataFrame()
     pdf["a"] = ha
@@ -695,12 +696,11 @@ def test_dataframe_iloc_tuple():
 
 
 def test_dataframe_iloc_index_error():
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame()
     nelem = 123
-    gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(
-        np.int32
-    )
-    gdf["b"] = hb = np.random.random(nelem).astype(np.float32)
+    gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32)
+    gdf["b"] = hb = rng.random(nelem).astype(np.float32)
 
     pdf = pd.DataFrame()
     pdf["a"] = ha
@@ -714,14 +714,16 @@ def test_dataframe_iloc_index_error():
 
 @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200])
 def test_dataframe_take(ntake):
-    np.random.seed(0)
-    df = cudf.DataFrame()
-
+    rng = np.random.default_rng(seed=0)
     nelem = 123
-    df["ii"] = np.random.randint(0, 20, nelem)
-    df["ff"] = np.random.random(nelem)
+    df = cudf.DataFrame(
+        {
+            "ii": rng.integers(0, 20, nelem),
+            "ff": rng.random(nelem),
+        }
+    )
 
-    take_indices = np.random.randint(0, len(df), ntake)
+    take_indices = rng.integers(0, len(df), ntake)
 
     actual = df.take(take_indices)
     expected = df.to_pandas().take(take_indices)
@@ -733,7 +735,7 @@ def test_dataframe_take(ntake):
 
 @pytest.mark.parametrize("ntake", [1, 2, 8, 9])
 def test_dataframe_take_with_multiindex(ntake):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(
         index=cudf.MultiIndex(
             levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
@@ -742,10 +744,10 @@ def test_dataframe_take_with_multiindex(ntake):
     )
 
     nelem = 9
-    df["ii"] = np.random.randint(0, 20, nelem)
-    df["ff"] = np.random.random(nelem)
+    df["ii"] = rng.integers(0, 20, nelem)
+    df["ff"] = rng.random(nelem)
 
-    take_indices = np.random.randint(0, len(df), ntake)
+    take_indices = rng.integers(0, len(df), ntake)
 
     actual = df.take(take_indices)
     expected = df.to_pandas().take(take_indices)
@@ -755,13 +757,13 @@ def test_dataframe_take_with_multiindex(ntake):
 
 @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200])
 def test_series_take(ntake):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     nelem = 123
 
-    psr = pd.Series(np.random.randint(0, 20, nelem))
+    psr = pd.Series(rng.integers(0, 20, nelem))
     gsr = cudf.Series(psr)
 
-    take_indices = np.random.randint(0, len(gsr), ntake)
+    take_indices = rng.integers(0, len(gsr), ntake)
 
     actual = gsr.take(take_indices)
     expected = psr.take(take_indices)
@@ -841,14 +843,15 @@ def test_empty_boolean_mask(dtype):
 )
 @pytest.mark.parametrize("nulls", ["one", "some", "all", "none"])
 def test_series_apply_boolean_mask(data, mask, nulls):
+    rng = np.random.default_rng(seed=0)
     psr = pd.Series(data)
 
     if len(data) > 0:
         if nulls == "one":
-            p = np.random.randint(0, 4)
+            p = rng.integers(0, 4)
             psr[p] = None
         elif nulls == "some":
-            p1, p2 = np.random.randint(0, 4, (2,))
+            p1, p2 = rng.integers(0, 4, (2,))
             psr[p1] = None
             psr[p2] = None
         elif nulls == "all":
@@ -1810,13 +1813,14 @@ def test_boolean_mask_columns_iloc_series():
 
 @pytest.mark.parametrize("index_type", ["single", "slice"])
 def test_loc_timestamp_issue_8585(index_type):
+    rng = np.random.default_rng(seed=0)
     # https://github.com/rapidsai/cudf/issues/8585
     start = pd.Timestamp(
         datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
     )
     end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M"))
     timestamps = pd.date_range(start, end, periods=12)
-    value = np.random.normal(size=12)
+    value = rng.normal(size=12)
     df = pd.DataFrame(value, index=timestamps, columns=["value"])
     cdf = cudf.from_pandas(df)
     if index_type == "single":
@@ -1851,6 +1855,7 @@ def test_loc_timestamp_issue_8585(index_type):
     ],
 )
 def test_loc_multiindex_timestamp_issue_8585(index_type):
+    rng = np.random.default_rng(seed=0)
     # https://github.com/rapidsai/cudf/issues/8585
     start = pd.Timestamp(
         datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
@@ -1861,7 +1866,7 @@ def test_loc_multiindex_timestamp_issue_8585(index_type):
     index = pd.MultiIndex.from_product(
         [timestamps, labels], names=["timestamp", "label"]
     )
-    value = np.random.normal(size=12)
+    value = rng.normal(size=12)
     df = pd.DataFrame(value, index=index, columns=["value"])
     cdf = cudf.from_pandas(df)
     start = pd.Timestamp(
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index b1ce69e58ef..f6941ce7fae 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -22,7 +22,7 @@
 
 
 def make_params():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     hows = _JOIN_TYPES
 
@@ -39,14 +39,14 @@ def make_params():
         yield (aa, bb, how)
 
     # Test large random integer inputs
-    aa = np.random.randint(0, 50, 100)
-    bb = np.random.randint(0, 50, 100)
+    aa = rng.integers(0, 50, 100)
+    bb = rng.integers(0, 50, 100)
     for how in hows:
         yield (aa, bb, how)
 
     # Test floating point inputs
-    aa = np.random.random(50)
-    bb = np.random.random(50)
+    aa = rng.random(50)
+    bb = rng.random(50)
     for how in hows:
         yield (aa, bb, how)
 
@@ -162,9 +162,9 @@ def _check_series(expect, got):
     reason="bug in older version of pandas",
 )
 def test_dataframe_join_suffix():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
-    df = cudf.DataFrame(np.random.randint(0, 5, (5, 3)), columns=list("abc"))
+    df = cudf.DataFrame(rng.integers(0, 5, (5, 3)), columns=list("abc"))
 
     left = df.set_index("a")
     right = df.set_index("c")
@@ -281,19 +281,19 @@ def test_dataframe_join_mismatch_cats(how):
 
 @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None])
 def test_dataframe_merge_on(on):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # Make cuDF
     df_left = cudf.DataFrame()
     nelem = 500
-    df_left["key1"] = np.random.randint(0, 40, nelem)
-    df_left["key2"] = np.random.randint(0, 50, nelem)
+    df_left["key1"] = rng.integers(0, 40, nelem)
+    df_left["key2"] = rng.integers(0, 50, nelem)
     df_left["left_val"] = np.arange(nelem)
 
     df_right = cudf.DataFrame()
     nelem = 500
-    df_right["key1"] = np.random.randint(0, 30, nelem)
-    df_right["key2"] = np.random.randint(0, 50, nelem)
+    df_right["key1"] = rng.integers(0, 30, nelem)
+    df_right["key2"] = rng.integers(0, 50, nelem)
     df_right["right_val"] = np.arange(nelem)
 
     # Make pandas DF
@@ -347,19 +347,19 @@ def test_dataframe_merge_on(on):
 
 
 def test_dataframe_merge_on_unknown_column():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # Make cuDF
     df_left = cudf.DataFrame()
     nelem = 500
-    df_left["key1"] = np.random.randint(0, 40, nelem)
-    df_left["key2"] = np.random.randint(0, 50, nelem)
+    df_left["key1"] = rng.integers(0, 40, nelem)
+    df_left["key2"] = rng.integers(0, 50, nelem)
     df_left["left_val"] = np.arange(nelem)
 
     df_right = cudf.DataFrame()
     nelem = 500
-    df_right["key1"] = np.random.randint(0, 30, nelem)
-    df_right["key2"] = np.random.randint(0, 50, nelem)
+    df_right["key1"] = rng.integers(0, 30, nelem)
+    df_right["key2"] = rng.integers(0, 50, nelem)
     df_right["right_val"] = np.arange(nelem)
 
     with pytest.raises(KeyError) as raises:
@@ -368,19 +368,19 @@ def test_dataframe_merge_on_unknown_column():
 
 
 def test_dataframe_merge_no_common_column():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # Make cuDF
     df_left = cudf.DataFrame()
     nelem = 500
-    df_left["key1"] = np.random.randint(0, 40, nelem)
-    df_left["key2"] = np.random.randint(0, 50, nelem)
+    df_left["key1"] = rng.integers(0, 40, nelem)
+    df_left["key2"] = rng.integers(0, 50, nelem)
     df_left["left_val"] = np.arange(nelem)
 
     df_right = cudf.DataFrame()
     nelem = 500
-    df_right["key3"] = np.random.randint(0, 30, nelem)
-    df_right["key4"] = np.random.randint(0, 50, nelem)
+    df_right["key3"] = rng.integers(0, 30, nelem)
+    df_right["key4"] = rng.integers(0, 50, nelem)
     df_right["right_val"] = np.arange(nelem)
 
     with pytest.raises(ValueError) as raises:
@@ -460,14 +460,14 @@ def test_dataframe_merge_order():
 @pytest.mark.parametrize("rows", [1, 5, 100])
 @pytest.mark.parametrize("how", ["left", "inner", "outer"])
 def test_dataframe_pairs_of_triples(pairs, max, rows, how):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     pdf_left = pd.DataFrame()
     pdf_right = pd.DataFrame()
     for left_column in pairs[0]:
-        pdf_left[left_column] = np.random.randint(0, max, rows)
+        pdf_left[left_column] = rng.integers(0, max, rows)
     for right_column in pairs[1]:
-        pdf_right[right_column] = np.random.randint(0, max, rows)
+        pdf_right[right_column] = rng.integers(0, max, rows)
     gdf_left = cudf.from_pandas(pdf_left)
     gdf_right = cudf.from_pandas(pdf_right)
     if not set(pdf_left.columns).intersection(pdf_right.columns):
@@ -504,15 +504,15 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how):
 
 
 def test_safe_merging_with_left_empty():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     pairs = ("bcd", "b")
     pdf_left = pd.DataFrame()
     pdf_right = pd.DataFrame()
     for left_column in pairs[0]:
-        pdf_left[left_column] = np.random.randint(0, 10, 0)
+        pdf_left[left_column] = rng.integers(0, 10, 0)
     for right_column in pairs[1]:
-        pdf_right[right_column] = np.random.randint(0, 10, 5)
+        pdf_right[right_column] = rng.integers(0, 10, 5)
     gdf_left = cudf.from_pandas(pdf_left)
     gdf_right = cudf.from_pandas(pdf_right)
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index c81c2d1d94b..47976fc4bac 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -32,13 +32,14 @@ def make_numeric_dataframe(nrows, dtype):
 
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
+    rng = np.random.default_rng(seed=0)
     types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"]
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
         {
-            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
             for typ in types
         }
     )
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 790e84559a9..a34c89f55d3 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -164,7 +164,8 @@ def test_series(testlist):
 
 
 def test_multiindex():
-    pdf = pd.DataFrame(np.random.rand(7, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(7, 5)))
     pdf.index = pd.MultiIndex(
         [
             ["a", "b", "c"],
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index c41be3e4428..ad0e0858c43 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -153,7 +153,8 @@ def test_multiindex_swaplevel():
 
 
 def test_string_index():
-    pdf = pd.DataFrame(np.random.rand(5, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(5, 5)))
     gdf = cudf.from_pandas(pdf)
     stringIndex = ["a", "b", "c", "d", "e"]
     pdf.index = stringIndex
@@ -176,7 +177,8 @@ def test_string_index():
 
 
 def test_multiindex_row_shape():
-    pdf = pd.DataFrame(np.random.rand(0, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(0, 5)))
     gdf = cudf.from_pandas(pdf)
     pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]])
     pdfIndex.names = ["alpha"]
@@ -193,7 +195,8 @@ def test_multiindex_row_shape():
 
 @pytest.fixture
 def pdf():
-    return pd.DataFrame(np.random.rand(7, 5))
+    rng = np.random.default_rng(seed=0)
+    return pd.DataFrame(rng.random(size=(7, 5)))
 
 
 @pytest.fixture
@@ -271,7 +274,8 @@ def test_from_pandas_series():
 
 
 def test_series_multiindex(pdfIndex):
-    ps = pd.Series(np.random.rand(7))
+    rng = np.random.default_rng(seed=0)
+    ps = pd.Series(rng.random(7))
     gs = cudf.from_pandas(ps)
     ps.index = pdfIndex
     gs.index = cudf.from_pandas(pdfIndex)
@@ -439,7 +443,8 @@ def test_multiindex_loc_rows_1_1_key(pdf, gdf, pdfIndex):
 
 
 def test_multiindex_column_shape():
-    pdf = pd.DataFrame(np.random.rand(5, 0))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(5, 0)))
     gdf = cudf.from_pandas(pdf)
     pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]])
     pdfIndex.names = ["alpha"]
@@ -522,9 +527,13 @@ def test_multiindex_from_product(arrays):
 
 
 def test_multiindex_index_and_columns():
-    gdf = cudf.DataFrame()
-    gdf["x"] = np.random.randint(0, 5, 5)
-    gdf["y"] = np.random.randint(0, 5, 5)
+    rng = np.random.default_rng(seed=0)
+    gdf = cudf.DataFrame(
+        {
+            "x": rng.integers(0, 5, 5),
+            "y": rng.integers(0, 5, 5),
+        }
+    )
     pdf = gdf.to_pandas()
     mi = cudf.MultiIndex(
         levels=[[0, 1, 2], [3, 4]],
@@ -542,11 +551,12 @@ def test_multiindex_index_and_columns():
 
 
 def test_multiindex_multiple_groupby():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
             "a": [4, 17, 4, 9, 5],
             "b": [1, 4, 4, 3, 2],
-            "x": np.random.normal(size=5),
+            "x": rng.normal(size=5),
         }
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -566,11 +576,12 @@ def test_multiindex_multiple_groupby():
     ],
 )
 def test_multi_column(func):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=1000),
-            "y": np.random.randint(0, 10, size=1000),
-            "z": np.random.normal(size=1000),
+            "x": rng.integers(0, 5, size=1000),
+            "y": rng.integers(0, 10, size=1000),
+            "z": rng.normal(size=1000),
         }
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 1dd732c7191..41c1c3ccb20 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -681,7 +681,6 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
 def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     from pyarrow import orc
 
-    np.random.seed(0)
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Writing bool columns to multiple row groups is disabled
     # until #6763 is fixed
@@ -704,6 +703,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
                 has_nulls=True,
                 low=0,
                 high=max_char_length,
+                seed=0,
             )
             for dtype in supported_stat_types
         }
@@ -845,7 +845,6 @@ def test_orc_reader_gmt_timestamps(datadir):
 
 
 def test_orc_bool_encode_fail():
-    np.random.seed(0)
     buffer = BytesIO()
 
     # Generate a boolean column longer than a single row group
@@ -927,7 +926,6 @@ def test_empty_string_columns(data):
     [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
 )
 def test_orc_writer_decimal(tmpdir, scale, decimal_type):
-    np.random.seed(0)
     fname = tmpdir / "decimal.orc"
 
     expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)})
@@ -988,7 +986,7 @@ def test_orc_string_stream_offset_issue():
 
 def generate_list_struct_buff(size=100_000):
     rd = random.Random(1)
-    np.random.seed(seed=1)
+    rng = np.random.default_rng(seed=1)
 
     buff = BytesIO()
 
@@ -999,12 +997,12 @@ def generate_list_struct_buff(size=100_000):
                 [
                     [
                         [
-                            rd.choice([None, np.random.randint(1, 3)])
-                            for _ in range(np.random.randint(1, 3))
+                            rd.choice([None, rng.integers(1, 3)])
+                            for _ in range(rng.integers(1, 3))
                         ]
-                        for _ in range(np.random.randint(0, 3))
+                        for _ in range(rng.integers(0, 3))
                     ]
-                    for _ in range(np.random.randint(0, 3))
+                    for _ in range(rng.integers(0, 3))
                 ],
             ]
         )
@@ -1012,8 +1010,8 @@ def generate_list_struct_buff(size=100_000):
     ]
     lvl1_list = [
         [
-            rd.choice([None, np.random.randint(0, 3)])
-            for _ in range(np.random.randint(1, 4))
+            rd.choice([None, rng.integers(0, 3)])
+            for _ in range(rng.integers(1, 4))
         ]
         for _ in range(size)
     ]
@@ -1021,7 +1019,7 @@ def generate_list_struct_buff(size=100_000):
         rd.choice(
             [
                 None,
-                {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)},
+                {"a": rng.integers(0, 3), "b": rng.integers(0, 3)},
             ]
         )
         for _ in range(size)
@@ -1030,11 +1028,11 @@ def generate_list_struct_buff(size=100_000):
         rd.choice(
             [
                 None,
-                {"a": rd.choice([None, np.random.randint(0, 3)])},
+                {"a": rd.choice([None, rng.integers(0, 3)])},
                 {
                     "lvl1_struct": {
-                        "c": rd.choice([None, np.random.randint(0, 3)]),
-                        "d": np.random.randint(0, 3),
+                        "c": rd.choice([None, rng.integers(0, 3)]),
+                        "d": rng.integers(0, 3),
                     },
                 },
             ]
@@ -1044,7 +1042,7 @@ def generate_list_struct_buff(size=100_000):
     list_nests_struct = [
         [
             {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)}
-            for _ in range(np.random.randint(1, 4))
+            for _ in range(rng.integers(1, 4))
         ]
         for _ in range(size)
     ]
@@ -1135,7 +1133,7 @@ def gen_map_buff(size):
     from pyarrow import orc
 
     rd = random.Random(1)
-    np.random.seed(seed=1)
+    rng = np.random.default_rng(seed=1)
 
     buff = BytesIO()
 
@@ -1146,7 +1144,7 @@ def gen_map_buff(size):
                     None,
                     {
                         rd.choice(al): rd.choice(
-                            [None, np.random.randint(1, 1500)]
+                            [None, rng.integers(1, 1500)]
                         ),
                     },
                 ]
@@ -1167,7 +1165,7 @@ def gen_map_buff(size):
                                     None,
                                     [
                                         rd.choice(
-                                            [None, np.random.randint(1, 1500)]
+                                            [None, rng.integers(1, 1500)]
                                         )
                                         for _ in range(5)
                                     ],
@@ -1194,10 +1192,10 @@ def gen_map_buff(size):
                                     None,
                                     {
                                         "a": rd.choice(
-                                            [None, np.random.randint(1, 1500)]
+                                            [None, rng.integers(1, 1500)]
                                         ),
                                         "b": rd.choice(
-                                            [None, np.random.randint(1, 1500)]
+                                            [None, rng.integers(1, 1500)]
                                         ),
                                     },
                                 ]
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
index ad78621c5fa..b474bbe9bd8 100644
--- a/python/cudf/cudf/tests/test_pack.py
+++ b/python/cudf/cudf/tests/test_pack.py
@@ -24,11 +24,11 @@
 
 
 def test_sizeof_packed_dataframe():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
     nelem = 1000
     df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
-    df["vals"] = hvals = np.random.random(nelem)
+    df["vals"] = hvals = rng.random(nelem)
     packed = pack(df)
 
     nbytes = hkeys.nbytes + hvals.nbytes
@@ -67,46 +67,46 @@ def assert_packed_frame_equality(df):
 
 
 def test_packed_dataframe_equality_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_packed_equality(df)
 
 
 def test_packed_dataframe_equality_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_equality(df)
 
 
 def test_packed_dataframe_equality_list():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_equality(df)
 
 
 def test_packed_dataframe_equality_struct():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(
         list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_equality(df)
 
@@ -135,46 +135,46 @@ def assert_packed_frame_unique_pointers(df):
 
 
 def test_packed_dataframe_unique_pointers_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_packed_unique_pointers(df)
 
 
 def test_packed_dataframe_unique_pointers_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_unique_pointers(df)
 
 
 def test_packed_dataframe_unique_pointers_list():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_unique_pointers(df)
 
 
 def test_packed_dataframe_unique_pointers_struct():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(
         list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_unique_pointers(df)
 
@@ -208,46 +208,46 @@ def assert_packed_frame_picklable(df):
 
 
 def test_pickle_packed_dataframe_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_packed_pickled_equality(df)
 
 
 def test_pickle_packed_dataframe_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_pickled_equality(df)
 
 
 def test_pickle_packed_dataframe_list():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_pickled_equality(df)
 
 
 def test_pickle_packed_dataframe_struct():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(
         list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_pickled_equality(df)
 
@@ -273,45 +273,45 @@ def assert_packed_frame_serializable(df):
 
 
 def test_serialize_packed_dataframe_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_packed_serialized_equality(df)
 
 
 def test_serialize_packed_dataframe_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_serialized_equality(df)
 
 
 def test_serialize_packed_dataframe_list():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_serialized_equality(df)
 
 
 def test_serialize_packed_dataframe_struct():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(
         list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_serialized_equality(df)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 7f1b0b1cd46..c9ce24d2a5b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -53,6 +53,7 @@ def datadir(datadir):
 
 @pytest.fixture(params=[1, 5, 10, 100000])
 def simple_pdf(request):
+    rng = np.random.default_rng(seed=0)
     types = [
         "bool",
         "int8",
@@ -72,7 +73,7 @@ def simple_pdf(request):
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
         {
-            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
             for typ in types
         },
         # Need to ensure that this index is not a RangeIndex to get the
@@ -92,6 +93,7 @@ def simple_gdf(simple_pdf):
 
 
 def build_pdf(num_columns, day_resolution_timestamps):
+    rng = np.random.default_rng(seed=0)
     types = [
         "bool",
         "int8",
@@ -114,7 +116,7 @@ def build_pdf(num_columns, day_resolution_timestamps):
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
         {
-            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
             for typ in types
         },
         # Need to ensure that this index is not a RangeIndex to get the
@@ -142,7 +144,7 @@ def build_pdf(num_columns, day_resolution_timestamps):
         },
     ]:
         data = [
-            np.random.randint(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"]))
+            rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"]))
             for i in range(nrows)
         ]
         if day_resolution_timestamps:
@@ -152,11 +154,11 @@ def build_pdf(num_columns, day_resolution_timestamps):
         )
 
     # Create non-numeric categorical data otherwise parquet may typecast it
-    data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
+    data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)]
     test_pdf["col_category"] = pd.Series(data, dtype="category")
 
     # Create non-numeric str data
-    data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
+    data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)]
     test_pdf["col_str"] = pd.Series(data, dtype="str")
 
     return test_pdf
@@ -453,7 +455,9 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
                 dg.ColumnParameters(
                     40,
                     0.2,
-                    lambda: np.random.default_rng().integers(0, 100, size=40),
+                    lambda: np.random.default_rng(seed=None).integers(
+                        0, 100, size=40
+                    ),
                     True,
                 ),
             ],
@@ -1909,6 +1913,7 @@ def test_parquet_writer_dictionary_setting(use_dict, max_dict_size):
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
 def test_parquet_partitioned(tmpdir_factory, cols, filename):
+    rng = np.random.default_rng(seed=0)
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
@@ -1917,8 +1922,8 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
     pdf = pd.DataFrame(
         {
             "a": np.arange(0, stop=size, dtype="int64"),
-            "b": np.random.choice(list("abcd"), size=size),
-            "c": np.random.choice(np.arange(4), size=size),
+            "b": rng.choice(list("abcd"), size=size),
+            "c": rng.choice(np.arange(4), size=size),
         }
     )
     pdf.to_parquet(pdf_dir, index=False, partition_cols=cols)
@@ -1954,6 +1959,7 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
 
 @pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}])
 def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs):
+    rng = np.random.default_rng(seed=0)
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
@@ -1961,8 +1967,8 @@ def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs):
     pdf = pd.DataFrame(
         {
             "a": np.arange(0, stop=size, dtype="int64"),
-            "b": np.random.choice(list("abcd"), size=size),
-            "c": np.random.choice(np.arange(4), size=size),
+            "b": rng.choice(list("abcd"), size=size),
+            "c": rng.choice(np.arange(4), size=size),
         }
     )
     pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"])
@@ -2127,6 +2133,7 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
 @pytest.mark.parametrize("cols", [None, ["b"]])
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema):
+    rng = np.random.default_rng(seed=0)
     dir1 = tmpdir_factory.mktemp("dir1")
     dir2 = tmpdir_factory.mktemp("dir2")
     if cols is None:
@@ -2139,7 +2146,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema):
     gdf = cudf.DataFrame(
         {
             "a": np.arange(0, stop=size),
-            "b": np.random.choice(np.arange(4), size=size),
+            "b": rng.choice(np.arange(4), size=size),
         }
     )
     gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
@@ -3214,11 +3221,12 @@ def test_parquet_nested_struct_list():
 
 def test_parquet_writer_zstd():
     size = 12345
+    rng = np.random.default_rng(seed=0)
     expected = cudf.DataFrame(
         {
             "a": np.arange(0, stop=size, dtype="float64"),
-            "b": np.random.choice(list("abcd"), size=size),
-            "c": np.random.choice(np.arange(4), size=size),
+            "b": rng.choice(list("abcd"), size=size),
+            "c": rng.choice(np.arange(4), size=size),
         }
     )
 
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 0f13a9e173a..2f10a5dfd74 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -40,33 +40,33 @@ def assert_frame_picklable(df):
 
 
 def test_pickle_dataframe_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_serialization(df)
 
 
 def test_pickle_dataframe_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_serialization(df)
 
 
 def test_memory_usage_dataframe():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
     nelem = 1000
     df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
-    df["vals"] = hvals = np.random.random(nelem)
+    df["vals"] = hvals = rng.random(nelem)
 
     nbytes = hkeys.nbytes + hvals.nbytes
     sizeof = df.memory_usage().sum()
@@ -98,11 +98,11 @@ def test_pickle_buffer():
 
 @pytest.mark.parametrize("named", [True, False])
 def test_pickle_series(named):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     if named:
-        ser = Series(np.random.random(10), name="a")
+        ser = Series(rng.random(10), name="a")
     else:
-        ser = Series(np.random.random(10))
+        ser = Series(rng.random(10))
 
     pickled = pickle.dumps(ser)
     out = pickle.loads(pickled)
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index b12209fd3b9..7685d09203e 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -45,10 +45,10 @@ def test_query(data, fn, nulls):
     # prepare
     nelem, seed = data
     expect_fn, query_expr = fn
-    np.random.seed(seed)
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame()
     pdf["a"] = np.arange(nelem)
-    pdf["b"] = np.random.random(nelem) * nelem
+    pdf["b"] = rng.random(nelem) * nelem
     if nulls:
         pdf.loc[::2, "a"] = None
     gdf = cudf.from_pandas(pdf)
@@ -71,10 +71,10 @@ def test_query_ref_env(data, fn):
     # prepare
     nelem, seed = data
     expect_fn, query_expr = fn
-    np.random.seed(seed)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
     df["a"] = aa = np.arange(nelem)
-    df["b"] = bb = np.random.random(nelem) * nelem
+    df["b"] = bb = rng.random(nelem) * nelem
     c = 2.3
     d = 1.2
     # udt
@@ -121,9 +121,9 @@ def test_query_local_dict():
 
 
 def test_query_splitted_combine():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)}
+        {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)}
     )
     gdf = DataFrame.from_pandas(df)
 
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index 4c1d8ce92ae..1d9c6690f14 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -125,32 +125,28 @@ def test_rank_error_arguments(pdf):
     )
 
 
-sort_group_args = [
-    np.full((3,), np.nan),
-    100 * np.random.random(10),
-    np.full((3,), np.inf),
-    np.full((3,), -np.inf),
-]
-sort_dtype_args = [np.int32, np.int64, np.float32, np.float64]
-
-
 @pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "elem,dtype",
     list(
         product(
-            combinations_with_replacement(sort_group_args, 4),
-            sort_dtype_args,
+            combinations_with_replacement(
+                [
+                    np.full((3,), np.nan),
+                    100 * np.random.default_rng(seed=0).random(10),
+                    np.full((3,), np.inf),
+                    np.full((3,), -np.inf),
+                ],
+                4,
+            ),
+            [np.int32, np.int64, np.float32, np.float64],
         )
     ),
 )
 def test_series_rank_combinations(elem, dtype):
-    np.random.seed(0)
     aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype)
-    gdf = DataFrame()
-    df = pd.DataFrame()
-    gdf["a"] = aa
-    df["a"] = aa
+    gdf = DataFrame({"a": aa})
+    df = pd.DataFrame({"a": aa})
     ranked_gs = gdf["a"].rank(method="first")
     ranked_ps = df["a"].rank(method="first")
     # Check
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index f276f394cd0..e0bc8f32c9b 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -62,8 +62,7 @@ def test_sum_string():
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_sum_decimal(dtype, nelem):
-    np.random.seed(0)
-    data = [str(x) for x in gen_rand("int64", nelem) / 100]
+    data = [str(x) for x in gen_rand("int64", nelem, seed=0) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).sum()
     got = cudf.Series(data).astype(dtype).sum()
@@ -73,15 +72,13 @@ def test_sum_decimal(dtype, nelem):
 
 @pytest.mark.parametrize("dtype,nelem", params)
 def test_product(dtype, nelem):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     dtype = cudf.dtype(dtype).type
     if cudf.dtype(dtype).kind in {"u", "i"}:
         data = np.ones(nelem, dtype=dtype)
         # Set at most 30 items to [0..2) to keep the value within 2^32
         for _ in range(30):
-            data[np.random.randint(low=0, high=nelem, size=1)] = (
-                np.random.uniform() * 2
-            )
+            data[rng.integers(low=0, high=nelem, size=1)] = rng.uniform() * 2
     else:
         data = gen_rand(dtype, nelem)
 
@@ -104,7 +101,6 @@ def test_product(dtype, nelem):
     ],
 )
 def test_product_decimal(dtype):
-    np.random.seed(0)
     data = [str(x) for x in gen_rand("int8", 3) / 10]
 
     expected = pd.Series([Decimal(x) for x in data]).product()
@@ -153,7 +149,6 @@ def test_sum_of_squares(dtype, nelem):
     ],
 )
 def test_sum_of_squares_decimal(dtype):
-    np.random.seed(0)
     data = [str(x) for x in gen_rand("int8", 3) / 10]
 
     expected = pd.Series([Decimal(x) for x in data]).pow(2).sum()
@@ -186,7 +181,6 @@ def test_min(dtype, nelem):
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_min_decimal(dtype, nelem):
-    np.random.seed(0)
     data = [str(x) for x in gen_rand("int64", nelem) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).min()
@@ -219,7 +213,6 @@ def test_max(dtype, nelem):
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_max_decimal(dtype, nelem):
-    np.random.seed(0)
     data = [str(x) for x in gen_rand("int64", nelem) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).max()
@@ -256,7 +249,8 @@ def test_sum_boolean():
 
 
 def test_date_minmax():
-    np_data = np.random.normal(size=10**3)
+    rng = np.random.default_rng(seed=0)
+    np_data = rng.normal(size=10**3)
     gdf_data = Series(np_data)
 
     np_casted = np_data.astype("datetime64[ms]")
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 95e19fae501..bf0c97adb00 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -25,9 +25,10 @@
 @pytest.mark.parametrize("dtype", repr_categories)
 @pytest.mark.parametrize("nrows", [0, 5, 10])
 def test_null_series(nrows, dtype):
+    rng = np.random.default_rng(seed=0)
     size = 5
-    sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype)
-    sr[np.random.choice([False, True], size=size)] = None
+    sr = cudf.Series(rng.integers(1, 9, size)).astype(dtype)
+    sr[rng.choice([False, True], size=size)] = None
     if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}:
         ps = pd.Series(
             sr._column.data_array_view(mode="read").copy_to_host(),
@@ -60,11 +61,12 @@ def test_null_series(nrows, dtype):
 
 @pytest.mark.parametrize("ncols", [1, 2, 3, 4, 5, 10])
 def test_null_dataframe(ncols):
+    rng = np.random.default_rng(seed=0)
     size = 20
     gdf = cudf.DataFrame()
     for idx, dtype in enumerate(dtype_categories):
-        sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype)
-        sr[np.random.choice([False, True], size=size)] = None
+        sr = cudf.Series(rng.integers(0, 128, size)).astype(dtype)
+        sr[rng.choice([False, True], size=size)] = None
         gdf[dtype] = sr
     pdf = gdf.to_pandas()
     pd.options.display.max_columns = int(ncols)
@@ -77,7 +79,8 @@ def test_null_dataframe(ncols):
 @pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21])
 def test_full_series(nrows, dtype):
     size = 20
-    ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    ps = pd.Series(rng.integers(0, 100, size)).astype(dtype)
     sr = cudf.from_pandas(ps)
     pd.options.display.max_rows = nrows
     assert repr(ps) == repr(sr)
@@ -89,8 +92,9 @@ def test_full_series(nrows, dtype):
 @pytest.mark.parametrize("size", [20, 21])
 @pytest.mark.parametrize("dtype", repr_categories)
 def test_full_dataframe_20(dtype, size, nrows, ncols):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
-        {idx: np.random.randint(0, 100, size) for idx in range(size)}
+        {idx: rng.integers(0, 100, size) for idx in range(size)}
     ).astype(dtype)
     gdf = cudf.from_pandas(pdf)
 
@@ -178,11 +182,12 @@ def test_mixed_series(mixed_pdf, mixed_gdf):
 
 
 def test_MI():
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
-            "a": np.random.randint(0, 4, 10),
-            "b": np.random.randint(0, 4, 10),
-            "c": np.random.randint(0, 4, 10),
+            "a": rng.integers(0, 4, 10),
+            "b": rng.integers(0, 4, 10),
+            "c": rng.integers(0, 4, 10),
         }
     )
     levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]]
@@ -223,9 +228,10 @@ def test_groupby_MI(nrows, ncols):
 @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES)
 @pytest.mark.parametrize("length", [0, 1, 10, 100, 1000])
 def test_generic_index(length, dtype):
+    rng = np.random.default_rng(seed=0)
     psr = pd.Series(
         range(length),
-        index=np.random.randint(0, high=100, size=length).astype(dtype),
+        index=rng.integers(0, high=100, size=length).astype(dtype),
         dtype="float64" if length == 0 else None,
     )
     gsr = cudf.Series.from_pandas(psr)
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index a61477981f8..5ff0098bcf4 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -50,8 +50,9 @@ def test_series_upsample_simple():
 
 @pytest.mark.parametrize("rule", ["2s", "10s"])
 def test_series_resample_ffill(rule):
-    rng = pd.date_range("1/1/2012", periods=10, freq="5s")
-    ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
+    date_idx = pd.date_range("1/1/2012", periods=10, freq="5s")
+    rng = np.random.default_rng(seed=0)
+    ts = pd.Series(rng.integers(0, 500, len(date_idx)), index=date_idx)
     gts = cudf.from_pandas(ts)
     assert_resample_results_equal(
         ts.resample(rule).ffill(), gts.resample(rule).ffill()
@@ -60,8 +61,9 @@ def test_series_resample_ffill(rule):
 
 @pytest.mark.parametrize("rule", ["2s", "10s"])
 def test_series_resample_bfill(rule):
-    rng = pd.date_range("1/1/2012", periods=10, freq="5s")
-    ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
+    date_idx = pd.date_range("1/1/2012", periods=10, freq="5s")
+    rng = np.random.default_rng(seed=0)
+    ts = pd.Series(rng.integers(0, 500, len(date_idx)), index=date_idx)
     gts = cudf.from_pandas(ts)
     assert_resample_results_equal(
         ts.resample(rule).bfill(), gts.resample(rule).bfill()
@@ -70,8 +72,9 @@ def test_series_resample_bfill(rule):
 
 @pytest.mark.parametrize("rule", ["2s", "10s"])
 def test_series_resample_asfreq(rule):
-    rng = pd.date_range("1/1/2012", periods=100, freq="5s")
-    ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
+    date_range = pd.date_range("1/1/2012", periods=100, freq="5s")
+    rng = np.random.default_rng(seed=0)
+    ts = pd.Series(rng.integers(0, 500, len(date_range)), index=date_range)
     gts = cudf.from_pandas(ts)
     assert_resample_results_equal(
         ts.resample(rule).asfreq(), gts.resample(rule).asfreq()
@@ -79,8 +82,9 @@ def test_series_resample_asfreq(rule):
 
 
 def test_dataframe_resample_aggregation_simple():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
-        np.random.randn(1000, 3),
+        rng.standard_normal(size=(1000, 3)),
         index=pd.date_range("1/1/2012", freq="s", periods=1000),
         columns=["A", "B", "C"],
     )
@@ -91,8 +95,9 @@ def test_dataframe_resample_aggregation_simple():
 
 
 def test_dataframe_resample_multiagg():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
-        np.random.randn(1000, 3),
+        rng.standard_normal(size=(1000, 3)),
         index=pd.date_range("1/1/2012", freq="s", periods=1000),
         columns=["A", "B", "C"],
     )
@@ -104,10 +109,11 @@ def test_dataframe_resample_multiagg():
 
 
 def test_dataframe_resample_on():
+    rng = np.random.default_rng(seed=0)
     # test resampling on a specified column
     pdf = pd.DataFrame(
         {
-            "x": np.random.randn(1000),
+            "x": rng.standard_normal(size=(1000)),
             "y": pd.date_range("1/1/2012", freq="s", periods=1000),
         }
     )
@@ -119,15 +125,16 @@ def test_dataframe_resample_on():
 
 
 def test_dataframe_resample_level():
+    rng = np.random.default_rng(seed=0)
     # test resampling on a specific level of a MultIndex
     pdf = pd.DataFrame(
         {
-            "x": np.random.randn(1000),
+            "x": rng.standard_normal(size=1000),
             "y": pd.date_range("1/1/2012", freq="s", periods=1000),
         }
     )
     pdi = pd.MultiIndex.from_frame(pdf)
-    pdf = pd.DataFrame({"a": np.random.randn(1000)}, index=pdi)
+    pdf = pd.DataFrame({"a": rng.standard_normal(size=1000)}, index=pdi)
     gdf = cudf.from_pandas(pdf)
     assert_resample_results_equal(
         pdf.resample("3min", level="y").mean(),
@@ -153,11 +160,12 @@ def test_dataframe_resample_level():
     reason="Fails in older versions of pandas",
 )
 def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
+    rng = np.random.default_rng(seed=0)
     # test that we cast to the appropriate frequency
     # when resampling:
     pdf = pd.DataFrame(
         {
-            "x": np.random.randn(100),
+            "x": rng.standard_normal(size=100),
             "y": pd.date_range("1/1/2012", freq=in_freq, periods=100),
         }
     )
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 3adbe1d2a74..26386abb05d 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -46,13 +46,12 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
 
     pdf = pd.DataFrame()
     id_vars = []
+    rng = np.random.default_rng(seed=0)
     for i in range(num_id_vars):
         colname = "id" + str(i)
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         elif nulls == "all":
             data[:] = np.nan
@@ -62,11 +61,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
     value_vars = []
     for i in range(num_value_vars):
         colname = "val" + str(i)
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         elif nulls == "all":
             data[:] = np.nan
@@ -139,13 +136,12 @@ def test_df_stack(nulls, num_cols, num_rows, dtype):
         pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame()
+    rng = np.random.default_rng(seed=0)
     for i in range(num_cols):
         colname = str(i)
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         pdf[colname] = data
 
@@ -280,8 +276,8 @@ def test_df_stack_multiindex_column_axis_pd_example(level):
         ],
         names=["exp", "animal", "hair_length"],
     )
-
-    df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
+    rng = np.random.default_rng(seed=0)
+    df = pd.DataFrame(rng.standard_normal(size=(4, 4)), columns=columns)
 
     with expect_warning_if(PANDAS_GE_220, FutureWarning):
         expect = df.stack(level=level, future_stack=False)
@@ -308,14 +304,13 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype):
         pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame(dtype=dtype)
+    rng = np.random.default_rng(seed=0)
     for i in range(num_cols):
         colname = str(i)
-        data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype)
+        data = pd.Series(rng.integers(0, 26, num_rows)).astype(dtype)
 
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         pdf[colname] = data
 
@@ -344,16 +339,13 @@ def test_tile(nulls, num_cols, num_rows, dtype, count):
         pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame(dtype=dtype)
+    rng = np.random.default_rng(seed=0)
     for i in range(num_cols):
         colname = str(i)
-        data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype(
-            dtype
-        )
+        data = pd.Series(rng.integers(num_cols, 26, num_rows)).astype(dtype)
 
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         pdf[colname] = data
 
@@ -724,23 +716,20 @@ def test_pivot_duplicate_error():
 
 
 @pytest.mark.parametrize(
-    "data",
-    [
+    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
+)
+@pytest.mark.parametrize("fill_value", [0])
+def test_pivot_table_simple(aggfunc, fill_value):
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(
         {
             "A": ["one", "one", "two", "three"] * 6,
             "B": ["A", "B", "C"] * 8,
             "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
-            "D": np.random.randn(24),
-            "E": np.random.randn(24),
+            "D": rng.standard_normal(size=24),
+            "E": rng.standard_normal(size=24),
         }
-    ],
-)
-@pytest.mark.parametrize(
-    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
-)
-@pytest.mark.parametrize("fill_value", [0])
-def test_pivot_table_simple(data, aggfunc, fill_value):
-    pdf = pd.DataFrame(data)
+    )
     expected = pd.pivot_table(
         pdf,
         values=["D", "E"],
@@ -749,7 +738,7 @@ def test_pivot_table_simple(data, aggfunc, fill_value):
         aggfunc=aggfunc,
         fill_value=fill_value,
     )
-    cdf = cudf.DataFrame(data)
+    cdf = cudf.DataFrame.from_pandas(pdf)
     actual = cudf.pivot_table(
         cdf,
         values=["D", "E"],
@@ -762,23 +751,20 @@ def test_pivot_table_simple(data, aggfunc, fill_value):
 
 
 @pytest.mark.parametrize(
-    "data",
-    [
+    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
+)
+@pytest.mark.parametrize("fill_value", [0])
+def test_dataframe_pivot_table_simple(aggfunc, fill_value):
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(
         {
             "A": ["one", "one", "two", "three"] * 6,
             "B": ["A", "B", "C"] * 8,
             "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
-            "D": np.random.randn(24),
-            "E": np.random.randn(24),
+            "D": rng.standard_normal(size=24),
+            "E": rng.standard_normal(size=24),
         }
-    ],
-)
-@pytest.mark.parametrize(
-    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
-)
-@pytest.mark.parametrize("fill_value", [0])
-def test_dataframe_pivot_table_simple(data, aggfunc, fill_value):
-    pdf = pd.DataFrame(data)
+    )
     expected = pdf.pivot_table(
         values=["D", "E"],
         index=["A", "B"],
@@ -786,7 +772,7 @@ def test_dataframe_pivot_table_simple(data, aggfunc, fill_value):
         aggfunc=aggfunc,
         fill_value=fill_value,
     )
-    cdf = cudf.DataFrame(data)
+    cdf = cudf.DataFrame.from_pandas(pdf)
     actual = cdf.pivot_table(
         values=["D", "E"],
         index=["A", "B"],
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 0b892a51895..68f2aaf9cab 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -170,11 +170,15 @@ def test_serialize_dataframe():
 
 
 def test_serialize_dataframe_with_index():
-    df = cudf.DataFrame()
-    df["a"] = np.arange(100)
-    df["b"] = np.random.random(100)
-    df["c"] = pd.Categorical(
-        ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"]
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "a": np.arange(100),
+            "b": rng.random(100),
+            "c": pd.Categorical(
+                ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"]
+            ),
+        }
     )
     df = df.sort_values("b")
     outdf = cudf.DataFrame.deserialize(*df.serialize())
@@ -200,11 +204,12 @@ def test_serialize_generic_index():
 
 
 def test_serialize_multi_index():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
             "a": [4, 17, 4, 9, 5],
             "b": [1, 4, 4, 3, 2],
-            "x": np.random.normal(size=5),
+            "x": rng.normal(size=5),
         }
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -218,7 +223,8 @@ def test_serialize_multi_index():
 
 def test_serialize_masked_series():
     nelem = 50
-    data = np.random.random(nelem)
+    rng = np.random.default_rng(seed=0)
+    data = rng.random(nelem)
     mask = utils.random_bitmask(nelem)
     bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
     null_count = utils.count_zero(bitmask)
@@ -229,10 +235,14 @@ def test_serialize_masked_series():
 
 
 def test_serialize_groupby_df():
-    df = cudf.DataFrame()
-    df["key_1"] = np.random.randint(0, 20, 100)
-    df["key_2"] = np.random.randint(0, 20, 100)
-    df["val"] = np.arange(100, dtype=np.float32)
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "key_1": rng.integers(0, 20, 100),
+            "key_2": rng.integers(0, 20, 100),
+            "val": np.arange(100, dtype=np.float32),
+        }
+    )
     gb = df.groupby(["key_1", "key_2"], sort=True)
     outgb = gb.deserialize(*gb.serialize())
     expect = gb.mean()
@@ -241,9 +251,9 @@ def test_serialize_groupby_df():
 
 
 def test_serialize_groupby_external():
-    df = cudf.DataFrame()
-    df["val"] = np.arange(100, dtype=np.float32)
-    gb = df.groupby(cudf.Series(np.random.randint(0, 20, 100)))
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame({"val": np.arange(100, dtype=np.float32)})
+    gb = df.groupby(cudf.Series(rng.integers(0, 20, 100)))
     outgb = gb.deserialize(*gb.serialize())
     expect = gb.mean()
     got = outgb.mean()
@@ -262,7 +272,8 @@ def test_serialize_groupby_level():
 
 
 def test_serialize_groupby_sr():
-    sr = cudf.Series(np.random.randint(0, 20, 100))
+    rng = np.random.default_rng(seed=0)
+    sr = cudf.Series(rng.integers(0, 20, 100))
     gb = sr.groupby(sr // 2)
     outgb = gb.deserialize(*gb.serialize())
     got = gb.mean()
@@ -271,9 +282,10 @@ def test_serialize_groupby_sr():
 
 
 def test_serialize_datetime():
+    rng = np.random.default_rng(seed=0)
     # Make frame with datetime column
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
     ts = np.arange(0, len(df), dtype=np.dtype("datetime64[ms]"))
     df["timestamp"] = ts
@@ -285,9 +297,10 @@ def test_serialize_datetime():
 
 
 def test_serialize_string():
+    rng = np.random.default_rng(seed=0)
     # Make frame with string column
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=5), "y": np.random.normal(size=5)}
+        {"x": rng.integers(0, 5, size=5), "y": rng.normal(size=5)}
     )
     str_data = ["a", "bc", "def", "ghij", "klmno"]
     df["timestamp"] = str_data
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index a24002dc38e..7f0a4902ed1 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -519,13 +519,13 @@ def test_series_factorize_sort(data, sort):
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_series_datetime_value_counts(data, nulls, normalize, dropna):
     psr = data.copy()
-
+    rng = np.random.default_rng(seed=0)
     if len(data) > 0:
         if nulls == "one":
-            p = np.random.randint(0, len(data))
+            p = rng.integers(0, len(data))
             psr[p] = None
         elif nulls == "some":
-            p = np.random.randint(0, len(data), 2)
+            p = rng.integers(0, len(data), 2)
             psr[p] = None
 
     gsr = cudf.from_pandas(psr)
@@ -546,10 +546,10 @@ def test_series_datetime_value_counts(data, nulls, normalize, dropna):
 @pytest.mark.parametrize("num_elements", [10, 100, 1000])
 def test_categorical_value_counts(dropna, normalize, num_elements):
     # create categorical series
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     pd_cat = pd.Categorical(
         pd.Series(
-            np.random.choice(list(ascii_letters + digits), num_elements),
+            rng.choice(list(ascii_letters + digits), num_elements),
             dtype="category",
         )
     )
@@ -586,8 +586,9 @@ def test_categorical_value_counts(dropna, normalize, num_elements):
 @pytest.mark.parametrize("dropna", [True, False])
 @pytest.mark.parametrize("normalize", [True, False])
 def test_series_value_counts(dropna, normalize):
+    rng = np.random.default_rng(seed=0)
     for size in [10**x for x in range(5)]:
-        arr = np.random.randint(low=-1, high=10, size=size)
+        arr = rng.integers(low=-1, high=10, size=size)
         mask = arr != -1
         sr = cudf.Series.from_masked_array(
             arr, cudf.Series(mask)._column.as_mask()
@@ -714,8 +715,8 @@ def test_series_mode(gs, dropna):
 @pytest.mark.parametrize(
     "arr",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat([-0.6459412758761901], 100),
         np.repeat(np.nan, 100),
@@ -731,12 +732,12 @@ def test_series_round(arr, decimals):
     expected = pser.round(decimals)
 
     assert_eq(result, expected)
-
+    rng = np.random.default_rng(seed=0)
     # with nulls, maintaining existing null mask
     arr = arr.astype("float64")  # for pandas nulls
-    arr.ravel()[
-        np.random.choice(arr.shape[0], arr.shape[0] // 2, replace=False)
-    ] = np.nan
+    arr.ravel()[rng.choice(arr.shape[0], arr.shape[0] // 2, replace=False)] = (
+        np.nan
+    )
 
     pser = pd.Series(arr)
     ser = cudf.Series(arr)
@@ -1726,7 +1727,7 @@ def test_series_truncate_datetimeindex():
         [],
         [0, 12, 14],
         [0, 14, 12, 12, 3, 10, 12, 14],
-        np.random.randint(-100, 100, 200),
+        np.random.default_rng(seed=0).integers(-100, 100, 200),
         pd.Series([0.0, 1.0, None, 10.0]),
         [None, None, None, None],
         [np.nan, None, -1, 2, 3],
@@ -1735,7 +1736,7 @@ def test_series_truncate_datetimeindex():
 @pytest.mark.parametrize(
     "values",
     [
-        np.random.randint(-100, 100, 10),
+        np.random.default_rng(seed=0).integers(-100, 100, 10),
         [],
         [np.nan, None, -1, 2, 3],
         [1.0, 12.0, None, None, 120],
@@ -1746,7 +1747,8 @@ def test_series_truncate_datetimeindex():
     ],
 )
 def test_isin_numeric(data, values):
-    index = np.random.randint(0, 100, len(data))
+    rng = np.random.default_rng(seed=0)
+    index = rng.integers(0, 100, len(data))
     psr = pd.Series(data, index=index)
     gsr = cudf.Series.from_pandas(psr, nan_as_null=False)
 
@@ -1943,8 +1945,9 @@ def test_diff_many_dtypes(data):
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 @pytest.mark.parametrize("series_bins", [True, False])
 def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
-    data = np.random.randint(0, 100, num_rows).astype(dtype)
-    bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype)))
+    rng = np.random.default_rng(seed=0)
+    data = rng.integers(0, 100, num_rows).astype(dtype)
+    bins = np.unique(np.sort(rng.integers(2, 95, num_bins).astype(dtype)))
     s = cudf.Series(data)
     if series_bins:
         s_bins = cudf.Series(bins)
@@ -1957,7 +1960,8 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
 
 
 def test_series_digitize_invalid_bins():
-    s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32")
+    rng = np.random.default_rng(seed=0)
+    s = cudf.Series(rng.integers(0, 30, 80), dtype="int32")
     bins = cudf.Series([2, None, None, 50, 90], dtype="int32")
 
     with pytest.raises(
@@ -2038,7 +2042,8 @@ def test_default_float_bitwidth_construction(default_float_bitwidth, data):
 
 def test_series_ordered_dedup():
     # part of https://github.com/rapidsai/cudf/issues/11486
-    sr = cudf.Series(np.random.randint(0, 100, 1000))
+    rng = np.random.default_rng(seed=0)
+    sr = cudf.Series(rng.integers(0, 100, 1000))
     # pandas unique() preserves order
     expect = pd.Series(sr.to_pandas().unique())
     got = cudf.Series._from_column(sr._column.unique())
diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py
index 3d8b6a79d2a..db1de7d0cf4 100644
--- a/python/cudf/cudf/tests/test_seriesmap.py
+++ b/python/cudf/cudf/tests/test_seriesmap.py
@@ -47,8 +47,8 @@ def test_series_map_callable_numeric_basic():
 @pytest.mark.parametrize("nelem", list(product([2, 10, 100, 1000])))
 def test_series_map_callable_numeric_random(nelem):
     # Generate data
-    np.random.seed(0)
-    data = np.random.random(nelem) * 100
+    rng = np.random.default_rng(seed=0)
+    data = rng.random(nelem) * 100
 
     sr = Series(data)
     pdsr = pd.Series(data)
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 2cf2259d9ec..7e5ce713c7e 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -34,10 +34,10 @@
     "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args))
 )
 def test_dataframe_sort_values(nelem, dtype):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
-    df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype)
-    df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype)
+    df["a"] = aa = (100 * rng.random(nelem)).astype(dtype)
+    df["b"] = bb = (100 * rng.random(nelem)).astype(dtype)
     sorted_df = df.sort_values(by="a")
     # Check
     sorted_index = np.argsort(aa, kind="mergesort")
@@ -85,9 +85,9 @@ def test_series_sort_values_ignore_index(ignore_index):
     "nelem,sliceobj", list(product([10, 100], sort_slice_args))
 )
 def test_dataframe_sort_values_sliced(nelem, sliceobj):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame()
-    df["a"] = np.random.random(nelem)
+    df["a"] = rng.random(nelem)
 
     expect = df[sliceobj]["a"].sort_values()
     gdf = DataFrame.from_pandas(df)
@@ -100,8 +100,8 @@ def test_dataframe_sort_values_sliced(nelem, sliceobj):
     list(product(sort_nelem_args, sort_dtype_args, [True, False])),
 )
 def test_series_argsort(nelem, dtype, asc):
-    np.random.seed(0)
-    sr = Series((100 * np.random.random(nelem)).astype(dtype))
+    rng = np.random.default_rng(seed=0)
+    sr = Series((100 * rng.random(nelem)).astype(dtype))
     res = sr.argsort(ascending=asc)
 
     if asc:
@@ -116,8 +116,8 @@ def test_series_argsort(nelem, dtype, asc):
     "nelem,asc", list(product(sort_nelem_args, [True, False]))
 )
 def test_series_sort_index(nelem, asc):
-    np.random.seed(0)
-    sr = Series(100 * np.random.random(nelem))
+    rng = np.random.default_rng(seed=0)
+    sr = Series(100 * rng.random(nelem))
     psr = sr.to_pandas()
 
     expected = psr.sort_index(ascending=asc)
@@ -167,9 +167,9 @@ def test_series_nsmallest(data, n):
 @pytest.mark.parametrize("op", ["nsmallest", "nlargest"])
 @pytest.mark.parametrize("columns", ["a", ["b", "a"]])
 def test_dataframe_nlargest_nsmallest(nelem, n, op, columns):
-    np.random.seed(0)
-    aa = np.random.random(nelem)
-    bb = np.random.random(nelem)
+    rng = np.random.default_rng(seed=0)
+    aa = rng.random(nelem)
+    bb = rng.random(nelem)
 
     df = DataFrame({"a": aa, "b": bb})
     pdf = df.to_pandas()
@@ -181,10 +181,10 @@ def test_dataframe_nlargest_nsmallest(nelem, n, op, columns):
 )
 def test_dataframe_nlargest_sliced(counts, sliceobj):
     nelem, n = counts
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame()
-    df["a"] = np.random.random(nelem)
-    df["b"] = np.random.random(nelem)
+    df["a"] = rng.random(nelem)
+    df["b"] = rng.random(nelem)
 
     expect = df[sliceobj].nlargest(n, "a")
     gdf = DataFrame.from_pandas(df)
@@ -197,10 +197,10 @@ def test_dataframe_nlargest_sliced(counts, sliceobj):
 )
 def test_dataframe_nsmallest_sliced(counts, sliceobj):
     nelem, n = counts
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame()
-    df["a"] = np.random.random(nelem)
-    df["b"] = np.random.random(nelem)
+    df["a"] = rng.random(nelem)
+    df["b"] = rng.random(nelem)
 
     expect = df[sliceobj].nsmallest(n, "a")
     gdf = DataFrame.from_pandas(df)
@@ -216,13 +216,13 @@ def test_dataframe_nsmallest_sliced(counts, sliceobj):
 def test_dataframe_multi_column(
     num_cols, num_rows, dtype, ascending, na_position
 ):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     by = list(string.ascii_lowercase[:num_cols])
     pdf = pd.DataFrame()
 
     for i in range(5):
         colname = string.ascii_lowercase[i]
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         pdf[colname] = data
 
     gdf = DataFrame.from_pandas(pdf)
@@ -244,17 +244,17 @@ def test_dataframe_multi_column(
 def test_dataframe_multi_column_nulls(
     num_cols, num_rows, dtype, nulls, ascending, na_position
 ):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     by = list(string.ascii_lowercase[:num_cols])
     pdf = pd.DataFrame()
 
     for i in range(3):
         colname = string.ascii_lowercase[i]
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         if nulls == "some":
             idx = np.array([], dtype="int64")
             if num_rows > 0:
-                idx = np.random.choice(
+                idx = rng.choice(
                     num_rows, size=int(num_rows / 4), replace=False
                 )
             data[idx] = np.nan
@@ -295,8 +295,8 @@ def test_dataframe_multi_column_nulls_multiple_ascending(
 
 @pytest.mark.parametrize("nelem", [1, 100])
 def test_series_nlargest_nelem(nelem):
-    np.random.seed(0)
-    elems = np.random.random(nelem)
+    rng = np.random.default_rng(seed=0)
+    elems = rng.random(nelem)
     gds = Series(elems).nlargest(nelem)
     pds = pd.Series(elems).nlargest(nelem)
 
@@ -308,11 +308,14 @@ def test_series_nlargest_nelem(nelem):
 @pytest.mark.parametrize("keep", [True, False])
 def test_dataframe_scatter_by_map(map_size, nelem, keep):
     strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"]
-    np.random.seed(0)
-    df = DataFrame()
-    df["a"] = np.random.choice(strlist[:map_size], nelem)
-    df["b"] = np.random.uniform(low=0, high=map_size, size=nelem)
-    df["c"] = np.random.randint(map_size, size=nelem)
+    rng = np.random.default_rng(seed=0)
+    df = DataFrame(
+        {
+            "a": rng.choice(strlist[:map_size], nelem),
+            "b": rng.uniform(low=0, high=map_size, size=nelem),
+            "c": rng.integers(map_size, size=nelem),
+        }
+    )
     df["d"] = df["a"].astype("category")
 
     def _check_scatter_by_map(dfs, col):
@@ -381,10 +384,10 @@ def _check_scatter_by_map(dfs, col):
     "kind", ["quicksort", "mergesort", "heapsort", "stable"]
 )
 def test_dataframe_sort_values_kind(nelem, dtype, kind):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
-    df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype)
-    df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype)
+    df["a"] = aa = (100 * rng.random(nelem)).astype(dtype)
+    df["b"] = bb = (100 * rng.random(nelem)).astype(dtype)
     with expect_warning_if(kind != "quicksort", UserWarning):
         sorted_df = df.sort_values(by="a", kind=kind)
     # Check
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index 3248e7f72c0..8b68ae6480b 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -6,7 +6,8 @@
 
 
 def test_to_dense_array():
-    data = np.random.random(8)
+    rng = np.random.default_rng(seed=0)
+    data = rng.random(8)
     mask = np.asarray([0b11010110]).astype(np.byte)
 
     sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index f952cea07f8..27de0ed42e5 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -24,8 +24,8 @@
 @pytest.mark.parametrize("dtype", params_dtypes)
 @pytest.mark.parametrize("skipna", [True, False])
 def test_series_reductions(method, dtype, skipna):
-    np.random.seed(0)
-    arr = np.random.random(100)
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(100)
     if np.issubdtype(dtype, np.integer):
         arr *= 100
         mask = arr > 10
@@ -56,8 +56,8 @@ def call_test(sr, skipna):
 def test_series_reductions_concurrency(method):
     e = ThreadPoolExecutor(10)
 
-    np.random.seed(0)
-    srs = [cudf.Series(np.random.random(10000)) for _ in range(1)]
+    rng = np.random.default_rng(seed=0)
+    srs = [cudf.Series(rng.random(10000)) for _ in range(1)]
 
     def call_test(sr):
         fn = getattr(sr, method)
@@ -74,8 +74,8 @@ def f(sr):
 
 @pytest.mark.parametrize("ddof", range(3))
 def test_series_std(ddof):
-    np.random.seed(0)
-    arr = np.random.random(100) - 0.5
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(100) - 0.5
     sr = cudf.Series(arr)
     pd = sr.to_pandas()
     got = sr.std(ddof=ddof)
@@ -84,8 +84,9 @@ def test_series_std(ddof):
 
 
 def test_series_unique():
+    rng = np.random.default_rng(seed=0)
     for size in [10**x for x in range(5)]:
-        arr = np.random.randint(low=-1, high=10, size=size)
+        arr = rng.integers(low=-1, high=10, size=size)
         mask = arr != -1
         sr = cudf.Series(arr)
         sr[~mask] = None
@@ -129,7 +130,8 @@ def test_series_nunique(nan_as_null, dropna):
 
 
 def test_series_scale():
-    arr = pd.Series(np.random.randint(low=-10, high=10, size=100))
+    rng = np.random.default_rng(seed=0)
+    arr = pd.Series(rng.integers(low=-10, high=10, size=100))
     sr = cudf.Series(arr)
 
     vmin = arr.min()
@@ -229,8 +231,8 @@ def test_misc_quantiles(data, q):
 @pytest.mark.parametrize(
     "data",
     [
-        {"data": np.random.normal(-100, 100, 1000)},
-        {"data": np.random.randint(-50, 50, 1000)},
+        {"data": np.random.default_rng(seed=0).normal(-100, 100, 1000)},
+        {"data": np.random.default_rng(seed=0).integers(-50, 50, 1000)},
         {"data": (np.zeros(100))},
         {"data": np.repeat(np.nan, 100)},
         {"data": np.array([1.123, 2.343, np.nan, 0.0])},
@@ -280,8 +282,8 @@ def test_kurt_skew_error(op):
 @pytest.mark.parametrize(
     "data",
     [
-        cudf.Series(np.random.normal(-100, 100, 1000)),
-        cudf.Series(np.random.randint(-50, 50, 1000)),
+        cudf.Series(np.random.default_rng(seed=0).normal(-100, 100, 1000)),
+        cudf.Series(np.random.default_rng(seed=0).integers(-50, 50, 1000)),
         cudf.Series(np.zeros(100)),
         cudf.Series(np.repeat(np.nan, 100)),
         cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])),
@@ -311,8 +313,8 @@ def test_skew_series(data, null_flag, numeric_only):
 @pytest.mark.parametrize("dtype", params_dtypes)
 @pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100])
 def test_series_median(dtype, num_na):
-    np.random.seed(0)
-    arr = np.random.random(100)
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(100)
     if np.issubdtype(dtype, np.integer):
         arr *= 100
     mask = np.arange(100) >= num_na
@@ -344,8 +346,8 @@ def test_series_median(dtype, num_na):
 @pytest.mark.parametrize(
     "data",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.array([1.123, 2.343, np.nan, 0.0]),
         np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]),
@@ -379,8 +381,8 @@ def test_series_pct_change(data, periods, fill_method):
 @pytest.mark.parametrize(
     "data1",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat(np.nan, 100),
         np.array([1.123, 2.343, np.nan, 0.0]),
@@ -393,8 +395,8 @@ def test_series_pct_change(data, periods, fill_method):
 @pytest.mark.parametrize(
     "data2",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat(np.nan, 100),
         np.array([1.123, 2.343, np.nan, 0.0]),
@@ -423,8 +425,8 @@ def test_cov1d(data1, data2):
 @pytest.mark.parametrize(
     "data1",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat(np.nan, 100),
         np.array([1.123, 2.343, np.nan, 0.0]),
@@ -437,8 +439,8 @@ def test_cov1d(data1, data2):
 @pytest.mark.parametrize(
     "data2",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat(np.nan, 100),
         np.array([1.123, 2.343, np.nan, 0.0]),
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 45143211a11..e25f99d7bee 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -36,6 +36,7 @@
 idx_list = [None, [10, 11, 12, 13, 14]]
 
 idx_id_list = ["None_index", "Set_index"]
+rng = np.random.default_rng(seed=0)
 
 
 def raise_builder(flags, exceptions):
@@ -132,9 +133,14 @@ def test_string_get_item(ps_gs, item):
         np.array([False] * 5),
         cupy.asarray(np.array([True] * 5)),
         cupy.asarray(np.array([False] * 5)),
-        np.random.randint(0, 2, 5).astype("bool").tolist(),
-        np.random.randint(0, 2, 5).astype("bool"),
-        cupy.asarray(np.random.randint(0, 2, 5).astype("bool")),
+        np.random.default_rng(seed=0)
+        .integers(0, 2, 5)
+        .astype("bool")
+        .tolist(),
+        np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool"),
+        cupy.asarray(
+            np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool")
+        ),
     ],
 )
 def test_string_bool_mask(ps_gs, item):
@@ -1078,7 +1084,8 @@ def test_string_set_scalar(scalar):
 
 
 def test_string_index():
-    pdf = pd.DataFrame(np.random.rand(5, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(5, 5)))
     gdf = cudf.DataFrame.from_pandas(pdf)
     stringIndex = ["a", "b", "c", "d", "e"]
     pdf.index = stringIndex
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index e91edc9eec6..899d78c999b 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -50,10 +50,14 @@ def test_struct_for_field(key, expect):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("input_obj", [[{"a": 1, "b": cudf.NA, "c": 3}]])
-def test_series_construction_with_nulls(input_obj):
-    expect = pa.array(input_obj, from_pandas=True)
-    got = cudf.Series(input_obj).to_arrow()
+def test_series_construction_with_nulls():
+    fields = [
+        pa.array([1], type=pa.int64()),
+        pa.array([None], type=pa.int64()),
+        pa.array([3], type=pa.int64()),
+    ]
+    expect = pa.StructArray.from_arrays(fields, ["a", "b", "c"])
+    got = cudf.Series(expect).to_arrow()
 
     assert expect == got
 
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index 88938457545..1305022d7fa 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -24,8 +24,8 @@ def _generic_function(a):
 )
 def test_apply_python_lambda(dtype, udf, testfunc):
     size = 500
-
-    lhs_arr = np.random.random(size).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    lhs_arr = rng.random(size).astype(dtype)
     lhs_ser = Series(lhs_arr)
 
     out_ser = lhs_ser.apply(udf)
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 5f5d79c1dce..b714beb0069 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -17,7 +17,8 @@
 
 @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES)
 def test_series_abs(dtype):
-    arr = (np.random.random(1000) * 100).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    arr = (rng.random(1000) * 100).astype(dtype)
     sr = Series(arr)
     np.testing.assert_equal(sr.abs().to_numpy(), np.abs(arr))
     np.testing.assert_equal(abs(sr).to_numpy(), abs(arr))
@@ -25,22 +26,24 @@ def test_series_abs(dtype):
 
 @pytest.mark.parametrize("dtype", utils.INTEGER_TYPES)
 def test_series_invert(dtype):
-    arr = (np.random.random(1000) * 100).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    arr = (rng.random(1000) * 100).astype(dtype)
     sr = Series(arr)
     np.testing.assert_equal((~sr).to_numpy(), np.invert(arr))
     np.testing.assert_equal((~sr).to_numpy(), ~arr)
 
 
 def test_series_neg():
-    arr = np.random.random(100) * 100
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(100) * 100
     sr = Series(arr)
     np.testing.assert_equal((-sr).to_numpy(), -arr)
 
 
 @pytest.mark.parametrize("mth", ["min", "max", "sum", "product"])
 def test_series_pandas_methods(mth):
-    np.random.seed(0)
-    arr = (1 + np.random.random(5) * 100).astype(np.int64)
+    rng = np.random.default_rng(seed=0)
+    arr = (1 + rng.random(5) * 100).astype(np.int64)
     sr = Series(arr)
     psr = pd.Series(arr)
     np.testing.assert_equal(getattr(sr, mth)(), getattr(psr, mth)())
diff --git a/python/cudf/cudf/tests/test_unique.py b/python/cudf/cudf/tests/test_unique.py
index 699b3340521..9a1c3b213b8 100644
--- a/python/cudf/cudf/tests/test_unique.py
+++ b/python/cudf/cudf/tests/test_unique.py
@@ -12,9 +12,9 @@
 @pytest.fixture
 def df():
     df = cudf.DataFrame()
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
-    arr = np.random.randint(2, size=10, dtype=np.int64)
+    arr = rng.integers(2, size=10, dtype=np.int64)
     df["foo"] = arr
     df["bar"] = cudf.Series([pd.Timestamp(x) for x in arr])
 
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index babe4be2715..896a3809c67 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -69,10 +69,10 @@ def _get_space_util(bins, init_bins):
     return sum(_new_bin_length(len(b)) for b in bins) + 2 * init_bins
 
 
-def _pick_initial_a_b(data, max_constant, init_bins):
+def _pick_initial_a_b(data, max_constant, init_bins, rng):
     while True:
-        a = np.random.randint(2**12, 2**15)
-        b = np.random.randint(2**12, 2**15)
+        a = rng.integers(2**12, 2**15)
+        b = rng.integers(2**12, 2**15)
         bins = _make_bins(data, init_bins, a, b)
         score = _get_space_util(bins, init_bins) / len(data)
 
@@ -86,18 +86,18 @@ def _pick_initial_a_b(data, max_constant, init_bins):
     return bins, a, b
 
 
-def _find_hash_for_internal(hash_bin):
+def _find_hash_for_internal(hash_bin, rng):
     if not hash_bin:
         return [[], 0, 0]
 
     new_length = _new_bin_length(len(hash_bin))
 
     while True:
-        a = np.random.randint(
+        a = rng.integers(
             A_LBOUND_SECOND_LEVEL_HASH,
             A_HBOUND_SECOND_LEVEL_HASH,
         )
-        b = np.random.randint(
+        b = rng.integers(
             B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
         )
         bins = _make_bins(hash_bin, new_length, a, b)
@@ -108,11 +108,11 @@ def _find_hash_for_internal(hash_bin):
             return bins, a, b
 
 
-def _perfect_hash(integers, max_constant):
+def _perfect_hash(integers, max_constant, rng):
     num_top_level_bins = len(integers) // 4
 
     init_bins, init_a, init_b = _pick_initial_a_b(
-        integers, max_constant, num_top_level_bins
+        integers, max_constant, num_top_level_bins, rng
     )
     flattened_bins = []
 
@@ -127,7 +127,7 @@ def _perfect_hash(integers, max_constant):
     for i, b in enumerate(init_bins):
         if i % 500 == 0:
             print(f"Processing bin {i} / {len(init_bins)} of size = {len(b)}")
-        internal_table, coeff_a, coeff_b = _find_hash_for_internal(b)
+        internal_table, coeff_a, coeff_b = _find_hash_for_internal(b, rng)
         bin_length = len(internal_table)
         max_bin_length = max(bin_length, max_bin_length)
         internal_table_coeffs[i] = (
@@ -245,7 +245,7 @@ def hash_vocab(
     """
     Write the vocab vocabulary hashtable to the output_path
     """
-    np.random.seed(1243342)
+    rng = np.random.default_rng(seed=1243342)
     vocab = _load_vocab_dict(vocab_path)
     keys = list(map(_sdbm_hash, vocab.keys()))
 
@@ -264,7 +264,7 @@ def hash_vocab(
         hash_table,
         inner_table_coeffs,
         offsets_into_ht,
-    ) = _perfect_hash(keys, 10)
+    ) = _perfect_hash(keys, 10, rng)
 
     _pack_keys_and_values(hash_table, hashed_vocab)
     _store_func(
diff --git a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
index c7d39b78810..94904fd83d4 100644
--- a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
+++ b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
@@ -18,13 +18,13 @@
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
-    "np.random.seed(0)\n",
+    "rng = np.random.default_rng(seed=0)\n",
     "\n",
     "num_rows = 25_000_000\n",
     "num_columns = 12\n",
     "\n",
     "# Create a DataFrame with random data\n",
-    "df = pd.DataFrame(np.random.randint(0, 100, size=(num_rows, num_columns)),\n",
+    "df = pd.DataFrame(rng.integers(0, 100, size=(num_rows, num_columns)),\n",
     "                  columns=[f'Column_{i}' for i in range(1, num_columns + 1)])"
    ]
   },
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index a74b7148c00..7aefdc386bb 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1142,8 +1142,8 @@ def test_private_method_result_wrapped():
 
 
 def test_numpy_var():
-    np.random.seed(42)
-    data = np.random.rand(1000)
+    rng = np.random.default_rng(seed=42)
+    data = rng.random(1000)
     psr = pd.Series(data)
     sr = xpd.Series(data)
 
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 5b7bde06d1d..a5c29bd93a2 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -23,12 +23,12 @@
     reason="function names change across versions of pandas, so making sure it only runs on latest version of pandas",
 )
 def test_profiler():
-    np.random.seed(42)
+    rng = np.random.default_rng(seed=42)
     with Profiler() as profiler:
         df = pd.DataFrame(
             {
-                "idx": np.random.randint(0, 10, 1000),
-                "data": np.random.rand(1000),
+                "idx": rng.integers(0, 10, 1000),
+                "data": rng.random(1000),
             }
         )
         sums = df.groupby("idx").sum()
@@ -58,7 +58,7 @@ def test_profiler():
     calls = [
         "pd.DataFrame",
         "",
-        "np.random.randint",
+        "rng.integers",
         "np.random.rand",
         'df.groupby("idx").sum',
         'df.sum()["data"]',
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
index 892d0886596..27eaff87ba0 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
@@ -102,7 +102,7 @@ def test_random_forest(binary_classification_data):
 def test_clustering():
     rng = np.random.default_rng(42)
     nsamps = 300
-    X = rng.random((nsamps, 2))
+    X = rng.random(size=(nsamps, 2))
     data = pd.DataFrame(X, columns=["x", "y"])
 
     kmeans = KMeans(n_clusters=3, random_state=42)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
index 37e3cc34856..0777d982ac2 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
@@ -31,17 +31,17 @@ def dask_client():
 
 
 def test_1d_distributed(dask_client):
-    np.random.seed(42)
-    ts = pd.Series(np.random.rand(100))
+    rng = np.random.default_rng(seed=42)
+    ts = pd.Series(rng.random(100))
     m = 10
     return stumpy.stumped(dask_client, ts, m)
 
 
 def test_multidimensional_distributed_timeseries(dask_client):
-    np.random.seed(42)
+    rng = np.random.default_rng(seed=42)
     # Each row represents data from a different dimension while each column represents
     # data from the same dimension
-    your_time_series = np.random.rand(3, 1000)
+    your_time_series = rng.random(3, 1000)
     # Approximately, how many data points might be found in a pattern
     window_size = 50
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 620a917109e..896c4169f5b 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -371,12 +371,12 @@ def test_split_row_groups(tmpdir, row_groups, index):
     row_group_size = 5
     file_row_groups = 10  # Known apriori
     npartitions_expected = math.ceil(file_row_groups / row_groups) * 2
-
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "a": np.random.choice(["apple", "banana", "carrot"], size=df_size),
-            "b": np.random.random(size=df_size),
-            "c": np.random.randint(1, 5, size=df_size),
+            "a": rng.choice(["apple", "banana", "carrot"], size=df_size),
+            "b": rng.random(size=df_size),
+            "c": rng.integers(1, 5, size=df_size),
             "index": np.arange(0, df_size),
         }
     )
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 6f04b5737da..3fbb2aacd2c 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -25,7 +25,7 @@ def data_dt_1():
 
 
 def data_dt_2():
-    return np.random.randn(100)
+    return np.random.default_rng(seed=0).standard_normal(size=100)
 
 
 dt_fields = ["year", "month", "day", "hour", "minute", "second"]
diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py
index 87bd401accd..8c51f950765 100644
--- a/python/dask_cudf/dask_cudf/tests/test_binops.py
+++ b/python/dask_cudf/dask_cudf/tests/test_binops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import operator
 
@@ -21,10 +21,11 @@ def _make_empty_frame(npartitions=2):
 
 
 def _make_random_frame_float(nelem, npartitions=2):
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=nelem),
-            "y": np.random.normal(size=nelem) + 1,
+            "x": rng.integers(0, 5, size=nelem),
+            "y": rng.normal(size=nelem) + 1,
         }
     )
     gdf = cudf.from_pandas(df)
@@ -51,7 +52,6 @@ def _make_random_frame_float(nelem, npartitions=2):
 
 @pytest.mark.parametrize("binop", _binops)
 def test_series_binops_integer(binop):
-    np.random.seed(0)
     size = 1000
     lhs_df, lhs_gdf = _make_random_frame(size)
     rhs_df, rhs_gdf = _make_random_frame(size)
@@ -62,7 +62,6 @@ def test_series_binops_integer(binop):
 
 @pytest.mark.parametrize("binop", _binops)
 def test_series_binops_float(binop):
-    np.random.seed(0)
     size = 1000
     lhs_df, lhs_gdf = _make_random_frame_float(size)
     rhs_df, rhs_gdf = _make_random_frame_float(size)
@@ -73,10 +72,10 @@ def test_series_binops_float(binop):
 
 @pytest.mark.parametrize("operator", _binops)
 def test_df_series_bind_ops(operator):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     size = 1000
     lhs_df, lhs_gdf = _make_random_frame_float(size)
-    rhs = np.random.rand()
+    rhs = rng.random()
 
     for col in lhs_gdf.columns:
         got = getattr(lhs_gdf[col], operator.__name__)(rhs)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 5f0fae86691..8e42c847ddf 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -22,13 +22,15 @@
     xfail_dask_expr,
 )
 
+rng = np.random.default_rng(seed=0)
+
 
 def test_from_dict_backend_dispatch():
     # Test ddf.from_dict cudf-backend dispatch
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     data = {
-        "x": np.random.randint(0, 5, size=10000),
-        "y": np.random.normal(size=10000),
+        "x": rng.integers(0, 5, size=10000),
+        "y": rng.normal(size=10000),
     }
     expect = cudf.DataFrame(data)
     with dask.config.set({"dataframe.backend": "cudf"}):
@@ -62,10 +64,10 @@ def test_from_dask_dataframe_deprecated():
 
 
 def test_to_backend():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     data = {
-        "x": np.random.randint(0, 5, size=10000),
-        "y": np.random.normal(size=10000),
+        "x": rng.integers(0, 5, size=10000),
+        "y": rng.normal(size=10000),
     }
     with dask.config.set({"dataframe.backend": "pandas"}):
         ddf = dd.from_dict(data, npartitions=2)
@@ -114,12 +116,12 @@ def test_to_backend_kwargs():
 
 
 def test_from_pandas():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
@@ -169,10 +171,10 @@ def _fragmented_gdf(df, nsplit):
 
 
 def test_query():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)}
+        {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)}
     )
     gdf = cudf.DataFrame.from_pandas(df)
     expr = "x > 2"
@@ -188,9 +190,9 @@ def test_query():
 
 
 def test_query_local_dict():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)}
+        {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)}
     )
     gdf = cudf.DataFrame.from_pandas(df)
     ddf = dask_cudf.from_cudf(gdf, npartitions=2)
@@ -204,11 +206,11 @@ def test_query_local_dict():
 
 
 def test_head():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=100),
-            "y": np.random.normal(size=100),
+            "x": rng.integers(0, 5, size=100),
+            "y": rng.normal(size=100),
         }
     )
     gdf = cudf.DataFrame.from_pandas(df)
@@ -220,13 +222,11 @@ def test_head():
 @pytest.mark.parametrize("nelem", [10, 200, 1333])
 def test_set_index(nelem):
     with dask.config.set(scheduler="single-threaded"):
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         # Use unique index range as the sort may not be stable-ordering
         x = np.arange(nelem)
-        np.random.shuffle(x)
-        df = pd.DataFrame(
-            {"x": x, "y": np.random.randint(0, nelem, size=nelem)}
-        )
+        rng.shuffle(x)
+        df = pd.DataFrame({"x": x, "y": rng.integers(0, nelem, size=nelem)})
         ddf = dd.from_pandas(df, npartitions=2)
         ddf2 = ddf.to_backend("cudf")
 
@@ -242,7 +242,7 @@ def test_set_index(nelem):
 def test_set_index_quantile(nelem, nparts, by):
     df = cudf.DataFrame()
     df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
-    df["b"] = np.random.choice(cudf.datasets.names, size=nelem)
+    df["b"] = rng.choice(cudf.datasets.names, size=nelem)
     ddf = dd.from_pandas(df, npartitions=nparts)
 
     with pytest.warns(FutureWarning, match="deprecated"):
@@ -270,11 +270,11 @@ def assert_frame_equal_by_index_group(expect, got):
 @pytest.mark.parametrize("nelem", [10, 200, 1333])
 def test_set_index_2(nelem):
     with dask.config.set(scheduler="single-threaded"):
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         df = pd.DataFrame(
             {
-                "x": 100 + np.random.randint(0, nelem // 2, size=nelem),
-                "y": np.random.normal(size=nelem),
+                "x": 100 + rng.integers(0, nelem // 2, size=nelem),
+                "y": rng.normal(size=nelem),
             }
         )
         expect = df.set_index("x").sort_index()
@@ -289,11 +289,11 @@ def test_set_index_2(nelem):
 def test_set_index_w_series():
     with dask.config.set(scheduler="single-threaded"):
         nelem = 20
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         df = pd.DataFrame(
             {
-                "x": 100 + np.random.randint(0, nelem // 2, size=nelem),
-                "y": np.random.normal(size=nelem),
+                "x": 100 + rng.integers(0, nelem // 2, size=nelem),
+                "y": rng.normal(size=nelem),
             }
         )
         expect = df.set_index(df.x).sort_index()
@@ -327,12 +327,12 @@ def test_set_index_sorted():
 @pytest.mark.parametrize("index", [None, "myindex"])
 def test_rearrange_by_divisions(nelem, index):
     with dask.config.set(scheduler="single-threaded"):
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         df = pd.DataFrame(
             {
-                "x": np.random.randint(0, 20, size=nelem),
-                "y": np.random.normal(size=nelem),
-                "z": np.random.choice(["dog", "cat", "bird"], nelem),
+                "x": rng.integers(0, 20, size=nelem),
+                "y": rng.normal(size=nelem),
+                "z": rng.choice(["dog", "cat", "bird"], nelem),
             }
         )
         df["z"] = df["z"].astype("category")
@@ -355,9 +355,9 @@ def test_rearrange_by_divisions(nelem, index):
 
 
 def test_assign():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
 
     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
@@ -372,10 +372,10 @@ def test_assign():
 
 @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"])
 def test_setitem_scalar_integer(data_type):
-    np.random.seed(0)
-    scalar = np.random.randint(0, 100, dtype=data_type)
+    rng = np.random.default_rng(seed=0)
+    scalar = rng.integers(0, 100, dtype=data_type)
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
 
@@ -388,10 +388,10 @@ def test_setitem_scalar_integer(data_type):
 
 @pytest.mark.parametrize("data_type", ["float32", "float64"])
 def test_setitem_scalar_float(data_type):
-    np.random.seed(0)
-    scalar = np.random.randn(1).astype(data_type)[0]
+    rng = np.random.default_rng(seed=0)
+    scalar = rng.standard_normal(size=1).astype(data_type)[0]
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
 
@@ -403,10 +403,10 @@ def test_setitem_scalar_float(data_type):
 
 
 def test_setitem_scalar_datetime():
-    np.random.seed(0)
-    scalar = np.int64(np.random.randint(0, 100)).astype("datetime64[ms]")
+    rng = np.random.default_rng(seed=0)
+    scalar = np.int64(rng.integers(0, 100)).astype("datetime64[ms]")
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
 
@@ -422,12 +422,12 @@ def test_setitem_scalar_datetime():
     "func",
     [
         lambda: pd.DataFrame(
-            {"A": np.random.rand(10), "B": np.random.rand(10)},
+            {"A": rng.random(10), "B": rng.random(10)},
             index=list("abcdefghij"),
         ),
         lambda: pd.DataFrame(
             {
-                "A": np.random.rand(10),
+                "A": rng.random(10),
                 "B": list("a" * 10),
                 "C": pd.Series(
                     [str(20090101 + i) for i in range(10)],
@@ -438,7 +438,7 @@ def test_setitem_scalar_datetime():
         ),
         lambda: pd.Series(list("abcdefghijklmnop")),
         lambda: pd.Series(
-            np.random.rand(10),
+            rng.random(10),
             index=pd.Index(
                 [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]"
             ),
@@ -497,10 +497,11 @@ def test_repartition_hash_staged(npartitions):
     by = ["b"]
     datarange = 35
     size = 100
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
             "a": np.arange(size, dtype="int64"),
-            "b": np.random.randint(datarange, size=size),
+            "b": rng.integers(datarange, size=size),
         }
     )
     # WARNING: Specific npartitions-max_branch combination
@@ -537,12 +538,13 @@ def test_repartition_hash(by, npartitions, max_branch):
     npartitions_i = 4
     datarange = 26
     size = 100
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
             "a": np.arange(0, stop=size, dtype="int64"),
-            "b": np.random.randint(datarange, size=size),
-            "c": np.random.choice(list("abcdefgh"), size=size),
-            "d": np.random.choice(np.arange(26), size=size),
+            "b": rng.integers(datarange, size=size),
+            "c": rng.choice(list("abcdefgh"), size=size),
+            "d": rng.choice(np.arange(26), size=size),
         }
     )
     gdf.d = gdf.d.astype("datetime64[ms]")
@@ -768,6 +770,7 @@ def test_dataframe_series_replace(data):
 
 
 def test_dataframe_assign_col():
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(list(range(100)))
     pdf = pd.DataFrame(list(range(100)))
 
@@ -780,7 +783,7 @@ def test_dataframe_assign_col():
     pddf = dd.from_pandas(pdf, npartitions=4)
     pddf["fold"] = 0
     pddf["fold"] = pddf["fold"].map_partitions(
-        lambda p_df: pd.Series(np.random.randint(0, 4, len(p_df)))
+        lambda p_df: pd.Series(rng.integers(0, 4, len(p_df)))
     )
 
     dd.assert_eq(ddf[0], pddf[0])
@@ -1015,10 +1018,11 @@ def test_to_backend_simplify():
 @pytest.mark.parametrize("numeric_only", [True, False])
 @pytest.mark.parametrize("op", ["corr", "cov"])
 def test_cov_corr(op, numeric_only):
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame.from_dict(
         {
-            "x": np.random.randint(0, 5, size=10),
-            "y": np.random.normal(size=10),
+            "x": rng.integers(0, 5, size=10),
+            "y": rng.normal(size=10),
         }
     )
     ddf = dd.from_pandas(df, npartitions=2)
diff --git a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py
index e6fb58ad6df..84ed3b46598 100644
--- a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py
+++ b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py
@@ -51,9 +51,13 @@ def test_series_from_delayed():
 def test_dataframe_to_delayed():
     nelem = 100
 
-    df = cudf.DataFrame()
-    df["x"] = np.arange(nelem)
-    df["y"] = np.random.randint(nelem, size=nelem)
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "x": np.arange(nelem),
+            "y": rng.integers(nelem, size=nelem),
+        }
+    )
 
     ddf = dask_cudf.from_cudf(df, npartitions=5)
 
@@ -80,8 +84,8 @@ def test_dataframe_to_delayed():
 
 def test_series_to_delayed():
     nelem = 100
-
-    sr = cudf.Series(np.random.randint(nelem, size=nelem))
+    rng = np.random.default_rng(seed=0)
+    sr = cudf.Series(rng.integers(nelem, size=nelem))
 
     dsr = dask_cudf.from_cudf(sr, npartitions=5)
 
@@ -109,11 +113,13 @@ def test_series_to_delayed():
 
 def test_mixing_series_frame_error():
     nelem = 20
-
-    df = cudf.DataFrame()
-    df["x"] = np.arange(nelem)
-    df["y"] = np.random.randint(nelem, size=nelem)
-
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "x": np.arange(nelem),
+            "y": rng.integers(nelem, size=nelem),
+        }
+    )
     ddf = dask_cudf.from_cudf(df, npartitions=5)
 
     delay_frame = ddf.to_delayed()
@@ -128,10 +134,13 @@ def test_mixing_series_frame_error():
 
 def test_frame_extra_columns_error():
     nelem = 20
-
-    df = cudf.DataFrame()
-    df["x"] = np.arange(nelem)
-    df["y"] = np.random.randint(nelem, size=nelem)
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "x": np.arange(nelem),
+            "y": rng.integers(nelem, size=nelem),
+        }
+    )
     ddf1 = dask_cudf.from_cudf(df, npartitions=5)
 
     df["z"] = np.arange(nelem)
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index a12481a7bb4..fe57d4a4f00 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -32,8 +32,9 @@ def test_pyarrow_conversion_dispatch(preserve_index, index):
         to_pyarrow_table_dispatch,
     )
 
+    rng = np.random.default_rng(seed=0)
     df1 = cudf.DataFrame(
-        np.random.randn(10, 3), columns=list("abc"), index=index
+        rng.standard_normal(size=(10, 3)), columns=list("abc"), index=index
     )
     df2 = from_pyarrow_table_dispatch(
         df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
@@ -108,7 +109,8 @@ def test_pyarrow_schema_dispatch(preserve_index):
         to_pyarrow_table_dispatch,
     )
 
-    df = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(rng.standard_normal(size=(10, 3)), columns=list("abc"))
     df["d"] = cudf.Series(["cat", "dog"] * 5)
     table = to_pyarrow_table_dispatch(df, preserve_index=preserve_index)
     schema = pyarrow_schema_dispatch(df, preserve_index=preserve_index)
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 7b9f0ca328a..e30474f6b94 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -30,21 +30,21 @@ def assert_cudf_groupby_layers(ddf):
 
 @pytest.fixture(params=["non_null", "null"])
 def pdf(request):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # note that column name "x" is a substring of the groupby key;
     # this gives us coverage for cudf#10829
     pdf = pd.DataFrame(
         {
-            "xx": np.random.randint(0, 5, size=10000),
-            "x": np.random.normal(size=10000),
-            "y": np.random.normal(size=10000),
+            "xx": rng.integers(0, 5, size=10000),
+            "x": rng.normal(size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
     # insert nulls into dataframe at random
     if request.param == "null":
-        pdf = pdf.mask(np.random.choice([True, False], size=pdf.shape))
+        pdf = pdf.mask(rng.choice([True, False], size=pdf.shape))
 
     return pdf
 
@@ -173,11 +173,12 @@ def test_groupby_agg_empty_partition(tmpdir, split_out):
     ],
 )
 def test_groupby_multi_column(func):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "a": np.random.randint(0, 20, size=1000),
-            "b": np.random.randint(0, 5, size=1000),
-            "x": np.random.normal(size=1000),
+            "a": rng.integers(0, 20, size=1000),
+            "b": rng.integers(0, 5, size=1000),
+            "x": rng.normal(size=1000),
         }
     )
 
@@ -371,11 +372,12 @@ def test_groupby_string_index_name(myindex):
     ],
 )
 def test_groupby_split_out_multiindex(agg_func):
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(
         {
-            "a": np.random.randint(0, 10, 100),
-            "b": np.random.randint(0, 5, 100),
-            "c": np.random.random(100),
+            "a": rng.integers(0, 10, 100),
+            "b": rng.integers(0, 5, 100),
+            "c": rng.random(100),
         }
     )
     ddf = dask_cudf.from_cudf(df, 5)
@@ -419,12 +421,13 @@ def test_groupby_multiindex_reset_index(npartitions):
     ],
 )
 def test_groupby_reset_index_multiindex(groupby_keys, agg_func):
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(
         {
-            "a": np.random.randint(0, 10, 10),
-            "b": np.random.randint(0, 5, 10),
-            "c": np.random.randint(0, 5, 10),
-            "dd": np.random.randint(0, 5, 10),
+            "a": rng.integers(0, 10, 10),
+            "b": rng.integers(0, 5, 10),
+            "c": rng.integers(0, 5, 10),
+            "dd": rng.integers(0, 5, 10),
         }
     )
     ddf = dask_cudf.from_cudf(df, 5)
@@ -437,8 +440,9 @@ def test_groupby_reset_index_multiindex(groupby_keys, agg_func):
 
 
 def test_groupby_reset_index_drop_True():
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(
-        {"a": np.random.randint(0, 10, 10), "b": np.random.randint(0, 5, 10)}
+        {"a": rng.integers(0, 10, 10), "b": rng.integers(0, 5, 10)}
     )
     ddf = dask_cudf.from_cudf(df, 5)
     pddf = dd.from_pandas(df.to_pandas(), 5)
@@ -653,10 +657,11 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
 )
 def test_groupby_agg_redirect(aggregations):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
@@ -758,10 +763,11 @@ def test_groupby_with_list_of_series():
     ],
 )
 def test_groupby_nested_dict(func):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
@@ -794,10 +800,11 @@ def test_groupby_nested_dict(func):
     ],
 )
 def test_groupby_all_columns(func):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 3e078c47cdd..61d0f8d7eb9 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -22,18 +22,18 @@
 def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
     chunksize = 50
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # cuDF
     left = cudf.DataFrame(
         {
-            "x": np.random.randint(0, left_nkeys, size=left_nrows),
+            "x": rng.integers(0, left_nkeys, size=left_nrows),
             "a": np.arange(left_nrows),
         }
     )
     right = cudf.DataFrame(
         {
-            "x": np.random.randint(0, right_nkeys, size=right_nrows),
+            "x": rng.integers(0, right_nkeys, size=right_nrows),
             "a": 1000 * np.arange(right_nrows),
         }
     )
@@ -84,18 +84,18 @@ def gather(df, grows):
 def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how):
     chunksize = 50
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # cuDF
     left = cudf.DataFrame(
         {
-            "x": np.random.randint(0, left_nkeys, size=left_nrows),
+            "x": rng.integers(0, left_nkeys, size=left_nrows),
             "a": np.arange(left_nrows, dtype=np.float64),
         }
     )
     right = cudf.DataFrame(
         {
-            "x": np.random.randint(0, right_nkeys, size=right_nrows),
+            "x": rng.integers(0, right_nkeys, size=right_nrows),
             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
         }
     )
@@ -153,20 +153,20 @@ def test_merge_left(
 ):
     chunksize = 3
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=42)
 
     # cuDF
     left = cudf.DataFrame(
         {
-            "x": np.random.randint(0, left_nkeys, size=left_nrows),
-            "y": np.random.randint(0, left_nkeys, size=left_nrows),
+            "x": rng.integers(0, left_nkeys, size=left_nrows),
+            "y": rng.integers(0, left_nkeys, size=left_nrows),
             "a": np.arange(left_nrows, dtype=np.float64),
         }
     )
     right = cudf.DataFrame(
         {
-            "x": np.random.randint(0, right_nkeys, size=right_nrows),
-            "y": np.random.randint(0, right_nkeys, size=right_nrows),
+            "x": rng.integers(0, right_nkeys, size=right_nrows),
+            "y": rng.integers(0, right_nkeys, size=right_nrows),
             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
         }
     )
@@ -200,18 +200,18 @@ def test_merge_1col_left(
 ):
     chunksize = 3
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # cuDF
     left = cudf.DataFrame(
         {
-            "x": np.random.randint(0, left_nkeys, size=left_nrows),
+            "x": rng.integers(0, left_nkeys, size=left_nrows),
             "a": np.arange(left_nrows, dtype=np.float64),
         }
     )
     right = cudf.DataFrame(
         {
-            "x": np.random.randint(0, right_nkeys, size=right_nrows),
+            "x": rng.integers(0, right_nkeys, size=right_nrows),
             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
         }
     )
@@ -238,13 +238,19 @@ def test_merge_1col_left(
 
 def test_merge_should_fail():
     # Expected failure cases described in #2694
-    df1 = cudf.DataFrame()
-    df1["a"] = [1, 2, 3, 4, 5, 6] * 2
-    df1["b"] = np.random.randint(0, 12, 12)
-
-    df2 = cudf.DataFrame()
-    df2["a"] = [7, 2, 3, 8, 5, 9] * 2
-    df2["c"] = np.random.randint(0, 12, 12)
+    rng = np.random.default_rng(seed=0)
+    df1 = pd.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6] * 2,
+            "b": rng.integers(0, 12, 12),
+        }
+    )
+    df2 = pd.DataFrame(
+        {
+            "a": [7, 2, 3, 8, 5, 9] * 2,
+            "c": rng.integers(0, 12, 12),
+        }
+    )
 
     left = dask_cudf.from_cudf(df1, 1).groupby("a").b.min().to_frame()
     right = dask_cudf.from_cudf(df2, 1).groupby("a").c.min().to_frame()
@@ -257,7 +263,7 @@ def test_merge_should_fail():
         left.merge(right, how="left", on=["c"])
 
     # Same column names
-    df2["b"] = np.random.randint(0, 12, 12)
+    df2["b"] = np.random.default_rng(seed=0).integers(0, 12, 12)
     right = dask_cudf.from_cudf(df2, 1).groupby("a").b.min().to_frame()
 
     with pytest.raises(KeyError):
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index d03e92319be..4351b672151 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -13,11 +13,11 @@
 
 
 def _make_random_frame(nelem, npartitions=2):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=nelem),
-            "y": np.random.normal(size=nelem) + 1,
+            "x": rng.integers(0, 5, size=nelem),
+            "y": rng.normal(loc=1.0, scale=1.0, size=nelem),
         }
     )
     gdf = cudf.DataFrame.from_pandas(df)
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 9bbbbc79561..02c815427f3 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -28,7 +28,7 @@
 @pytest.mark.parametrize("nelem", [10, 500])
 @pytest.mark.parametrize("nparts", [1, 10])
 def test_sort_values(nelem, nparts, by, ascending):
-    np.random.seed(0)
+    _ = np.random.default_rng(seed=0)
     df = cudf.DataFrame()
     df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
     df["b"] = np.arange(100, nelem + 100)
@@ -82,7 +82,7 @@ def test_sort_repartition():
     ],
 )
 def test_sort_values_with_nulls(data, by, ascending, na_position):
-    np.random.seed(0)
+    _ = np.random.default_rng(seed=0)
     cp.random.seed(0)
     df = cudf.DataFrame(data)
     ddf = dd.from_pandas(df, npartitions=5)
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index cc0c6899804..9aaf6dc8420 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -19,8 +19,9 @@
 
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
+    rng = np.random.default_rng(seed=None)
     df = pd.DataFrame(
-        {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)}
+        {"x": rng.random(size=nelem), "y": rng.random(size=nelem)}
     )
 
     if include_na:

From 9980997b92179c55dc08076fce7cb7f224551ad1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 17 Oct 2024 10:39:54 -0500
Subject: [PATCH 079/117] Add conda recipe for cudf-polars (#17037)

This PR adds conda recipes for `cudf-polars`. This is needed to get `cudf-polars` into RAPIDS Docker containers and the `rapids` metapackage.

Closes https://github.com/rapidsai/cudf/issues/16816.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - James Lamb (https://github.com/jameslamb)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17037
---
 ci/build_python.sh                            |  5 ++
 ci/test_python_other.sh                       | 19 +++++-
 conda/recipes/cudf-polars/build.sh            |  4 ++
 conda/recipes/cudf-polars/meta.yaml           | 61 +++++++++++++++++++
 .../cudf_polars/tests/expressions/test_agg.py |  2 +-
 5 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 conda/recipes/cudf-polars/build.sh
 create mode 100644 conda/recipes/cudf-polars/meta.yaml

diff --git a/ci/build_python.sh b/ci/build_python.sh
index 2e3f70ba767..823d7f62290 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -52,5 +52,10 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/custreamz
 
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/cudf-polars
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index 21a59fa1494..db86721755d 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -14,7 +14,8 @@ rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
   "dask-cudf=${RAPIDS_VERSION}" \
   "cudf_kafka=${RAPIDS_VERSION}" \
-  "custreamz=${RAPIDS_VERSION}"
+  "custreamz=${RAPIDS_VERSION}" \
+  "cudf-polars=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
@@ -37,7 +38,7 @@ rapids-logger "pytest dask_cudf (legacy)"
 DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   .
 
 rapids-logger "pytest cudf_kafka"
@@ -54,5 +55,19 @@ rapids-logger "pytest custreamz"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \
   --cov-report=term
 
+# Note that cudf-polars uses rmm.mr.CudaAsyncMemoryResource() which allocates
+# half the available memory. This doesn't play well with multiple workers, so
+# we keep --numprocesses=1 for now. This should be resolved by
+# https://github.com/rapidsai/cudf/issues/16723.
+rapids-logger "pytest cudf-polars"
+./ci/run_cudf_polars_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" \
+  --numprocesses=1 \
+  --dist=worksteal \
+  --cov-config=./pyproject.toml \
+  --cov=cudf_polars \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-polars-coverage.xml" \
+  --cov-report=term
+
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/conda/recipes/cudf-polars/build.sh b/conda/recipes/cudf-polars/build.sh
new file mode 100644
index 00000000000..06e2f1bcb99
--- /dev/null
+++ b/conda/recipes/cudf-polars/build.sh
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# This assumes the script is executed from the root of the repo directory
+./build.sh cudf_polars
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
new file mode 100644
index 00000000000..e8fef715c60
--- /dev/null
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: cudf-polars
+  version: {{ version }}
+
+source:
+  path: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    - AWS_SESSION_TOKEN
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_GENERATOR
+    - PARALLEL_LEVEL
+    - SCCACHE_BUCKET
+    - SCCACHE_IDLE_TIMEOUT
+    - SCCACHE_REGION
+    - SCCACHE_S3_KEY_PREFIX=cudf-polars-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=cudf-polars-linux64 # [linux64]
+    - SCCACHE_S3_USE_SSL
+    - SCCACHE_S3_NO_CREDENTIALS
+
+requirements:
+  host:
+    - python
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - setuptools
+    - cuda-version ={{ cuda_version }}
+  run:
+    - python
+    - pylibcudf ={{ version }}
+    - polars >=1.8,<1.9
+    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
+
+test:
+  requires:
+    - cuda-version ={{ cuda_version }}
+  imports:
+    - cudf_polars
+
+
+about:
+  home: https://rapids.ai/
+  license: Apache-2.0
+  license_family: APACHE
+  license_file: LICENSE
+  summary: cudf-polars library
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 3001a61101a..86cb2352dcc 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -96,7 +96,7 @@ def test_bool_agg(agg, request):
     assert_gpu_result_equal(q, check_exact=False)
 
 
-@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs)
+@pytest.mark.parametrize("cum_agg", sorted(expr.UnaryFunction._supported_cum_aggs))
 def test_cum_agg_reverse_unsupported(cum_agg):
     df = pl.LazyFrame({"a": [1, 2, 3]})
     expr = getattr(pl.col("a"), cum_agg)(reverse=True)

From 6eeb7d6bda9bd952cb2553261b3dfc25da4ad6e5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 17 Oct 2024 11:11:11 -0500
Subject: [PATCH 080/117] Fix `DataFrame._from_arrays` and introduce
 validations (#17112)

Fixes: #17111  This PR fixes `DataFrame._from_arrays` to properly access `ndim` attribute and also corrects two validations in `Series` & `DataFrame` constructors.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17112
---
 python/cudf/cudf/core/dataframe.py       | 10 ++++++++--
 python/cudf/cudf/core/series.py          |  9 +++++++--
 python/cudf/cudf/tests/test_dataframe.py |  9 +++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f8d544e04b1..7d4d34f5b04 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -781,9 +781,15 @@ def __init__(
                 )
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
-                "Use cudf.Series._from_data for constructing a Series from "
+                "Use cudf.DataFrame._from_data for constructing a DataFrame from "
                 "ColumnAccessor"
             )
+        elif isinstance(data, ColumnBase):
+            raise TypeError(
+                "Use cudf.DataFrame._from_arrays for constructing a DataFrame from "
+                "ColumnBase or Use cudf.DataFrame._from_data by passing a dict "
+                "of column name and column as key-value pair."
+            )
         elif hasattr(data, "__cuda_array_interface__"):
             arr_interface = data.__cuda_array_interface__
             # descr is an optional field of the _cuda_ary_iface_
@@ -5884,7 +5890,7 @@ def _from_arrays(
                 f"records dimension expected 1 or 2 but found: {array_data.ndim}"
             )
 
-        if data.ndim == 2:
+        if array_data.ndim == 2:
             num_cols = array_data.shape[1]
         else:
             # Since we validate ndim to be either 1 or 2 above,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 41ee94b72c8..29ed18ac0ce 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -637,10 +637,15 @@ def __init__(
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
             if isinstance(data, (pd.Series, Series)):
                 index_from_data = ensure_index(data.index)
-        elif isinstance(data, (ColumnAccessor, ColumnBase)):
+        elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
-                "ColumnAccessor or a ColumnBase"
+                "ColumnAccessor"
+            )
+        elif isinstance(data, ColumnBase):
+            raise TypeError(
+                "Use cudf.Series._from_column for constructing a Series from "
+                "a ColumnBase"
             )
         elif isinstance(data, dict):
             if not data:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 7c3547c59d6..0f2b41888fa 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11172,3 +11172,12 @@ def test_from_pandas_preserve_column_dtype():
     df = pd.DataFrame([[1, 2]], columns=pd.Index([1, 2], dtype="int8"))
     result = cudf.DataFrame.from_pandas(df)
     pd.testing.assert_index_equal(result.columns, df.columns, exact=True)
+
+
+def test_dataframe_init_column():
+    s = cudf.Series([1, 2, 3])
+    with pytest.raises(TypeError):
+        cudf.DataFrame(s._column)
+    expect = cudf.DataFrame({"a": s})
+    actual = cudf.DataFrame._from_arrays(s._column, columns=["a"])
+    assert_eq(expect, actual)

From 14209c1962f1615f82f2c5be1cdbf58a6ed05789 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 17 Oct 2024 09:52:06 -0700
Subject: [PATCH 081/117] Correctly set `is_device_accesible` when creating
 `host_span`s from other container/span types (#17079)

Discovered that the way `host_span`s are created from `hostdevice_vector`, `hostdevice_span`,  `hostdevice_2dvector` and `host_2dspan` (yes, these are all real types!) does not propagate the `is_device_accesible` flag. In most of the cases these spans use pinned memory, so we're incorrect most of the time.
This PR fixed the way these conversions work. Adjusted some APIs to make it a bit harder to avoid passing the `is_device_accesible` flag.

Removed a few unused functions in `span.hpp` to keep the file as light as possible (it's included EVERYWHERE).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/17079
---
 cpp/include/cudf/utilities/span.hpp           | 131 +++++++-----------
 cpp/src/io/orc/reader_impl_chunking.cu        |   4 +-
 cpp/src/io/orc/reader_impl_decode.cu          |  44 +++---
 cpp/src/io/orc/writer_impl.cu                 |  15 +-
 cpp/src/io/utilities/hostdevice_span.hpp      |  94 ++++++-------
 cpp/src/io/utilities/hostdevice_vector.hpp    |  42 +++---
 cpp/src/strings/extract/extract.cu            |   5 +-
 .../utilities_tests/pinned_memory_tests.cpp   |  53 ++++++-
 cpp/tests/utilities_tests/span_tests.cu       |  37 ++---
 9 files changed, 205 insertions(+), 220 deletions(-)

diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index f3e1a61d075..d558cfb5e85 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -180,18 +180,6 @@ class span_base {
     return Derived(_data + _size - count, count);
   }
 
-  /**
-   * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
-   *
-   * @param offset The offset of the first element in the subspan
-   * @param count The number of elements in the subspan
-   * @return A subspan of the sequence, of requested count and offset
-   */
-  [[nodiscard]] constexpr Derived subspan(size_type offset, size_type count) const noexcept
-  {
-    return Derived(_data + offset, count);
-  }
-
  private:
   pointer _data{nullptr};
   size_type _size{0};
@@ -234,6 +222,15 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 
   constexpr host_span() noexcept : base() {}  // required to compile on centos
 
+  /// Constructor from pointer and size
+  /// @param data Pointer to the first element in the span
+  /// @param size The number of elements in the span
+  /// @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
+  constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
+    : base(data, size), _is_device_accessible{is_device_accessible}
+  {
+  }
+
   /// Constructor from container
   /// @param in The container to construct the span from
   template <typename C,
@@ -299,6 +296,19 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
    */
   [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; }
 
+  /**
+   * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
+   *
+   * @param offset The offset of the first element in the subspan
+   * @param count The number of elements in the subspan
+   * @return A subspan of the sequence, of requested count and offset
+   */
+  [[nodiscard]] constexpr host_span subspan(typename base::size_type offset,
+                                            typename base::size_type count) const noexcept
+  {
+    return host_span{this->data() + offset, count, _is_device_accessible};
+  }
+
  private:
   bool _is_device_accessible{false};
 };
@@ -368,6 +378,19 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
     : base(other.data(), other.size())
   {
   }
+
+  /**
+   * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
+   *
+   * @param offset The offset of the first element in the subspan
+   * @param count The number of elements in the subspan
+   * @return A subspan of the sequence, of requested count and offset
+   */
+  [[nodiscard]] constexpr device_span subspan(typename base::size_type offset,
+                                              typename base::size_type count) const noexcept
+  {
+    return device_span{this->data() + offset, count};
+  }
 };
 /** @} */  // end of group
 
@@ -386,42 +409,38 @@ class base_2dspan {
 
   constexpr base_2dspan() noexcept = default;
   /**
-   * @brief Constructor a 2D span
+   * @brief Constructor from a span and number of elements in each row.
    *
-   * @param data Pointer to the data
-   * @param rows Number of rows
+   * @param flat_view The flattened 2D span
    * @param columns Number of columns
    */
-  constexpr base_2dspan(T* data, size_t rows, size_t columns) noexcept
-    : _data{data}, _size{rows, columns}
+  constexpr base_2dspan(RowType<T, dynamic_extent> flat_view, size_t columns)
+    : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns}
   {
+    CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size");
   }
-  /**
-   * @brief Constructor a 2D span
-   *
-   * @param data Pointer to the data
-   * @param size Size of the 2D span as pair
-   */
-  base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{std::move(size)} {}
 
   /**
    * @brief Returns a pointer to the beginning of the sequence.
    *
    * @return A pointer to the first element of the span
    */
-  constexpr auto data() const noexcept { return _data; }
+  constexpr auto data() const noexcept { return _flat.data(); }
+
   /**
    * @brief Returns the size in the span as pair.
    *
    * @return pair representing rows and columns size of the span
    */
   constexpr auto size() const noexcept { return _size; }
+
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return Number of elements in the span
    */
-  constexpr auto count() const noexcept { return size().first * size().second; }
+  constexpr auto count() const noexcept { return _flat.size(); }
+
   /**
    * @brief Checks if the span is empty.
    *
@@ -429,19 +448,6 @@ class base_2dspan {
    */
   [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; }
 
-  /**
-   * @brief Returns flattened index of the element at the specified 2D position.
-   *
-   * @param row The row index
-   * @param column The column index
-   * @param size The size of the 2D span as pair
-   * @return The flattened index of the element at the specified 2D position
-   */
-  static constexpr size_t flatten_index(size_t row, size_t column, size_type size) noexcept
-  {
-    return row * size.second + column;
-  }
-
   /**
    * @brief Returns a reference to the row-th element of the sequence.
    *
@@ -453,41 +459,7 @@ class base_2dspan {
    */
   constexpr RowType<T, dynamic_extent> operator[](size_t row) const
   {
-    return {this->data() + flatten_index(row, 0, this->size()), this->size().second};
-  }
-
-  /**
-   * @brief Returns a reference to the first element in the span.
-   *
-   * Calling front() on an empty span results in undefined behavior.
-   *
-   * @return Reference to the first element in the span
-   */
-  [[nodiscard]] constexpr RowType<T, dynamic_extent> front() const { return (*this)[0]; }
-  /**
-   * @brief Returns a reference to the last element in the span.
-   *
-   * Calling back() on an empty span results in undefined behavior.
-   *
-   * @return Reference to the last element in the span
-   */
-  [[nodiscard]] constexpr RowType<T, dynamic_extent> back() const
-  {
-    return (*this)[size().first - 1];
-  }
-
-  /**
-   * @brief Obtains a 2D span that is a view over the `num_rows` rows of this span starting at
-   * `first_row`
-   *
-   * @param first_row The first row in the subspan
-   * @param num_rows The number of rows in the subspan
-   * @return A subspan of the sequence, of requested starting `first_row` and `num_rows`
-   */
-  constexpr base_2dspan subspan(size_t first_row, size_t num_rows) const noexcept
-  {
-    return base_2dspan(
-      _data + flatten_index(first_row, 0, this->size()), num_rows, this->size().second);
+    return _flat.subspan(row * _size.second, _size.second);
   }
 
   /**
@@ -495,10 +467,7 @@ class base_2dspan {
    *
    * @return A flattened span of the 2D span
    */
-  constexpr RowType<T, dynamic_extent> flat_view()
-  {
-    return {this->data(), this->size().first * this->size().second};
-  }
+  constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
 
   /**
    * @brief Construct a 2D span from another 2D span of convertible type
@@ -514,13 +483,13 @@ class base_2dspan {
                                                    RowType<T, dynamic_extent>>,
                              void>* = nullptr>
   constexpr base_2dspan(base_2dspan<OtherT, OtherRowType> const& other) noexcept
-    : _data{other.data()}, _size{other.size()}
+    : _flat{other.flat_view()}, _size{other.size()}
   {
   }
 
  protected:
-  T* _data = nullptr;     ///< pointer to the first element
-  size_type _size{0, 0};  ///< rows, columns
+  RowType<T, dynamic_extent> _flat;  ///< flattened 2D span
+  size_type _size{0, 0};             ///< num rows, num columns
 };
 
 /**
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index ecf319e75ab..9cc77e8e738 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -668,8 +668,8 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
-      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
-        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>{hd_compinfo}.subspan(
+        0, stream_range.size());
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         auto const dst_base =
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index d628e936cb1..a1e4aa65dcf 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -508,21 +508,20 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, cudf::get_current_device_resource_ref());
 
-  thrust::for_each(
-    rmm::exec_policy_nosync(stream),
-    d_prefix_sums_to_update.begin(),
-    d_prefix_sums_to_update.end(),
-    [num_stripes, chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-      auto const& idx_psums) {
-      auto const col_idx = idx_psums.first;
-      auto const psums   = idx_psums.second;
-      thrust::transform(thrust::seq,
-                        thrust::make_counting_iterator<std::size_t>(0ul),
-                        thrust::make_counting_iterator<std::size_t>(num_stripes),
-                        psums,
-                        [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-      thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
-    });
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   d_prefix_sums_to_update.begin(),
+                   d_prefix_sums_to_update.end(),
+                   [num_stripes, chunks = chunks.device_view()] __device__(auto const& idx_psums) {
+                     auto const col_idx = idx_psums.first;
+                     auto const psums   = idx_psums.second;
+                     thrust::transform(
+                       thrust::seq,
+                       thrust::make_counting_iterator<std::size_t>(0ul),
+                       thrust::make_counting_iterator<std::size_t>(num_stripes),
+                       psums,
+                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+                     thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
+                   });
   // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
   stream.synchronize();
 }
@@ -554,12 +553,12 @@ void aggregate_child_meta(std::size_t level,
   col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
   col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
 
-  auto child_start_row = cudf::detail::host_2dspan<int64_t>(
-    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
-  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<int64_t>(
-    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
+  auto child_start_row =
+    cudf::detail::host_2dspan<int64_t>(col_meta.child_start_row, num_child_cols);
+  auto num_child_rows_per_stripe =
+    cudf::detail::host_2dspan<int64_t>(col_meta.num_child_rows_per_stripe, num_child_cols);
   auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
-    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
+    col_meta.rwgrp_meta, num_child_cols);
 
   int index = 0;  // number of child column processed
 
@@ -951,8 +950,9 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
     // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
-        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      auto const compinfo =
+        cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>{hd_compinfo}.subspan(
+          0, stream_range.size());
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
                                                 stripe_count,
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index b09062f700e..03020eb649f 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -718,8 +718,8 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
 
   auto d_pd_set_counts_data = rmm::device_uvector<cudf::size_type>(
     orc_table.num_columns() * segmentation.num_rowgroups(), stream);
-  auto const d_pd_set_counts = device_2dspan<cudf::size_type>{
-    d_pd_set_counts_data.data(), segmentation.num_rowgroups(), orc_table.num_columns()};
+  auto const d_pd_set_counts =
+    device_2dspan<cudf::size_type>{d_pd_set_counts_data, orc_table.num_columns()};
   gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream);
 
   auto aligned_rgs = hostdevice_2dvector<rowgroup_rows>(
@@ -740,7 +740,7 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
     [columns = device_span<orc_column_device_view const>{orc_table.d_columns},
      stripes = device_span<stripe_rowgroups const>{d_stripes},
      d_pd_set_counts,
-     out_rowgroups = device_2dspan<rowgroup_rows>{aligned_rgs}] __device__(auto& idx) {
+     out_rowgroups = aligned_rgs.device_view()] __device__(auto& idx) {
       uint32_t const col_idx = idx / stripes.size();
       // No alignment needed for root columns
       if (not columns[col_idx].parent_index.has_value()) return;
@@ -912,7 +912,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0ul),
     chunks.count(),
-    [chunks = device_2dspan<gpu::EncChunk>{chunks},
+    [chunks = chunks.device_view(),
      cols = device_span<orc_column_device_view const>{orc_table.d_columns}] __device__(auto& idx) {
       auto const col_idx             = idx / chunks.size().second;
       auto const rg_idx              = idx % chunks.size().second;
@@ -1898,7 +1898,7 @@ hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view cons
     thrust::make_counting_iterator(0ul),
     num_rowgroups,
     [cols      = device_span<orc_column_device_view const>{orc_table.d_columns},
-     rg_bounds = device_2dspan<rowgroup_rows>{rowgroup_bounds},
+     rg_bounds = rowgroup_bounds.device_view(),
      rowgroup_size] __device__(auto rg_idx) mutable {
       thrust::transform(
         thrust::seq, cols.begin(), cols.end(), rg_bounds[rg_idx].begin(), [&](auto const& col) {
@@ -1988,8 +1988,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                      d_tmp_rowgroup_sizes.end(),
                      [src       = esizes.data(),
                       col_idx   = col_idx,
-                      rg_bounds = device_2dspan<rowgroup_rows const>{
-                        segmentation.rowgroups}] __device__(auto idx) {
+                      rg_bounds = segmentation.rowgroups.device_view()] __device__(auto idx) {
                        return src[rg_bounds[idx][col_idx].end - 1];
                      });
 
@@ -2051,7 +2050,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
   auto const num_str_cols  = orc_table.num_string_columns();
 
   auto counts         = rmm::device_uvector<size_type>(num_str_cols * num_rowgroups, stream);
-  auto counts_2d_view = device_2dspan<size_type>(counts.data(), num_str_cols, num_rowgroups);
+  auto counts_2d_view = device_2dspan<size_type>(counts, num_rowgroups);
   gpu::rowgroup_char_counts(counts_2d_view,
                             orc_table.d_columns,
                             rowgroup_bounds,
diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp
index 1d8b34addbd..a3ddef52dd8 100644
--- a/cpp/src/io/utilities/hostdevice_span.hpp
+++ b/cpp/src/io/utilities/hostdevice_span.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -33,31 +34,18 @@ class hostdevice_span {
   hostdevice_span(hostdevice_span const&) = default;  ///< Copy constructor
   hostdevice_span(hostdevice_span&&)      = default;  ///< Move constructor
 
-  hostdevice_span(T* cpu_data, T* gpu_data, size_t size)
-    : _size(size), _device_data(gpu_data), _host_data(cpu_data)
+  hostdevice_span(host_span<T> host_data, T* device_data)
+    : _host_data{host_data}, _device_data{device_data}
   {
   }
 
-  /// Constructor from container
-  /// @param in The container to construct the span from
-  template <typename C,
-            // Only supported containers of types convertible to T
-            std::enable_if_t<std::is_convertible_v<
-              std::remove_pointer_t<decltype(std::declval<C&>().host_ptr())> (*)[],  // NOLINT
-              T (*)[]>>* = nullptr>                                                  // NOLINT
-  constexpr hostdevice_span(C& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size())
-  {
-  }
-
-  /// Constructor from const container
-  /// @param in The container to construct the span from
-  template <typename C,
-            // Only supported containers of types convertible to T
-            std::enable_if_t<std::is_convertible_v<
-              std::remove_pointer_t<decltype(std::declval<C&>().host_ptr())> (*)[],  // NOLINT
-              T (*)[]>>* = nullptr>                                                  // NOLINT
-  constexpr hostdevice_span(C const& in)
-    : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size())
+  // Copy construction to support const conversion
+  /// @param other The span to copy
+  template <typename OtherT,
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
+                             void>* = nullptr>
+  constexpr hostdevice_span(hostdevice_span<OtherT> const& other) noexcept
+    : _host_data{host_span<OtherT>{other}}, _device_data{other.device_ptr()}
   {
   }
 
@@ -74,15 +62,13 @@ class hostdevice_span {
    * @tparam T The device span type.
    * @return A typed device span of the hostdevice view's data.
    */
-  [[nodiscard]] operator cudf::device_span<T>() { return {_device_data, size()}; }
-
-  /**
-   * @brief Converts a hostdevice view into a device span of const data.
-   *
-   * @tparam T The device span type.
-   * @return A const typed device span of the hostdevice view's data.
-   */
-  [[nodiscard]] operator cudf::device_span<T const>() const { return {_device_data, size()}; }
+  template <typename U,
+            std::enable_if_t<std::is_convertible_v<T (*)[], U (*)[]>,  // NOLINT
+                             void>* = nullptr>
+  [[nodiscard]] operator cudf::device_span<U>() const noexcept
+  {
+    return {_device_data, size()};
+  }
 
   /**
    * @brief Returns the underlying device data.
@@ -114,9 +100,12 @@ class hostdevice_span {
    * @tparam T The host span type.
    * @return A typed host span of the hostdevice_span's data.
    */
-  [[nodiscard]] operator cudf::host_span<T>() const noexcept
+  template <typename U,
+            std::enable_if_t<std::is_convertible_v<T (*)[], U (*)[]>,  // NOLINT
+                             void>* = nullptr>
+  [[nodiscard]] operator host_span<U>() const noexcept
   {
-    return cudf::host_span<T>(_host_data, size());
+    return {_host_data};
   }
 
   /**
@@ -125,7 +114,7 @@ class hostdevice_span {
    * @tparam T The type to cast to
    * @return T* Typed pointer to underlying data
    */
-  [[nodiscard]] T* host_ptr(size_t offset = 0) const noexcept { return _host_data + offset; }
+  [[nodiscard]] T* host_ptr(size_t offset = 0) const noexcept { return _host_data.data() + offset; }
 
   /**
    * @brief Return first element in host data.
@@ -136,19 +125,19 @@ class hostdevice_span {
   [[nodiscard]] T* host_begin() const noexcept { return host_ptr(); }
 
   /**
-   * @brief Return one past the last elementin host data.
+   * @brief Return one past the last element in host data.
    *
    * @tparam T The desired type
    * @return T const* Pointer to one past the last element
    */
-  [[nodiscard]] T* host_end() const noexcept { return host_begin() + size(); }
+  [[nodiscard]] T* host_end() const noexcept { return _host_data.end(); }
 
   /**
    * @brief Returns the number of elements in the view
    *
    * @return The number of elements in the view
    */
-  [[nodiscard]] std::size_t size() const noexcept { return _size; }
+  [[nodiscard]] std::size_t size() const noexcept { return _host_data.size(); }
 
   /**
    * @brief Returns true if `size()` returns zero, or false otherwise
@@ -159,12 +148,11 @@ class hostdevice_span {
 
   [[nodiscard]] size_t size_bytes() const noexcept { return sizeof(T) * size(); }
 
-  [[nodiscard]] T& operator[](size_t i) { return _host_data[i]; }
-  [[nodiscard]] T const& operator[](size_t i) const { return _host_data[i]; }
+  [[nodiscard]] T& operator[](size_t i) const { return _host_data[i]; }
 
   /**
-   * @brief Obtains a hostdevice_span that is a view over the `count` elements of this
-   * hostdevice_span starting at offset
+   * @brief Obtains a `hostdevice_span` that is a view over the `count` elements of this
+   * hostdevice_span starting at `offset`
    *
    * @param offset The offset of the first element in the subspan
    * @param count The number of elements in the subspan
@@ -172,37 +160,37 @@ class hostdevice_span {
    */
   [[nodiscard]] constexpr hostdevice_span<T> subspan(size_t offset, size_t count) const noexcept
   {
-    return hostdevice_span<T>(_host_data + offset, _device_data + offset, count);
+    return hostdevice_span<T>(_host_data.subspan(offset, count), device_ptr(offset));
   }
 
-  void host_to_device_async(rmm::cuda_stream_view stream)
+  void host_to_device_async(rmm::cuda_stream_view stream) const
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    static_assert(not std::is_const_v<T>, "Cannot copy to const device memory");
+    cudf::detail::cuda_memcpy_async<T>(device_span<T>{device_ptr(), size()}, _host_data, stream);
   }
 
-  void host_to_device_sync(rmm::cuda_stream_view stream)
+  void host_to_device_sync(rmm::cuda_stream_view stream) const
   {
     host_to_device_async(stream);
     stream.synchronize();
   }
 
-  void device_to_host_async(rmm::cuda_stream_view stream)
+  void device_to_host_async(rmm::cuda_stream_view stream) const
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    static_assert(not std::is_const_v<T>, "Cannot copy to const host memory");
+    cudf::detail::cuda_memcpy_async<T>(
+      _host_data, device_span<T const>{device_ptr(), size()}, stream);
   }
 
-  void device_to_host_sync(rmm::cuda_stream_view stream)
+  void device_to_host_sync(rmm::cuda_stream_view stream) const
   {
     device_to_host_async(stream);
     stream.synchronize();
   }
 
  private:
-  size_t _size{};     ///< Number of elements
-  T* _device_data{};  ///< Pointer to device memory containing elements
-  T* _host_data{};    ///< Pointer to host memory containing elements
+  host_span<T> _host_data;
+  T* _device_data{nullptr};
 };
 
 }  // namespace cudf::detail
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 634e6d78ebc..af1ba16a424 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -117,8 +117,11 @@ class hostdevice_vector {
     return d_data.element(element_index, stream);
   }
 
-  operator cudf::host_span<T>() { return {host_ptr(), size()}; }
-  operator cudf::host_span<T const>() const { return {host_ptr(), size()}; }
+  operator cudf::host_span<T>() { return host_span<T>{h_data}.subspan(0, size()); }
+  operator cudf::host_span<T const>() const
+  {
+    return host_span<T const>{h_data}.subspan(0, size());
+  }
 
   operator cudf::device_span<T>() { return {device_ptr(), size()}; }
   operator cudf::device_span<T const>() const { return {device_ptr(), size()}; }
@@ -142,24 +145,11 @@ class hostdevice_vector {
    *
    * @return A typed hostdevice_span of the hostdevice_vector's data
    */
-  [[nodiscard]] operator hostdevice_span<T>()
-  {
-    return hostdevice_span<T>{h_data.data(), d_data.data(), size()};
-  }
+  [[nodiscard]] operator hostdevice_span<T>() { return {host_span<T>{h_data}, device_ptr()}; }
 
-  /**
-   * @brief Converts a part of a hostdevice_vector into a hostdevice_span.
-   *
-   * @param offset The offset of the first element in the subspan
-   * @param count The number of elements in the subspan
-   * @return A typed hostdevice_span of the hostdevice_vector's data
-   */
-  [[nodiscard]] hostdevice_span<T> subspan(size_t offset, size_t count)
+  [[nodiscard]] operator hostdevice_span<T const>() const
   {
-    CUDF_EXPECTS(offset < d_data.size(), "Offset is out of bounds.");
-    CUDF_EXPECTS(count <= d_data.size() - offset,
-                 "The span with given offset and count is out of bounds.");
-    return hostdevice_span<T>{h_data.data() + offset, d_data.data() + offset, count};
+    return {host_span<T const>{h_data}, device_ptr()};
   }
 
  private:
@@ -182,26 +172,26 @@ class hostdevice_2dvector {
   {
   }
 
-  operator device_2dspan<T>() { return {_data.device_ptr(), _size}; }
-  operator device_2dspan<T const>() const { return {_data.device_ptr(), _size}; }
+  operator device_2dspan<T>() { return {device_span<T>{_data}, _size.second}; }
+  operator device_2dspan<T const>() const { return {device_span<T const>{_data}, _size.second}; }
 
   device_2dspan<T> device_view() { return static_cast<device_2dspan<T>>(*this); }
-  device_2dspan<T> device_view() const { return static_cast<device_2dspan<T const>>(*this); }
+  device_2dspan<T const> device_view() const { return static_cast<device_2dspan<T const>>(*this); }
 
-  operator host_2dspan<T>() { return {_data.host_ptr(), _size}; }
-  operator host_2dspan<T const>() const { return {_data.host_ptr(), _size}; }
+  operator host_2dspan<T>() { return {host_span<T>{_data}, _size.second}; }
+  operator host_2dspan<T const>() const { return {host_span<T const>{_data}, _size.second}; }
 
   host_2dspan<T> host_view() { return static_cast<host_2dspan<T>>(*this); }
-  host_2dspan<T> host_view() const { return static_cast<host_2dspan<T const>>(*this); }
+  host_2dspan<T const> host_view() const { return static_cast<host_2dspan<T const>>(*this); }
 
   host_span<T> operator[](size_t row)
   {
-    return {_data.host_ptr() + host_2dspan<T>::flatten_index(row, 0, _size), _size.second};
+    return host_span<T>{_data}.subspan(row * _size.second, _size.second);
   }
 
   host_span<T const> operator[](size_t row) const
   {
-    return {_data.host_ptr() + host_2dspan<T>::flatten_index(row, 0, _size), _size.second};
+    return host_span<T const>{_data}.subspan(row * _size.second, _size.second);
   }
 
   auto size() const noexcept { return _size; }
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index 7323918dcff..8683a9bdfbe 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -100,9 +100,8 @@ std::unique_ptr<table> extract(strings_column_view const& input,
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
 
-  auto indices = rmm::device_uvector<string_index_pair>(input.size() * groups, stream);
-  auto d_indices =
-    cudf::detail::device_2dspan<string_index_pair>(indices.data(), input.size(), groups);
+  auto indices   = rmm::device_uvector<string_index_pair>(input.size() * groups, stream);
+  auto d_indices = cudf::detail::device_2dspan<string_index_pair>(indices, groups);
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index 1e1e21fe18a..7b8ee840da4 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "io/utilities/hostdevice_vector.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
@@ -27,6 +29,12 @@
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
+using cudf::host_span;
+using cudf::detail::host_2dspan;
+using cudf::detail::hostdevice_2dvector;
+using cudf::detail::hostdevice_span;
+using cudf::detail::hostdevice_vector;
+
 class PinnedMemoryTest : public cudf::test::BaseFixture {
   size_t prev_copy_threshold;
   size_t prev_alloc_threshold;
@@ -132,10 +140,10 @@ TEST_F(PinnedMemoryTest, HostSpan)
   auto test_ctors = [](auto&& vec) {
     auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible();
     // Test conversion from a vector
-    auto const span = cudf::host_span<int16_t>{vec};
+    auto const span = host_span<int16_t>{vec};
     EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible);
     // Test conversion from host_span with different type
-    auto const span_converted = cudf::host_span<int16_t const>{span};
+    auto const span_converted = host_span<int16_t const>{span};
     EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible);
   };
 
@@ -144,4 +152,45 @@ TEST_F(PinnedMemoryTest, HostSpan)
     // some iterations will use pinned memory, some will not
     test_ctors(cudf::detail::make_host_vector<int16_t>(i, cudf::get_default_stream()));
   }
+
+  auto stream{cudf::get_default_stream()};
+
+  // hostdevice vectors use pinned memory for the host side; test that host_span can be constructed
+  // from a hostdevice_vector with correct device accessibility
+
+  hostdevice_vector<int16_t> hd_vec(10, stream);
+  auto const span = host_span<int16_t>{hd_vec};
+  EXPECT_TRUE(span.is_device_accessible());
+
+  // test host_view and operator[]
+  {
+    hostdevice_2dvector<int16_t> hd_2dvec(10, 10, stream);
+    auto const span2d = hd_2dvec.host_view().flat_view();
+    EXPECT_TRUE(span2d.is_device_accessible());
+
+    auto const span2d_from_cast = host_2dspan<int16_t>{hd_2dvec};
+    EXPECT_TRUE(span2d_from_cast.flat_view().is_device_accessible());
+
+    auto const row_span = hd_2dvec[0];
+    EXPECT_TRUE(row_span.is_device_accessible());
+  }
+
+  // test const versions of host_view and operator[]
+  {
+    hostdevice_2dvector<int16_t> const const_hd_2dvec(10, 10, stream);
+    auto const const_span2d = const_hd_2dvec.host_view().flat_view();
+    EXPECT_TRUE(const_span2d.is_device_accessible());
+
+    auto const const_span2d_from_cast = host_2dspan<int16_t const>{const_hd_2dvec};
+    EXPECT_TRUE(const_span2d_from_cast.flat_view().is_device_accessible());
+
+    auto const const_row_span = const_hd_2dvec[0];
+    EXPECT_TRUE(const_row_span.is_device_accessible());
+  }
+
+  // test hostdevice_span
+  {
+    hostdevice_span<int16_t> hd_span(hd_vec);
+    EXPECT_TRUE(host_span<int16_t>{hd_span}.is_device_accessible());
+  }
 }
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index 019d6adc007..5389e1c069d 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -336,58 +336,50 @@ auto get_test_hostdevice_vector()
 
 TEST(HostDeviceSpanTest, CanCreateFullSubspan)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
-  expect_equivalent(message_span, message.subspan(0, message_span.size()));
+  expect_equivalent(message_span.subspan(0, message_span.size()), message_span);
 }
 
 TEST(HostDeviceSpanTest, CanCreateHostSpan)
 {
   auto message            = get_test_hostdevice_vector();
   auto const message_span = host_span<char>(message.host_ptr(), message.size());
-  auto const hd_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto const hd_span      = cudf::detail::hostdevice_span<char>{message};
 
   expect_equivalent(message_span, cudf::host_span<char>(hd_span));
 }
 
 TEST(HostDeviceSpanTest, CanTakeSubspanFull)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
-  expect_match("hello world", message.subspan(0, 11));
   expect_match("hello world", message_span.subspan(0, 11));
 }
 
 TEST(HostDeviceSpanTest, CanTakeSubspanPartial)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
-  expect_match("lo w", message.subspan(3, 4));
   expect_match("lo w", message_span.subspan(3, 4));
 }
 
 TEST(HostDeviceSpanTest, CanGetData)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
   EXPECT_EQ(message.host_ptr(), message_span.host_ptr());
 }
 
 TEST(HostDeviceSpanTest, CanGetSize)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
-  auto const empty_span = cudf::detail::hostdevice_span<char>();
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
+  auto const empty_span   = cudf::detail::hostdevice_span<char>();
 
   EXPECT_EQ(static_cast<size_t>(11), message_span.size());
   EXPECT_EQ(static_cast<size_t>(0), empty_span.size());
@@ -413,8 +405,7 @@ TEST(HostDeviceSpanTest, CanCopySpan)
   cudf::detail::hostdevice_span<char> message_span_copy;
 
   {
-    auto const message_span =
-      cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+    auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
     message_span_copy = message_span;
   }

From 920a5f6a1b0a423dda2b7c2d73aae4ab0df62053 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 17 Oct 2024 15:28:08 -0400
Subject: [PATCH 082/117] Remove the additional host register calls initially
 intended for performance improvement on Grace Hopper (#17092)

On Grace Hopper, file I/O takes a special path that calls `cudaHostRegister` to circumvent a performance issue. Recent benchmark shows that this workaround is no longer necessary . This PR is making a clean-up.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17092
---
 cpp/include/cudf/io/datasource.hpp  | 19 +++----
 cpp/src/io/functions.cpp            | 12 ++---
 cpp/src/io/utilities/datasource.cpp | 84 ++---------------------------
 3 files changed, 14 insertions(+), 101 deletions(-)

diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index dc14802adc1..7d2cc4ad493 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -86,28 +86,21 @@ class datasource {
   /**
    * @brief Creates a source from a file path.
    *
-   * @note Parameters `offset`, `max_size_estimate` and `min_size_estimate` are hints to the
-   * `datasource` implementation about the expected range of the data that will be read. The
-   * implementation may use these hints to optimize the read operation. These parameters are usually
-   * based on the byte range option. In this case, `min_size_estimate` should be no greater than the
-   * byte range to avoid potential issues when reading adjacent ranges. `max_size_estimate` can
-   * include padding after the byte range, to include additional data that may be needed for
-   * processing.
-   *
-   @throws cudf::logic_error if the minimum size estimate is greater than the maximum size estimate
+   * Parameters `offset` and `max_size_estimate` are hints to the `datasource` implementation about
+   * the expected range of the data that will be read. The implementation may use these hints to
+   * optimize the read operation. These parameters are usually based on the byte range option. In
+   * this case, `max_size_estimate` can include padding after the byte range, to include additional
+   * data that may be needed for processing.
    *
    * @param[in] filepath Path to the file to use
    * @param[in] offset Starting byte offset from which data will be read (the default is zero)
    * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is
    * zero, which means the whole file after `offset`)
-   * @param[in] min_size_estimate Lower estimate of the data range that will be read (the default is
-   * zero, which means the whole file after `offset`)
    * @return Constructed datasource object
    */
   static std::unique_ptr<datasource> create(std::string const& filepath,
                                             size_t offset            = 0,
-                                            size_t max_size_estimate = 0,
-                                            size_t min_size_estimate = 0);
+                                            size_t max_size_estimate = 0);
 
   /**
    * @brief Creates a source from a host memory buffer.
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 5a060902eb2..a8682e6a760 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -123,15 +123,13 @@ namespace {
 
 std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
                                                                     size_t offset            = 0,
-                                                                    size_t max_size_estimate = 0,
-                                                                    size_t min_size_estimate = 0)
+                                                                    size_t max_size_estimate = 0)
 {
   switch (info.type()) {
     case io_type::FILEPATH: {
       auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
       for (auto const& filepath : info.filepaths()) {
-        sources.emplace_back(
-          cudf::io::datasource::create(filepath, offset, max_size_estimate, min_size_estimate));
+        sources.emplace_back(cudf::io::datasource::create(filepath, offset, max_size_estimate));
       }
       return sources;
     }
@@ -213,8 +211,7 @@ table_with_metadata read_json(json_reader_options options,
 
   auto datasources = make_datasources(options.get_source(),
                                       options.get_byte_range_offset(),
-                                      options.get_byte_range_size_with_padding(),
-                                      options.get_byte_range_size());
+                                      options.get_byte_range_size_with_padding());
 
   return json::detail::read_json(datasources, options, stream, mr);
 }
@@ -241,8 +238,7 @@ table_with_metadata read_csv(csv_reader_options options,
 
   auto datasources = make_datasources(options.get_source(),
                                       options.get_byte_range_offset(),
-                                      options.get_byte_range_size_with_padding(),
-                                      options.get_byte_range_size());
+                                      options.get_byte_range_size_with_padding());
 
   CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported.");
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 0be976b6144..28f7f08521e 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -134,27 +134,6 @@ class file_source : public datasource {
   static constexpr size_t _gds_read_preferred_threshold = 128 << 10;  // 128KB
 };
 
-/**
- * @brief Memoized pageableMemoryAccessUsesHostPageTables device property.
- */
-[[nodiscard]] bool pageableMemoryAccessUsesHostPageTables()
-{
-  static std::unordered_map<int, bool> result_cache{};
-
-  int deviceId{};
-  CUDF_CUDA_TRY(cudaGetDevice(&deviceId));
-
-  if (result_cache.find(deviceId) == result_cache.end()) {
-    cudaDeviceProp props{};
-    CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, deviceId));
-    result_cache[deviceId] = (props.pageableMemoryAccessUsesHostPageTables == 1);
-    CUDF_LOG_INFO(
-      "Device {} pageableMemoryAccessUsesHostPageTables: {}", deviceId, result_cache[deviceId]);
-  }
-
-  return result_cache[deviceId];
-}
-
 /**
  * @brief Implementation class for reading from a file using memory mapped access.
  *
@@ -163,28 +142,18 @@ class file_source : public datasource {
  */
 class memory_mapped_source : public file_source {
  public:
-  explicit memory_mapped_source(char const* filepath,
-                                size_t offset,
-                                size_t max_size_estimate,
-                                size_t min_size_estimate)
+  explicit memory_mapped_source(char const* filepath, size_t offset, size_t max_size_estimate)
     : file_source(filepath)
   {
     if (_file.size() != 0) {
       // Memory mapping is not exclusive, so we can include the whole region we expect to read
       map(_file.desc(), offset, max_size_estimate);
-      // Buffer registration is exclusive (can't overlap with other registered buffers) so we
-      // register the lower estimate; this avoids issues when reading adjacent ranges from the same
-      // file from multiple threads
-      register_mmap_buffer(offset, min_size_estimate);
     }
   }
 
   ~memory_mapped_source() override
   {
-    if (_map_addr != nullptr) {
-      unmap();
-      unregister_mmap_buffer();
-    }
+    if (_map_addr != nullptr) { unmap(); }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
@@ -227,46 +196,6 @@ class memory_mapped_source : public file_source {
   }
 
  private:
-  /**
-   * @brief Page-locks (registers) the memory range of the mapped file.
-   *
-   * Fixes nvbugs/4215160
-   */
-  void register_mmap_buffer(size_t offset, size_t size)
-  {
-    if (_map_addr == nullptr or not pageableMemoryAccessUsesHostPageTables()) { return; }
-
-    // Registered region must be within the mapped region
-    _reg_offset = std::max(offset, _map_offset);
-    _reg_size   = std::min(size != 0 ? size : _map_size, (_map_offset + _map_size) - _reg_offset);
-
-    _reg_addr         = static_cast<std::byte*>(_map_addr) - _map_offset + _reg_offset;
-    auto const result = cudaHostRegister(_reg_addr, _reg_size, cudaHostRegisterReadOnly);
-    if (result != cudaSuccess) {
-      _reg_addr = nullptr;
-      CUDF_LOG_WARN("cudaHostRegister failed with {} ({})",
-                    static_cast<int>(result),
-                    cudaGetErrorString(result));
-    }
-  }
-
-  /**
-   * @brief Unregisters the memory range of the mapped file.
-   */
-  void unregister_mmap_buffer()
-  {
-    if (_reg_addr == nullptr) { return; }
-
-    auto const result = cudaHostUnregister(_reg_addr);
-    if (result == cudaSuccess) {
-      _reg_addr = nullptr;
-    } else {
-      CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})",
-                    static_cast<int>(result),
-                    cudaGetErrorString(result));
-    }
-  }
-
   void map(int fd, size_t offset, size_t size)
   {
     CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error);
@@ -461,12 +390,8 @@ class user_datasource_wrapper : public datasource {
 
 std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t offset,
-                                               size_t max_size_estimate,
-                                               size_t min_size_estimate)
+                                               size_t max_size_estimate)
 {
-  CUDF_EXPECTS(max_size_estimate == 0 or min_size_estimate <= max_size_estimate,
-               "Invalid min/max size estimates for datasource creation");
-
 #ifdef CUFILE_FOUND
   if (cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
@@ -474,8 +399,7 @@ std::unique_ptr<datasource> datasource::create(std::string const& filepath,
   }
 #endif
   // Use our own memory mapping implementation for direct file reads
-  return std::make_unique<memory_mapped_source>(
-    filepath.c_str(), offset, max_size_estimate, min_size_estimate);
+  return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, max_size_estimate);
 }
 
 std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)

From 00feb82cbda10bf65343e08d54ed9e893ff4aa71 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 17 Oct 2024 13:02:00 -0700
Subject: [PATCH 083/117] Limit the number of keys to calculate column sizes
 and page starts in PQ reader to 1B (#17059)

This PR limits the number of keys to use at a time to calculate column `sizes` and `page_start_values` to 1B averting possible OOM and UB from implicit typecasting of `size_t` iterator to `size_type` iterators in `thrust::reduce_by_key`.

Closes #16985
Closes #17086

## Resolved
- [x] Add tests
- [x] Debug with fingerprinting structs table for a possible bug in PQ writer (nothing seems wrong with the writer as pyarrow is able to read the written parquet files).

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17059
---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 89 ++++++++++++++------
 cpp/tests/io/parquet_reader_test.cpp         | 35 ++++++++
 2 files changed, 96 insertions(+), 28 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 8cab68ea721..5138a92ac14 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -44,6 +44,7 @@
 #include <thrust/unique.h>
 
 #include <bitset>
+#include <limits>
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
@@ -1592,36 +1593,68 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
     auto const d_cols_info = cudf::detail::make_device_uvector_async(
       h_cols_info, _stream, cudf::get_current_device_resource_ref());
 
-    auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size();
-    // size iterator. indexes pages by sorted order
-    rmm::device_uvector<size_type> size_input{num_keys, _stream};
-    thrust::transform(
-      rmm::exec_policy(_stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(num_keys),
-      size_input.begin(),
-      get_page_nesting_size{
-        d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
-    auto const reduction_keys =
-      cudf::detail::make_counting_transform_iterator(0, get_reduction_key{subpass.pages.size()});
+    // Vector to store page sizes for each column at each depth
     cudf::detail::hostdevice_vector<size_t> sizes{_input_columns.size() * max_depth, _stream};
 
-    // find the size of each column
-    thrust::reduce_by_key(rmm::exec_policy(_stream),
-                          reduction_keys,
-                          reduction_keys + num_keys,
-                          size_input.cbegin(),
-                          thrust::make_discard_iterator(),
-                          sizes.d_begin());
-
-    // for nested hierarchies, compute per-page start offset
-    thrust::exclusive_scan_by_key(
-      rmm::exec_policy(_stream),
-      reduction_keys,
-      reduction_keys + num_keys,
-      size_input.cbegin(),
-      start_offset_output_iterator{
-        subpass.pages.device_begin(), 0, d_cols_info.data(), max_depth, subpass.pages.size()});
+    // Total number of keys to process
+    auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size();
+
+    // Maximum 1 billion keys processed per iteration
+    auto constexpr max_keys_per_iter =
+      static_cast<size_t>(std::numeric_limits<size_type>::max() / 2);
+
+    // Number of keys for per each column
+    auto const num_keys_per_col = max_depth * subpass.pages.size();
+
+    // The largest multiple of `num_keys_per_col` that is <= `num_keys`
+    auto const num_keys_per_iter =
+      num_keys <= max_keys_per_iter
+        ? num_keys
+        : num_keys_per_col * std::max<size_t>(1, max_keys_per_iter / num_keys_per_col);
+
+    // Size iterator. Indexes pages by sorted order
+    rmm::device_uvector<size_type> size_input{num_keys_per_iter, _stream};
+
+    // To keep track of the starting key of an iteration
+    size_t key_start = 0;
+    // Loop until all keys are processed
+    while (key_start < num_keys) {
+      // Number of keys processed in this iteration
+      auto const num_keys_this_iter = std::min<size_t>(num_keys_per_iter, num_keys - key_start);
+      thrust::transform(
+        rmm::exec_policy_nosync(_stream),
+        thrust::make_counting_iterator<size_t>(key_start),
+        thrust::make_counting_iterator<size_t>(key_start + num_keys_this_iter),
+        size_input.begin(),
+        get_page_nesting_size{
+          d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
+
+      // Manually create a int64_t `key_start` compatible counting_transform_iterator to avoid
+      // implicit casting to size_type.
+      auto const reduction_keys = thrust::make_transform_iterator(
+        thrust::make_counting_iterator<size_t>(key_start), get_reduction_key{subpass.pages.size()});
+
+      // Find the size of each column
+      thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
+                            reduction_keys,
+                            reduction_keys + num_keys_this_iter,
+                            size_input.cbegin(),
+                            thrust::make_discard_iterator(),
+                            sizes.d_begin() + (key_start / subpass.pages.size()));
+
+      // For nested hierarchies, compute per-page start offset
+      thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
+                                    reduction_keys,
+                                    reduction_keys + num_keys_this_iter,
+                                    size_input.cbegin(),
+                                    start_offset_output_iterator{subpass.pages.device_begin(),
+                                                                 key_start,
+                                                                 d_cols_info.data(),
+                                                                 max_depth,
+                                                                 subpass.pages.size()});
+      // Increment the key_start
+      key_start += num_keys_this_iter;
+    }
 
     sizes.device_to_host_sync(_stream);
     for (size_type idx = 0; idx < static_cast<size_type>(_input_columns.size()); idx++) {
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 4a5309f3ba7..ab4645c2e25 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -2724,3 +2724,38 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped)
   EXPECT_EQ(result_table.num_columns(), expected->num_columns());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
 }
+
+TEST_F(ParquetReaderTest, ListsWideTable)
+{
+  auto constexpr num_rows = 2;
+  auto constexpr num_cols = 26'755;  // for slightly over 2B keys
+  auto constexpr seed     = 0xceed;
+
+  std::mt19937 engine{seed};
+
+  auto list_list       = make_parquet_list_list_col<int32_t>(0, num_rows, 1, 1, false);
+  auto list_list_nulls = make_parquet_list_list_col<int32_t>(0, num_rows, 1, 1, true);
+
+  // switch between nullable and non-nullable
+  std::vector<cudf::column_view> cols(num_cols);
+  bool with_nulls = false;
+  std::generate_n(cols.begin(), num_cols, [&]() {
+    auto const view = with_nulls ? list_list_nulls->view() : list_list->view();
+    with_nulls      = not with_nulls;
+    return view;
+  });
+
+  cudf::table_view expected(cols);
+
+  // Use a host buffer for faster I/O
+  std::vector<char> buffer;
+  auto const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected).build();
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size()));
+  auto const [result, _] = cudf::io::read_parquet(default_in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
+}

From ce93c366c451e27a49583cbb809bf5579a4bcf15 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 17 Oct 2024 19:12:56 -0400
Subject: [PATCH 084/117] Migrate NVText Normalizing APIs to Pylibcudf (#17072)

Apart of #15162.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17072
---
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../api_docs/pylibcudf/nvtext/normalize.rst   |  6 ++
 .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx | 13 ++--
 python/cudf/cudf/_lib/nvtext/normalize.pyx    | 38 ++++-------
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  4 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py | 10 ++-
 .../pylibcudf/pylibcudf/nvtext/normalize.pxd  |  9 +++
 .../pylibcudf/pylibcudf/nvtext/normalize.pyx  | 64 +++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_normalize.py  | 42 ++++++++++++
 10 files changed, 155 insertions(+), 34 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/normalize.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/normalize.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 58303356336..3a79c869971 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -9,3 +9,4 @@ nvtext
     jaccard
     minhash
     ngrams_tokenize
+    normalize
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
new file mode 100644
index 00000000000..e496f6a45da
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
@@ -0,0 +1,6 @@
+=========
+normalize
+=========
+
+.. automodule:: pylibcudf.nvtext.normalize
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index 6521116eafe..c125d92a24e 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -14,10 +14,11 @@ def ngrams_tokenize(
     object py_delimiter,
     object py_separator
 ):
-    result = nvtext.ngrams_tokenize.ngrams_tokenize(
-        input.to_pylibcudf(mode="read"),
-        ngrams,
-        py_delimiter.device_value.c_value,
-        py_separator.device_value.c_value
+    return Column.from_pylibcudf(
+        nvtext.ngrams_tokenize.ngrams_tokenize(
+            input.to_pylibcudf(mode="read"),
+            ngrams,
+            py_delimiter.device_value.c_value,
+            py_separator.device_value.c_value
+        )
     )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 5e86a9ce959..633bc902db1 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -3,36 +3,24 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.normalize cimport (
-    normalize_characters as cpp_normalize_characters,
-    normalize_spaces as cpp_normalize_spaces,
-)
 
 from cudf._lib.column cimport Column
 
-
-@acquire_spill_lock()
-def normalize_spaces(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_normalize_spaces(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def normalize_characters(Column strings, bool do_lower=True):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
+def normalize_spaces(Column input):
+    result = nvtext.normalize.normalize_spaces(
+        input.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(result)
 
-    with nogil:
-        c_result = move(cpp_normalize_characters(c_strings, do_lower))
 
-    return Column.from_unique_ptr(move(c_result))
+@acquire_spill_lock()
+def normalize_characters(Column input, bool do_lower=True):
+    result = nvtext.normalize.normalize_characters(
+        input.to_pylibcudf(mode="read"),
+        do_lower,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 94df9bbbebb..e01ca3fbdd3 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx
+                   ngrams_tokenize.pyx normalize.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index b6659827688..08dbec84090 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -6,6 +6,7 @@ from . cimport (
     jaccard,
     minhash,
     ngrams_tokenize,
+    normalize,
 )
 
 __all__ = [
@@ -13,5 +14,6 @@ __all__ = [
     "generate_ngrams",
     "jaccard",
     "minhash",
-    "ngrams_tokenize"
+    "ngrams_tokenize",
+    "normalize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index f74633a3521..6dccf3dd9cf 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,6 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams, jaccard, minhash, ngrams_tokenize
+from . import (
+    edit_distance,
+    generate_ngrams,
+    jaccard,
+    minhash,
+    ngrams_tokenize,
+    normalize,
+)
 
 __all__ = [
     "edit_distance",
@@ -8,4 +15,5 @@
     "jaccard",
     "minhash",
     "ngrams_tokenize",
+    "normalize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
new file mode 100644
index 00000000000..90676145afa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+
+
+cpdef Column normalize_spaces(Column input)
+
+cpdef Column normalize_characters(Column input, bool do_lower_case)
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
new file mode 100644
index 00000000000..637d900b659
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.normalize cimport (
+    normalize_characters as cpp_normalize_characters,
+    normalize_spaces as cpp_normalize_spaces,
+)
+
+
+cpdef Column normalize_spaces(Column input):
+    """
+    Returns a new strings column by normalizing the whitespace in
+    each string in the input column.
+
+    For details, see :cpp:func:`normalize_spaces`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+
+    Returns
+    -------
+    Column
+        New strings columns of normalized strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize_spaces(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column normalize_characters(Column input, bool do_lower_case):
+    """
+    Normalizes strings characters for tokenizing.
+
+    For details, see :cpp:func:`normalize_characters`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    do_lower_case : bool
+        If true, upper-case characters are converted to lower-case
+        and accents are stripped from those characters. If false,
+        accented and upper-case characters are not transformed.
+
+    Returns
+    -------
+    Column
+        Normalized strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize_characters(input.view(), do_lower_case)
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
new file mode 100644
index 00000000000..fe28b83c09a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def norm_spaces_input_data():
+    arr = ["a b", "  c  d\n", "e \t f "]
+    return pa.array(arr)
+
+
+@pytest.fixture(scope="module")
+def norm_chars_input_data():
+    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+    return pa.array(arr)
+
+
+def test_normalize_spaces(norm_spaces_input_data):
+    result = plc.nvtext.normalize.normalize_spaces(
+        plc.interop.from_arrow(norm_spaces_input_data)
+    )
+    expected = pa.array(["a b", "c d", "e f"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalize_characters(norm_chars_input_data, do_lower):
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        do_lower,
+    )
+    expected = pa.array(
+        ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+    )
+    if not do_lower:
+        expected = pa.array(
+            ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+        )
+    assert_column_eq(result, expected)

From 8ebf0d40669014e42b9c9079cdc1cc3ab5c4f7a8 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 17 Oct 2024 18:27:58 -0700
Subject: [PATCH 085/117] Add device aggregators used by shared memory groupby
 (#17031)

This work is part of splitting the original bulk shared memory groupby PR #16619.

It introduces two device-side element aggregators:

- `shmem_element_aggregator`: aggregates data from global memory sources to shared memory targets,
- `gmem_element_aggregator`: aggregates from shared memory sources to global memory targets.

These two aggregators are similar to the `elementwise_aggregator` functionality. Follow-up work is tracked via #17032.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17031
---
 .../detail/aggregation/device_aggregators.cuh |  63 ++--
 .../groupby/hash/global_memory_aggregator.cuh | 277 ++++++++++++++++++
 .../groupby/hash/shared_memory_aggregator.cuh | 263 +++++++++++++++++
 3 files changed, 570 insertions(+), 33 deletions(-)
 create mode 100644 cpp/src/groupby/hash/global_memory_aggregator.cuh
 create mode 100644 cpp/src/groupby/hash/shared_memory_aggregator.cuh

diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 10be5e1d36f..204eee49a2a 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/aggregation.hpp>
@@ -29,12 +28,31 @@
 #include <cuda/std/type_traits>
 
 namespace cudf::detail {
+/// Checks if an aggregation kind needs to operate on the underlying storage type
+template <aggregation::Kind k>
+__device__ constexpr bool uses_underlying_type()
+{
+  return k == aggregation::MIN or k == aggregation::MAX or k == aggregation::SUM;
+}
+
+/// Gets the underlying target type for the given source type and aggregation kind
+template <typename Source, aggregation::Kind k>
+using underlying_target_t =
+  cuda::std::conditional_t<uses_underlying_type<k>(),
+                           cudf::device_storage_type_t<cudf::detail::target_type_t<Source, k>>,
+                           cudf::detail::target_type_t<Source, k>>;
+
+/// Gets the underlying source type for the given source type and aggregation kind
+template <typename Source, aggregation::Kind k>
+using underlying_source_t =
+  cuda::std::conditional_t<uses_underlying_type<k>(), cudf::device_storage_type_t<Source>, Source>;
+
 template <typename Source, aggregation::Kind k, typename Enable = void>
 struct update_target_element {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
+  __device__ void operator()(mutable_column_device_view,
+                             size_type,
+                             column_device_view,
+                             size_type) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
@@ -51,8 +69,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::MIN>;
     cudf::detail::atomic_min(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -72,8 +88,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::MIN>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -96,8 +110,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::MAX>;
     cudf::detail::atomic_max(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -117,8 +129,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::MAX>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -141,8 +151,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::SUM>;
     cudf::detail::atomic_add(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -162,8 +170,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::SUM>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -197,10 +203,10 @@ struct update_target_from_dictionary {
   template <typename Source,
             aggregation::Kind k,
             cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
+  __device__ void operator()(mutable_column_device_view,
+                             size_type,
+                             column_device_view,
+                             size_type) const noexcept
   {
   }
 };
@@ -227,8 +233,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     dispatch_type_and_aggregation(
       source.child(cudf::dictionary_column_view::keys_column_index).type(),
       k,
@@ -249,8 +253,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
     auto value   = static_cast<Target>(source.element<Source>(source_index));
     cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
@@ -267,8 +269,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::PRODUCT>;
     cudf::detail::atomic_mul(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -286,8 +286,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::COUNT_VALID>;
     cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
 
@@ -323,8 +321,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::ARGMAX>;
     auto old     = cudf::detail::atomic_cas(
       &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
@@ -349,8 +345,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::ARGMIN>;
     auto old     = cudf::detail::atomic_cas(
       &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
@@ -376,6 +370,9 @@ struct elementwise_aggregator {
                              column_device_view source,
                              size_type source_index) const noexcept
   {
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source.is_null(source_index)) { return; }
+    }
     update_target_element<Source, k>{}(target, target_index, source, source_index);
   }
 };
diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
new file mode 100644
index 00000000000..50e89c727ff
--- /dev/null
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/type_traits>
+
+namespace cudf::groupby::detail::hash {
+template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
+struct update_target_element_gmem {
+  __device__ void operator()(cudf::mutable_column_device_view,
+                             cudf::size_type,
+                             cudf::column_device_view,
+                             cuda::std::byte*,
+                             cudf::size_type) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MIN,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_min(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MAX,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_max(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_add(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// The shared memory will already have it squared
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM_OF_SQUARES,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    Target value          = static_cast<Target>(source_casted[source_index]);
+
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value);
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::PRODUCT,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and
+// non-fixed point column
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_ALL is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmax_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source_column.element<Source>(source_argmax_index) >
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmax_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmin_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source_column.element<Source>(source_argmin_index) <
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmin_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief A functor that updates a single element in the target column stored in global memory by
+ * applying an aggregation operation to a corresponding element from a source column in shared
+ * memory.
+ *
+ * This functor can NOT be used for dictionary columns.
+ *
+ * This is a redundant copy replicating the behavior of `elementwise_aggregator` from
+ * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts
+ * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from
+ * shared memory.
+ */
+struct gmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
+  {
+    // Early exit for all aggregation kinds since shared memory aggregation of
+    // `COUNT_ALL` is always valid
+    if (!source_mask[source_index]) { return; }
+
+    update_target_element_gmem<Source, k>{}(
+      target, target_index, source_column, source, source_index);
+  }
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
new file mode 100644
index 00000000000..9cbeeb34b86
--- /dev/null
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/type_traits>
+
+namespace cudf::groupby::detail::hash {
+template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
+struct update_target_element_shmem {
+  __device__ void operator()(
+    cuda::std::byte*, bool*, cudf::size_type, cudf::column_device_view, cudf::size_type) const
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MIN,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MIN>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_min(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MAX,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MAX>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_max(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::SUM>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM_OF_SQUARES,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto value            = static_cast<Target>(source.element<Source>(source_index));
+    cudf::detail::atomic_add(&target_casted[target_index], value * value);
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::PRODUCT,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_mul(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    // The nullability was checked prior to this call in the `shmem_element_aggregator` functor
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+
+    // Assumes target is already set to be valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source.element<Source>(source_index) > source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source.element<Source>(source_index) < source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+/**
+ * @brief A functor that updates a single element in the target column stored in shared memory by
+ * applying an aggregation operation to a corresponding element from a source column in global
+ * memory.
+ *
+ * This functor can NOT be used for dictionary columns.
+ *
+ * This is a redundant copy replicating the behavior of `elementwise_aggregator` from
+ * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts
+ * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from
+ * shared memory.
+ */
+struct shmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    // Check nullability for all aggregation kinds but `COUNT_ALL`
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source.is_null(source_index)) { return; }
+    }
+    update_target_element_shmem<Source, k>{}(
+      target, target_mask, target_index, source, source_index);
+  }
+};
+}  // namespace cudf::groupby::detail::hash

From b8917229f8a2446c7e5f697475f76743a05e6856 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 17 Oct 2024 19:02:30 -0700
Subject: [PATCH 086/117] Control whether a file data source memory-maps the
 file with an environment variable (#17004)

Adds an environment variable, `LIBCUDF_MMAP_ENABLED`, to control whether we memory map the input file in the data source.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/17004
---
 cpp/src/io/utilities/datasource.cpp | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 28f7f08521e..2daaecadca6 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "file_io_utilities.hpp"
+#include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -392,14 +393,21 @@ std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t offset,
                                                size_t max_size_estimate)
 {
-#ifdef CUFILE_FOUND
-  if (cufile_integration::is_always_enabled()) {
-    // avoid mmap as GDS is expected to be used for most reads
+  auto const use_memory_mapping = [] {
+    auto const policy = getenv_or("LIBCUDF_MMAP_ENABLED", std::string{"ON"});
+
+    if (policy == "ON") { return true; }
+    if (policy == "OFF") { return false; }
+
+    CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy);
+  }();
+
+  if (use_memory_mapping) {
+    return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, max_size_estimate);
+  } else {
+    // `file_source` reads the file directly, without memory mapping
     return std::make_unique<file_source>(filepath.c_str());
   }
-#endif
-  // Use our own memory mapping implementation for direct file reads
-  return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, max_size_estimate);
 }
 
 std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)

From 6ca721ca95925328f4d7f2ae473c69a4a0d38af7 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 18 Oct 2024 13:24:41 -0400
Subject: [PATCH 087/117] Fix the GDS read/write segfault/bus error when the
 cuFile policy is set to GDS or ALWAYS (#17122)

When `LIBCUDF_CUFILE_POLICY` is set to `GDS` or `ALWAYS`, cuDF uses an internal implementation to call the cuFile API and harness the GDS feature. Recent tests with these two settings were unsuccessful due to program crash. Specifically, for the `PARQUET_READER_NVBENCH`'s `parquet_read_io_compression` benchmark:

- GDS write randomly crashed with segmentation fault (SIGSEGV).
- GDS read randomly crashed with bus error (SIGBUS).
- At the time of crash, stack frame is randomly corrupted.

The root cause is the use of dangling reference, which occurs when a variable is captured by reference by nested lambdas. This PR performs a hotfix that turns out to be a 1-char change.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17122
---
 cpp/src/io/utilities/file_io_utilities.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index d7b54399f8d..98ed9b28f0a 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -239,7 +239,7 @@ std::vector<std::future<ResultT>> make_sliced_tasks(
   std::vector<std::future<ResultT>> slice_tasks;
   std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
     return pool.submit_task(
-      [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
+      [=] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
   });
   return slice_tasks;
 }

From e1c9a5a524f33c2f429f6efaf4dc6c20a0e764e4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 18 Oct 2024 14:22:18 -0400
Subject: [PATCH 088/117] Fix clang-tidy violations for span.hpp and
 hostdevice_vector.hpp (#17124)

Errors reported here: https://github.com/rapidsai/cudf/actions/runs/11398977412/job/31716929242
Just adding `[[nodiscard]]` to a few member functions.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/17124
---
 cpp/include/cudf/utilities/span.hpp        |  8 ++++----
 cpp/src/io/utilities/hostdevice_vector.hpp | 23 +++++++++++++++-------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index d558cfb5e85..21ee4fa9e9b 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -425,21 +425,21 @@ class base_2dspan {
    *
    * @return A pointer to the first element of the span
    */
-  constexpr auto data() const noexcept { return _flat.data(); }
+  [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); }
 
   /**
    * @brief Returns the size in the span as pair.
    *
    * @return pair representing rows and columns size of the span
    */
-  constexpr auto size() const noexcept { return _size; }
+  [[nodiscard]] constexpr auto size() const noexcept { return _size; }
 
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return Number of elements in the span
    */
-  constexpr auto count() const noexcept { return _flat.size(); }
+  [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); }
 
   /**
    * @brief Checks if the span is empty.
@@ -467,7 +467,7 @@ class base_2dspan {
    *
    * @return A flattened span of the 2D span
    */
-  constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
+  [[nodiscard]] constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
 
   /**
    * @brief Construct a 2D span from another 2D span of convertible type
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index af1ba16a424..f969b45727b 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -176,13 +176,19 @@ class hostdevice_2dvector {
   operator device_2dspan<T const>() const { return {device_span<T const>{_data}, _size.second}; }
 
   device_2dspan<T> device_view() { return static_cast<device_2dspan<T>>(*this); }
-  device_2dspan<T const> device_view() const { return static_cast<device_2dspan<T const>>(*this); }
+  [[nodiscard]] device_2dspan<T const> device_view() const
+  {
+    return static_cast<device_2dspan<T const>>(*this);
+  }
 
   operator host_2dspan<T>() { return {host_span<T>{_data}, _size.second}; }
   operator host_2dspan<T const>() const { return {host_span<T const>{_data}, _size.second}; }
 
   host_2dspan<T> host_view() { return static_cast<host_2dspan<T>>(*this); }
-  host_2dspan<T const> host_view() const { return static_cast<host_2dspan<T const>>(*this); }
+  [[nodiscard]] host_2dspan<T const> host_view() const
+  {
+    return static_cast<host_2dspan<T const>>(*this);
+  }
 
   host_span<T> operator[](size_t row)
   {
@@ -194,16 +200,19 @@ class hostdevice_2dvector {
     return host_span<T const>{_data}.subspan(row * _size.second, _size.second);
   }
 
-  auto size() const noexcept { return _size; }
-  auto count() const noexcept { return _size.first * _size.second; }
-  auto is_empty() const noexcept { return count() == 0; }
+  [[nodiscard]] auto size() const noexcept { return _size; }
+  [[nodiscard]] auto count() const noexcept { return _size.first * _size.second; }
+  [[nodiscard]] auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
 
-  T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); }
+  [[nodiscard]] T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); }
 
-  T const* base_device_ptr(size_t offset = 0) const { return _data.device_ptr(offset); }
+  [[nodiscard]] T const* base_device_ptr(size_t offset = 0) const
+  {
+    return _data.device_ptr(offset);
+  }
 
   [[nodiscard]] size_t size_bytes() const noexcept { return _data.size_bytes(); }
 

From e242dce9331a544188b591688949fc7d43cc362d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 18 Oct 2024 12:24:00 -0700
Subject: [PATCH 089/117] Disable the Parquet reader's wide lists tables GTest
 by default (#17120)

This PR disables Parquet reader's wide lists table gtest by default as it takes several minutes to complete with memcheck. See the discussion on PR #17059 (this [comment](https://github.com/rapidsai/cudf/pull/17059#discussion_r1805342332)) for more context.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17120
---
 cpp/tests/io/parquet_reader_test.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index ab4645c2e25..7986a3c6d70 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -2725,7 +2725,9 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
 }
 
-TEST_F(ParquetReaderTest, ListsWideTable)
+// The test below requires several minutes to complete with memcheck, thus it is disabled by
+// default.
+TEST_F(ParquetReaderTest, DISABLED_ListsWideTable)
 {
   auto constexpr num_rows = 2;
   auto constexpr num_cols = 26'755;  // for slightly over 2B keys

From 6ad90742f5a1efa5eecbbad25dddc46c1ed5c801 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 18 Oct 2024 15:10:09 -0500
Subject: [PATCH 090/117] Add custom "fused" groupby aggregation to Dask cuDF
 (#17009)

The legacy Dask cuDF implementation uses a custom code path for GroupBy aggregations. However, when query-planning is enabled (the default), we use the same algorithm as the pandas backend. This PR ports the custom "fused aggregation" code path over to the dask-expr version of Dask cuDF.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17009
---
 python/dask_cudf/dask_cudf/expr/_expr.py      | 215 +++++++++++++++++-
 python/dask_cudf/dask_cudf/expr/_groupby.py   |  27 ++-
 .../dask_cudf/dask_cudf/tests/test_groupby.py |  23 +-
 3 files changed, 257 insertions(+), 8 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index af83a01da98..4a9f4de8b9c 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -6,11 +6,20 @@
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
+from dask_expr._groupby import (
+    DecomposableGroupbyAggregation,
+    GroupbyAggregation,
+)
 from dask_expr._reductions import Reduction, Var
 from dask_expr.io.io import FusedParquetIO
 from dask_expr.io.parquet import ReadParquetPyarrowFS
 
-from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
+from dask.dataframe.core import (
+    _concat,
+    is_dataframe_like,
+    make_meta,
+    meta_nonempty,
+)
 from dask.dataframe.dispatch import is_categorical_dtype
 from dask.typing import no_default
 
@@ -21,6 +30,210 @@
 ##
 
 
+def _get_spec_info(gb):
+    if isinstance(gb.arg, (dict, list)):
+        aggs = gb.arg.copy()
+    else:
+        aggs = gb.arg
+
+    if gb._slice and not isinstance(aggs, dict):
+        aggs = {gb._slice: aggs}
+
+    gb_cols = gb._by_columns
+    if isinstance(gb_cols, str):
+        gb_cols = [gb_cols]
+    columns = [c for c in gb.frame.columns if c not in gb_cols]
+    if not isinstance(aggs, dict):
+        aggs = {col: aggs for col in columns}
+
+    # Assert if our output will have a MultiIndex; this will be the case if
+    # any value in the `aggs` dict is not a string (i.e. multiple/named
+    # aggregations per column)
+    str_cols_out = True
+    aggs_renames = {}
+    for col in aggs:
+        if isinstance(aggs[col], str) or callable(aggs[col]):
+            aggs[col] = [aggs[col]]
+        elif isinstance(aggs[col], dict):
+            str_cols_out = False
+            col_aggs = []
+            for k, v in aggs[col].items():
+                aggs_renames[col, v] = k
+                col_aggs.append(v)
+            aggs[col] = col_aggs
+        else:
+            str_cols_out = False
+        if col in gb_cols:
+            columns.append(col)
+
+    return {
+        "aggs": aggs,
+        "columns": columns,
+        "str_cols_out": str_cols_out,
+        "aggs_renames": aggs_renames,
+    }
+
+
+def _get_meta(gb):
+    spec_info = gb.spec_info
+    gb_cols = gb._by_columns
+    aggs = spec_info["aggs"].copy()
+    aggs_renames = spec_info["aggs_renames"]
+    if spec_info["str_cols_out"]:
+        # Metadata should use `str` for dict values if that is
+        # what the user originally specified (column names will
+        # be str, rather than tuples).
+        for col in aggs:
+            aggs[col] = aggs[col][0]
+    _meta = gb.frame._meta.groupby(gb_cols).agg(aggs)
+    if aggs_renames:
+        col_array = []
+        agg_array = []
+        for col, agg in _meta.columns:
+            col_array.append(col)
+            agg_array.append(aggs_renames.get((col, agg), agg))
+        _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
+    return _meta
+
+
+class DecomposableCudfGroupbyAgg(DecomposableGroupbyAggregation):
+    sep = "___"
+
+    @functools.cached_property
+    def spec_info(self):
+        return _get_spec_info(self)
+
+    @functools.cached_property
+    def _meta(self):
+        return _get_meta(self)
+
+    @property
+    def shuffle_by_index(self):
+        return False  # We always group by column(s)
+
+    @classmethod
+    def chunk(cls, df, *by, **kwargs):
+        from dask_cudf.groupby import _groupby_partition_agg
+
+        return _groupby_partition_agg(df, **kwargs)
+
+    @classmethod
+    def combine(cls, inputs, **kwargs):
+        from dask_cudf.groupby import _tree_node_agg
+
+        return _tree_node_agg(_concat(inputs), **kwargs)
+
+    @classmethod
+    def aggregate(cls, inputs, **kwargs):
+        from dask_cudf.groupby import _finalize_gb_agg
+
+        return _finalize_gb_agg(_concat(inputs), **kwargs)
+
+    @property
+    def chunk_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        return {
+            "gb_cols": self._by_columns,
+            "aggs": self.spec_info["aggs"],
+            "columns": self.spec_info["columns"],
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+        }
+
+    @property
+    def combine_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        return {
+            "gb_cols": self._by_columns,
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+        }
+
+    @property
+    def aggregate_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        final_columns = self._slice or self._meta.columns
+        return {
+            "gb_cols": self._by_columns,
+            "aggs": self.spec_info["aggs"],
+            "columns": self.spec_info["columns"],
+            "final_columns": final_columns,
+            "as_index": True,
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+            "str_cols_out": self.spec_info["str_cols_out"],
+            "aggs_renames": self.spec_info["aggs_renames"],
+        }
+
+
+class CudfGroupbyAgg(GroupbyAggregation):
+    @functools.cached_property
+    def spec_info(self):
+        return _get_spec_info(self)
+
+    @functools.cached_property
+    def _meta(self):
+        return _get_meta(self)
+
+    def _lower(self):
+        return DecomposableCudfGroupbyAgg(
+            self.frame,
+            self.arg,
+            self.observed,
+            self.dropna,
+            self.split_every,
+            self.split_out,
+            self.sort,
+            self.shuffle_method,
+            self._slice,
+            *self.by,
+        )
+
+
+def _maybe_get_custom_expr(
+    gb,
+    aggs,
+    split_every=None,
+    split_out=None,
+    shuffle_method=None,
+    **kwargs,
+):
+    from dask_cudf.groupby import (
+        OPTIMIZED_AGGS,
+        _aggs_optimized,
+        _redirect_aggs,
+    )
+
+    if kwargs:
+        # Unsupported key-word arguments
+        return None
+
+    if not hasattr(gb.obj._meta, "to_pandas"):
+        # Not cuDF-backed data
+        return None
+
+    _aggs = _redirect_aggs(aggs)
+    if not _aggs_optimized(_aggs, OPTIMIZED_AGGS):
+        # One or more aggregations are unsupported
+        return None
+
+    return CudfGroupbyAgg(
+        gb.obj.expr,
+        _aggs,
+        gb.observed,
+        gb.dropna,
+        split_every,
+        split_out,
+        gb.sort,
+        shuffle_method,
+        gb._slice,
+        *gb.by,
+    )
+
+
 class CudfFusedParquetIO(FusedParquetIO):
     @staticmethod
     def _load_multiple_files(
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 65688115b59..8a16fe7615d 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from dask_expr._collection import new_collection
 from dask_expr._groupby import (
     GroupBy as DXGroupBy,
     SeriesGroupBy as DXSeriesGroupBy,
@@ -11,6 +12,8 @@
 
 from cudf.core.groupby.groupby import _deprecate_collect
 
+from dask_cudf.expr._expr import _maybe_get_custom_expr
+
 ##
 ## Custom groupby classes
 ##
@@ -54,9 +57,16 @@ def _translate_arg(arg):
         return arg
 
 
-# TODO: These classes are mostly a work-around for missing
-# `observed=False` support.
-# See: https://github.com/rapidsai/cudf/issues/15173
+# We define our own GroupBy classes in Dask cuDF for
+# the following reasons:
+#  (1) We want to use a custom `aggregate` algorithm
+#      that performs multiple aggregations on the
+#      same dataframe partition at once. The upstream
+#      algorithm breaks distinct aggregations into
+#      separate tasks.
+#  (2) We need to work around missing `observed=False`
+#      support:
+#      https://github.com/rapidsai/cudf/issues/15173
 
 
 class GroupBy(DXGroupBy):
@@ -89,8 +99,15 @@ def collect(self, **kwargs):
         _deprecate_collect()
         return self._single_agg(ListAgg, **kwargs)
 
-    def aggregate(self, arg, **kwargs):
-        return super().aggregate(_translate_arg(arg), **kwargs)
+    def aggregate(self, arg, fused=True, **kwargs):
+        if (
+            fused
+            and (expr := _maybe_get_custom_expr(self, arg, **kwargs))
+            is not None
+        ):
+            return new_collection(expr)
+        else:
+            return super().aggregate(_translate_arg(arg), **kwargs)
 
 
 class SeriesGroupBy(DXSeriesGroupBy):
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index e30474f6b94..042e69d86f4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -14,7 +14,11 @@
 
 import dask_cudf
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
-from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    QUERY_PLANNING_ON,
+    require_dask_expr,
+    xfail_dask_expr,
+)
 
 
 def assert_cudf_groupby_layers(ddf):
@@ -556,10 +560,22 @@ def test_groupby_categorical_key():
         ),
     ],
 )
+@pytest.mark.parametrize(
+    "fused",
+    [
+        True,
+        pytest.param(
+            False,
+            marks=require_dask_expr("Not supported by legacy API"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
-def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
+def test_groupby_agg_params(
+    npartitions, split_every, split_out, fused, as_index
+):
     df = cudf.datasets.randomdata(
         nrows=150,
         dtypes={"name": str, "a": int, "b": int, "c": float},
@@ -574,6 +590,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
         "c": ["mean", "std", "var"],
     }
 
+    fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {}
     split_kwargs = {"split_every": split_every, "split_out": split_out}
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
@@ -593,6 +610,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
             ddf.groupby(["name", "a"], sort=True, **maybe_as_index)
             .aggregate(
                 agg_dict,
+                **fused_kwarg,
                 **split_kwargs,
             )
             .compute()
@@ -614,6 +632,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     # Full check (`sort=False`)
     gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate(
         agg_dict,
+        **fused_kwarg,
         **split_kwargs,
     )
     pr = pddf.groupby(["name", "a"], sort=False).agg(

From 98eef67d12670bd592022201b3c9dcc12374a34a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 18 Oct 2024 14:55:41 -0700
Subject: [PATCH 091/117] Extend `device_scalar` to optionally use pinned
 bounce buffer (#16947)

Depends on https://github.com/rapidsai/cudf/pull/16945

Added `cudf::detail::device_scalar`, derived from `rmm::device_scalar`. The new class overrides function members that perform copies between host and device. New implementation uses a `cudf::detail::host_vector` as a bounce buffer to avoid performing a pageable copy.

Replaced `rmm::device_scalar` with `cudf::detail::device_scalar` across libcudf.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16947
---
 cpp/include/cudf/detail/copy_if.cuh       |   4 +-
 cpp/include/cudf/detail/copy_if_else.cuh  |   5 +-
 cpp/include/cudf/detail/copy_range.cuh    |   4 +-
 cpp/include/cudf/detail/device_scalar.hpp | 103 ++++++++++++++++++++++
 cpp/include/cudf/detail/null_mask.cuh     |   4 +-
 cpp/include/cudf/detail/valid_if.cuh      |   4 +-
 cpp/include/cudf/scalar/scalar.hpp        |   5 +-
 cpp/src/bitmask/null_mask.cu              |   4 +-
 cpp/src/copying/concatenate.cu            |   5 +-
 cpp/src/copying/get_element.cu            |   7 +-
 cpp/src/groupby/sort/group_quantiles.cu   |   3 +-
 cpp/src/groupby/sort/group_std.cu         |   4 +-
 cpp/src/interop/to_arrow_device.cu        |  12 +--
 cpp/src/io/json/json_normalization.cu     |   6 +-
 cpp/src/io/json/nested_json_gpu.cu        |   8 +-
 cpp/src/io/orc/reader_impl_decode.cu      |   4 +-
 cpp/src/io/parquet/error.hpp              |   2 +-
 cpp/src/io/parquet/parquet_gpu.hpp        |   3 +-
 cpp/src/io/utilities/data_casting.cu      |   5 +-
 cpp/src/io/utilities/type_inference.cu    |   5 +-
 cpp/src/join/conditional_join.cu          |  11 +--
 cpp/src/join/distinct_hash_join.cu        |   1 -
 cpp/src/join/mixed_join_size_kernel.cuh   |   3 +-
 cpp/src/json/json_path.cu                 |   3 +-
 cpp/src/reductions/all.cu                 |   4 +-
 cpp/src/reductions/any.cu                 |   4 +-
 cpp/src/reductions/minmax.cu              |  13 +--
 cpp/src/replace/nulls.cu                  |   3 +-
 cpp/src/replace/replace.cu                |   4 +-
 cpp/src/rolling/detail/rolling.cuh        |   6 +-
 cpp/src/strings/case.cu                   |   3 +-
 cpp/src/strings/copying/concatenate.cu    |   4 +-
 cpp/src/strings/replace/find_replace.cu   |   2 +-
 cpp/src/strings/replace/multi.cu          |   2 +-
 cpp/src/strings/replace/replace.cu        |   2 +-
 cpp/src/strings/split/split.cuh           |   3 +-
 cpp/src/text/tokenize.cu                  |   3 +-
 37 files changed, 192 insertions(+), 76 deletions(-)
 create mode 100644 cpp/include/cudf/detail/device_scalar.hpp

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index dfb646c66c4..4159e324472 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -19,6 +19,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -36,7 +37,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -256,7 +256,7 @@ struct scatter_gather_functor {
 
     cudf::detail::grid_1d grid{input.size(), block_size, per_thread};
 
-    rmm::device_scalar<cudf::size_type> null_count{0, stream};
+    cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
     if (output.nullable()) {
       // Have to initialize the output mask to all zeros because we may update
       // it with atomicOr().
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index a70cd5a0661..5dc75b1a3fb 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -19,12 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <rmm/device_scalar.hpp>
-
 #include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
 
@@ -171,7 +170,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
 
   // if we have validity in the output
   if (nullable) {
-    rmm::device_scalar<size_type> valid_count{0, stream};
+    cudf::detail::device_scalar<size_type> valid_count{0, stream};
 
     // call the kernel
     copy_if_else_kernel<block_size, Element, LeftIter, RightIter, FilterFn, true>
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 3aa136d630b..fcb80fe45f7 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>
@@ -154,7 +154,7 @@ void copy_range(SourceValueIterator source_value_begin,
   auto grid = cudf::detail::grid_1d{num_items, block_size, 1};
 
   if (target.nullable()) {
-    rmm::device_scalar<size_type> null_count(target.null_count(), stream);
+    cudf::detail::device_scalar<size_type> null_count(target.null_count(), stream);
 
     auto kernel =
       copy_range_kernel<block_size, SourceValueIterator, SourceValidityIterator, T, true>;
diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp
new file mode 100644
index 00000000000..16ca06c6561
--- /dev/null
+++ b/cpp/include/cudf/detail/device_scalar.hpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+template <typename T>
+class device_scalar : public rmm::device_scalar<T> {
+ public:
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  ~device_scalar() = default;
+
+// Implementation is the same as what compiler should generate
+// Could not use default move constructor as 11.8 compiler fails to generate it
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  device_scalar(device_scalar&& other) noexcept
+    : rmm::device_scalar<T>{std::move(other)}, bounce_buffer{std::move(other.bounce_buffer)}
+  {
+  }
+  device_scalar& operator=(device_scalar&&) noexcept = default;
+
+  device_scalar(device_scalar const&)            = delete;
+  device_scalar& operator=(device_scalar const&) = delete;
+
+  device_scalar() = delete;
+
+  explicit device_scalar(
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+  }
+
+  explicit device_scalar(
+    T const& initial_value,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+    bounce_buffer[0] = initial_value;
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  device_scalar(device_scalar const& other,
+                rmm::cuda_stream_view stream,
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(other, stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+  }
+
+  [[nodiscard]] T value(rmm::cuda_stream_view stream) const
+  {
+    cuda_memcpy<T>(bounce_buffer, device_span<T const>{this->data(), 1}, stream);
+    return bounce_buffer[0];
+  }
+
+  void set_value_async(T const& value, rmm::cuda_stream_view stream)
+  {
+    bounce_buffer[0] = value;
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  void set_value_async(T&& value, rmm::cuda_stream_view stream)
+  {
+    bounce_buffer[0] = std::move(value);
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  void set_value_to_zero_async(rmm::cuda_stream_view stream) { set_value_async(T{}, stream); }
+
+ private:
+  mutable cudf::detail::host_vector<T> bounce_buffer;
+};
+
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 327c732716c..482265d633e 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
@@ -25,7 +26,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cub/block/block_reduce.cuh>
@@ -165,7 +165,7 @@ size_type inplace_bitmask_binop(Binop op,
                "Mask pointer cannot be null");
 
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
-  rmm::device_scalar<size_type> d_counter{0, stream, mr};
+  cudf::detail::device_scalar<size_type> d_counter{0, stream, mr};
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
 
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index cfb2e70bfed..af182b69c3a 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
@@ -25,7 +26,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/distance.h>
 
@@ -101,7 +101,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
 
   size_type null_count{0};
   if (size > 0) {
-    rmm::device_scalar<size_type> valid_count{0, stream};
+    cudf::detail::device_scalar<size_type> valid_count{0, stream};
 
     constexpr size_type block_size{256};
     grid_1d grid{size, block_size};
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 66be2a12fbe..360dde11fc0 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -94,8 +95,8 @@ class scalar {
   [[nodiscard]] bool const* validity_data() const;
 
  protected:
-  data_type _type{type_id::EMPTY};     ///< Logical type of value in the scalar
-  rmm::device_scalar<bool> _is_valid;  ///< Device bool signifying validity
+  data_type _type{type_id::EMPTY};              ///< Logical type of value in the scalar
+  cudf::detail::device_scalar<bool> _is_valid;  ///< Device bool signifying validity
 
   /**
    * @brief Move constructor for scalar.
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 4ca05f9c335..e6659f76c7c 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -32,7 +33,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cub/cub.cuh>
@@ -329,7 +329,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask,
 
   cudf::detail::grid_1d grid(num_words, block_size);
 
-  rmm::device_scalar<size_type> non_zero_count(0, stream);
+  cudf::detail::device_scalar<size_type> non_zero_count(0, stream);
 
   count_set_bits_kernel<block_size>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b8e140f1fa5..d8419760120 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -19,6 +19,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -162,7 +163,7 @@ size_type concatenate_masks(device_span<column_device_view const> d_views,
                             size_type output_size,
                             rmm::cuda_stream_view stream)
 {
-  rmm::device_scalar<size_type> d_valid_count(0, stream);
+  cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
   constexpr size_type block_size{256};
   cudf::detail::grid_1d config(output_size, block_size);
   concatenate_masks_kernel<block_size>
@@ -265,7 +266,7 @@ std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
   auto out_view     = out_col->mutable_view();
   auto d_out_view   = mutable_column_device_view::create(out_view, stream);
 
-  rmm::device_scalar<size_type> d_valid_count(0, stream);
+  cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
 
   // Launch kernel
   constexpr size_type block_size{256};
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 29a28f81d1a..80b0bd5242f 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/is_element_valid.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -71,7 +72,7 @@ struct get_element_functor {
     auto device_col = column_device_view::create(input, stream);
 
     rmm::device_scalar<string_view> temp_data(stream, mr);
-    rmm::device_scalar<bool> temp_valid(stream, mr);
+    cudf::detail::device_scalar<bool> temp_valid(stream, mr);
 
     device_single_thread(
       [buffer   = temp_data.data(),
@@ -155,8 +156,8 @@ struct get_element_functor {
 
     auto device_col = column_device_view::create(input, stream);
 
-    rmm::device_scalar<Type> temp_data(stream, mr);
-    rmm::device_scalar<bool> temp_valid(stream, mr);
+    cudf::detail::device_scalar<Type> temp_data(stream, mr);
+    cudf::detail::device_scalar<bool> temp_valid(stream, mr);
 
     device_single_thread(
       [buffer   = temp_data.data(),
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 82d557b9f7e..d6c900fb689 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -108,7 +109,7 @@ struct quantiles_functor {
     auto values_view     = column_device_view::create(values, stream);
     auto group_size_view = column_device_view::create(group_sizes, stream);
     auto result_view     = mutable_column_device_view::create(result->mutable_view(), stream);
-    auto null_count      = rmm::device_scalar<cudf::size_type>(0, stream, mr);
+    auto null_count      = cudf::detail::device_scalar<cudf::size_type>(0, stream, mr);
 
     // For each group, calculate quantile
     if (!cudf::is_dictionary(values.type())) {
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 86ee20dbbe2..c3dfac46502 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -134,7 +134,7 @@ struct var_functor {
 
     // set nulls
     auto result_view  = mutable_column_device_view::create(*result, stream);
-    auto null_count   = rmm::device_scalar<cudf::size_type>(0, stream, mr);
+    auto null_count   = cudf::detail::device_scalar<cudf::size_type>(0, stream, mr);
     auto d_null_count = null_count.data();
     thrust::for_each_n(
       rmm::exec_policy(stream),
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index a2874b46b06..fc1b0226a48 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -19,6 +19,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -35,7 +36,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
@@ -60,7 +60,7 @@ template <typename>
 struct is_device_scalar : public std::false_type {};
 
 template <typename T>
-struct is_device_scalar<rmm::device_scalar<T>> : public std::true_type {};
+struct is_device_scalar<cudf::detail::device_scalar<T>> : public std::true_type {};
 
 template <typename>
 struct is_device_uvector : public std::false_type {};
@@ -232,10 +232,10 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
     // in the offsets buffer. While some arrow implementations may accept a zero-sized
     // offsets buffer, best practices would be to allocate the buffer with the single value.
     if (nanoarrow_type == NANOARROW_TYPE_STRING) {
-      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int32_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     } else {
-      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int64_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     }
 
@@ -466,10 +466,10 @@ int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out
   if (column.size() == 0) {
     // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
     if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
-      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int64_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     } else {
-      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int32_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     }
 
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 2d435dc8e1a..34a87918e57 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -16,6 +16,7 @@
 
 #include "io/fst/lookup_tables.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
@@ -24,7 +25,6 @@
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -316,7 +316,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
                           stream);
 
   rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
   parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
                    static_cast<SymbolT*>(outbuf.data()),
@@ -401,7 +401,7 @@ std::
                           stream);
 
   rmm::device_uvector<size_type> outbuf_indices(inbuf.size(), stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
   parser.Transduce(inbuf.data(),
                    static_cast<SymbolOffsetT>(inbuf.size()),
                    thrust::make_discard_iterator(),
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 69a51fab5dc..534b30a6089 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -21,6 +21,7 @@
 #include "nested_json.hpp"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
@@ -34,7 +35,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -1446,7 +1446,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr StackSymbolT read_symbol = 'x';
 
   // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes)
-  rmm::device_scalar<SymbolOffsetT> d_num_stack_ops(stream);
+  cudf::detail::device_scalar<SymbolOffsetT> d_num_stack_ops(stream);
 
   // Sequence of stack symbols and their position in the original input (sparse representation)
   rmm::device_uvector<StackSymbolT> stack_ops{json_in.size(), stream};
@@ -1519,7 +1519,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
     stream);
 
   auto const mr = cudf::get_current_device_resource_ref();
-  rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
   rmm::device_uvector<PdaTokenT> filtered_tokens_out{tokens.size(), stream, mr};
   rmm::device_uvector<SymbolOffsetT> filtered_token_indices_out{tokens.size(), stream, mr};
 
@@ -1638,7 +1638,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   std::size_t constexpr max_tokens_per_struct = 6;
   auto const max_token_out_count =
     cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct;
-  rmm::device_scalar<std::size_t> num_written_tokens{stream};
+  cudf::detail::device_scalar<std::size_t> num_written_tokens{stream};
   // In case we're recovering on invalid JSON lines, post-processing the token stream requires to
   // see a JSON-line delimiter as the very first item
   SymbolOffsetT const delimiter_offset =
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index a1e4aa65dcf..c42348a165f 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -22,6 +22,7 @@
 #include "io/utilities/hostdevice_span.hpp"
 
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -32,7 +33,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -451,7 +451,7 @@ void decode_stream_data(int64_t num_dicts,
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
-  rmm::device_scalar<size_type> error_count(0, stream);
+  cudf::detail::device_scalar<size_type> error_count(0, stream);
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp
index f0fc9fab3ab..8b3d1d7a6c3 100644
--- a/cpp/src/io/parquet/error.hpp
+++ b/cpp/src/io/parquet/error.hpp
@@ -26,7 +26,7 @@
 namespace cudf::io::parquet {
 
 /**
- * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in
+ * @brief Specialized device scalar for use in reporting errors that occur in
  * kernel calls.
  *
  * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 4f6d41a97da..be502b581af 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -22,14 +22,13 @@
 #include "io/parquet/parquet_common.hpp"
 #include "io/statistics/statistics.cuh"
 #include "io/utilities/column_buffer.hpp"
-#include "io/utilities/hostdevice_vector.hpp"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cuda/atomic>
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index f70171eef68..0c49b2e5d78 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -800,7 +801,7 @@ template <typename string_view_pair_it>
 static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
                                             size_type col_size,
                                             rmm::device_buffer&& null_mask,
-                                            rmm::device_scalar<size_type>& d_null_count,
+                                            cudf::detail::device_scalar<size_type>& d_null_count,
                                             cudf::io::parse_options_view const& options,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
@@ -930,7 +931,7 @@ std::unique_ptr<column> parse_data(
   CUDF_FUNC_RANGE();
 
   if (col_size == 0) { return make_empty_column(col_type); }
-  auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
+  auto d_null_count    = cudf::detail::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
   if (null_mask.is_empty()) {
     null_mask = cudf::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index 43dc38c4ac6..af32b207d20 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -18,11 +18,10 @@
 #include "io/utilities/string_parsing.hpp"
 #include "io/utilities/trie.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/device_scalar.hpp>
-
 #include <cub/block/block_reduce.cuh>
 
 #include <cstddef>
@@ -242,7 +241,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
   constexpr int block_size = 128;
 
   auto const grid_size = (size + block_size - 1) / block_size;
-  auto d_column_info   = rmm::device_scalar<cudf::io::column_type_histogram>(stream);
+  auto d_column_info   = cudf::detail::device_scalar<cudf::io::column_type_histogram>(stream);
   CUDF_CUDA_TRY(cudaMemsetAsync(
     d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value()));
 
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 2ec23e0dc6d..40d1c925889 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
@@ -81,7 +82,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = *output_size;
   } else {
     // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
     if (has_nulls) {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -94,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  rmm::device_scalar<std::size_t> write_index(0, stream);
+  cudf::detail::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -197,7 +198,7 @@ conditional_join(table_view const& left,
     join_size = *output_size;
   } else {
     // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
     if (has_nulls) {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -231,7 +232,7 @@ conditional_join(table_view const& left,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<std::size_t> write_index(0, stream);
+  cudf::detail::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -342,7 +343,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index c7294152982..515d28201e8 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -27,7 +27,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cooperative_groups.h>
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 84e9be45030..4049ccf35e1 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/export.hpp>
@@ -122,7 +123,7 @@ std::size_t launch_compute_mixed_join_output_size(
   rmm::device_async_resource_ref mr)
 {
   // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
 
   compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, has_nulls>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index 59fdbedf089..fb5cf66dd60 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
@@ -1031,7 +1032,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr);
 
   // compute results
-  rmm::device_scalar<size_type> d_valid_count{0, stream};
+  cudf::detail::device_scalar<size_type> d_valid_count{0, stream};
 
   get_json_object_kernel<block_size>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 67ea29a2cb1..890625830a5 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -16,6 +16,7 @@
 
 #include "simple.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -65,7 +66,8 @@ struct all_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(1, stream, cudf::get_current_device_resource_ref());
+    auto d_result =
+      cudf::detail::device_scalar<int32_t>(1, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 057f038c622..d70da369d72 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -16,6 +16,7 @@
 
 #include "simple.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -65,7 +66,8 @@ struct any_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(0, stream, cudf::get_current_device_resource_ref());
+    auto d_result =
+      cudf::detail::device_scalar<int32_t>(0, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 139de068050..4f6eb23ce5b 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
@@ -69,18 +70,18 @@ struct minmax_pair {
  * @param num_items number of items to reduce
  * @param binary_op binary operator used to reduce
  * @param stream CUDA stream to run kernels on.
- * @return rmm::device_scalar<OutputType>
+ * @return cudf::detail::device_scalar<OutputType>
  */
 template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type>
-rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
-                                             size_type num_items,
-                                             Op binary_op,
-                                             rmm::cuda_stream_view stream)
+auto reduce_device(InputIterator d_in,
+                   size_type num_items,
+                   Op binary_op,
+                   rmm::cuda_stream_view stream)
 {
   OutputType identity{};
-  rmm::device_scalar<OutputType> result{identity, stream};
+  cudf::detail::device_scalar<OutputType> result{identity, stream};
 
   // Allocate temporary storage
   size_t storage_bytes = 0;
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 1df1549432f..d0e3358cc34 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -137,7 +138,7 @@ struct replace_nulls_column_kernel_forwarder {
     auto device_out         = cudf::mutable_column_device_view::create(output_view, stream);
     auto device_replacement = cudf::column_device_view::create(replacement, stream);
 
-    rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
+    cudf::detail::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
 
     replace<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 86ec8cfc91e..0cc97ca05e0 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -37,6 +37,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
@@ -53,7 +54,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -182,7 +182,7 @@ struct replace_kernel_forwarder {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
+    cudf::detail::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
 
     auto replace = [&] {
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 528700137bf..bc0ee2eb519 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -33,6 +33,7 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/unary.hpp>
@@ -49,7 +50,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/std/climits>
@@ -1105,7 +1105,7 @@ struct rolling_window_launcher {
       auto const d_inp_ptr         = column_device_view::create(input, stream);
       auto const d_default_out_ptr = column_device_view::create(default_outputs, stream);
       auto const d_out_ptr = mutable_column_device_view::create(output->mutable_view(), stream);
-      auto d_valid_count   = rmm::device_scalar<size_type>{0, stream};
+      auto d_valid_count   = cudf::detail::device_scalar<size_type>{0, stream};
 
       auto constexpr block_size = 256;
       auto const grid           = cudf::detail::grid_1d(input.size(), block_size);
@@ -1271,7 +1271,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
     udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
 
   auto output_view = output->mutable_view();
-  rmm::device_scalar<size_type> device_valid_count{0, stream};
+  cudf::detail::device_scalar<size_type> device_valid_count{0, stream};
 
   std::string kernel_name =
     jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new")  //
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 4c015f3cbed..6a7c8ea45e9 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
@@ -348,7 +349,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // This check incurs ~20% performance hit for smaller strings and so we only use it
   // after the threshold check above. The check makes very little impact for long strings
   // but results in a large performance gain when the input contains only single-byte characters.
-  rmm::device_scalar<int64_t> mb_count(0, stream);
+  cudf::detail::device_scalar<int64_t> mb_count(0, stream);
   // cudf::detail::grid_1d is limited to size_type elements
   auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size);
   // we only need to check every other byte since either will contain high bit
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 1d9d12686eb..9e4ef47ff79 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/advance.h>
@@ -242,7 +242,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   }
 
   {  // Copy offsets columns with single kernel launch
-    rmm::device_scalar<size_type> d_valid_count(0, stream);
+    cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
 
     constexpr size_type block_size{256};
     cudf::detail::grid_1d config(offsets_count, block_size);
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
index 8a8001dd81a..957075017ba 100644
--- a/cpp/src/strings/replace/find_replace.cu
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/replace.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -21,7 +22,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 352d883bdc5..88f343926c9 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -334,7 +334,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   auto const num_blocks = util::div_rounding_up_safe(
     util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
   count_targets<<<num_blocks, block_size, 0, stream.value()>>>(fn, chars_bytes, d_count.data());
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 16df0dbabdf..52ddef76c1a 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -285,7 +285,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  rmm::device_scalar<int64_t> d_target_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_target_count(0, stream);
   constexpr int64_t block_size         = 512;
   constexpr size_type bytes_per_thread = 4;
   auto const num_blocks                = util::div_rounding_up_safe(
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 81aca001d53..4b777be9d5b 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
@@ -361,7 +362,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   if (chars_bytes > 0) {
     constexpr int64_t block_size         = 512;
     constexpr size_type bytes_per_thread = 4;
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index df25950e6d5..89ca8a089d6 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -221,7 +222,7 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // To minimize memory, count the number of characters so we can
   // build the output offsets without an intermediate buffer.
   // In the worst case each byte is a character so the output is 4x the input.
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   auto const num_blocks = cudf::util::div_rounding_up_safe(
     cudf::util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)),
     block_size);

From fdd2b262aa76400d3d57018461eba37892445a4b Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Fri, 18 Oct 2024 21:03:45 -0400
Subject: [PATCH 092/117] Changing developer guide int_64_t to int64_t (#17130)

Fixes #17129

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Alessandro Bellina (https://github.com/abellina)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17130
---
 cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index fce8adb4c06..311539efbfc 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -370,7 +370,7 @@ any type that cudf supports. For example, a `list_scalar` representing a list of
 |Value type|Scalar class|Notes|
 |-|-|-|
 |fixed-width|`fixed_width_scalar<T>`| `T` can be any fixed-width type|
-|numeric|`numeric_scalar<T>` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int_64_t`, `float` or `double`|
+|numeric|`numeric_scalar<T>` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int64_t`, `float` or `double`|
 |fixed-point|`fixed_point_scalar<T>` | `T` can be `numeric::decimal32` or `numeric::decimal64`|
 |timestamp|`timestamp_scalar<T>` | `T` can be `timestamp_D`, `timestamp_s`, etc.|
 |duration|`duration_scalar<T>` | `T` can be `duration_D`, `duration_s`, etc.|

From 1ce2526bde7f77d2da7d0927a052fd9ccf69b9f2 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Sat, 19 Oct 2024 10:40:25 -0500
Subject: [PATCH 093/117] Replace old host tree algorithm with new algorithm in
 JSON reader (#17019)

This PR replaced old tree algorithm in JSON reader, with experimental algorithms and removed the experimental namespace.

Changes are old tree algorithm code removal,  experimental namespace removal, code of `scatter_offsets` moved, always call new tree algorithm.
No functional change is made in this PR. All unit tests should pass with this change.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17019
---
 cpp/src/io/json/host_tree_algorithms.cu | 857 ++++++------------------
 cpp/src/io/json/json_column.cu          |  29 +-
 cpp/src/io/json/nested_json.hpp         |  15 -
 3 files changed, 224 insertions(+), 677 deletions(-)

diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index f7e8134b68d..7ee652e0239 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -134,12 +134,13 @@ std::vector<std::string> copy_strings_to_host_sync(
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
     host_data.reserve(col.size());
-    std::transform(
-      std::begin(h_offsets),
-      std::end(h_offsets) - 1,
-      std::begin(h_offsets) + 1,
-      std::back_inserter(host_data),
-      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
+    std::transform(std::begin(h_offsets),
+                   std::end(h_offsets) - 1,
+                   std::begin(h_offsets) + 1,
+                   std::back_inserter(host_data),
+                   [&h_chars](auto start, auto end) {
+                     return std::string(h_chars.data() + start, end - start);
+                   });
     return host_data;
   };
   return to_host(d_column_names->view());
@@ -173,633 +174,79 @@ rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const>
   auto parse_opt = parsing_options(options, stream);
   thrust::for_each_n(
     rmm::exec_policy_nosync(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [options           = parse_opt.view(),
-     data              = input.data(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
-      auto const node_category = column_categories[col_ids[i]];
-      if (node_category == NC_STR or node_category == NC_VAL) {
-        auto const is_null_literal = serialized_trie_contains(
-          options.trie_na,
-          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
-        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
-      }
-    });
-  return is_all_nulls;
-}
-
-NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
-                                       bool is_enabled_lines,
-                                       rmm::cuda_stream_view stream)
-{
-  NodeIndexT value = parent_node_sentinel;
-  if (!col_ids.empty()) {
-    auto const list_node_index = is_enabled_lines ? 0 : 1;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                  col_ids.data() + list_node_index,
-                                  sizeof(NodeIndexT),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
-  }
-  return value;
-}
-/**
- * @brief Holds member data pointers of `d_json_column`
- *
- */
-struct json_column_data {
-  using row_offset_t = json_column::row_offset_t;
-  row_offset_t* string_offsets;
-  row_offset_t* string_lengths;
-  row_offset_t* child_offsets;
-  bitmask_type* validity;
-};
-
-using hashmap_of_device_columns =
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
-
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-void scatter_offsets(tree_meta_t const& tree,
-                     device_span<NodeIndexT const> col_ids,
-                     device_span<size_type const> row_offsets,
-                     device_span<size_type> node_ids,
-                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-                     tree_meta_t const& d_column_tree,
-                     host_span<const bool> ignore_vals,
-                     hashmap_of_device_columns const& columns,
-                     rmm::cuda_stream_view stream);
-
-/**
- * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are inserted into `root`'s children.
- * `root` must be a list type.
- *
- * @param input Input JSON string device data
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param row_offsets Row offsets of the nodes in the tree
- * @param root Root node of the `d_json_column` tree
- * @param is_array_of_arrays Whether the tree is an array of arrays
- * @param options Parsing options specifying the parsing behaviour
- * options affecting behaviour are
- *   is_enabled_lines: Whether the input is a line-delimited JSON
- *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the device memory
- * of child_offets and validity members of `d_json_column`
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t const& tree,
-                             device_span<NodeIndexT const> col_ids,
-                             device_span<size_type const> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  bool const is_enabled_lines                 = options.is_enabled_lines();
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  // make a copy
-  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
-    col_ids, stream, cudf::get_current_device_resource_ref());
-
-  // sort by {col_id} on {node_ids} stable
-  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
-                             sorted_col_ids.begin(),
-                             sorted_col_ids.end(),
-                             node_ids.begin());
-
-  NodeIndexT const row_array_parent_col_id =
-    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
-
-  // 1. gather column information.
-  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
-    reduce_to_column_tree(tree,
-                          col_ids,
-                          sorted_col_ids,
-                          node_ids,
-                          row_offsets,
-                          is_array_of_arrays,
-                          row_array_parent_col_id,
-                          stream);
-  auto num_columns                      = d_unique_col_ids.size();
-  std::vector<std::string> column_names = copy_strings_to_host_sync(
-    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  // array of arrays column names
-  if (is_array_of_arrays) {
-    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-    auto const column_parent_ids =
-      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
-    auto values_column_indices =
-      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
-    auto h_values_column_indices =
-      cudf::detail::make_host_vector_sync(values_column_indices, stream);
-    std::transform(unique_col_ids.begin(),
-                   unique_col_ids.end(),
-                   column_names.cbegin(),
-                   column_names.begin(),
-                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
-                     auto col_id, auto name) mutable {
-                     return column_parent_ids[col_id] == row_array_parent_col_id
-                              ? std::to_string(h_values_column_indices[col_id])
-                              : name;
-                   });
-  }
-
-  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
-    if (is_enabled_mixed_types_as_string) {
-      return cudf::detail::make_std_vector_sync(
-        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
-    }
-    return std::vector<uint8_t>();
-  }();
-  auto const [ignore_vals, columns] = build_tree(root,
-                                                 is_str_column_all_nulls,
-                                                 d_column_tree,
-                                                 d_unique_col_ids,
-                                                 d_max_row_offsets,
-                                                 column_names,
-                                                 row_array_parent_col_id,
-                                                 is_array_of_arrays,
-                                                 options,
-                                                 stream,
-                                                 mr);
-
-  scatter_offsets(tree,
-                  col_ids,
-                  row_offsets,
-                  node_ids,
-                  sorted_col_ids,
-                  d_column_tree,
-                  ignore_vals,
-                  columns,
-                  stream);
-}
-
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-  auto column_categories =
-    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
-  auto const column_parent_ids =
-    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-  auto column_range_beg =
-    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
-  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
-  auto num_columns           = d_unique_col_ids.size();
-  stream.synchronize();
-
-  auto to_json_col_type = [](auto category) {
-    switch (category) {
-      case NC_STRUCT: return json_col_t::StructColumn;
-      case NC_LIST: return json_col_t::ListColumn;
-      case NC_STR: [[fallthrough]];
-      case NC_VAL: return json_col_t::StringColumn;
-      default: return json_col_t::Unknown;
-    }
-  };
-  auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
-  };
-
-  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
-    if (column_category == NC_ERR || column_category == NC_FN) {
-      return;
-    } else if (column_category == NC_VAL || column_category == NC_STR) {
-      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-      init_to_zero(col.string_offsets);
-      init_to_zero(col.string_lengths);
-    } else if (column_category == NC_LIST) {
-      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
-      init_to_zero(col.child_offsets);
-    }
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_category);
-  };
-
-  auto reinitialize_as_string = [&](auto i, auto& col) {
-    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-    init_to_zero(col.string_offsets);
-    init_to_zero(col.string_lengths);
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = json_col_t::StringColumn;
-    // destroy references of all child columns after this step, by calling remove_child_columns
-  };
-
-  path_from_tree tree_path{column_categories,
-                           column_parent_ids,
-                           column_names,
-                           is_array_of_arrays,
-                           row_array_parent_col_id};
-
-  // 2. generate nested columns tree and its device_memory
-  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
-  auto h_range_col_id_it =
-    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<0>(a) < thrust::get<0>(b);
-  });
-
-  // use hash map because we may skip field name's col_ids
-  hashmap_of_device_columns columns;
-  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
-  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
-  // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<bool>(num_columns, stream);
-  std::fill(ignore_vals.begin(), ignore_vals.end(), false);
-  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
-  std::vector<uint8_t> is_pruned(num_columns, 0);
-  // for columns that are not mixed type but have been forced as string
-  std::vector<bool> forced_as_string_column(num_columns);
-  columns.try_emplace(parent_node_sentinel, std::ref(root));
-
-  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
-    [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto const& col_name : col.column_order) {
-        auto child_id                  = mapped_columns[{this_col_id, col_name}];
-        is_mixed_type_column[child_id] = 1;
-        remove_child_columns(child_id, col.child_columns.at(col_name));
-        mapped_columns.erase({this_col_id, col_name});
-        columns.erase(child_id);
-      }
-      col.child_columns.clear();  // their references are deleted above.
-      col.column_order.clear();
-    };
-
-  auto name_and_parent_index = [&is_array_of_arrays,
-                                &row_array_parent_col_id,
-                                &column_parent_ids,
-                                &column_categories,
-                                &column_names](auto this_col_id) {
-    std::string name   = "";
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    } else {
-      CUDF_FAIL("Unexpected parent column category");
-    }
-    return std::pair{name, parent_col_id};
-  };
-
-  // Prune columns that are not required to be parsed.
-  if (options.is_enabled_prune_columns()) {
-    for (auto const this_col_id : unique_col_ids) {
-      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-        continue;
-      }
-      // Struct, List, String, Value
-      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-      // get path of this column, and get its dtype if present in options
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
-        is_pruned[this_col_id] = 1;
-        continue;
-      } else {
-        // make sure all its parents are not pruned.
-        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
-          is_pruned[parent_col_id] = 0;
-          parent_col_id            = column_parent_ids[parent_col_id];
-        }
-      }
-    }
-  }
-
-  // Build the column tree, also, handles mixed types.
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
-    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-
-    // if parent is mixed type column or this column is pruned or if parent
-    // has been forced as string, ignore this column.
-    if (parent_col_id != parent_node_sentinel &&
-          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
-        forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = true;
-      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
-      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
-      continue;
-    }
-
-    // If the child is already found,
-    // replace if this column is a nested column and the existing was a value column
-    // ignore this column if this column is a value column and the existing was a nested column
-    auto it = columns.find(parent_col_id);
-    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
-    auto& parent_col = it->second.get();
-    bool replaced    = false;
-    if (mapped_columns.count({parent_col_id, name}) > 0) {
-      auto const old_col_id = mapped_columns[{parent_col_id, name}];
-      // If mixed type as string is enabled, make both of them strings and merge them.
-      // All child columns will be ignored when parsing.
-      if (is_enabled_mixed_types_as_string) {
-        bool const is_mixed_type = [&]() {
-          // If new or old is STR and they are all not null, make it mixed type, else ignore.
-          if (column_categories[this_col_id] == NC_VAL ||
-              column_categories[this_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[this_col_id]) return false;
-          }
-          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[old_col_id]) return false;
-          }
-          return true;
-        }();
-        if (is_mixed_type) {
-          is_mixed_type_column[this_col_id] = 1;
-          is_mixed_type_column[old_col_id]  = 1;
-          // if old col type (not cat) is list or struct, replace with string.
-          auto& col = columns.at(old_col_id).get();
-          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
-            reinitialize_as_string(old_col_id, col);
-            remove_child_columns(old_col_id, col);
-            // all its children (which are already inserted) are ignored later.
-          }
-          col.forced_as_string_column = true;
-          columns.try_emplace(this_col_id, columns.at(old_col_id));
-          continue;
-        }
-      }
-
-      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = true;
-        continue;
-      }
-      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-        // remap
-        ignore_vals[old_col_id] = true;
-        mapped_columns.erase({parent_col_id, name});
-        columns.erase(old_col_id);
-        parent_col.child_columns.erase(name);
-        replaced = true;  // to skip duplicate name in column_order
-      } else {
-        // If this is a nested column but we're trying to insert either (a) a list node into a
-        // struct column or (b) a struct node into a list column, we fail
-        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
-                          column_categories[this_col_id] == NC_STRUCT) or
-                         (column_categories[old_col_id] == NC_STRUCT and
-                          column_categories[this_col_id] == NC_LIST)),
-                     "A mix of lists and structs within the same column is not supported");
-      }
-    }
-
-    auto this_column_category = column_categories[this_col_id];
-    // get path of this column, check if it is a struct/list forced as string, and enforce it
-    auto const nt                             = tree_path.get_path(this_col_id);
-    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      this_column_category = NC_STR;
-    }
-
-    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
-    // move into parent
-    device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col, this_column_category);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      col.forced_as_string_column          = true;
-      forced_as_string_column[this_col_id] = true;
-    }
-
-    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
-    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
-    if (not replaced) parent_col.column_order.push_back(name);
-    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
-    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
-  }
-
-  if (is_enabled_mixed_types_as_string) {
-    // ignore all children of mixed type columns
-    for (auto const this_col_id : unique_col_ids) {
-      auto parent_col_id = column_parent_ids[this_col_id];
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
-        is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = true;
-        columns.erase(this_col_id);
-      }
-      // Convert only mixed type columns as string (so to copy), but not its children
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
-          is_mixed_type_column[this_col_id] == 1)
-        column_categories[this_col_id] = NC_STR;
-    }
-    cudf::detail::cuda_memcpy_async<NodeT>(
-      d_column_tree.node_categories, column_categories, stream);
-  }
-
-  // ignore all children of columns forced as string
-  for (auto const this_col_id : unique_col_ids) {
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
-      forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = true;
-    }
-    // Convert only mixed type columns as string (so to copy), but not its children
-    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
-        forced_as_string_column[this_col_id])
-      column_categories[this_col_id] = NC_STR;
-  }
-  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, column_categories, stream);
-
-  // restore unique_col_ids order
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<1>(a) < thrust::get<1>(b);
-  });
-  return {ignore_vals, columns};
-}
-
-void scatter_offsets(tree_meta_t const& tree,
-                     device_span<NodeIndexT const> col_ids,
-                     device_span<size_type const> row_offsets,
-                     device_span<size_type> node_ids,
-                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-                     tree_meta_t const& d_column_tree,
-                     host_span<const bool> ignore_vals,
-                     hashmap_of_device_columns const& columns,
-                     rmm::cuda_stream_view stream)
-{
-  auto const num_nodes   = col_ids.size();
-  auto const num_columns = d_column_tree.node_categories.size();
-  // move columns data to device.
-  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
-  for (auto& [col_id, col_ref] : columns) {
-    if (col_id == parent_node_sentinel) continue;
-    auto& col            = col_ref.get();
-    columns_data[col_id] = json_column_data{col.string_offsets.data(),
-                                            col.string_lengths.data(),
-                                            col.child_offsets.data(),
-                                            static_cast<bitmask_type*>(col.validity.data())};
-  }
-
-  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
-    ignore_vals, stream, cudf::get_current_device_resource_ref());
-  auto d_columns_data = cudf::detail::make_device_uvector_async(
-    columns_data, stream, cudf::get_current_device_resource_ref());
-
-  // 3. scatter string offsets to respective columns, set validity bits
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     row_offsets       = row_offsets.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     d_ignore_vals     = d_ignore_vals.begin(),
-     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
-      if (d_ignore_vals[col_ids[i]]) return;
-      auto const node_category = column_categories[col_ids[i]];
-      switch (node_category) {
-        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_STR: [[fallthrough]];
-        case NC_VAL:
-          if (d_ignore_vals[col_ids[i]]) break;
-          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
-          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
-          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
-          break;
-        default: break;
-      }
-    });
-
-  // 4. scatter List offset
-  // copy_if only node's whose parent is list, (node_id, parent_col_id)
-  // stable_sort by parent_col_id of {node_id}.
-  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
-  //                                  (i==last, i+1!=i), write end offset.
-  //    unique_copy_by_key {parent_node_id} {row_offset} to
-  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
-
-  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
-  auto parent_col_id   = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    cuda::proclaim_return_type<NodeIndexT>(
-      [col_ids         = col_ids.begin(),
-       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
-        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
-                                                                  : col_ids[parent_node_ids[node_id]];
-      }));
-  auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
-      num_nodes,
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [d_ignore_vals     = d_ignore_vals.begin(),
-     parent_node_ids   = tree.parent_node_ids.begin(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin()] __device__(size_type node_id) {
-      auto parent_node_id = parent_node_ids[node_id];
-      return parent_node_id != parent_node_sentinel and
-             column_categories[col_ids[parent_node_id]] == NC_LIST and
-             (!d_ignore_vals[col_ids[parent_node_id]]);
-    });
-
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
-                             parent_col_ids.begin(),
-                             parent_col_ids.begin() + num_list_children,
-                             node_ids.begin());
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    num_list_children,
-    [node_ids        = node_ids.begin(),
-     parent_node_ids = tree.parent_node_ids.begin(),
-     parent_col_ids  = parent_col_ids.begin(),
-     row_offsets     = row_offsets.begin(),
-     d_columns_data  = d_columns_data.begin(),
-     num_list_children] __device__(size_type i) {
-      auto const node_id        = node_ids[i];
-      auto const parent_node_id = parent_node_ids[node_id];
-      // scatter to list_offset
-      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
-          row_offsets[node_id];
-      }
-      // last value of list child_offset is its size.
-      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
-          row_offsets[node_id] + 1;
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [options           = parse_opt.view(),
+     data              = input.data(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
+      auto const node_category = column_categories[col_ids[i]];
+      if (node_category == NC_STR or node_category == NC_VAL) {
+        auto const is_null_literal = serialized_trie_contains(
+          options.trie_na,
+          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
+        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
       }
     });
+  return is_all_nulls;
+}
 
-  // 5. scan on offsets.
-  for (auto& [id, col_ref] : columns) {
-    auto& col = col_ref.get();
-    if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.string_offsets.begin(),
-                             col.string_offsets.end(),
-                             col.string_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.child_offsets.begin(),
-                             col.child_offsets.end(),
-                             col.child_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    }
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
+                                       bool is_enabled_lines,
+                                       rmm::cuda_stream_view stream)
+{
+  NodeIndexT value = parent_node_sentinel;
+  if (!col_ids.empty()) {
+    auto const list_node_index = is_enabled_lines ? 0 : 1;
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                  col_ids.data() + list_node_index,
+                                  sizeof(NodeIndexT),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
   }
-  stream.synchronize();
+  return value;
 }
+/**
+ * @brief Holds member data pointers of `d_json_column`
+ *
+ */
+struct json_column_data {
+  using row_offset_t = json_column::row_offset_t;
+  row_offset_t* string_offsets;
+  row_offset_t* string_lengths;
+  row_offset_t* child_offsets;
+  bitmask_type* validity;
+};
+
+using hashmap_of_device_columns =
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
-namespace experimental {
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream);
 
 std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_options const& options)
 {
@@ -829,19 +276,6 @@ std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_optio
     options.get_dtypes());
 }
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
 /**
  * @brief Constructs `d_json_column` from node tree representation
  * Newly constructed columns are inserted into `root`'s children.
@@ -1033,7 +467,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
   std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns());
 
   // prune all children of a column, but not self.
-  auto ignore_all_children = [&](auto parent_col_id) {
+  auto ignore_all_children = [&adj, &is_pruned](auto parent_col_id) {
     std::deque<NodeIndexT> offspring;
     if (adj.count(parent_col_id)) {
       for (auto const& child : adj[parent_col_id]) {
@@ -1392,6 +826,145 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
 
   return {is_pruned, columns};
 }
-}  // namespace experimental
+
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream)
+{
+  auto const num_nodes   = col_ids.size();
+  auto const num_columns = d_column_tree.node_categories.size();
+  // move columns data to device.
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
+  for (auto& [col_id, col_ref] : columns) {
+    if (col_id == parent_node_sentinel) continue;
+    auto& col            = col_ref.get();
+    columns_data[col_id] = json_column_data{col.string_offsets.data(),
+                                            col.string_lengths.data(),
+                                            col.child_offsets.data(),
+                                            static_cast<bitmask_type*>(col.validity.data())};
+  }
+
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, cudf::get_current_device_resource_ref());
+
+  // 3. scatter string offsets to respective columns, set validity bits
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      if (d_ignore_vals[col_ids[i]]) return;
+      auto const node_category = column_categories[col_ids[i]];
+      switch (node_category) {
+        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_STR: [[fallthrough]];
+        case NC_VAL:
+          if (d_ignore_vals[col_ids[i]]) break;
+          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
+          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
+          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
+          break;
+        default: break;
+      }
+    });
+
+  // 4. scatter List offset
+  // copy_if only node's whose parent is list, (node_id, parent_col_id)
+  // stable_sort by parent_col_id of {node_id}.
+  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
+  //                                  (i==last, i+1!=i), write end offset.
+  //    unique_copy_by_key {parent_node_id} {row_offset} to
+  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
+
+  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
+  auto parent_col_id   = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<NodeIndexT>(
+      [col_ids         = col_ids.begin(),
+       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
+        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
+                                                                  : col_ids[parent_node_ids[node_id]];
+      }));
+  auto const list_children_end = thrust::copy_if(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
+      num_nodes,
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
+    [d_ignore_vals     = d_ignore_vals.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin()] __device__(size_type node_id) {
+      auto parent_node_id = parent_node_ids[node_id];
+      return parent_node_id != parent_node_sentinel and
+             column_categories[col_ids[parent_node_id]] == NC_LIST and
+             (!d_ignore_vals[col_ids[parent_node_id]]);
+    });
+
+  auto const num_list_children =
+    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             parent_col_ids.begin(),
+                             parent_col_ids.begin() + num_list_children,
+                             node_ids.begin());
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids        = node_ids.begin(),
+     parent_node_ids = tree.parent_node_ids.begin(),
+     parent_col_ids  = parent_col_ids.begin(),
+     row_offsets     = row_offsets.begin(),
+     d_columns_data  = d_columns_data.begin(),
+     num_list_children] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      // scatter to list_offset
+      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
+          row_offsets[node_id];
+      }
+      // last value of list child_offset is its size.
+      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
+          row_offsets[node_id] + 1;
+      }
+    });
+
+  // 5. scan on offsets.
+  for (auto& [id, col_ref] : columns) {
+    auto& col = col_ref.get();
+    if (col.type == json_col_t::StringColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.string_offsets.begin(),
+                             col.string_offsets.end(),
+                             col.string_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    } else if (col.type == json_col_t::ListColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.child_offsets.begin(),
+                             col.child_offsets.end(),
+                             col.child_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    }
+  }
+  stream.synchronize();
+}
 
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 912e93d52ae..4584f71775f 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -485,16 +485,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   }
 }
 
-template <typename... Args>
-auto make_device_json_column_dispatch(bool experimental, Args&&... args)
-{
-  if (experimental) {
-    return experimental::make_device_json_column(std::forward<Args>(args)...);
-  } else {
-    return make_device_json_column(std::forward<Args>(args)...);
-  }
-}
-
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
@@ -553,16 +543,15 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                0);
 
   // Get internal JSON column
-  make_device_json_column_dispatch(options.is_enabled_experimental(),
-                                   d_input,
-                                   gpu_tree,
-                                   gpu_col_id,
-                                   gpu_row_offsets,
-                                   root_column,
-                                   is_array_of_arrays,
-                                   options,
-                                   stream,
-                                   mr);
+  make_device_json_column(d_input,
+                          gpu_tree,
+                          gpu_col_id,
+                          gpu_row_offsets,
+                          root_column,
+                          is_array_of_arrays,
+                          options,
+                          stream,
+                          mr);
 
   // data_root refers to the root column of the data represented by the given JSON string
   auto& data_root =
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 3d9a51833e0..f6be4539d7f 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -405,21 +405,6 @@ void make_device_json_column(device_span<SymbolT const> input,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
-namespace experimental {
-/**
- * @copydoc cudf::io::json::detail::make_device_json_column
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t const& tree,
-                             device_span<NodeIndexT const> col_ids,
-                             device_span<size_type const> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr);
-}  // namespace experimental
-
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *

From 074ab749531aa136c546afc7837fec0b404fe022 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sat, 19 Oct 2024 14:34:24 -0700
Subject: [PATCH 094/117] Split hash-based groupby into multiple smaller files
 to reduce build time (#17089)

This work is part of splitting the original bulk shared memory groupby PR https://github.com/rapidsai/cudf/pull/16619.

This PR splits the hash-based groupby file into multiple translation units and uses explicit template instantiations to help reduce build time. It also includes some minor cleanups without significant functional changes.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17089
---
 cpp/CMakeLists.txt                            |   5 +
 cpp/src/groupby/hash/compute_groupby.cu       | 142 ++++++
 cpp/src/groupby/hash/compute_groupby.hpp      |  95 ++++
 .../groupby/hash/compute_single_pass_aggs.cu  |  99 ++++
 .../groupby/hash/compute_single_pass_aggs.hpp |  38 ++
 .../hash/create_sparse_results_table.cu       |  67 +++
 .../hash/create_sparse_results_table.hpp      |  32 ++
 cpp/src/groupby/hash/groupby.cu               | 459 +-----------------
 .../hash/hash_compound_agg_finalizer.cu       | 199 ++++++++
 .../hash/hash_compound_agg_finalizer.hpp      |  69 +++
 cpp/src/groupby/hash/helpers.cuh              | 116 +++++
 ...y_kernels.cuh => single_pass_functors.cuh} |  14 +-
 .../groupby/hash/sparse_to_dense_results.cu   |  72 +++
 .../groupby/hash/sparse_to_dense_results.hpp  |  51 ++
 14 files changed, 1012 insertions(+), 446 deletions(-)
 create mode 100644 cpp/src/groupby/hash/compute_groupby.cu
 create mode 100644 cpp/src/groupby/hash/compute_groupby.hpp
 create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.cu
 create mode 100644 cpp/src/groupby/hash/compute_single_pass_aggs.hpp
 create mode 100644 cpp/src/groupby/hash/create_sparse_results_table.cu
 create mode 100644 cpp/src/groupby/hash/create_sparse_results_table.hpp
 create mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
 create mode 100644 cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
 create mode 100644 cpp/src/groupby/hash/helpers.cuh
 rename cpp/src/groupby/hash/{groupby_kernels.cuh => single_pass_functors.cuh} (95%)
 create mode 100644 cpp/src/groupby/hash/sparse_to_dense_results.cu
 create mode 100644 cpp/src/groupby/hash/sparse_to_dense_results.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 32a753c9f40..e4b9cbf8921 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -368,8 +368,13 @@ add_library(
   src/filling/repeat.cu
   src/filling/sequence.cu
   src/groupby/groupby.cu
+  src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_single_pass_aggs.cu
+  src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
+  src/groupby/hash/hash_compound_agg_finalizer.cu
+  src/groupby/hash/sparse_to_dense_results.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
new file mode 100644
index 00000000000..59457bea694
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_groupby.hpp"
+#include "compute_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "sparse_to_dense_results.hpp"
+
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <iterator>
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
+                                                      size_type num_keys,
+                                                      rmm::cuda_stream_view stream)
+{
+  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+  return populated_keys;
+}
+
+template <typename Equal, typename Hash>
+std::unique_ptr<table> compute_groupby(table_view const& keys,
+                                       host_span<aggregation_request const> requests,
+                                       bool skip_rows_with_nulls,
+                                       Equal const& d_row_equal,
+                                       Hash const& d_row_hash,
+                                       cudf::detail::result_cache* cache,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+{
+  // convert to int64_t to avoid potential overflow with large `keys`
+  auto const num_keys = static_cast<int64_t>(keys.num_rows());
+
+  // Cache of sparse results where the location of aggregate value in each
+  // column is indexed by the hash set
+  cudf::detail::result_cache sparse_results(requests.size());
+
+  auto const set = cuco::static_set{
+    num_keys,
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% load factor
+    cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    d_row_equal,
+    probing_scheme_t{d_row_hash},
+    cuco::thread_scope_device,
+    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
+  auto row_bitmask =
+    skip_rows_with_nulls
+      ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      : rmm::device_buffer{};
+
+  // Compute all single pass aggs first
+  compute_single_pass_aggs(num_keys,
+                           skip_rows_with_nulls,
+                           static_cast<bitmask_type*>(row_bitmask.data()),
+                           set.ref(cuco::insert_and_find),
+                           requests,
+                           &sparse_results,
+                           stream);
+
+  // Extract the populated indices from the hash set and create a gather map.
+  // Gathering using this map from sparse results will give dense results.
+  auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
+
+  // Compact all results from sparse_results and insert into cache
+  sparse_to_dense_results(requests,
+                          &sparse_results,
+                          cache,
+                          gather_map,
+                          set.ref(cuco::find),
+                          static_cast<bitmask_type*>(row_bitmask.data()),
+                          stream,
+                          mr);
+
+  return cudf::detail::gather(keys,
+                              gather_map,
+                              out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
+                              stream,
+                              mr);
+}
+
+template rmm::device_uvector<size_type> extract_populated_keys<global_set_t>(
+  global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
+
+template rmm::device_uvector<size_type> extract_populated_keys<nullable_global_set_t>(
+  nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
+
+template std::unique_ptr<table> compute_groupby<row_comparator_t, row_hash_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  bool skip_rows_with_nulls,
+  row_comparator_t const& d_row_equal,
+  row_hash_t const& d_row_hash,
+  cudf::detail::result_cache* cache,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t, row_hash_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  bool skip_rows_with_nulls,
+  nullable_row_comparator_t const& d_row_equal,
+  row_hash_t const& d_row_hash,
+  cudf::detail::result_cache* cache,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
new file mode 100644
index 00000000000..7bb3a60ff07
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes and returns a device vector containing all populated keys in
+ * `key_set`.
+ *
+ * @tparam SetType Type of key hash set
+ *
+ * @param key_set Key hash set
+ * @param num_keys Number of input keys
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return An array of unique keys contained in `key_set`
+ */
+template <typename SetType>
+rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
+                                                      size_type num_keys,
+                                                      rmm::cuda_stream_view stream);
+
+/**
+ * @brief Computes groupby using hash table.
+ *
+ * First, we create a hash table that stores the indices of unique rows in
+ * `keys`. The upper limit on the number of values in this map is the number
+ * of rows in `keys`.
+ *
+ * To store the results of aggregations, we create temporary sparse columns
+ * which have the same size as input value columns. Using the hash map, we
+ * determine the location within the sparse column to write the result of the
+ * aggregation into.
+ *
+ * The sparse column results of all aggregations are stored into the cache
+ * `sparse_results`. This enables the use of previously calculated results in
+ * other aggregations.
+ *
+ * All the aggregations which can be computed in a single pass are computed
+ * first, in a combined kernel. Then using these results, aggregations that
+ * require multiple passes, will be computed.
+ *
+ * Finally, using the hash map, we generate a vector of indices of populated
+ * values in sparse result columns. Then, for each aggregation originally
+ * requested in `requests`, we gather sparse results into a column of dense
+ * results using the aforementioned index vector. Dense results are stored into
+ * the in/out parameter `cache`.
+ *
+ * @tparam Equal Device row comparator type
+ * @tparam Hash Device row hasher type
+ *
+ * @param keys Table whose rows act as the groupby keys
+ * @param requests The set of columns to aggregate and the aggregations to perform
+ * @param skip_rows_with_nulls Flag indicating whether to ignore nulls or not
+ * @param d_row_equal Device row comparator
+ * @param d_row_hash Device row hasher
+ * @param cache Dense aggregation results
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table
+ * @return Table of unique keys
+ */
+template <typename Equal, typename Hash>
+std::unique_ptr<cudf::table> compute_groupby(table_view const& keys,
+                                             host_span<aggregation_request const> requests,
+                                             bool skip_rows_with_nulls,
+                                             Equal const& d_row_equal,
+                                             Hash const& d_row_hash,
+                                             cudf::detail::result_cache* cache,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
new file mode 100644
index 00000000000..e292543e6e9
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_single_pass_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+void compute_single_pass_aggs(int64_t num_keys,
+                              bool skip_rows_with_nulls,
+                              bitmask_type const* row_bitmask,
+                              SetType set,
+                              host_span<aggregation_request const> requests,
+                              cudf::detail::result_cache* sparse_results,
+                              rmm::cuda_stream_view stream)
+{
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+
+  // make table that will hold sparse results
+  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto const d_aggs   = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, cudf::get_current_device_resource_ref());
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    num_keys,
+    hash::compute_single_pass_aggs_fn{
+      set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls});
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+}
+
+template void compute_single_pass_aggs<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  int64_t num_keys,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  hash_set_ref_t<cuco::insert_and_find_tag> set,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+
+template void compute_single_pass_aggs<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
+  int64_t num_keys,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  nullable_hash_set_ref_t<cuco::insert_and_find_tag> set,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
new file mode 100644
index 00000000000..a7434bdf61a
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+void compute_single_pass_aggs(int64_t num_keys,
+                              bool skip_rows_with_nulls,
+                              bitmask_type const* row_bitmask,
+                              SetType set,
+                              cudf::host_span<cudf::groupby::aggregation_request const> requests,
+                              cudf::detail::result_cache* sparse_results,
+                              rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
new file mode 100644
index 00000000000..22fa4fc584c
--- /dev/null
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "create_sparse_results_table.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+// make table that will hold sparse results
+cudf::table create_sparse_results_table(table_view const& flattened_values,
+                                        std::vector<aggregation::Kind> aggs,
+                                        rmm::cuda_stream_view stream)
+{
+  // TODO single allocation - room for performance improvement
+  std::vector<std::unique_ptr<column>> sparse_columns;
+  sparse_columns.reserve(flattened_values.num_columns());
+  std::transform(
+    flattened_values.begin(),
+    flattened_values.end(),
+    aggs.begin(),
+    std::back_inserter(sparse_columns),
+    [stream](auto const& col, auto const& agg) {
+      bool nullable =
+        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
+          ? false
+          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
+      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
+
+      auto col_type = cudf::is_dictionary(col.type())
+                        ? cudf::dictionary_column_view(col).keys().type()
+                        : col.type();
+
+      return make_fixed_width_column(
+        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+    });
+
+  table sparse_table(std::move(sparse_columns));
+  mutable_table_view table_view = sparse_table.mutable_view();
+  cudf::detail::initialize_with_identity(table_view, aggs, stream);
+  return sparse_table;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
new file mode 100644
index 00000000000..c1d4e0d3f20
--- /dev/null
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+// make table that will hold sparse results
+cudf::table create_sparse_results_table(table_view const& flattened_values,
+                                        std::vector<aggregation::Kind> aggs_kinds,
+                                        rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 0432b9d120a..30e1d52fdbf 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,60 +14,32 @@
  * limitations under the License.
  */
 
-#include "flatten_single_pass_aggs.hpp"
+#include "compute_groupby.hpp"
 #include "groupby/common/utils.hpp"
-#include "groupby_kernels.cuh"
-#include "var_hash_functor.cuh"
+#include "helpers.cuh"
 
 #include <cudf/aggregation.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/unary.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <cuco/static_set.cuh>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
+#include <algorithm>
 #include <memory>
-#include <unordered_set>
 #include <utility>
+#include <vector>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 namespace {
-
-// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
-// types and `cg_size = 1`for flat data to improve performance
-using probing_scheme_type = cuco::linear_probing<
-  1,  ///< Number of threads used to handle each input key
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC>>;
-
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.
@@ -112,413 +84,33 @@ bool constexpr is_hash_aggregation(aggregation::Kind t)
   return array_contains(hash_aggregations, t);
 }
 
-template <typename SetType>
-class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
-  column_view col;
-  data_type result_type;
-  cudf::detail::result_cache* sparse_results;
-  cudf::detail::result_cache* dense_results;
-  device_span<size_type const> gather_map;
-  SetType set;
-  bitmask_type const* __restrict__ row_bitmask;
-  rmm::cuda_stream_view stream;
-  rmm::device_async_resource_ref mr;
-
- public:
-  using cudf::detail::aggregation_finalizer::visit;
-
-  hash_compound_agg_finalizer(column_view col,
-                              cudf::detail::result_cache* sparse_results,
-                              cudf::detail::result_cache* dense_results,
-                              device_span<size_type const> gather_map,
-                              SetType set,
-                              bitmask_type const* row_bitmask,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-    : col(col),
-      sparse_results(sparse_results),
-      dense_results(dense_results),
-      gather_map(gather_map),
-      set(set),
-      row_bitmask(row_bitmask),
-      stream(stream),
-      mr(mr)
-  {
-    result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type()
-                                                  : col.type();
-  }
-
-  auto to_dense_agg_result(cudf::aggregation const& agg)
-  {
-    auto s                  = sparse_results->get_result(col, agg);
-    auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
-                                                   gather_map,
-                                                   out_of_bounds_policy::DONT_CHECK,
-                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                   stream,
-                                                   mr);
-    return std::move(dense_result_table->release()[0]);
-  }
-
-  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
-  auto gather_argminmax(aggregation const& agg)
-  {
-    auto arg_result = to_dense_agg_result(agg);
-    // We make a view of ARG(MIN/MAX) result without a null mask and gather
-    // using this map. The values in data buffer of ARG(MIN/MAX) result
-    // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
-    // which is an out of bounds index value (-1) and causes the gathered
-    // value to be null.
-    column_view null_removed_map(
-      data_type(type_to_id<size_type>()),
-      arg_result->size(),
-      static_cast<void const*>(arg_result->view().template data<size_type>()),
-      nullptr,
-      0);
-    auto gather_argminmax =
-      cudf::detail::gather(table_view({col}),
-                           null_removed_map,
-                           arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
-                                                  : cudf::out_of_bounds_policy::DONT_CHECK,
-                           cudf::detail::negative_index_policy::NOT_ALLOWED,
-                           stream,
-                           mr);
-    return std::move(gather_argminmax->release()[0]);
-  }
-
-  // Declare overloads for each kind of aggregation to dispatch
-  void visit(cudf::aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::min_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmin_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::max_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmax_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::mean_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = dense_results->get_result(col, *sum_agg);
-    column_view count_result = dense_results->get_result(col, *count_agg);
-
-    auto result =
-      cudf::detail::binary_operation(sum_result,
-                                     count_result,
-                                     binary_operator::DIV,
-                                     cudf::detail::target_type(result_type, aggregation::MEAN),
-                                     stream,
-                                     mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-
-  void visit(cudf::detail::var_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = sparse_results->get_result(col, *sum_agg);
-    column_view count_result = sparse_results->get_result(col, *count_agg);
-
-    auto values_view = column_device_view::create(col, stream);
-    auto sum_view    = column_device_view::create(sum_result, stream);
-    auto count_view  = column_device_view::create(count_result, stream);
-
-    auto var_result = make_fixed_width_column(
-      cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
-    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
-    mutable_table_view var_table_view{{var_result->mutable_view()}};
-    cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
-
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      col.size(),
-      var_hash_functor{
-        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
-    sparse_results->add_result(col, agg, std::move(var_result));
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::std_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    auto var_agg = make_variance_aggregation(agg._ddof);
-    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
-    column_view variance = dense_results->get_result(col, *var_agg);
-
-    auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-};
-
-/**
- * @brief Gather sparse results into dense using `gather_map` and add to
- * `dense_cache`
- *
- * @see groupby_null_templated()
- */
-template <typename SetType>
-void sparse_to_dense_results(table_view const& keys,
-                             host_span<aggregation_request const> requests,
-                             cudf::detail::result_cache* sparse_results,
-                             cudf::detail::result_cache* dense_results,
-                             device_span<size_type const> gather_map,
-                             SetType set,
-                             bool keys_have_nulls,
-                             null_policy include_null_keys,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
-  bitmask_type const* row_bitmask_ptr =
-    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-    auto const& col   = request.values;
-
-    // Given an aggregation, this will get the result from sparse_results and
-    // convert and return dense, compacted result
-    auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
-    for (auto&& agg : agg_v) {
-      agg->finalize(finalizer);
-    }
-  }
-}
-
-// make table that will hold sparse results
-auto create_sparse_results_table(table_view const& flattened_values,
-                                 std::vector<aggregation::Kind> aggs,
-                                 rmm::cuda_stream_view stream)
-{
-  // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<column>> sparse_columns;
-  std::transform(
-    flattened_values.begin(),
-    flattened_values.end(),
-    aggs.begin(),
-    std::back_inserter(sparse_columns),
-    [stream](auto const& col, auto const& agg) {
-      bool nullable =
-        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
-          ? false
-          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
-      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
-
-      auto col_type = cudf::is_dictionary(col.type())
-                        ? cudf::dictionary_column_view(col).keys().type()
-                        : col.type();
-
-      return make_fixed_width_column(
-        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-    });
-
-  table sparse_table(std::move(sparse_columns));
-  mutable_table_view table_view = sparse_table.mutable_view();
-  cudf::detail::initialize_with_identity(table_view, aggs, stream);
-  return sparse_table;
-}
-
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-void compute_single_pass_aggs(table_view const& keys,
-                              host_span<aggregation_request const> requests,
-                              cudf::detail::result_cache* sparse_results,
-                              SetType set,
-                              bool keys_have_nulls,
-                              null_policy include_null_keys,
-                              rmm::cuda_stream_view stream)
-{
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-
-  // make table that will hold sparse results
-  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, cudf::get_current_device_resource_ref());
-  auto const skip_key_rows_with_nulls =
-    keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
-
-  auto row_bitmask =
-    skip_key_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
-      : rmm::device_buffer{};
-
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    keys.num_rows(),
-    hash::compute_single_pass_aggs_fn{set,
-                                      *d_values,
-                                      *d_sparse_table,
-                                      d_aggs.data(),
-                                      static_cast<bitmask_type*>(row_bitmask.data()),
-                                      skip_key_rows_with_nulls});
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-}
-
-/**
- * @brief Computes and returns a device vector containing all populated keys in
- * `map`.
- */
-template <typename SetType>
-rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
-                                                      size_type num_keys,
-                                                      rmm::cuda_stream_view stream)
-{
-  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-  return populated_keys;
-}
-
-/**
- * @brief Computes groupby using hash table.
- *
- * First, we create a hash table that stores the indices of unique rows in
- * `keys`. The upper limit on the number of values in this map is the number
- * of rows in `keys`.
- *
- * To store the results of aggregations, we create temporary sparse columns
- * which have the same size as input value columns. Using the hash map, we
- * determine the location within the sparse column to write the result of the
- * aggregation into.
- *
- * The sparse column results of all aggregations are stored into the cache
- * `sparse_results`. This enables the use of previously calculated results in
- * other aggregations.
- *
- * All the aggregations which can be computed in a single pass are computed
- * first, in a combined kernel. Then using these results, aggregations that
- * require multiple passes, will be computed.
- *
- * Finally, using the hash map, we generate a vector of indices of populated
- * values in sparse result columns. Then, for each aggregation originally
- * requested in `requests`, we gather sparse results into a column of dense
- * results using the aforementioned index vector. Dense results are stored into
- * the in/out parameter `cache`.
- */
-std::unique_ptr<table> groupby(table_view const& keys,
-                               host_span<aggregation_request const> requests,
-                               cudf::detail::result_cache* cache,
-                               bool const keys_have_nulls,
-                               null_policy const include_null_keys,
-                               rmm::cuda_stream_view stream,
-                               rmm::device_async_resource_ref mr)
+std::unique_ptr<table> dispatch_groupby(table_view const& keys,
+                                        host_span<aggregation_request const> requests,
+                                        cudf::detail::result_cache* cache,
+                                        bool const keys_have_nulls,
+                                        null_policy const include_null_keys,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr)
 {
-  // convert to int64_t to avoid potential overflow with large `keys`
-  auto const num_keys            = static_cast<int64_t>(keys.num_rows());
-  auto const null_keys_are_equal = null_equality::EQUAL;
-  auto const has_null            = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+  auto const null_keys_are_equal  = null_equality::EQUAL;
+  auto const has_null             = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+  auto const skip_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
   auto const comparator  = cudf::experimental::row::equality::self_comparator{preprocessed_keys};
   auto const row_hash    = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)};
   auto const d_row_hash  = row_hash.device_hasher(has_null);
 
-  // Cache of sparse results where the location of aggregate value in each
-  // column is indexed by the hash set
-  cudf::detail::result_cache sparse_results(requests.size());
-
-  auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set = cuco::static_set{
-      num_keys,
-      0.5,  // desired load factor
-      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-      d_key_equal,
-      probing_scheme_type{d_row_hash},
-      cuco::thread_scope_device,
-      cuco::storage<1>{},
-      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-      stream.value()};
-
-    // Compute all single pass aggs first
-    compute_single_pass_aggs(keys,
-                             requests,
-                             &sparse_results,
-                             set.ref(cuco::insert_and_find),
-                             keys_have_nulls,
-                             include_null_keys,
-                             stream);
-
-    // Extract the populated indices from the hash set and create a gather map.
-    // Gathering using this map from sparse results will give dense results.
-    auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
-
-    // Compact all results from sparse_results and insert into cache
-    sparse_to_dense_results(keys,
-                            requests,
-                            &sparse_results,
-                            cache,
-                            gather_map,
-                            set.ref(cuco::find),
-                            keys_have_nulls,
-                            include_null_keys,
-                            stream,
-                            mr);
-
-    return cudf::detail::gather(keys,
-                                gather_map,
-                                out_of_bounds_policy::DONT_CHECK,
-                                cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                stream,
-                                mr);
-  };
-
   if (cudf::detail::has_nested_columns(keys)) {
-    auto const d_key_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
-    return comparator_helper(d_key_equal);
+    auto const d_row_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
+    return compute_groupby<nullable_row_comparator_t>(
+      keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr);
   } else {
-    auto const d_key_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
-    return comparator_helper(d_key_equal);
+    auto const d_row_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
+    return compute_groupby<row_comparator_t>(
+      keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr);
   }
 }
-
 }  // namespace
 
 /**
@@ -559,11 +151,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   cudf::detail::result_cache cache(requests.size());
 
   std::unique_ptr<table> unique_keys =
-    groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
+    dispatch_groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
 
   return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
 }
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
new file mode 100644
index 00000000000..37a61c1a22c
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash_compound_agg_finalizer.hpp"
+#include "helpers.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+hash_compound_agg_finalizer<SetType>::hash_compound_agg_finalizer(
+  column_view col,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  SetType set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+  : col(col),
+    sparse_results(sparse_results),
+    dense_results(dense_results),
+    gather_map(gather_map),
+    set(set),
+    row_bitmask(row_bitmask),
+    stream(stream),
+    mr(mr)
+{
+  result_type =
+    cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
+}
+
+template <typename SetType>
+auto hash_compound_agg_finalizer<SetType>::to_dense_agg_result(cudf::aggregation const& agg)
+{
+  auto s                  = sparse_results->get_result(col, agg);
+  auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
+                                                 gather_map,
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                                 stream,
+                                                 mr);
+  return std::move(dense_result_table->release()[0]);
+}
+
+template <typename SetType>
+auto hash_compound_agg_finalizer<SetType>::gather_argminmax(aggregation const& agg)
+{
+  auto arg_result = to_dense_agg_result(agg);
+  // We make a view of ARG(MIN/MAX) result without a null mask and gather
+  // using this map. The values in data buffer of ARG(MIN/MAX) result
+  // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
+  // which is an out of bounds index value (-1) and causes the gathered
+  // value to be null.
+  column_view null_removed_map(
+    data_type(type_to_id<size_type>()),
+    arg_result->size(),
+    static_cast<void const*>(arg_result->view().template data<size_type>()),
+    nullptr,
+    0);
+  auto gather_argminmax =
+    cudf::detail::gather(table_view({col}),
+                         null_removed_map,
+                         arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
+                                                : cudf::out_of_bounds_policy::DONT_CHECK,
+                         cudf::detail::negative_index_policy::NOT_ALLOWED,
+                         stream,
+                         mr);
+  return std::move(gather_argminmax->release()[0]);
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  dense_results->add_result(col, agg, to_dense_agg_result(agg));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::min_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  if (result_type.id() == type_id::STRING) {
+    auto transformed_agg = make_argmin_aggregation();
+    dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+  } else {
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::max_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  if (result_type.id() == type_id::STRING) {
+    auto transformed_agg = make_argmax_aggregation();
+    dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+  } else {
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::mean_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  auto sum_agg   = make_sum_aggregation();
+  auto count_agg = make_count_aggregation();
+  this->visit(*sum_agg);
+  this->visit(*count_agg);
+  column_view sum_result   = dense_results->get_result(col, *sum_agg);
+  column_view count_result = dense_results->get_result(col, *count_agg);
+
+  auto result =
+    cudf::detail::binary_operation(sum_result,
+                                   count_result,
+                                   binary_operator::DIV,
+                                   cudf::detail::target_type(result_type, aggregation::MEAN),
+                                   stream,
+                                   mr);
+  dense_results->add_result(col, agg, std::move(result));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::var_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  auto sum_agg   = make_sum_aggregation();
+  auto count_agg = make_count_aggregation();
+  this->visit(*sum_agg);
+  this->visit(*count_agg);
+  column_view sum_result   = sparse_results->get_result(col, *sum_agg);
+  column_view count_result = sparse_results->get_result(col, *count_agg);
+
+  auto values_view = column_device_view::create(col, stream);
+  auto sum_view    = column_device_view::create(sum_result, stream);
+  auto count_view  = column_device_view::create(count_result, stream);
+
+  auto var_result = make_fixed_width_column(
+    cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
+  auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
+  mutable_table_view var_table_view{{var_result->mutable_view()}};
+  cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    col.size(),
+    var_hash_functor{
+      set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
+  sparse_results->add_result(col, agg, std::move(var_result));
+  dense_results->add_result(col, agg, to_dense_agg_result(agg));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::std_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  auto var_agg = make_variance_aggregation(agg._ddof);
+  this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
+  column_view variance = dense_results->get_result(col, *var_agg);
+
+  auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
+  dense_results->add_result(col, agg, std::move(result));
+}
+
+template class hash_compound_agg_finalizer<hash_set_ref_t<cuco::find_tag>>;
+template class hash_compound_agg_finalizer<nullable_hash_set_ref_t<cuco::find_tag>>;
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
new file mode 100644
index 00000000000..8bee1a92c40
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
+  column_view col;
+  data_type result_type;
+  cudf::detail::result_cache* sparse_results;
+  cudf::detail::result_cache* dense_results;
+  device_span<size_type const> gather_map;
+  SetType set;
+  bitmask_type const* __restrict__ row_bitmask;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+ public:
+  using cudf::detail::aggregation_finalizer::visit;
+
+  hash_compound_agg_finalizer(column_view col,
+                              cudf::detail::result_cache* sparse_results,
+                              cudf::detail::result_cache* dense_results,
+                              device_span<size_type const> gather_map,
+                              SetType set,
+                              bitmask_type const* row_bitmask,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr);
+
+  auto to_dense_agg_result(cudf::aggregation const& agg);
+
+  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
+  auto gather_argminmax(cudf::aggregation const& agg);
+
+  // Declare overloads for each kind of aggregation to dispatch
+  void visit(cudf::aggregation const& agg) override;
+
+  void visit(cudf::detail::min_aggregation const& agg) override;
+
+  void visit(cudf::detail::max_aggregation const& agg) override;
+
+  void visit(cudf::detail::mean_aggregation const& agg) override;
+
+  void visit(cudf::detail::var_aggregation const& agg) override;
+
+  void visit(cudf::detail::std_aggregation const& agg) override;
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
new file mode 100644
index 00000000000..0d117ca35b3
--- /dev/null
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <cuco/static_set.cuh>
+
+namespace cudf::groupby::detail::hash {
+// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
+// types and `cg_size = 1`for flat data to improve performance
+/// Number of threads to handle each input element
+CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
+
+/// Number of slots per thread
+CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
+
+/// Thread block size
+CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128;
+
+/// Threshold cardinality to switch between shared memory aggregations and global memory
+/// aggregations
+CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128;
+
+// We add additional `block_size`, because after the number of elements in the local hash set
+// exceeds the threshold, all threads in the thread block can still insert one more element.
+/// The maximum number of elements handled per block
+CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS =
+  GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE;
+
+// GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
+/// Shared memory hash set extent type
+using shmem_extent_t =
+  cuco::extent<cudf::size_type,
+               static_cast<cudf::size_type>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>;
+
+/// Number of windows needed by each shared memory hash set
+CUDF_HOST_DEVICE auto constexpr window_extent =
+  cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
+
+/**
+ * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer.
+ */
+CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
+{
+  std::size_t constexpr base = 8;
+  return cudf::util::div_rounding_up_safe(num, base) * base;
+}
+
+using row_hash_t =
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>;
+
+/// Probing scheme type used by groupby hash table
+using probing_scheme_t = cuco::linear_probing<GROUPBY_CG_SIZE, row_hash_t>;
+
+using row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
+  false,
+  cudf::nullate::DYNAMIC,
+  cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
+
+using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
+  true,
+  cudf::nullate::DYNAMIC,
+  cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
+
+using global_set_t = cuco::static_set<cudf::size_type,
+                                      cuco::extent<int64_t>,
+                                      cuda::thread_scope_device,
+                                      row_comparator_t,
+                                      probing_scheme_t,
+                                      cudf::detail::cuco_allocator<char>,
+                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+using nullable_global_set_t = cuco::static_set<cudf::size_type,
+                                               cuco::extent<int64_t>,
+                                               cuda::thread_scope_device,
+                                               nullable_row_comparator_t,
+                                               probing_scheme_t,
+                                               cudf::detail::cuco_allocator<char>,
+                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+template <typename Op>
+using hash_set_ref_t = cuco::static_set_ref<
+  cudf::size_type,
+  cuda::thread_scope_device,
+  row_comparator_t,
+  probing_scheme_t,
+  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  Op>;
+
+template <typename Op>
+using nullable_hash_set_ref_t = cuco::static_set_ref<
+  cudf::size_type,
+  cuda::thread_scope_device,
+  nullable_row_comparator_t,
+  probing_scheme_t,
+  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  Op>;
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
similarity index 95%
rename from cpp/src/groupby/hash/groupby_kernels.cuh
rename to cpp/src/groupby/hash/single_pass_functors.cuh
index 86f4d76487f..73791b3aa71 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -21,12 +20,9 @@
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/bit.hpp>
 
-#include <thrust/pair.h>
+#include <cuco/static_set_ref.cuh>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 /**
  * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
  * and populate `set` with indices of unique keys
@@ -102,8 +98,4 @@ struct compute_single_pass_aggs_fn {
     }
   }
 };
-
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
new file mode 100644
index 00000000000..e1c2cd22309
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash_compound_agg_finalizer.hpp"
+#include "helpers.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetRef>
+void sparse_to_dense_results(host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetRef set,
+                             bitmask_type const* row_bitmask,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+    auto const& col   = request.values;
+
+    // Given an aggregation, this will get the result from sparse_results and
+    // convert and return dense, compacted result
+    auto finalizer = hash_compound_agg_finalizer(
+      col, sparse_results, dense_results, gather_map, set, row_bitmask, stream, mr);
+    for (auto&& agg : agg_v) {
+      agg->finalize(finalizer);
+    }
+  }
+}
+
+template void sparse_to_dense_results<hash_set_ref_t<cuco::find_tag>>(
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  hash_set_ref_t<cuco::find_tag> set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template void sparse_to_dense_results<nullable_hash_set_ref_t<cuco::find_tag>>(
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  nullable_hash_set_ref_t<cuco::find_tag> set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
new file mode 100644
index 00000000000..3a2b3090b99
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Gather sparse aggregation results into dense using `gather_map` and add to
+ * `dense_results`
+ *
+ * @tparam SetRef Device hash set ref type
+ *
+ * @param[in] requests The set of columns to aggregate and the aggregations to perform
+ * @param[in] sparse_results Sparse aggregation results
+ * @param[out] dense_results Dense aggregation results
+ * @param[in] gather_map Gather map indicating valid elements in `sparse_results`
+ * @param[in] set Device hash set ref
+ * @param[in] row_bitmask Bitmask indicating the validity of input keys
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @param[in] mr Device memory resource used to allocate the returned table
+ */
+template <typename SetRef>
+void sparse_to_dense_results(host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetRef set,
+                             bitmask_type const* row_bitmask,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash

From 69ca3874b97e9cce6efb71e3e33ec598b57908a3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 21 Oct 2024 18:56:07 -0500
Subject: [PATCH 095/117] Ignore loud dask warnings about legacy dataframe
 implementation (#17137)

This PR ignores loud dask warnings about legacy dask dataframe implementation is going to be soon removed: https://github.com/dask/dask/pull/11437

Note: We only see this error for `DASK_DATAFRAME__QUERY_PLANNING=False` cases, `DASK_DATAFRAME__QUERY_PLANNING=True` are passing fine.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/17137
---
 python/dask_cudf/pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index fbcd7ae5dfb..705865d083b 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -126,5 +126,8 @@ filterwarnings = [
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
     "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
+    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
+    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
+    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
 xfail_strict = true

From 13de3c1f61b49ade0d7283442209d82c9f59ae02 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 22 Oct 2024 02:24:03 -0700
Subject: [PATCH 096/117] Add compile time check to ensure the
 `counting_iterator` type in `counting_transform_iterator` fits in `size_type`
 (#17118)

This PR adds a compile time check to enforce that the `start` argument to `cudf::detail::counting_transform_iterator`, which is used to determine the type of `counting_iterator`, is of a type that fits in `int32_t` (aka `size_type`). The PR also modifies the instances of `counting_transform_iterator` that need to work with `counting_iterators` of type > `int32_t` to manually created `counting_transform_iterators` using thrust.

More context in this [comment](https://github.com/rapidsai/cudf/pull/17059/files#r1803925659).

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/17118
---
 cpp/include/cudf/detail/iterator.cuh          | 20 +++++++++++++------
 .../cudf/detail/utilities/batched_memset.hpp  |  4 ++--
 .../cudf/strings/detail/strings_children.cuh  | 17 ++++++++--------
 cpp/src/io/parquet/page_data.cu               |  7 ++++---
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  8 ++++----
 cpp/tests/io/orc_chunked_reader_test.cu       |  9 +++++----
 6 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 4349e1b70fd..30f36d6a5da 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -38,18 +38,19 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-#include <utility>
-
 namespace cudf {
 namespace detail {
 /**
  * @brief Convenience wrapper for creating a `thrust::transform_iterator` over a
- * `thrust::counting_iterator`.
+ * `thrust::counting_iterator` within the range [0, INT_MAX].
+ *
  *
  * Example:
  * @code{.cpp}
@@ -62,14 +63,21 @@ namespace detail {
  * iter[n] == n * n
  * @endcode
  *
- * @param start The starting value of the counting iterator
+ * @param start The starting value of the counting iterator (must be size_type or smaller type).
  * @param f The unary function to apply to the counting iterator.
  * @return A transform iterator that applies `f` to a counting iterator
  */
-template <typename UnaryFunction>
-CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(cudf::size_type start,
+template <typename CountingIterType, typename UnaryFunction>
+CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {
+  // Check if the `start` for counting_iterator is of size_type or a smaller integral type
+  static_assert(
+    cuda::std::is_integral_v<CountingIterType> and
+      cuda::std::numeric_limits<CountingIterType>::digits <=
+        cuda::std::numeric_limits<cudf::size_type>::digits,
+    "The `start` for the counting_transform_iterator must be size_type or smaller type");
+
   return thrust::make_transform_iterator(thrust::make_counting_iterator(start), f);
 }
 
diff --git a/cpp/include/cudf/detail/utilities/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp
index 75f738f7529..78be5b91248 100644
--- a/cpp/include/cudf/detail/utilities/batched_memset.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp
@@ -53,8 +53,8 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
     cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref());
 
   // get a vector with the sizes of all buffers
-  auto sizes = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
+  auto sizes = thrust::make_transform_iterator(
+    thrust::counting_iterator<std::size_t>(0),
     cuda::proclaim_return_type<std::size_t>(
       [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
 
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index fb0b25cf9f1..de2f1770e28 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -65,19 +65,20 @@ rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
   auto chars_data      = rmm::device_uvector<char>(chars_size, stream, mr);
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);
 
-  auto const src_ptrs = cudf::detail::make_counting_transform_iterator(
-    0u, cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
+  auto const src_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
       // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
       // we have to use `const_cast` to remove `const` qualifier from the source pointer.
       // This should be fine as long as we only read but not write anything to the source.
       return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
     }));
-  auto const src_sizes = cudf::detail::make_counting_transform_iterator(
-    0u, cuda::proclaim_return_type<size_type>([begin] __device__(uint32_t idx) {
-      return begin[idx].second;
-    }));
-  auto const dst_ptrs = cudf::detail::make_counting_transform_iterator(
-    0u,
+  auto const src_sizes = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<size_type>(
+      [begin] __device__(uint32_t idx) { return begin[idx].second; }));
+  auto const dst_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
     cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
                                         uint32_t idx) { return output + offsets[idx]; }));
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index b3276c81c1f..0d24fa4236f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
 namespace cudf::io::parquet::detail {
@@ -476,9 +477,9 @@ void WriteFinalOffsets(host_span<size_type const> offsets,
   auto d_src_data = cudf::detail::make_device_uvector_async(
     offsets, stream, cudf::get_current_device_resource_ref());
   // Iterator for the source (scalar) data
-  auto src_iter = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
-    cuda::proclaim_return_type<size_type*>(
+  auto src_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<std::size_t>(0),
+    cuda::proclaim_return_type<cudf::size_type*>(
       [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; }));
 
   // Copy buffer addresses to device and create an iterator
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 5138a92ac14..60e35465793 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1629,10 +1629,10 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         get_page_nesting_size{
           d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
 
-      // Manually create a int64_t `key_start` compatible counting_transform_iterator to avoid
-      // implicit casting to size_type.
-      auto const reduction_keys = thrust::make_transform_iterator(
-        thrust::make_counting_iterator<size_t>(key_start), get_reduction_key{subpass.pages.size()});
+      // Manually create a size_t `key_start` compatible counting_transform_iterator.
+      auto const reduction_keys =
+        thrust::make_transform_iterator(thrust::make_counting_iterator<std::size_t>(key_start),
+                                        get_reduction_key{subpass.pages.size()});
 
       // Find the size of each column
       thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 8ad1fea649d..5f1aea71f73 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1358,10 +1358,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
-    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
-  });
-  auto const col = data_col(it, it + num_rows);
+  auto const it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<int64_t>(0), [num_rows](int64_t i) {
+      return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+    });
+  auto const col         = data_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
   std::vector<char> data_buffer;

From 637e3206a4656bd38636f3fadf3c4573c7bc906a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 22 Oct 2024 10:28:07 +0100
Subject: [PATCH 097/117] Unify treatment of `Expr` and `IR` nodes in
 cudf-polars DSL (#17016)

As part of in-progress multi-GPU work, we will likely want to:

1. Introduce additional nodes into the `IR` namespace;
2. Implement rewrite rules for `IR` trees to express needed communication patterns;
3. Write visitors that translate expressions into an appropriate description for whichever multi-GPU approach we end up taking.

It was already straightforward to write generic visitors for `Expr` nodes, since those uniformly have a `.children` property for their dependents. In contrast, the `IR` nodes were more ad-hoc. To solve this, pull out the generic implementation from `Expr` into an abstract `Node` class. Now `Expr` nodes just inherit from this, and `IR` nodes do so similarly.

Redoing the `IR` nodes is a little painful because we want to make them hashable, so we have to provide a bunch of custom `get_hashable` implementations (the schema dict, for example, is not hashable).

With these generic facilities in place, we can now implement traversal and visitor infrastructure. Specifically, we provide:

- a mechanism for pre-order traversal of an expression DAG, yielding each unique node exactly once. This is useful if one wants to know if an expression contains some particular node;
- a mechanism for writing recursive visitors and then wrapping a caching scheme around the outside. This is useful for rewrites.

Some example usages are shown in tests.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17016
---
 .../dsl/expressions/aggregation.py            |   5 +-
 .../cudf_polars/dsl/expressions/base.py       |  97 +---
 .../cudf_polars/dsl/expressions/binaryop.py   |   5 +-
 .../cudf_polars/dsl/expressions/boolean.py    |   5 +-
 .../cudf_polars/dsl/expressions/datetime.py   |   5 +-
 .../cudf_polars/dsl/expressions/literal.py    |  14 +-
 .../cudf_polars/dsl/expressions/rolling.py    |  10 +-
 .../cudf_polars/dsl/expressions/selection.py  |  10 +-
 .../cudf_polars/dsl/expressions/sorting.py    |  10 +-
 .../cudf_polars/dsl/expressions/string.py     |   5 +-
 .../cudf_polars/dsl/expressions/ternary.py    |   5 +-
 .../cudf_polars/dsl/expressions/unary.py      |  14 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 527 ++++++++++++------
 .../cudf_polars/cudf_polars/dsl/nodebase.py   | 152 +++++
 .../cudf_polars/cudf_polars/dsl/translate.py  |  40 +-
 .../cudf_polars/cudf_polars/dsl/traversal.py  | 175 ++++++
 .../cudf_polars/typing/__init__.py            |  50 +-
 python/cudf_polars/docs/overview.md           | 229 +++++++-
 python/cudf_polars/pyproject.toml             |   2 +-
 python/cudf_polars/tests/dsl/test_expr.py     |  21 +
 .../cudf_polars/tests/dsl/test_traversal.py   | 229 ++++++++
 python/cudf_polars/tests/test_config.py       |   6 +-
 22 files changed, 1253 insertions(+), 363 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/dsl/nodebase.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/traversal.py
 create mode 100644 python/cudf_polars/tests/dsl/test_traversal.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index b8b18ec5039..41b1defab39 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -30,14 +30,13 @@
 
 
 class Agg(Expr):
-    __slots__ = ("name", "options", "op", "request", "children")
+    __slots__ = ("name", "options", "op", "request")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self, dtype: plc.DataType, name: str, options: Any, *children: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.name = name
         self.options = options
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 8d021b0231d..effe8cb2378 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -13,9 +13,10 @@
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
+from cudf_polars.dsl.nodebase import Node
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence
+    from collections.abc import Mapping
 
     from cudf_polars.containers import Column, DataFrame
 
@@ -32,100 +33,16 @@ class ExecutionContext(IntEnum):
     ROLLING = enum.auto()
 
 
-class Expr:
-    """
-    An abstract expression object.
+class Expr(Node["Expr"]):
+    """An abstract expression object."""
 
-    This contains a (potentially empty) tuple of child expressions,
-    along with non-child data. For uniform reconstruction and
-    implementation of hashing and equality schemes, child classes need
-    to provide a certain amount of metadata when they are defined.
-    Specifically, the ``_non_child`` attribute must list, in-order,
-    the names of the slots that are passed to the constructor. The
-    constructor must take arguments in the order ``(*_non_child,
-    *children).``
-    """
-
-    __slots__ = ("dtype", "_hash_value", "_repr_value")
+    __slots__ = ("dtype",)
     dtype: plc.DataType
     """Data type of the expression."""
-    _hash_value: int
-    """Caching slot for the hash of the expression."""
-    _repr_value: str
-    """Caching slot for repr of the expression."""
-    children: tuple[Expr, ...] = ()
-    """Children of the expression."""
+    # This annotation is needed because of https://github.com/python/mypy/issues/17981
     _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
     """Names of non-child data (not Exprs) for reconstruction."""
 
-    # Constructor must take arguments in order (*_non_child, *children)
-    def __init__(self, dtype: plc.DataType) -> None:
-        self.dtype = dtype
-
-    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
-        return (*(getattr(self, attr) for attr in self._non_child), *children)
-
-    def get_hash(self) -> int:
-        """
-        Return the hash of this expr.
-
-        Override this in subclasses, rather than __hash__.
-
-        Returns
-        -------
-        The integer hash value.
-        """
-        return hash((type(self), self._ctor_arguments(self.children)))
-
-    def __hash__(self) -> int:
-        """Hash of an expression with caching."""
-        try:
-            return self._hash_value
-        except AttributeError:
-            self._hash_value = self.get_hash()
-            return self._hash_value
-
-    def is_equal(self, other: Any) -> bool:
-        """
-        Equality of two expressions.
-
-        Override this in subclasses, rather than __eq__.
-
-        Parameter
-        ---------
-        other
-            object to compare to
-
-        Returns
-        -------
-        True if the two expressions are equal, false otherwise.
-        """
-        if type(self) is not type(other):
-            return False  # pragma: no cover; __eq__ trips first
-        return self._ctor_arguments(self.children) == other._ctor_arguments(
-            other.children
-        )
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of expressions."""
-        if type(self) is not type(other) or hash(self) != hash(other):
-            return False
-        else:
-            return self.is_equal(other)
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def __repr__(self) -> str:
-        """String representation of an expression with caching."""
-        try:
-            return self._repr_value
-        except AttributeError:
-            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
-            self._repr_value = f"{type(self).__name__}({args})"
-            return self._repr_value
-
     def do_evaluate(
         self,
         df: DataFrame,
@@ -311,11 +228,11 @@ class Col(Expr):
     __slots__ = ("name",)
     _non_child = ("dtype", "name")
     name: str
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, name: str) -> None:
         self.dtype = dtype
         self.name = name
+        self.children = ()
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
index 19baae3611d..11a47e7ea51 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -24,9 +24,8 @@
 
 
 class BinOp(Expr):
-    __slots__ = ("op", "children")
+    __slots__ = ("op",)
     _non_child = ("dtype", "op")
-    children: tuple[Expr, Expr]
 
     def __init__(
         self,
@@ -35,7 +34,7 @@ def __init__(
         left: Expr,
         right: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         if plc.traits.is_boolean(self.dtype):
             # For boolean output types, bitand and bitor implement
             # boolean logic, so translate. bitxor also does, but the
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index ff9973a47d5..9c14a8386f3 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -31,9 +31,8 @@
 
 
 class BooleanFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -42,7 +41,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index f752a23b628..596e193d8fe 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -25,7 +25,7 @@
 
 
 class TemporalFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
         pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
         pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
@@ -39,7 +39,6 @@ class TemporalFunction(Expr):
         pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
     }
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -48,7 +47,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
index 562a2255033..c8aa993b994 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -16,7 +16,7 @@
 from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping
+    from collections.abc import Hashable, Mapping
 
     import pyarrow as pa
 
@@ -31,12 +31,12 @@ class Literal(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
     value: pa.Scalar[Any]
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         assert value.type == plc.interop.to_arrow(dtype)
         self.value = value
+        self.children = ()
 
     def do_evaluate(
         self,
@@ -58,19 +58,19 @@ class LiteralColumn(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
     value: pa.Array[Any, Any]
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+        self.children = ()
 
-    def get_hash(self) -> int:
+    def get_hashable(self) -> Hashable:
         """Compute a hash of the column."""
         # This is stricter than necessary, but we only need this hash
         # for identity in groupby replacements so it's OK. And this
         # way we avoid doing potentially expensive compute.
-        return hash((type(self), self.dtype, id(self.value)))
+        return (type(self), self.dtype, id(self.value))
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
index f7dcc3c542c..fa68bcb9426 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -17,24 +17,22 @@
 
 
 class RollingWindow(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr]
 
     def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (agg,)
         raise NotImplementedError("Rolling window not implemented")
 
 
 class GroupedRollingWindow(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
 
     def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (agg, *by)
         raise NotImplementedError("Grouped rolling window not implemented")
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
index a7a3e68a28c..0247256e507 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -23,12 +23,11 @@
 
 
 class Gather(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
 
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (values, indices)
 
     def do_evaluate(
@@ -65,12 +64,11 @@ def do_evaluate(
 
 
 class Filter(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
 
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (values, indices)
 
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
index 861b73ce6a0..99512e2ef52 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
@@ -23,14 +23,13 @@
 
 
 class Sort(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr]
 
     def __init__(
         self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (column,)
 
@@ -59,9 +58,8 @@ def do_evaluate(
 
 
 class SortBy(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -70,7 +68,7 @@ def __init__(
         column: Expr,
         *by: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (column, *by)
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 6669669aadc..62b54c63a8d 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -28,9 +28,8 @@
 
 
 class StringFunction(Expr):
-    __slots__ = ("name", "options", "children", "_regex_program")
+    __slots__ = ("name", "options", "_regex_program")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -39,7 +38,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
index c7d7a802ded..d2b5d6bae29 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
@@ -26,14 +26,13 @@
 
 
 class Ternary(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr, Expr]
 
     def __init__(
         self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (when, then, otherwise)
 
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 3d4d15be1ce..53f6ed29239 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -26,12 +26,11 @@
 class Cast(Expr):
     """Class representing a cast of an expression."""
 
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr]
 
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (value,)
         if not dtypes.can_cast(value.dtype, self.dtype):
             raise NotImplementedError(
@@ -60,7 +59,9 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class Len(Expr):
     """Class representing the length of an expression."""
 
-    children: tuple[()]
+    def __init__(self, dtype: plc.DataType) -> None:
+        self.dtype = dtype
+        self.children = ()
 
     def do_evaluate(
         self,
@@ -90,9 +91,8 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class UnaryFunction(Expr):
     """Class representing unary functions of an expression."""
 
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     # Note: log, and pow are handled via translation to binops
     _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
@@ -142,7 +142,7 @@ class UnaryFunction(Expr):
     def __init__(
         self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.name = name
         self.options = options
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e319c363a23..eb93929cf61 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -13,8 +13,8 @@
 
 from __future__ import annotations
 
-import dataclasses
 import itertools
+import json
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -27,10 +27,11 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.dsl.nodebase import Node
+from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, MutableMapping
+    from collections.abc import Callable, Hashable, MutableMapping, Sequence
     from typing import Literal
 
     from cudf_polars.typing import Schema
@@ -121,16 +122,27 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column
     ]
 
 
-@dataclasses.dataclass
-class IR:
+class IR(Node["IR"]):
     """Abstract plan node, representing an unevaluated dataframe."""
 
+    __slots__ = ("schema",)
+    # This annotation is needed because of https://github.com/python/mypy/issues/17981
+    _non_child: ClassVar[tuple[str, ...]] = ("schema",)
     schema: Schema
     """Mapping from column names to their data types."""
 
-    def __post_init__(self):
-        """Validate preconditions."""
-        pass  # noqa: PIE790
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of node, treating schema dictionary.
+
+        Since the schema is a dictionary, even though it is morally
+        immutable, it is not hashable. We therefore convert it to
+        tuples for hashing purposes.
+        """
+        # Schema is the first constructor argument
+        args = self._ctor_arguments(self.children)[1:]
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, args)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
@@ -159,24 +171,50 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )  # pragma: no cover
 
 
-@dataclasses.dataclass
 class PythonScan(IR):
     """Representation of input from a python function."""
 
+    __slots__ = ("options", "predicate")
+    _non_child = ("schema", "options", "predicate")
     options: Any
     """Arbitrary options."""
     predicate: expr.NamedExpr | None
     """Filter to apply to the constructed dataframe before returning it."""
 
-    def __post_init__(self):
-        """Validate preconditions."""
+    def __init__(self, schema: Schema, options: Any, predicate: expr.NamedExpr | None):
+        self.schema = schema
+        self.options = options
+        self.predicate = predicate
+        self.children = ()
         raise NotImplementedError("PythonScan not implemented")
 
 
-@dataclasses.dataclass
 class Scan(IR):
     """Input from files."""
 
+    __slots__ = (
+        "typ",
+        "reader_options",
+        "cloud_options",
+        "paths",
+        "with_columns",
+        "skip_rows",
+        "n_rows",
+        "row_index",
+        "predicate",
+    )
+    _non_child = (
+        "schema",
+        "typ",
+        "reader_options",
+        "cloud_options",
+        "paths",
+        "with_columns",
+        "skip_rows",
+        "n_rows",
+        "row_index",
+        "predicate",
+    )
     typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
     reader_options: dict[str, Any]
@@ -185,7 +223,7 @@ class Scan(IR):
     """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
-    with_columns: list[str]
+    with_columns: list[str] | None
     """Projected columns to return."""
     skip_rows: int
     """Rows to skip at the start when reading."""
@@ -196,9 +234,30 @@ class Scan(IR):
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(
+        self,
+        schema: Schema,
+        typ: str,
+        reader_options: dict[str, Any],
+        cloud_options: dict[str, Any] | None,
+        paths: list[str],
+        with_columns: list[str] | None,
+        skip_rows: int,
+        n_rows: int,
+        row_index: tuple[str, int] | None,
+        predicate: expr.NamedExpr | None,
+    ):
+        self.schema = schema
+        self.typ = typ
+        self.reader_options = reader_options
+        self.cloud_options = cloud_options
+        self.paths = paths
+        self.with_columns = with_columns
+        self.skip_rows = skip_rows
+        self.n_rows = n_rows
+        self.row_index = row_index
+        self.predicate = predicate
+        self.children = ()
         if self.typ not in ("csv", "parquet", "ndjson"):  # pragma: no cover
             # This line is unhittable ATM since IPC/Anonymous scan raise
             # on the polars side
@@ -258,6 +317,28 @@ def __post_init__(self) -> None:
                 "Reading only parquet metadata to produce row index."
             )
 
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+
+        The options dictionaries are serialised for hashing purposes
+        as json strings.
+        """
+        schema_hash = tuple(self.schema.items())
+        return (
+            type(self),
+            schema_hash,
+            self.typ,
+            json.dumps(self.reader_options),
+            json.dumps(self.cloud_options),
+            tuple(self.paths),
+            tuple(self.with_columns) if self.with_columns is not None else None,
+            self.skip_rows,
+            self.n_rows,
+            self.row_index,
+            self.predicate,
+        )
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         with_columns = self.with_columns
@@ -401,7 +482,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df.filter(mask)
 
 
-@dataclasses.dataclass
 class Cache(IR):
     """
     Return a cached plan node.
@@ -409,20 +489,25 @@ class Cache(IR):
     Used for CSE at the plan level.
     """
 
+    __slots__ = ("key",)
+    _non_child = ("schema", "key")
     key: int
     """The cache key."""
-    value: IR
-    """The unevaluated node to cache."""
+
+    def __init__(self, schema: Schema, key: int, value: IR):
+        self.schema = schema
+        self.key = key
+        self.children = (value,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         try:
             return cache[self.key]
         except KeyError:
-            return cache.setdefault(self.key, self.value.evaluate(cache=cache))
+            (value,) = self.children
+            return cache.setdefault(self.key, value.evaluate(cache=cache))
 
 
-@dataclasses.dataclass
 class DataFrameScan(IR):
     """
     Input from an existing polars DataFrame.
@@ -430,13 +515,38 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
 
+    __slots__ = ("df", "projection", "predicate")
+    _non_child = ("schema", "df", "projection", "predicate")
     df: Any
     """Polars LazyFrame object."""
-    projection: list[str]
+    projection: tuple[str, ...] | None
     """List of columns to project out."""
     predicate: expr.NamedExpr | None
     """Mask to apply."""
 
+    def __init__(
+        self,
+        schema: Schema,
+        df: Any,
+        projection: Sequence[str] | None,
+        predicate: expr.NamedExpr | None,
+    ):
+        self.schema = schema
+        self.df = df
+        self.projection = tuple(projection) if projection is not None else None
+        self.predicate = predicate
+        self.children = ()
+
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+
+        The (heavy) dataframe object is hashed as its id, so this is
+        not stable across runs, or repeat instances of the same equal dataframes.
+        """
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, id(self.df), self.projection, self.predicate)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(self.df)
@@ -454,28 +564,39 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df
 
 
-@dataclasses.dataclass
 class Select(IR):
     """Produce a new dataframe selecting given expressions from an input."""
 
-    df: IR
-    """Input dataframe."""
-    expr: list[expr.NamedExpr]
+    __slots__ = ("exprs", "should_broadcast")
+    _non_child = ("schema", "exprs", "should_broadcast")
+    exprs: tuple[expr.NamedExpr, ...]
     """List of expressions to evaluate to form the new dataframe."""
     should_broadcast: bool
     """Should columns be broadcast?"""
 
+    def __init__(
+        self,
+        schema: Schema,
+        exprs: Sequence[expr.NamedExpr],
+        should_broadcast: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.exprs = tuple(exprs)
+        self.should_broadcast = should_broadcast
+        self.children = (df,)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         # Handle any broadcasting
-        columns = [e.evaluate(df) for e in self.expr]
+        columns = [e.evaluate(df) for e in self.exprs]
         if self.should_broadcast:
             columns = broadcast(*columns)
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class Reduce(IR):
     """
     Produce a new dataframe selecting given expressions from an input.
@@ -483,36 +604,73 @@ class Reduce(IR):
     This is a special case of :class:`Select` where all outputs are a single row.
     """
 
-    df: IR
-    """Input dataframe."""
-    expr: list[expr.NamedExpr]
+    __slots__ = ("exprs",)
+    _non_child = ("schema", "exprs")
+    exprs: tuple[expr.NamedExpr, ...]
     """List of expressions to evaluate to form the new dataframe."""
 
+    def __init__(
+        self, schema: Schema, exprs: Sequence[expr.NamedExpr], df: IR
+    ):  # pragma: no cover; polars doesn't emit this node yet
+        self.schema = schema
+        self.exprs = tuple(exprs)
+        self.children = (df,)
+
     def evaluate(
         self, *, cache: MutableMapping[int, DataFrame]
     ) -> DataFrame:  # pragma: no cover; polars doesn't emit this node yet
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
-        columns = broadcast(*(e.evaluate(df) for e in self.expr))
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
+        columns = broadcast(*(e.evaluate(df) for e in self.exprs))
         assert all(column.obj.size() == 1 for column in columns)
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
 
-    df: IR
-    """Input dataframe."""
-    agg_requests: list[expr.NamedExpr]
-    """List of expressions to evaluate groupwise."""
-    keys: list[expr.NamedExpr]
-    """List of expressions forming the keys."""
+    __slots__ = (
+        "agg_requests",
+        "keys",
+        "maintain_order",
+        "options",
+        "agg_infos",
+    )
+    _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options")
+    keys: tuple[expr.NamedExpr, ...]
+    """Grouping keys."""
+    agg_requests: tuple[expr.NamedExpr, ...]
+    """Aggregation expressions."""
     maintain_order: bool
-    """Should the order of the input dataframe be maintained?"""
+    """Preserve order in groupby."""
     options: Any
-    """Options controlling style of groupby."""
-    agg_infos: list[expr.AggInfo] = dataclasses.field(init=False)
+    """Arbitrary options."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keys: Sequence[expr.NamedExpr],
+        agg_requests: Sequence[expr.NamedExpr],
+        maintain_order: bool,  # noqa: FBT001
+        options: Any,
+        df: IR,
+    ):
+        self.schema = schema
+        self.keys = tuple(keys)
+        self.agg_requests = tuple(agg_requests)
+        self.maintain_order = maintain_order
+        self.options = options
+        self.children = (df,)
+        if self.options.rolling:
+            raise NotImplementedError(
+                "rolling window/groupby"
+            )  # pragma: no cover; rollingwindow constructor has already raised
+        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
+            raise NotImplementedError("Nested aggregations in groupby")
+        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
+        if len(self.keys) == 0:
+            raise NotImplementedError("dynamic groupby")
 
     @staticmethod
     def check_agg(agg: expr.Expr) -> int:
@@ -542,22 +700,10 @@ def check_agg(agg: expr.Expr) -> int:
         else:
             raise NotImplementedError(f"No handler for {agg=}")
 
-    def __post_init__(self) -> None:
-        """Check whether all the aggregations are implemented."""
-        super().__post_init__()
-        if self.options.rolling:
-            raise NotImplementedError(
-                "rolling window/groupby"
-            )  # pragma: no cover; rollingwindow constructor has already raised
-        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
-            raise NotImplementedError("Nested aggregations in groupby")
-        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
-        if len(self.keys) == 0:
-            raise NotImplementedError("dynamic groupby")
-
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         keys = broadcast(
             *(k.evaluate(df) for k in self.keys), target_length=df.num_rows
         )
@@ -646,17 +792,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(broadcasted).slice(self.options.slice)
 
 
-@dataclasses.dataclass
 class Join(IR):
     """A join of two dataframes."""
 
-    left: IR
-    """Left frame."""
-    right: IR
-    """Right frame."""
-    left_on: list[expr.NamedExpr]
+    __slots__ = ("left_on", "right_on", "options")
+    _non_child = ("schema", "left_on", "right_on", "options")
+    left_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the left frame."""
-    right_on: list[expr.NamedExpr]
+    right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
         Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
@@ -674,9 +817,20 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(
+        self,
+        schema: Schema,
+        left_on: Sequence[expr.NamedExpr],
+        right_on: Sequence[expr.NamedExpr],
+        options: Any,
+        left: IR,
+        right: IR,
+    ):
+        self.schema = schema
+        self.left_on = tuple(left_on)
+        self.right_on = tuple(right_on)
+        self.options = options
+        self.children = (left, right)
         if any(
             isinstance(e.value, expr.Literal)
             for e in itertools.chain(self.left_on, self.right_on)
@@ -777,8 +931,7 @@ def _reorder_maps(
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        left = self.left.evaluate(cache=cache)
-        right = self.right.evaluate(cache=cache)
+        left, right = (c.evaluate(cache=cache) for c in self.children)
         how, join_nulls, zlice, suffix, coalesce = self.options
         suffix = "_right" if suffix is None else suffix
         if how == "cross":
@@ -866,20 +1019,30 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(zlice)
 
 
-@dataclasses.dataclass
 class HStack(IR):
     """Add new columns to a dataframe."""
 
-    df: IR
-    """Input dataframe."""
-    columns: list[expr.NamedExpr]
-    """List of expressions to produce new columns."""
+    __slots__ = ("columns", "should_broadcast")
+    _non_child = ("schema", "columns", "should_broadcast")
     should_broadcast: bool
-    """Should columns be broadcast?"""
+    """Should the resulting evaluated columns be broadcast to the same length."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        columns: Sequence[expr.NamedExpr],
+        should_broadcast: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.columns = tuple(columns)
+        self.should_broadcast = should_broadcast
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         columns = [c.evaluate(df) for c in self.columns]
         if self.should_broadcast:
             columns = broadcast(*columns, target_length=df.num_rows)
@@ -895,20 +1058,36 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.with_columns(columns)
 
 
-@dataclasses.dataclass
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
-    df: IR
-    """Input dataframe."""
+    __slots__ = ("keep", "subset", "zlice", "stable")
+    _non_child = ("schema", "keep", "subset", "zlice", "stable")
     keep: plc.stream_compaction.DuplicateKeepOption
-    """Which rows to keep."""
-    subset: set[str] | None
-    """Which columns to inspect when computing distinct rows."""
+    """Which distinct value to keep."""
+    subset: frozenset[str] | None
+    """Which columns should be used to define distinctness. If None,
+    then all columns are used."""
     zlice: tuple[int, int] | None
-    """Optional slice to perform after compaction."""
+    """Optional slice to apply to the result."""
     stable: bool
-    """Should order be preserved?"""
+    """Should the result maintain ordering."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        subset: frozenset[str] | None,
+        zlice: tuple[int, int] | None,
+        stable: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.keep = keep
+        self.subset = subset
+        self.zlice = zlice
+        self.stable = stable
+        self.children = (df,)
 
     _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = {
         "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
@@ -917,18 +1096,10 @@ class Distinct(IR):
         "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
     }
 
-    def __init__(self, schema: Schema, df: IR, options: Any) -> None:
-        self.schema = schema
-        self.df = df
-        (keep, subset, maintain_order, zlice) = options
-        self.keep = Distinct._KEEP_MAP[keep]
-        self.subset = set(subset) if subset is not None else None
-        self.stable = maintain_order
-        self.zlice = zlice
-
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         if self.subset is None:
             indices = list(range(df.num_columns))
             keys_sorted = all(c.is_sorted for c in df.column_map.values())
@@ -967,46 +1138,44 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(self.zlice)
 
 
-@dataclasses.dataclass
 class Sort(IR):
     """Sort a dataframe."""
 
-    df: IR
-    """Input."""
-    by: list[expr.NamedExpr]
-    """List of expressions to produce sort keys."""
-    do_sort: Callable[..., plc.Table]
-    """pylibcudf sorting function."""
+    __slots__ = ("by", "order", "null_order", "stable", "zlice")
+    _non_child = ("schema", "by", "order", "null_order", "stable", "zlice")
+    by: tuple[expr.NamedExpr, ...]
+    """Sort keys."""
+    order: tuple[plc.types.Order, ...]
+    """Sort order for each sort key."""
+    null_order: tuple[plc.types.NullOrder, ...]
+    """Null sorting location for each sort key."""
+    stable: bool
+    """Should the sort be stable?"""
     zlice: tuple[int, int] | None
-    """Optional slice to apply after sorting."""
-    order: list[plc.types.Order]
-    """Order keys should be sorted in."""
-    null_order: list[plc.types.NullOrder]
-    """Where nulls sort to."""
+    """Optional slice to apply to the result."""
 
     def __init__(
         self,
         schema: Schema,
-        df: IR,
-        by: list[expr.NamedExpr],
-        options: Any,
+        by: Sequence[expr.NamedExpr],
+        order: Sequence[plc.types.Order],
+        null_order: Sequence[plc.types.NullOrder],
+        stable: bool,  # noqa: FBT001
         zlice: tuple[int, int] | None,
-    ) -> None:
+        df: IR,
+    ):
         self.schema = schema
-        self.df = df
-        self.by = by
+        self.by = tuple(by)
+        self.order = tuple(order)
+        self.null_order = tuple(null_order)
+        self.stable = stable
         self.zlice = zlice
-        stable, nulls_last, descending = options
-        self.order, self.null_order = sorting.sort_order(
-            descending, nulls_last=nulls_last, num_keys=len(by)
-        )
-        self.do_sort = (
-            plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
-        )
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         sort_keys = broadcast(
             *(k.evaluate(df) for k in self.by), target_length=df.num_rows
         )
@@ -1016,11 +1185,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             for i, k in enumerate(sort_keys)
             if k.name in df.column_map and k.obj is df.column_map[k.name].obj
         }
-        table = self.do_sort(
+        do_sort = (
+            plc.sorting.stable_sort_by_key if self.stable else plc.sorting.sort_by_key
+        )
+        table = do_sort(
             df.table,
             plc.Table([k.obj for k in sort_keys]),
-            self.order,
-            self.null_order,
+            list(self.order),
+            list(self.null_order),
         )
         columns: list[Column] = []
         for name, c in zip(df.column_map, table.columns(), strict=True):
@@ -1037,49 +1209,64 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns).slice(self.zlice)
 
 
-@dataclasses.dataclass
 class Slice(IR):
     """Slice a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("offset", "length")
+    _non_child = ("schema", "offset", "length")
     offset: int
     """Start of the slice."""
     length: int
     """Length of the slice."""
 
+    def __init__(self, schema: Schema, offset: int, length: int, df: IR):
+        self.schema = schema
+        self.offset = offset
+        self.length = length
+        self.children = (df,)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         return df.slice((self.offset, self.length))
 
 
-@dataclasses.dataclass
 class Filter(IR):
     """Filter a dataframe with a boolean mask."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("mask",)
+    _non_child = ("schema", "mask")
     mask: expr.NamedExpr
-    """Expression evaluating to a mask."""
+    """Expression to produce the filter mask."""
+
+    def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR):
+        self.schema = schema
+        self.mask = mask
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows)
         return df.filter(mask)
 
 
-@dataclasses.dataclass
 class Projection(IR):
     """Select a subset of columns from a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ()
+    _non_child = ("schema",)
+
+    def __init__(self, schema: Schema, df: IR):
+        self.schema = schema
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         # This can reorder things.
         columns = broadcast(
             *(df.column_map[name] for name in self.schema), target_length=df.num_rows
@@ -1087,16 +1274,15 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("name", "options")
+    _non_child = ("schema", "name", "options")
     name: str
-    """Function name."""
+    """Name of the function to apply"""
     options: Any
-    """Arbitrary options, interpreted per function."""
+    """Arbitrary name-specific options"""
 
     _NAMES: ClassVar[frozenset[str]] = frozenset(
         [
@@ -1111,9 +1297,11 @@ class MapFunction(IR):
         ]
     )
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(self, schema: Schema, name: str, options: Any, df: IR):
+        self.schema = schema
+        self.name = name
+        self.options = options
+        self.children = (df,)
         if self.name not in MapFunction._NAMES:
             raise NotImplementedError(f"Unhandled map function {self.name}")
         if self.name == "explode":
@@ -1127,7 +1315,7 @@ def __post_init__(self) -> None:
             old, new, _ = self.options
             # TODO: perhaps polars should validate renaming in the IR?
             if len(new) != len(set(new)) or (
-                set(new) & (set(self.df.schema.keys()) - set(old))
+                set(new) & (set(df.schema.keys()) - set(old))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
         elif self.name == "unpivot":
@@ -1136,31 +1324,31 @@ def __post_init__(self) -> None:
             variable_name = "variable" if variable_name is None else variable_name
             if len(pivotees) == 0:
                 index = frozenset(indices)
-                pivotees = [name for name in self.df.schema if name not in index]
+                pivotees = [name for name in df.schema if name not in index]
             if not all(
-                dtypes.can_cast(self.df.schema[p], self.schema[value_name])
-                for p in pivotees
+                dtypes.can_cast(df.schema[p], self.schema[value_name]) for p in pivotees
             ):
                 raise NotImplementedError(
                     "Unpivot cannot cast all input columns to "
                     f"{self.schema[value_name].id()}"
                 )
-            self.options = (indices, pivotees, variable_name, value_name)
+            self.options = (tuple(indices), tuple(pivotees), variable_name, value_name)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
+        (child,) = self.children
         if self.name == "rechunk":
             # No-op in our data model
             # Don't think this appears in a plan tree from python
-            return self.df.evaluate(cache=cache)  # pragma: no cover
+            return child.evaluate(cache=cache)  # pragma: no cover
         elif self.name == "rename":
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             # final tag is "swapping" which is useful for the
             # optimiser (it blocks some pushdown operations)
             old, new, _ = self.options
             return df.rename_columns(dict(zip(old, new, strict=True)))
         elif self.name == "explode":
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             ((to_explode,),) = self.options
             index = df.column_names.index(to_explode)
             subset = df.column_names_set - {to_explode}
@@ -1170,7 +1358,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         elif self.name == "unpivot":
             indices, pivotees, variable_name, value_name = self.options
             npiv = len(pivotees)
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             index_columns = [
                 Column(col, name=name)
                 for col, name in zip(
@@ -1209,37 +1397,40 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
 
-@dataclasses.dataclass
 class Union(IR):
     """Concatenate dataframes vertically."""
 
-    dfs: list[IR]
-    """List of inputs."""
+    __slots__ = ("zlice",)
+    _non_child = ("schema", "zlice")
     zlice: tuple[int, int] | None
-    """Optional slice to apply after concatenation."""
+    """Optional slice to apply to the result."""
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
-        schema = self.dfs[0].schema
-        if not all(s.schema == schema for s in self.dfs[1:]):
+    def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR):
+        self.schema = schema
+        self.zlice = zlice
+        self.children = children
+        schema = self.children[0].schema
+        if not all(s.schema == schema for s in self.children[1:]):
             raise NotImplementedError("Schema mismatch")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: only evaluate what we need if we have a slice
-        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        dfs = [df.evaluate(cache=cache) for df in self.children]
         return DataFrame.from_table(
             plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names
         ).slice(self.zlice)
 
 
-@dataclasses.dataclass
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
 
-    dfs: list[IR]
-    """List of inputs."""
+    __slots__ = ()
+    _non_child = ("schema",)
+
+    def __init__(self, schema: Schema, *children: IR):
+        self.schema = schema
+        self.children = children
 
     @staticmethod
     def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
@@ -1271,7 +1462,7 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        dfs = [df.evaluate(cache=cache) for df in self.children]
         max_rows = max(df.num_rows for df in dfs)
         # Horizontal concatenation extends shorter tables with nulls
         dfs = [
diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py
new file mode 100644
index 00000000000..228d300f467
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Base class for IR nodes, and utilities."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, Sequence
+
+    from typing_extensions import Self
+
+
+__all__: list[str] = ["Node"]
+
+T = TypeVar("T", bound="Node[Any]")
+
+
+class Node(Generic[T]):
+    """
+    An abstract node type.
+
+    Nodes are immutable!
+
+    This contains a (potentially empty) tuple of child nodes,
+    along with non-child data. For uniform reconstruction and
+    implementation of hashing and equality schemes, child classes need
+    to provide a certain amount of metadata when they are defined.
+    Specifically, the ``_non_child`` attribute must list, in-order,
+    the names of the slots that are passed to the constructor. The
+    constructor must take arguments in the order ``(*_non_child,
+    *children).``
+    """
+
+    __slots__ = ("_hash_value", "_repr_value", "children")
+    _hash_value: int
+    _repr_value: str
+    children: tuple[T, ...]
+    _non_child: ClassVar[tuple[str, ...]] = ()
+
+    def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]:
+        return (*(getattr(self, attr) for attr in self._non_child), *children)
+
+    def reconstruct(
+        self, children: Sequence[T]
+    ) -> Self:  # pragma: no cover; not yet used
+        """
+        Rebuild this node with new children.
+
+        Parameters
+        ----------
+        children
+            New children
+
+        Returns
+        -------
+        New node with new children. Non-child data is shared with the input.
+        """
+        return type(self)(*self._ctor_arguments(children))
+
+    def get_hashable(self) -> Hashable:
+        """
+        Return a hashable object for the node.
+
+        Returns
+        -------
+        Hashable object.
+
+        Notes
+        -----
+        This method is used by the :meth:`__hash__` implementation
+        (which does caching). If your node type needs special-case
+        handling for some of its attributes, override this method, not
+        :meth:`__hash__`.
+        """
+        return (type(self), self._ctor_arguments(self.children))
+
+    def __hash__(self) -> int:
+        """
+        Hash of an expression with caching.
+
+        See Also
+        --------
+        get_hashable
+        """
+        try:
+            return self._hash_value
+        except AttributeError:
+            self._hash_value = hash(self.get_hashable())
+            return self._hash_value
+
+    def is_equal(self, other: Self) -> bool:
+        """
+        Equality of two nodes of equal type.
+
+        Override this in subclasses, rather than :meth:`__eq__`.
+
+        Parameter
+        ---------
+        other
+            object of same type to compare to.
+
+        Notes
+        -----
+        Since nodes are immutable, this does common subexpression
+        elimination when two nodes are determined to be equal.
+
+        :meth:`__eq__` handles the case where the objects being
+        compared are not of the same type, so in this method, we only
+        need to implement equality of equal types.
+
+        Returns
+        -------
+        True if the two nodes are equal, false otherwise.
+        """
+        if self is other:
+            return True
+        result = self._ctor_arguments(self.children) == other._ctor_arguments(
+            other.children
+        )
+        # Eager CSE for nodes that match.
+        if result:
+            self.children = other.children
+        return result
+
+    def __eq__(self, other: Any) -> bool:
+        """
+        Equality of expressions.
+
+        See Also
+        --------
+        is_equal
+        """
+        if type(self) is not type(other) or hash(self) != hash(other):
+            return False
+        else:
+            return self.is_equal(other)
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def __repr__(self) -> str:
+        """String representation of an expression with caching."""
+        try:
+            return self._repr_value
+        except AttributeError:
+            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
+            self._repr_value = f"{type(self).__name__}({args})"
+            return self._repr_value
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a0291037f01..522c4a6729c 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -20,7 +20,7 @@
 
 from cudf_polars.dsl import expr, ir
 from cudf_polars.typing import NodeTraverser
-from cudf_polars.utils import dtypes
+from cudf_polars.utils import dtypes, sorting
 
 __all__ = ["translate_ir", "translate_named_expr"]
 
@@ -148,7 +148,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Select(schema, inp, exprs, node.should_broadcast)
+    return ir.Select(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
@@ -161,11 +161,11 @@ def _(
         keys = [translate_named_expr(visitor, n=e) for e in node.keys]
     return ir.GroupBy(
         schema,
-        inp,
-        aggs,
         keys,
+        aggs,
         node.maintain_order,
         node.options,
+        inp,
     )
 
 
@@ -182,7 +182,7 @@ def _(
     with set_node(visitor, node.input_right):
         inp_right = translate_ir(visitor, n=None)
         right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
-    return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
+    return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
 
 
 @_translate_ir.register
@@ -192,7 +192,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
-    return ir.HStack(schema, inp, exprs, node.should_broadcast)
+    return ir.HStack(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
@@ -202,17 +202,23 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Reduce(schema, inp, exprs)
+    return ir.Reduce(schema, exprs, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
+    (keep, subset, maintain_order, zlice) = node.options
+    keep = ir.Distinct._KEEP_MAP[keep]
+    subset = frozenset(subset) if subset is not None else None
     return ir.Distinct(
         schema,
+        keep,
+        subset,
+        zlice,
+        maintain_order,
         translate_ir(visitor, n=node.input),
-        node.options,
     )
 
 
@@ -223,14 +229,18 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         by = [translate_named_expr(visitor, n=e) for e in node.by_column]
-    return ir.Sort(schema, inp, by, node.sort_options, node.slice)
+    stable, nulls_last, descending = node.sort_options
+    order, null_order = sorting.sort_order(
+        descending, nulls_last=nulls_last, num_keys=len(by)
+    )
+    return ir.Sort(schema, by, order, null_order, stable, node.slice, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len)
+    return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input))
 
 
 @_translate_ir.register
@@ -240,7 +250,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         mask = translate_named_expr(visitor, n=node.predicate)
-    return ir.Filter(schema, inp, mask)
+    return ir.Filter(schema, mask, inp)
 
 
 @_translate_ir.register
@@ -259,10 +269,10 @@ def _(
     name, *options = node.function
     return ir.MapFunction(
         schema,
-        # TODO: merge_sorted breaks this pattern
-        translate_ir(visitor, n=node.input),
         name,
         options,
+        # TODO: merge_sorted breaks this pattern
+        translate_ir(visitor, n=node.input),
     )
 
 
@@ -271,7 +281,7 @@ def _(
     node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.Union(
-        schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
+        schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs)
     )
 
 
@@ -279,7 +289,7 @@ def _(
 def _(
     node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+    return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs))
 
 
 def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py
new file mode 100644
index 00000000000..be8338cb9a9
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/traversal.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Traversal and visitor utilities for nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Generic
+
+from cudf_polars.typing import U_contra, V_co
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Generator, Mapping, MutableMapping
+
+    from cudf_polars.typing import GenericTransformer, NodeT
+
+
+__all__: list[str] = [
+    "traversal",
+    "reuse_if_unchanged",
+    "make_recursive",
+    "CachingVisitor",
+]
+
+
+def traversal(node: NodeT) -> Generator[NodeT, None, None]:
+    """
+    Pre-order traversal of nodes in an expression.
+
+    Parameters
+    ----------
+    node
+        Root of expression to traverse.
+
+    Yields
+    ------
+    Unique nodes in the expression, parent before child, children
+    in-order from left to right.
+    """
+    seen = {node}
+    lifo = [node]
+
+    while lifo:
+        node = lifo.pop()
+        yield node
+        for child in reversed(node.children):
+            if child not in seen:
+                seen.add(child)
+                lifo.append(child)
+
+
+def reuse_if_unchanged(node: NodeT, fn: GenericTransformer[NodeT, NodeT]) -> NodeT:
+    """
+    Recipe for transforming nodes that returns the old object if unchanged.
+
+    Parameters
+    ----------
+    node
+         Node to recurse on
+    fn
+         Function to transform children
+
+    Notes
+    -----
+    This can be used as a generic "base case" handler when
+    writing transforms that take nodes and produce new nodes.
+
+    Returns
+    -------
+    Existing node `e` if transformed children are unchanged, otherwise
+    reconstructed node with new children.
+    """
+    new_children = [fn(c) for c in node.children]
+    if all(new == old for new, old in zip(new_children, node.children, strict=True)):
+        return node
+    return node.reconstruct(new_children)
+
+
+def make_recursive(
+    fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co],
+    *,
+    state: Mapping[str, Any] | None = None,
+) -> GenericTransformer[U_contra, V_co]:
+    """
+    No-op wrapper for recursive visitors.
+
+    Facilitates using visitors that don't need caching but are written
+    in the same style.
+
+    Parameters
+    ----------
+    fn
+        Function to transform inputs to outputs. Should take as its
+        second argument a callable from input to output.
+    state
+        Arbitrary *immutable* state that should be accessible to the
+        visitor through the `state` property.
+
+    Notes
+    -----
+    All transformation functions *must* be free of side-effects.
+
+    Usually, prefer a :class:`CachingVisitor`, but if we know that we
+    don't need caching in a transformation and then this no-op
+    approach is slightly cheaper.
+
+    Returns
+    -------
+    Recursive function without caching.
+
+    See Also
+    --------
+    CachingVisitor
+    """
+
+    def rec(node: U_contra) -> V_co:
+        return fn(node, rec)  # type: ignore[arg-type]
+
+    rec.state = state if state is not None else {}  # type: ignore[attr-defined]
+    return rec  # type: ignore[return-value]
+
+
+class CachingVisitor(Generic[U_contra, V_co]):
+    """
+    Caching wrapper for recursive visitors.
+
+    Facilitates writing visitors where already computed results should
+    be cached and reused. The cache is managed automatically, and is
+    tied to the lifetime of the wrapper.
+
+    Parameters
+    ----------
+    fn
+        Function to transform inputs to outputs. Should take as its
+        second argument the recursive cache manager.
+    state
+        Arbitrary *immutable* state that should be accessible to the
+        visitor through the `state` property.
+
+    Notes
+    -----
+    All transformation functions *must* be free of side-effects.
+
+    Returns
+    -------
+    Recursive function with caching.
+    """
+
+    def __init__(
+        self,
+        fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co],
+        *,
+        state: Mapping[str, Any] | None = None,
+    ) -> None:
+        self.fn = fn
+        self.cache: MutableMapping[U_contra, V_co] = {}
+        self.state = state if state is not None else {}
+
+    def __call__(self, value: U_contra) -> V_co:
+        """
+        Apply the function to a value.
+
+        Parameters
+        ----------
+        value
+            The value to transform.
+
+        Returns
+        -------
+        A transformed value.
+        """
+        try:
+            return self.cache[value]
+        except KeyError:
+            return self.cache.setdefault(value, self.fn(value, self))
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 240b11bdf59..a27a3395c35 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -5,8 +5,8 @@
 
 from __future__ import annotations
 
-from collections.abc import Mapping
-from typing import TYPE_CHECKING, Literal, Protocol, Union
+from collections.abc import Hashable, Mapping
+from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union
 
 import pylibcudf as plc
 
@@ -18,7 +18,19 @@
 
     import polars as pl
 
-IR: TypeAlias = Union[
+    from cudf_polars.dsl import expr, ir, nodebase
+
+__all__: list[str] = [
+    "PolarsIR",
+    "PolarsExpr",
+    "NodeTraverser",
+    "OptimizationArgs",
+    "GenericTransformer",
+    "ExprTransformer",
+    "IRTransformer",
+]
+
+PolarsIR: TypeAlias = Union[
     pl_ir.PythonScan,
     pl_ir.Scan,
     pl_ir.Cache,
@@ -38,7 +50,7 @@
     pl_ir.ExtContext,
 ]
 
-Expr: TypeAlias = Union[
+PolarsExpr: TypeAlias = Union[
     pl_expr.Function,
     pl_expr.Window,
     pl_expr.Literal,
@@ -68,7 +80,7 @@ def set_node(self, n: int) -> None:
         """Set the current plan node to n."""
         ...
 
-    def view_current_node(self) -> IR:
+    def view_current_node(self) -> PolarsIR:
         """Convert current plan node to python rep."""
         ...
 
@@ -80,7 +92,7 @@ def get_dtype(self, n: int) -> pl.DataType:
         """Get the datatype of the given expression id."""
         ...
 
-    def view_expression(self, n: int) -> Expr:
+    def view_expression(self, n: int) -> PolarsExpr:
         """Convert the given expression to python rep."""
         ...
 
@@ -107,3 +119,29 @@ def set_udf(
     "cluster_with_columns",
     "no_optimization",
 ]
+
+
+U_contra = TypeVar("U_contra", bound=Hashable, contravariant=True)
+V_co = TypeVar("V_co", covariant=True)
+NodeT = TypeVar("NodeT", bound="nodebase.Node[Any]")
+
+
+class GenericTransformer(Protocol[U_contra, V_co]):
+    """Abstract protocol for recursive visitors."""
+
+    def __call__(self, __value: U_contra) -> V_co:
+        """Apply the visitor to the node."""
+        ...
+
+    @property
+    def state(self) -> Mapping[str, Any]:
+        """Arbitrary immutable state."""
+        ...
+
+
+# Quotes to avoid circular import
+ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"]
+"""Protocol for transformation of Expr nodes."""
+
+IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"]
+"""Protocol for transformation of IR nodes."""
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 7837a275f20..74b2cd4e5de 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -11,14 +11,17 @@ You will need:
    environment](https://github.com/rapidsai/cudf/blob/branch-24.12/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
-> ![NOTE] These instructions will get simpler as we merge code in.
+:::{note}
+These instructions will get simpler as we merge code in.
+:::
 
 ## Installing polars
 
-`cudf-polars` works with polars >= 1.3, as long as the internal IR
-version doesn't get a major version bump. So `pip install polars>=1.3`
-should work. For development, if we're adding things to the polars
-side of things, we will need to build polars from source:
+The `cudf-polars` `pyproject.toml` advertises which polars versions it
+works with. So for pure `cudf-polars` development, installing as
+normal and satisfying the dependencies in the repository is
+sufficient. For development, if we're adding things to the polars side
+of things, we will need to build polars from source:
 
 ```sh
 git clone https://github.com/pola-rs/polars
@@ -36,7 +39,9 @@ pip install --upgrade uv
 uv pip install --upgrade -r py-polars/requirements-dev.txt
 ```
 
-> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster!
+:::{note}
+plain `pip install` works fine, but `uv` is _much_ faster!
+:::
 
 Now we have the necessary machinery to build polars
 ```sh
@@ -83,7 +88,7 @@ representation (IR). Second, an execution phase which executes using
 our IR.
 
 The translation phase receives the a low-level Rust `NodeTraverser`
-object which delivers Python representations of the plan nodes (and
+object that delivers Python representations of the plan nodes (and
 expressions) one at a time. During translation, we endeavour to raise
 `NotImplementedError` for any unsupported functionality. This way, if
 we can't execute something, we just don't modify the logical plan at
@@ -126,7 +131,6 @@ arguments, at the moment, `raise_on_fail` is also supported, which
 raises, rather than falling back, during translation:
 
 ```python
-
 result = q.collect(engine=pl.GPUEngine(raise_on_fail=True))
 ```
 
@@ -144,13 +148,73 @@ changes. We can therefore attempt to detect the IR version
 appropriately. This should be done during IR translation in
 `translate.py`.
 
-## Adding a handler for a new plan node
+# IR design
+
+As noted, we translate the polars DSL into our own IR. This is both so
+that we can smooth out minor version differences (advertised by
+`NodeTraverser` version changes) within `cudf-polars`, and so that we
+have the freedom to introduce new IR nodes and rewrite rules as might
+be appropriate for GPU execution.
+
+To that end, we provide facilities for definition of nodes as well as
+writing traversals and rewrite rules. The abstract base class `Node`
+in `dsl/nodebase.py` defines the interface for implementing new nodes,
+and provides many useful default methods. See also the docstrings of
+the `Node` class.
+
+:::{note}
+This generic implementation relies on nodes being treated as
+*immutable*. Do not implement in-place modification of nodes, bad
+things will happen.
+:::
+
+## Defining nodes
+
+A concrete node type (`cudf-polars` has expression nodes, `Expr`;
+and plan nodes, `IR`), should inherit from `Node`. Nodes have
+two types of data:
+
+1. `children`: a tuple (possibly empty) of concrete nodes;
+2. non-child: arbitrary data attached to the node that is _not_ a
+   concrete node.
+
+The base `Node` class requires that one advertise the names of the
+non-child attributes in the `_non_child` class variable. The
+constructor of the concrete node should take its arguments in the
+order `*_non_child` (ordered as the class variable does) and then
+`*children`. For example, the `Sort` node, which sorts a column
+generated by an expression, has this definition:
+
+```python
+class Expr(Node):
+    children: tuple[Expr, ...]
+
+class Sort(Expr):
+    _non_child = ("dtype", "options")
+    children: tuple[Expr]
+    def __init__(self, dtype, options, column: Expr):
+        self.dtype = dtype
+        self.options = options
+        self.children = (column,)
+```
+
+By following this pattern, we get an automatic (caching)
+implementation of `__hash__` and `__eq__`, as well as a useful
+`reconstruct` method that will rebuild the node with new children.
+
+If you want to control the behaviour of `__hash__` and `__eq__` for a
+single node, override (respectively) the `get_hashable` and `is_equal`
+methods.
+
+## Adding new translation rules from the polars IR
+
+### Plan nodes
 
-Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
-`dataclasses` that inherit from the base `IR` node. The evaluation of
-a plan node is done by implementing the `evaluate` method.
+Plan node definitions live in `cudf_polars/dsl/ir.py`, these all
+inherit from the base `IR` node. The evaluation of a plan node is done
+by implementing the `evaluate` method.
 
-To translate the plan node, add a case handler in `translate_ir` which
+To translate the plan node, add a case handler in `translate_ir` that
 lives in `cudf_polars/dsl/translate.py`.
 
 As well as child nodes that are plans, most plan nodes contain child
@@ -163,25 +227,12 @@ translating a `Join` node, the left keys (expressions) should be
 translated with the left input active (and right keys with right
 input). To facilitate this, use the `set_node` context manager.
 
-## Adding a handler for a new expression node
+### Expression nodes
 
 Adding a handle for an expression node is very similar to a plan node.
-Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit
-from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it
-is simpler for us to implement efficient hashing, repr, and equality if we
-can write that ourselves.
-
-Every expression consists of two types of data:
-1. child data (other `Expr`s)
-2. non-child data (anything other than an `Expr`)
-The generic implementations of special methods in the base `Expr` base
-class require that the subclasses advertise which arguments to the
-constructor are non-child in a `_non_child` class slot. The
-constructor should then take arguments:
-```python
-def __init__(self, *non_child_data: Any, *children: Expr):
-```
-Read the docstrings in the `Expr` class for more details.
+Expressions are defined in `cudf_polars/dsl/expressions/` and exported
+into the `dsl` namespace via `expr.py`. They inherit
+from `Expr`.
 
 Expressions are evaluated by implementing a `do_evaluate` method that
 takes a `DataFrame` as context (this provides columns) along with an
@@ -198,6 +249,124 @@ To simplify state tracking, all columns should be considered immutable
 on construction. This matches the "functional" description coming from
 the logical plan in any case, so is reasonably natural.
 
+## Traversing and transforming nodes
+
+In addition to representing and evaluating nodes. We also provide
+facilities for traversing a tree of nodes and defining transformation
+rules in `dsl/traversal.py`. The simplest is `traversal`, a
+[pre-order](https://en.wikipedia.org/wiki/Tree_traversal) visit of all
+unique nodes in an expression. Use this if you want to know some
+specific thing about an expression. For example, to determine if an
+expression contains a `Literal` node:
+
+```python
+def has_literal(node: Expr) -> bool:
+    return any(isinstance(e, Literal) for e in traversal(node))
+```
+
+It is often convenient to provide (immutable) state to a visitor, as
+well as some facility to perform DAG-aware rewrites (reusing a
+transformation for an expression if we have already seen it). We
+therefore adopt the following pattern of writing DAG-aware visitors.
+Suppose we want a rewrite rule (`rewrite`) between expressions
+(`Expr`) and some new type `T`. We define our general transformation
+function `rewrite` with type `Expr -> (Expr -> T) -> T`:
+
+```python
+from cudf_polars.typing import GenericTransformer
+
+@singledispatch
+def rewrite(e: Expr, rec: GenericTransformer[Expr, T]) -> T:
+    ...
+```
+
+Note in particular that the function to perform the recursion is
+passed as the second argument. Rather than defining methods on each
+node in turn for a particular rewrite rule, we prefer free functions
+and use `functools.singledispatch` to provide dispatching. We now, in
+the usual fashion, register handlers for different expression types.
+To use this function, we need to be able to provide both the
+expression to convert and the recursive function itself. To do this we
+must convert our `rewrite` function into something that only takes a
+single argument (the expression to rewrite), but carries around
+information about how to perform the recursion. To this end, we have
+two utilities in `traversal.py`:
+
+- `make_recursive` and
+- `CachingVisitor`.
+
+These both implement the `GenericTransformer` protocol, and can be
+wrapped around a transformation function like `rewrite` to provide a
+function `Expr -> T`. They also allow us to attach arbitrary
+*immutable* state to our visitor by passing a `state` dictionary. This
+dictionary can then be inspected by the concrete transformation
+function. `make_recursive` is very simple, and provides no caching of
+intermediate results (so any DAGs that are visited will be viewed as
+trees). `CachingVisitor` provides the same interface, but maintains a
+cache of intermediate results, and reuses them if the same expression
+is seen again.
+
+Finally, for writing transformations that take nodes and deliver new
+nodes (e.g. rewrite rules), we have a final utility
+`reuse_if_unchanged` that can be used as a base case transformation
+for node to node rewrites. It is a depth-first visit that transforms
+children but only returns a new node with new children if the rewrite
+of children returned new nodes.
+
+To see how these pieces fit together, let us consider writing a
+`rename` function that takes an expression (potentially with
+references to columns) along with a mapping defining a renaming
+between (some subset of) column names. The goal is to deliver a new
+expression with appropriate columns renamed.
+
+To start, we define the dispatch function
+```python
+from collections.abc import Mapping
+from functools import singledispatch
+from cudf_polars.dsl.traversal import (
+    CachingVisitor, make_recursive, reuse_if_unchanged
+)
+from cudf_polars.dsl.expr import Col, Expr
+from cudf_polars.typing import ExprTransformer
+
+
+@singledispatch
+def _rename(e: Expr, rec: ExprTransformer) -> Expr:
+    raise NotImplementedError(f"No handler for {type(e)}")
+```
+then we register specific handlers, first for columns:
+```python
+@_rename.register
+def _(e: Col, rec: ExprTransformer) -> Expr:
+    mapping = rec.state["mapping"] # state set on rec
+    if e.name in mapping:
+        # If we have a rename, return a new Col reference
+        # with a new name
+        return type(e)(e.dtype, mapping[e.name])
+    return e
+```
+and then for the remaining expressions
+```python
+_rename.register(Expr)(reuse_if_unchanged)
+```
+
+:::{note}
+In this case, we could have put the generic handler in the `_rename`
+function, however, then we would not get a nice error message if we
+accidentally sent in an object of the incorrect type.
+:::
+
+Finally we tie everything together with a public function:
+
+```python
+def rename(e: Expr, mapping: Mapping[str, str]) -> Expr:
+    """Rename column references in an expression."""
+    mapper = CachingVisitor(_rename, state={"mapping": mapping})
+    # or
+    # mapper = make_recursive(_rename, state={"mapping": mapping})
+    return mapper(e)
+```
+
 # Containers
 
 Containers should be constructed as relatively lightweight objects
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 5345fad41a2..a8bb634732f 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -60,7 +60,7 @@ xfail_strict = true
 [tool.coverage.report]
 exclude_also = [
   "if TYPE_CHECKING:",
-  "class .*\\bProtocol\\):",
+  "class .*\\bProtocol(?:\\[[^]]+\\])?\\):",
   "assert_never\\("
 ]
 # The cudf_polars test suite doesn't exercise the plugin, so we omit
diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py
index b7d4672daca..84e33262869 100644
--- a/python/cudf_polars/tests/dsl/test_expr.py
+++ b/python/cudf_polars/tests/dsl/test_expr.py
@@ -73,3 +73,24 @@ def test_namedexpr_repr_stable():
     b2 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
 
     assert repr(b1) == repr(b2)
+
+
+def test_equality_cse():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    def make_expr(n1, n2):
+        a = expr.Col(plc.DataType(plc.TypeId.INT8), n1)
+        b = expr.Col(plc.DataType(plc.TypeId.INT8), n2)
+
+        return expr.BinOp(dt, plc.binaryop.BinaryOperator.ADD, a, b)
+
+    e1 = make_expr("a", "b")
+    e2 = make_expr("a", "b")
+    e3 = make_expr("a", "c")
+
+    assert e1.children is not e2.children
+    assert e1 == e2
+    assert e1.children is e2.children
+    assert e1 == e2
+    assert e1 != e3
+    assert e2 != e3
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
new file mode 100644
index 00000000000..6505a786855
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -0,0 +1,229 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from functools import singledispatch
+
+import pylibcudf as plc
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+from cudf_polars import translate_ir
+from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import (
+    CachingVisitor,
+    make_recursive,
+    reuse_if_unchanged,
+    traversal,
+)
+from cudf_polars.typing import ExprTransformer, IRTransformer
+
+
+def make_expr(dt, n1, n2):
+    a1 = expr.Col(dt, n1)
+    a2 = expr.Col(dt, n2)
+
+    return expr.BinOp(dt, plc.binaryop.BinaryOperator.MUL, a1, a2)
+
+
+def test_traversal_unique():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "a")
+    unique_exprs = list(traversal(e1))
+
+    assert len(unique_exprs) == 2
+    assert set(unique_exprs) == {expr.Col(dt, "a"), e1}
+    assert unique_exprs == [e1, expr.Col(dt, "a")]
+
+    e2 = make_expr(dt, "a", "b")
+    unique_exprs = list(traversal(e2))
+
+    assert len(unique_exprs) == 3
+    assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2}
+    assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")]
+
+    e3 = make_expr(dt, "b", "a")
+    unique_exprs = list(traversal(e3))
+
+    assert len(unique_exprs) == 3
+    assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3}
+    assert unique_exprs == [e3, expr.Col(dt, "b"), expr.Col(dt, "a")]
+
+
+def rename(e, rec):
+    mapping = rec.state["mapping"]
+    if isinstance(e, expr.Col) and e.name in mapping:
+        return type(e)(e.dtype, mapping[e.name])
+    return reuse_if_unchanged(e, rec)
+
+
+def test_caching_visitor():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "b")
+
+    mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e1)
+    assert renamed == make_expr(dt, "a", "c")
+    assert len(mapper.cache) == 3
+
+    e2 = make_expr(dt, "a", "a")
+    mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "a", "a")
+    assert len(mapper.cache) == 2
+    mapper = CachingVisitor(rename, state={"mapping": {"a": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "c", "c")
+    assert len(mapper.cache) == 2
+
+
+def test_noop_visitor():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "b")
+
+    mapper = make_recursive(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e1)
+    assert renamed == make_expr(dt, "a", "c")
+
+    e2 = make_expr(dt, "a", "a")
+    mapper = make_recursive(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "a", "a")
+    mapper = make_recursive(rename, state={"mapping": {"a": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "c", "c")
+
+
+def test_rewrite_ir_node():
+    df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]})
+    q = df.group_by("a").agg(pl.col("b").sum()).sort("b")
+
+    orig = translate_ir(q._ldf.visit())
+
+    new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]})
+
+    def replace_df(node, rec):
+        if isinstance(node, ir.DataFrameScan):
+            return ir.DataFrameScan(
+                node.schema, new_df._df, node.projection, node.predicate
+            )
+        return reuse_if_unchanged(node, rec)
+
+    mapper = CachingVisitor(replace_df)
+
+    new = mapper(orig)
+
+    result = new.evaluate(cache={}).to_polars()
+
+    expect = pl.DataFrame({"a": [2, 1], "b": [-4, -3]})
+
+    assert_frame_equal(result, expect)
+
+
+def test_rewrite_scan_node(tmp_path):
+    left = pl.LazyFrame({"a": [1, 2, 3], "b": [1, 3, 4]})
+    right = pl.DataFrame({"a": [1, 4, 2], "c": [1, 2, 3]})
+
+    right.write_parquet(tmp_path / "right.pq")
+
+    right_s = pl.scan_parquet(tmp_path / "right.pq")
+
+    q = left.join(right_s, on="a", how="inner")
+
+    def replace_scan(node, rec):
+        if isinstance(node, ir.Scan):
+            return ir.DataFrameScan(
+                node.schema, right._df, node.with_columns, node.predicate
+            )
+        return reuse_if_unchanged(node, rec)
+
+    mapper = CachingVisitor(replace_scan)
+
+    orig = translate_ir(q._ldf.visit())
+    new = mapper(orig)
+
+    result = new.evaluate(cache={}).to_polars()
+
+    expect = q.collect()
+
+    assert_frame_equal(result, expect, check_row_order=False)
+
+
+def test_rewrite_names_and_ops():
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]})
+
+    q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d")
+
+    # We will replace a -> d, c -> d, and addition with multiplication
+    expect = (
+        df.select(
+            (pl.col("d") - (pl.col("b") * pl.col("d") * 2)).alias("a"), pl.col("d")
+        )
+        .sort("d")
+        .collect()
+    )
+
+    qir = translate_ir(q._ldf.visit())
+
+    @singledispatch
+    def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr:
+        raise NotImplementedError("Unhandled")
+
+    @_transform.register
+    def _(e: expr.Col, fn: ExprTransformer):
+        mapping = fn.state["mapping"]
+        if e.name in mapping:
+            return type(e)(e.dtype, mapping[e.name])
+        return e
+
+    @_transform.register
+    def _(e: expr.BinOp, fn: ExprTransformer):
+        if e.op == plc.binaryop.BinaryOperator.ADD:
+            return type(e)(
+                e.dtype, plc.binaryop.BinaryOperator.MUL, *map(fn, e.children)
+            )
+        return reuse_if_unchanged(e, fn)
+
+    _transform.register(expr.Expr)(reuse_if_unchanged)
+
+    @singledispatch
+    def _rewrite(node: ir.IR, fn: IRTransformer) -> ir.IR:
+        raise NotImplementedError("Unhandled")
+
+    @_rewrite.register
+    def _(node: ir.Select, fn: IRTransformer):
+        expr_mapper = fn.state["expr_mapper"]
+        return type(node)(
+            node.schema,
+            [expr.NamedExpr(e.name, expr_mapper(e.value)) for e in node.exprs],
+            node.should_broadcast,
+            fn(node.children[0]),
+        )
+
+    _rewrite.register(ir.IR)(reuse_if_unchanged)
+
+    rewriter = CachingVisitor(
+        _rewrite,
+        state={
+            "expr_mapper": CachingVisitor(
+                _transform, state={"mapping": {"a": "d", "c": "d"}}
+            )
+        },
+    )
+
+    new_ir = rewriter(qir)
+
+    got = new_ir.evaluate(cache={}).to_polars()
+
+    assert_frame_equal(expect, got)
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 3c3986be19b..9900f598e5f 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -10,7 +10,7 @@
 
 import rmm
 
-from cudf_polars.dsl.ir import IR
+from cudf_polars.dsl.ir import DataFrameScan
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
@@ -18,10 +18,10 @@
 
 
 def test_polars_verbose_warns(monkeypatch):
-    def raise_unimplemented(self):
+    def raise_unimplemented(self, *args):
         raise NotImplementedError("We don't support this")
 
-    monkeypatch.setattr(IR, "__post_init__", raise_unimplemented)
+    monkeypatch.setattr(DataFrameScan, "__init__", raise_unimplemented)
     q = pl.LazyFrame({})
     # Ensure that things raise
     assert_ir_translation_raises(q, NotImplementedError)

From 4fe338c0efe0fee2ee69c8207f9f4cbe9aa4d4a2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 22 Oct 2024 04:39:09 -1000
Subject: [PATCH 098/117] Add string.replace_re APIs to pylibcudf (#17023)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17023
---
 .../api_docs/pylibcudf/strings/index.rst      |   1 +
 .../api_docs/pylibcudf/strings/replace_re.rst |   6 +
 python/cudf/cudf/_lib/strings/replace_re.pyx  | 104 ++++----------
 python/cudf/cudf/core/column/string.py        |   2 +-
 .../pylibcudf/libcudf/strings/replace_re.pxd  |  24 ++--
 .../pylibcudf/strings/CMakeLists.txt          |   1 +
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |   2 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |   2 +
 .../pylibcudf/strings/replace_re.pxd          |  30 ++++
 .../pylibcudf/strings/replace_re.pyx          | 134 ++++++++++++++++++
 .../pylibcudf/tests/test_string_replace_re.py |  71 ++++++++++
 11 files changed, 289 insertions(+), 88 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/replace_re.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/replace_re.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_replace_re.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index c8c0016126d..ae670b5bd8a 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -16,6 +16,7 @@ strings
     regex_flags
     regex_program
     repeat
+    replace_re
     replace
     side_type
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
new file mode 100644
index 00000000000..5bf715ef657
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
@@ -0,0 +1,6 @@
+==========
+replace_re
+==========
+
+.. automodule:: pylibcudf.strings.replace_re
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index fffc8b7c3f6..462d5c903e8 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -1,26 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
+from pylibcudf.libcudf.types cimport size_type
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.replace_re cimport (
-    replace_re as cpp_replace_re,
-    replace_with_backrefs as cpp_replace_with_backrefs,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
@@ -34,28 +19,16 @@ def replace_re(Column source_strings,
     `n` indicates the number of resplacements to be made from
     start. (-1 indicates all)
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef const string_scalar* scalar_repl = \
-        <const string_scalar*>(repl.get_raw_ptr())
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_replace_re(
-            source_view,
-            dereference(c_prog),
-            scalar_repl[0],
-            n
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        py_repl.device_value.c_value,
+        n
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -68,50 +41,29 @@ def replace_with_backrefs(
     new string with the extracted elements found using
     `pattern` regular expression in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef string repl_string = <string>str(repl).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_replace_with_backrefs(
-            source_view,
-            dereference(c_prog),
-            repl_string
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_with_backrefs(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        repl
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
 def replace_multi_re(Column source_strings,
-                     object patterns,
+                     list patterns,
                      Column repl_strings):
     """
     Returns a Column after replacing occurrences of multiple
     regular expressions `patterns` with their corresponding
     strings in `repl_strings` in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view repl_view = repl_strings.view()
-
-    cdef int pattern_size = len(patterns)
-    cdef vector[string] patterns_vector
-    patterns_vector.reserve(pattern_size)
-
-    for pattern in patterns:
-        patterns_vector.push_back(str.encode(pattern))
-
-    with nogil:
-        c_result = move(cpp_replace_re(
-            source_view,
-            patterns_vector,
-            repl_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_re(
+        source_strings.to_pylibcudf(mode="read"),
+        patterns,
+        repl_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 45d1a8b087b..b25e486d855 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -998,7 +998,7 @@ def replace(
             return self._return_or_inplace(
                 libstrings.replace_multi_re(
                     self._column,
-                    pat,
+                    list(pat),
                     column.as_column(repl, dtype="str"),
                 )
                 if regex
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
index 40f0e2fa50c..6b0c90d0acc 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
@@ -6,6 +6,7 @@ from libcpp.vector cimport vector
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
@@ -14,17 +15,18 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] replace_re(
-        column_view source_strings,
-        regex_program,
-        string_scalar repl,
-        size_type maxrepl) except +
-
-    cdef unique_ptr[column] replace_with_backrefs(
-        column_view source_strings,
-        regex_program,
-        string repl) except +
+        column_view input,
+        regex_program prog,
+        string_scalar replacement,
+        size_type max_replace_count) except +
 
     cdef unique_ptr[column] replace_re(
-        column_view source_strings,
+        column_view input,
         vector[string] patterns,
-        column_view repls) except +
+        column_view replacements,
+        regex_flags flags) except +
+
+    cdef unique_ptr[column] replace_with_backrefs(
+        column_view input,
+        regex_program prog,
+        string replacement) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 04dd131cd75..5d7fbd24b91 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -28,6 +28,7 @@ set(cython_sources
     regex_program.pyx
     repeat.pyx
     replace.pyx
+    replace_re.pyx
     side_type.pyx
     slice.pyx
     strip.pyx
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 93c61f3f72c..da1c1c576c0 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     regex_program,
     repeat,
     replace,
+    replace_re,
     side_type,
     slice,
     split,
@@ -42,6 +43,7 @@ __all__ = [
     "regex_program",
     "repeat",
     "replace",
+    "replace_re",
     "slice",
     "strip",
     "split",
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index d52b0405f1e..40fa8261905 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -17,6 +17,7 @@
     regex_program,
     repeat,
     replace,
+    replace_re,
     side_type,
     slice,
     split,
@@ -42,6 +43,7 @@
     "regex_program",
     "repeat",
     "replace",
+    "replace_re",
     "slice",
     "strip",
     "split",
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
new file mode 100644
index 00000000000..e27ccd55f7d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+ctypedef fused Replacement:
+    Column
+    Scalar
+
+ctypedef fused Patterns:
+    RegexProgram
+    list
+
+
+cpdef Column replace_re(
+    Column input,
+    Patterns patterns,
+    Replacement replacement=*,
+    size_type max_replace_count=*,
+    regex_flags flags=*
+)
+
+cpdef Column replace_with_backrefs(
+    Column input,
+    RegexProgram prog,
+    str replacement
+)
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
new file mode 100644
index 00000000000..ccc33fd4425
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
@@ -0,0 +1,134 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings cimport replace_re as cpp_replace_re
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column replace_re(
+    Column input,
+    Patterns patterns,
+    Replacement replacement=None,
+    size_type max_replace_count=-1,
+    regex_flags flags=regex_flags.DEFAULT,
+):
+    """
+    For each string, replaces any character sequence matching the given patterns
+    with the provided replacement.
+
+    For details, see :cpp:func:`cudf::strings::replace_re`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+    patterns: RegexProgram or list[str]
+        If RegexProgram, the regex to match to each string.
+        If list[str], a list of regex strings to search within each string.
+    replacement : Scalar or Column
+        If Scalar, the string used to replace the matched sequence in each string.
+        ``patterns`` must be a RegexProgram.
+        If Column, the strings used for replacement.
+        ``patterns`` must be a list[str].
+    max_replace_count : int
+        The maximum number of times to replace the matched pattern
+        within each string. ``patterns`` must be a RegexProgram.
+        Default replaces every substring that is matched.
+    flags : RegexFlags
+        Regex flags for interpreting special characters in the patterns.
+        ``patterns`` must be a list[str]
+
+    Returns
+    -------
+    Column
+        New strings column
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[string] c_patterns
+
+    if Patterns is RegexProgram and Replacement is Scalar:
+        if replacement is None:
+            replacement = Scalar.from_libcudf(
+                cpp_make_string_scalar("".encode())
+            )
+        with nogil:
+            c_result = move(
+                cpp_replace_re.replace_re(
+                    input.view(),
+                    patterns.c_obj.get()[0],
+                    dereference(<string_scalar*>(replacement.get())),
+                    max_replace_count
+                )
+            )
+
+        return Column.from_libcudf(move(c_result))
+    elif Patterns is list and Replacement is Column:
+        c_patterns.reserve(len(patterns))
+        for pattern in patterns:
+            c_patterns.push_back(pattern.encode())
+
+        with nogil:
+            c_result = move(
+                cpp_replace_re.replace_re(
+                    input.view(),
+                    c_patterns,
+                    replacement.view(),
+                    flags,
+                )
+            )
+
+        return Column.from_libcudf(move(c_result))
+    else:
+        raise TypeError("Must pass either a RegexProgram and a Scalar or a list")
+
+
+cpdef Column replace_with_backrefs(
+    Column input,
+    RegexProgram prog,
+    str replacement
+):
+    """
+    For each string, replaces any character sequence matching the given regex
+    using the replacement template for back-references.
+
+    For details, see :cpp:func:`cudf::strings::replace_with_backrefs`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    prog: RegexProgram
+        Regex program instance.
+
+    replacement : str
+         The replacement template for creating the output string.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_replacement = replacement.encode()
+
+    with nogil:
+        c_result = cpp_replace_re.replace_with_backrefs(
+            input.view(),
+            prog.c_obj.get()[0],
+            c_replacement,
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
new file mode 100644
index 00000000000..ff2ce348d3b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.mark.parametrize("max_replace_count", [-1, 1])
+def test_replace_re_regex_program_scalar(max_replace_count):
+    arr = pa.array(["foo", "fuz", None])
+    pat = "f."
+    repl = "ba"
+    result = plc.strings.replace_re.replace_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pat, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        plc.interop.from_arrow(pa.scalar(repl)),
+        max_replace_count=max_replace_count,
+    )
+    expected = pc.replace_substring_regex(
+        arr,
+        pat,
+        repl,
+        max_replacements=max_replace_count
+        if max_replace_count != -1
+        else None,
+    )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "flags",
+    [
+        plc.strings.regex_flags.RegexFlags.DEFAULT,
+        plc.strings.regex_flags.RegexFlags.DOTALL,
+    ],
+)
+def test_replace_re_list_str_columns(flags):
+    arr = pa.array(["foo", "fuz", None])
+    pats = ["oo", "uz"]
+    repls = ["a", "b"]
+    result = plc.strings.replace_re.replace_re(
+        plc.interop.from_arrow(arr),
+        pats,
+        plc.interop.from_arrow(pa.array(repls)),
+        flags=flags,
+    )
+    expected = arr
+    for pat, repl in zip(pats, repls):
+        expected = pc.replace_substring_regex(
+            expected,
+            pat,
+            repl,
+        )
+    assert_column_eq(result, expected)
+
+
+def test_replace_with_backrefs():
+    arr = pa.array(["Z756", None])
+    result = plc.strings.replace_re.replace_with_backrefs(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            "(\\d)(\\d)", plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        "V\\2\\1",
+    )
+    expected = pa.array(["ZV576", None])
+    assert_column_eq(result, expected)

From 14cdf53df3b8d9d7aa1bce1e8d9ff0af47998580 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 22 Oct 2024 19:13:44 -0400
Subject: [PATCH 099/117] Migrate NVText Replacing APIs to pylibcudf (#17084)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/17084
---
 cpp/include/nvtext/replace.hpp                |   4 +-
 .../api_docs/pylibcudf/nvtext/index.rst       |   1 +
 .../api_docs/pylibcudf/nvtext/replace.rst     |   6 +
 python/cudf/cudf/_lib/nvtext/normalize.pyx    |  16 +--
 python/cudf/cudf/_lib/nvtext/replace.pyx      |  66 +++--------
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |   2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |   2 +
 python/pylibcudf/pylibcudf/nvtext/__init__.py |   2 +
 python/pylibcudf/pylibcudf/nvtext/replace.pxd |  20 ++++
 python/pylibcudf/pylibcudf/nvtext/replace.pyx | 109 ++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_replace.py    |  63 ++++++++++
 11 files changed, 230 insertions(+), 61 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/replace.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/replace.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py

diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index bbd0503379b..822edcbdb43 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -82,7 +82,7 @@ namespace CUDF_EXPORT nvtext {
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column with replaced strings
  */
 std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& input,
@@ -131,7 +131,7 @@ std::unique_ptr<cudf::column> replace_tokens(
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column of filtered strings
  */
 std::unique_ptr<cudf::column> filter_tokens(
   cudf::strings_column_view const& input,
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 3a79c869971..c5b9533597a 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -10,3 +10,4 @@ nvtext
     minhash
     ngrams_tokenize
     normalize
+    replace
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
new file mode 100644
index 00000000000..04cee972dc1
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
@@ -0,0 +1,6 @@
+=======
+replace
+=======
+
+.. automodule:: pylibcudf.nvtext.replace
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 633bc902db1..cc45123dd0a 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -11,16 +11,18 @@ from pylibcudf import nvtext
 
 @acquire_spill_lock()
 def normalize_spaces(Column input):
-    result = nvtext.normalize.normalize_spaces(
-        input.to_pylibcudf(mode="read")
+    return Column.from_pylibcudf(
+        nvtext.normalize.normalize_spaces(
+            input.to_pylibcudf(mode="read")
+        )
     )
-    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def normalize_characters(Column input, bool do_lower=True):
-    result = nvtext.normalize.normalize_characters(
-        input.to_pylibcudf(mode="read"),
-        do_lower,
+    return Column.from_pylibcudf(
+        nvtext.normalize.normalize_characters(
+            input.to_pylibcudf(mode="read"),
+            do_lower,
+        )
     )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index 61ae3da5782..bec56ade83c 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -2,20 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.replace cimport (
-    filter_tokens as cpp_filter_tokens,
-    replace_tokens as cpp_replace_tokens,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
@@ -30,27 +20,14 @@ def replace_tokens(Column strings,
     provided.
     """
 
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_targets = targets.view()
-    cdef column_view c_replacements = replacements.view()
-
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_replace_tokens(
-                c_strings,
-                c_targets,
-                c_replacements,
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.replace_tokens(
+            strings.to_pylibcudf(mode="read"),
+            targets.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
@@ -65,24 +42,11 @@ def filter_tokens(Column strings,
     character provided.
     """
 
-    cdef DeviceScalar replacement = py_replacement.device_value
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_repl = <const string_scalar*>replacement\
-        .get_raw_ptr()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_filter_tokens(
-                c_strings,
-                min_token_length,
-                c_repl[0],
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.filter_tokens(
+            strings.to_pylibcudf(mode="read"),
+            min_token_length,
+            py_replacement.device_value.c_value,
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index e01ca3fbdd3..7a94490998a 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx normalize.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 08dbec84090..5a5e665d309 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -7,6 +7,7 @@ from . cimport (
     minhash,
     ngrams_tokenize,
     normalize,
+    replace,
 )
 
 __all__ = [
@@ -16,4 +17,5 @@ __all__ = [
     "minhash",
     "ngrams_tokenize",
     "normalize",
+    "replace",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 6dccf3dd9cf..77187f0845d 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -7,6 +7,7 @@
     minhash,
     ngrams_tokenize,
     normalize,
+    replace,
 )
 
 __all__ = [
@@ -16,4 +17,5 @@
     "minhash",
     "ngrams_tokenize",
     "normalize",
+    "replace",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
new file mode 100644
index 00000000000..624f90e7486
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace_tokens(
+    Column input,
+    Column targets,
+    Column replacements,
+    Scalar delimiter=*,
+)
+
+cpdef Column filter_tokens(
+    Column input,
+    size_type min_token_length,
+    Scalar replacement=*,
+    Scalar delimiter=*
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
new file mode 100644
index 00000000000..b65348ce14d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.replace cimport (
+    filter_tokens as cpp_filter_tokens,
+    replace_tokens as cpp_replace_tokens,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace_tokens(
+    Column input,
+    Column targets,
+    Column replacements,
+    Scalar delimiter=None,
+):
+    """
+    Replaces specified tokens with corresponding replacement strings.
+
+    For details, see :cpp:func:`replace_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to replace
+    targets : Column
+        Strings to compare against tokens found in ``input``
+    replacements : Column
+        Replacement strings for each string in ``targets``
+    delimiter : Scalar, optional
+        Characters used to separate each string into tokens.
+        The default of empty string will identify tokens using whitespace.
+
+    Returns
+    -------
+    Column
+        New strings column with replaced strings
+    """
+    cdef unique_ptr[column] c_result
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    with nogil:
+        c_result = cpp_replace_tokens(
+            input.view(),
+            targets.view(),
+            replacements.view(),
+            dereference(<const string_scalar*>delimiter.get()),
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column filter_tokens(
+    Column input,
+    size_type min_token_length,
+    Scalar replacement=None,
+    Scalar delimiter=None
+):
+    """
+    Removes tokens whose lengths are less than a specified number of characters.
+
+    For details, see :cpp:func:`filter_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to replace
+    min_token_length : size_type
+        The minimum number of characters to retain a
+        token in the output string
+    replacement : Scalar, optional
+        Optional replacement string to be used in place of removed tokens
+    delimiter : Scalar, optional
+        Characters used to separate each string into tokens.
+        The default of empty string will identify tokens using whitespace.
+    Returns
+    -------
+    Column
+        New strings column of filtered strings
+    """
+    cdef unique_ptr[column] c_result
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    if replacement is None:
+        replacement = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_filter_tokens(
+            input.view(),
+            min_token_length,
+            dereference(<const string_scalar*>replacement.get()),
+            dereference(<const string_scalar*>delimiter.get()),
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
new file mode 100644
index 00000000000..0fb54bb4ee1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    return pa.array(arr)
+
+
+@pytest.fixture(scope="module")
+def targets():
+    arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("delim", ["*", None])
+def test_replace_tokens(input_col, targets, delim):
+    replacements = pa.array(["slow", "cat", "looked", "rat"])
+    result = plc.nvtext.replace.replace_tokens(
+        plc.interop.from_arrow(input_col),
+        plc.interop.from_arrow(targets),
+        plc.interop.from_arrow(replacements),
+        plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
+    )
+    expected = pa.array(["slow", "cat", "jumps*over the", "rat"])
+    if not delim:
+        expected = pa.array(
+            ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("min_token_length", [4, 5])
+@pytest.mark.parametrize("replace", ["---", None])
+@pytest.mark.parametrize("delim", ["*", None])
+def test_filter_tokens(input_col, min_token_length, replace, delim):
+    result = plc.nvtext.replace.filter_tokens(
+        plc.interop.from_arrow(input_col),
+        min_token_length,
+        plc.interop.from_arrow(pa.scalar(replace)) if replace else None,
+        plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
+    )
+    expected = pa.array(
+        ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    )
+    if not delim and not replace and min_token_length == 4:
+        expected = pa.array([" quick", "brown ", "jumps*over ", "lazy "])
+    if not delim and not replace and min_token_length == 5:
+        expected = pa.array([" quick", "brown ", "jumps*over ", " "])
+    if not delim and replace == "---" and min_token_length == 4:
+        expected = pa.array(
+            ["--- quick", "brown ---", "jumps*over ---", "lazy ---"]
+        )
+    if not delim and replace == "---" and min_token_length == 5:
+        expected = pa.array(
+            ["--- quick", "brown ---", "jumps*over ---", "--- ---"]
+        )
+    assert_column_eq(result, expected)

From 27c0c9d594c9cc94261ab9a1db968bfc50aa38e0 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 22 Oct 2024 19:16:06 -0400
Subject: [PATCH 100/117] Set the default number of threads in KvikIO thread
 pool to 8 (#17126)

Recent benchmarks have shown that setting the environment variable `KVIKIO_NTHREADS=8` in cuDF usually leads to optimal I/O performance. This PR internally sets the default KvikIO thread pool size to 8. The env `KVIKIO_NTHREADS` will still be honored if users explicitly set it.

Fixes #16718

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17126
---
 cpp/include/cudf/io/config_utils.hpp  | 16 ++++++++++++----
 cpp/src/io/utilities/config_utils.cpp | 12 +++++++++++-
 cpp/src/io/utilities/data_sink.cpp    |  1 +
 cpp/src/io/utilities/datasource.cpp   |  1 +
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
index 1827ba0e3e6..13a76d50346 100644
--- a/cpp/include/cudf/io/config_utils.hpp
+++ b/cpp/include/cudf/io/config_utils.hpp
@@ -18,7 +18,8 @@
 #include <cudf/utilities/export.hpp>
 
 namespace CUDF_EXPORT cudf {
-namespace io::cufile_integration {
+namespace io {
+namespace cufile_integration {
 
 /**
  * @brief Returns true if cuFile and its compatibility mode are enabled.
@@ -35,9 +36,15 @@ bool is_gds_enabled();
  */
 bool is_kvikio_enabled();
 
-}  // namespace io::cufile_integration
+/**
+ * @brief Set kvikIO thread pool size according to the environment variable KVIKIO_NTHREADS. If
+ * KVIKIO_NTHREADS is not set, use 8 threads by default.
+ */
+void set_thread_pool_nthreads_from_env();
+
+}  // namespace cufile_integration
 
-namespace io::nvcomp_integration {
+namespace nvcomp_integration {
 
 /**
  * @brief Returns true if all nvCOMP uses are enabled.
@@ -49,5 +56,6 @@ bool is_all_enabled();
  */
 bool is_stable_enabled();
 
-}  // namespace io::nvcomp_integration
+}  // namespace nvcomp_integration
+}  // namespace io
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index a3afbd52896..813743fa7b4 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -19,7 +19,10 @@
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <kvikio/defaults.hpp>
+
 #include <cstdlib>
+#include <mutex>
 #include <sstream>
 #include <string>
 
@@ -53,6 +56,14 @@ bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_
 
 bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; }
 
+void set_thread_pool_nthreads_from_env()
+{
+  static std::once_flag flag{};
+  std::call_once(flag, [] {
+    auto nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 8U);
+    kvikio::defaults::thread_pool_nthreads_reset(nthreads);
+  });
+}
 }  // namespace cufile_integration
 
 namespace nvcomp_integration {
@@ -81,5 +92,4 @@ bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
 bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; }
 
 }  // namespace nvcomp_integration
-
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 0b76f3d3e8f..a8a275919d8 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -42,6 +42,7 @@ class file_sink : public data_sink {
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
     if (cufile_integration::is_kvikio_enabled()) {
+      cufile_integration::set_thread_pool_nthreads_from_env();
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 2daaecadca6..19f2103071e 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -48,6 +48,7 @@ class file_source : public datasource {
   {
     detail::force_init_cuda_context();
     if (cufile_integration::is_kvikio_enabled()) {
+      cufile_integration::set_thread_pool_nthreads_from_env();
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");

From cff1296845aa9a4078dd0d95dd30b7e7c004f2d9 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 22 Oct 2024 20:28:51 -0400
Subject: [PATCH 101/117] JSON tokenizer memory optimizations (#16978)

The full push-down automata that tokenizes the input JSON string, as well as the bracket-brace FST over-estimates the total buffer size required for the translated output and indices. This PR splits the `transduce` calls for both FSTs into two invocations. The first invocation estimates the size of the translated buffer and the translated indices, and the second call performs the DFA run.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Basit Ayantunde (https://github.com/lamarrr)

URL: https://github.com/rapidsai/cudf/pull/16978
---
 cpp/src/io/json/nested_json_gpu.cu | 43 +++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 534b30a6089..60e78f4763d 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1448,10 +1448,6 @@ void get_stack_context(device_span<SymbolT const> json_in,
   // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes)
   cudf::detail::device_scalar<SymbolOffsetT> d_num_stack_ops(stream);
 
-  // Sequence of stack symbols and their position in the original input (sparse representation)
-  rmm::device_uvector<StackSymbolT> stack_ops{json_in.size(), stream};
-  rmm::device_uvector<SymbolOffsetT> stack_op_indices{json_in.size(), stream};
-
   // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
@@ -1468,11 +1464,26 @@ void get_stack_context(device_span<SymbolT const> json_in,
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
   // of structs/lists
+  // Run FST to estimate the sizes of translated buffers
+  json_to_stack_ops_fst.Transduce(json_in.begin(),
+                                  static_cast<SymbolOffsetT>(json_in.size()),
+                                  thrust::make_discard_iterator(),
+                                  thrust::make_discard_iterator(),
+                                  d_num_stack_ops.data(),
+                                  to_stack_op::start_state,
+                                  stream);
+
+  auto stack_ops_bufsize = d_num_stack_ops.value(stream);
+  // Sequence of stack symbols and their position in the original input (sparse representation)
+  rmm::device_uvector<StackSymbolT> stack_ops{stack_ops_bufsize, stream};
+  rmm::device_uvector<SymbolOffsetT> stack_op_indices{stack_ops_bufsize, stream};
+
+  // Run bracket-brace FST to retrieve starting positions of structs and lists
   json_to_stack_ops_fst.Transduce(json_in.begin(),
                                   static_cast<SymbolOffsetT>(json_in.size()),
                                   stack_ops.data(),
                                   stack_op_indices.data(),
-                                  d_num_stack_ops.data(),
+                                  thrust::make_discard_iterator(),
                                   to_stack_op::start_state,
                                   stream);
 
@@ -1508,6 +1519,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   device_span<SymbolOffsetT const> token_indices,
   rmm::cuda_stream_view stream)
 {
+  CUDF_FUNC_RANGE();
   // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
   // invalid JSON line
   token_filter::UnwrapTokenFromSymbolOp sgid_op{};
@@ -1643,21 +1655,28 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // see a JSON-line delimiter as the very first item
   SymbolOffsetT const delimiter_offset =
     (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0);
-  rmm::device_uvector<PdaTokenT> tokens{max_token_out_count + delimiter_offset, stream, mr};
-  rmm::device_uvector<SymbolOffsetT> tokens_indices{
-    max_token_out_count + delimiter_offset, stream, mr};
 
+  // Run FST to estimate the size of output buffers
   json_to_tokens_fst.Transduce(zip_in,
                                static_cast<SymbolOffsetT>(json_in.size()),
-                               tokens.data() + delimiter_offset,
-                               tokens_indices.data() + delimiter_offset,
+                               thrust::make_discard_iterator(),
+                               thrust::make_discard_iterator(),
                                num_written_tokens.data(),
                                tokenizer_pda::start_state,
                                stream);
 
   auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset;
-  tokens.resize(num_total_tokens, stream);
-  tokens_indices.resize(num_total_tokens, stream);
+  rmm::device_uvector<PdaTokenT> tokens{num_total_tokens, stream, mr};
+  rmm::device_uvector<SymbolOffsetT> tokens_indices{num_total_tokens, stream, mr};
+
+  // Run FST to translate the input JSON string into tokens and indices at which they occur
+  json_to_tokens_fst.Transduce(zip_in,
+                               static_cast<SymbolOffsetT>(json_in.size()),
+                               tokens.data() + delimiter_offset,
+                               tokens_indices.data() + delimiter_offset,
+                               thrust::make_discard_iterator(),
+                               tokenizer_pda::start_state,
+                               stream);
 
   if (delimiter_offset == 1) {
     tokens.set_element(0, token_t::LineEnd, stream);

From 3126f775c527a8df65df2e2cbc8c2b73da2219bf Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 22 Oct 2024 22:34:30 -0500
Subject: [PATCH 102/117] [Bug] Fix Arrow-FS parquet reader for larger files
 (#17099)

Follow-up to https://github.com/rapidsai/cudf/pull/16684

There is currently a bug in `dask_cudf.read_parquet(..., filesystem="arrow")` when the files are larger than the `"dataframe.parquet.minimum-partition-size"` config. More specifically, when the files are not aggregated together, the output will be `pd.DataFrame` instead of `cudf.DataFrame`.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/cudf/pull/17099
---
 python/dask_cudf/dask_cudf/expr/_expr.py      | 30 ++++++++++++++----
 .../dask_cudf/io/tests/test_parquet.py        | 31 ++++++++++++++++++-
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  4 +++
 3 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 4a9f4de8b9c..c7cf66fbffd 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -12,7 +12,7 @@
 )
 from dask_expr._reductions import Reduction, Var
 from dask_expr.io.io import FusedParquetIO
-from dask_expr.io.parquet import ReadParquetPyarrowFS
+from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS
 
 from dask.dataframe.core import (
     _concat,
@@ -302,16 +302,34 @@ def _dataset_info(self):
         return dataset_info
 
     @staticmethod
-    def _table_to_pandas(
-        table,
-        index_name,
-        *args,
-    ):
+    def _table_to_pandas(table, index_name):
         df = cudf.DataFrame.from_arrow(table)
         if index_name is not None:
             df = df.set_index(index_name)
         return df
 
+    def _filtered_task(self, index: int):
+        columns = self.columns.copy()
+        index_name = self.index.name
+        if self.index is not None:
+            index_name = self.index.name
+        schema = self._dataset_info["schema"].remove_metadata()
+        if index_name:
+            if columns is None:
+                columns = list(schema.names)
+            columns.append(index_name)
+        return (
+            self._table_to_pandas,
+            (
+                self._fragment_to_table,
+                FragmentWrapper(self.fragments[index], filesystem=self.fs),
+                self.filters,
+                columns,
+                schema,
+            ),
+            index_name,
+        )
+
     def _tune_up(self, parent):
         if self._fusion_compression_factor >= 1:
             return
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 896c4169f5b..ae5ca480e31 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -15,7 +15,11 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    require_dask_expr,
+    skip_dask_expr,
+    xfail_dask_expr,
+)
 
 # Check if create_metadata_file is supported by
 # the current dask.dataframe version
@@ -615,3 +619,28 @@ def test_timezone_column(tmpdir):
     got = dask_cudf.read_parquet(path)
     expect = cudf.read_parquet(path)
     dd.assert_eq(got, expect)
+
+
+@require_dask_expr()
+@pytest.mark.skipif(
+    not dask_cudf.backends.PYARROW_GE_15,
+    reason="Requires pyarrow 15",
+)
+@pytest.mark.parametrize("min_part_size", ["1B", "1GB"])
+def test_read_parquet_arrow_filesystem(tmpdir, min_part_size):
+    tmp_path = str(tmpdir)
+    with dask.config.set(
+        {
+            "dataframe.backend": "cudf",
+            "dataframe.parquet.minimum-partition-size": min_part_size,
+        }
+    ):
+        dd.from_dict(
+            {"x": range(1000), "y": ["a", "b", "c", "d"] * 250},
+            npartitions=10,
+        ).to_parquet(tmp_path, write_index=False)
+        df = cudf.read_parquet(tmp_path)
+        ddf = dask_cudf.read_parquet(tmp_path, filesystem="arrow")
+        dd.assert_eq(df, ddf, check_index=False)
+        assert isinstance(ddf._meta, cudf.DataFrame)
+        assert isinstance(ddf.compute(), cudf.DataFrame)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index cf8af82e112..90907f6fb99 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -11,6 +11,8 @@
 
 from dask.dataframe import assert_eq
 
+import cudf
+
 import dask_cudf
 from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
@@ -168,6 +170,8 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem):
                 filesystem=filesystem,
             )
         assert df.b.sum().compute() == 9
+        assert isinstance(df._meta, cudf.DataFrame)
+        assert isinstance(df.compute(), cudf.DataFrame)
 
 
 def test_read_parquet_filesystem_explicit(s3_base, s3so, pdf):

From f0c6a04c6b4bd8f09b59adfaeb97435a90898fb0 Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Wed, 23 Oct 2024 07:41:03 -0700
Subject: [PATCH 103/117] Add JNI Support for Multi-line Delimiters and Include
 Test (#17139)

This PR introduces the necessary changes to the cuDF jni to support the issue described in [NVIDIA/spark-rapids#11554](https://github.com/NVIDIA/spark-rapids/issues/11554). For further information, refer to the details in the [comment](https://github.com/NVIDIA/spark-rapids/issues/11554#issuecomment-2425327183).

Issue #15961 adds support for handling multiple line delimiters. This PR extends that functionality to JNI, which was previously missing, and also includes a test to validate the changes.

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/17139
---
 .../main/java/ai/rapids/cudf/RegexFlag.java   | 11 +++++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 38 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java
index 7ed8e0354c9..68a3856f37d 100644
--- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java
+++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java
@@ -28,7 +28,16 @@ public enum RegexFlag {
   DEFAULT(0),   // default
   MULTILINE(8), // the '^' and '$' honor new-line characters
   DOTALL(16),   // the '.' matching includes new-line characters
-  ASCII(256);   // use only ASCII when matching built-in character classes
+  ASCII(256),   // use only ASCII when matching built-in character classes
+  /**
+   * EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters
+   * - NEXT_LINE ('\u0085')
+   * - LINE_SEPARATOR ('\u2028')
+   * - PARAGRAPH_SEPARATOR ('\u2029')
+   * - CARRIAGE_RETURN ('\r')
+   * - NEW_LINE ('\n')
+   */
+  EXT_NEWLINE(512);
 
   final int nativeId; // Native id, for use with libcudf.
   private RegexFlag(int nativeId) { // Only constant values should be used
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 708744569df..14c290b300a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -31,6 +31,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.EnumSet;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -3877,6 +3878,43 @@ void testExtractRe() {
     }
   }
 
+  @Test
+void testExtractReWithMultiLineDelimiters() {
+    String NEXT_LINE = "\u0085";
+    String LINE_SEPARATOR = "\u2028";
+    String PARAGRAPH_SEPARATOR = "\u2029";
+    String CARRIAGE_RETURN = "\r";
+    String NEW_LINE = "\n";
+
+    try (ColumnVector input = ColumnVector.fromStrings(
+            "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::",
+            "boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll",
+            "boo::",
+            "",
+            "boo::" + NEW_LINE,
+            "boo::" + CARRIAGE_RETURN,
+            "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR,
+            "boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR,
+            "boo:" + NEXT_LINE + "boo::" + NEXT_LINE);
+         Table expected_ext_newline = new Table.TestBuilder()
+             .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::")
+             .build();
+         Table expected_default = new Table.TestBuilder()
+             .column("boo:::", null, "boo::", null, "boo::", null, null, null, null)
+             .build()) {
+
+        // Regex pattern to match 'boo:' followed by one or more colons at the end of the string
+        try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) {
+          assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]);
+        }
+
+        try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) {
+          assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]);
+        }
+    }
+  }
+
+
   @Test
   void testExtractAllRecord() {
     String pattern = "([ab])(\\d)";

From 02ee8199a7703eafe8aa061213423eecf27757ec Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 09:28:20 -0700
Subject: [PATCH 104/117] Use async execution policy for true_if (#17146)

Closes #17117

Related to #12086

This PR replaces the synchronous execution policy with an asynchronous one to eliminate unnecessary synchronization.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Jason Lowe (https://github.com/jlowe)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17146
---
 cpp/include/cudf/detail/unary.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 18b1e9b2d2e..0f852db0c54 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -59,7 +59,7 @@ std::unique_ptr<column> true_if(InputIterator begin,
   auto output_mutable_view = output->mutable_view();
   auto output_data         = output_mutable_view.data<bool>();
 
-  thrust::transform(rmm::exec_policy(stream), begin, end, output_data, p);
+  thrust::transform(rmm::exec_policy_nosync(stream), begin, end, output_data, p);
 
   return output;
 }

From deb9af4df7c9c44a43a1d146d4ba4eb8bad31cbb Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 23 Oct 2024 11:21:24 -0700
Subject: [PATCH 105/117] Replace direct `cudaMemcpyAsync` calls with utility
 functions (limited to `cudf::io`) (#17132)

Issue https://github.com/rapidsai/cudf/issues/15620

Replaced the calls to `cudaMemcpyAsync` with the new `cuda_memcpy`/`cuda_memcpy_async` utility, which optionally avoids using the copy engine. Changes are limited to cuIO to make the PR easier to review (repetitive enough as-is!).

Also took the opportunity to use `cudf::detail::host_vector` and its factories to enable wider pinned memory use.

Skipped a few instances of `cudaMemcpyAsync`; few are under `io::comp`, which we don't want to invest in further (if possible). The other `cudaMemcpyAsync` instances are D2D copies, which `cuda_memcpy`/`cuda_memcpy_async` don't support. Perhaps they should, just to make the use ubiquitous.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17132
---
 cpp/src/io/comp/uncomp.cpp                    |  6 +-
 cpp/src/io/csv/reader_impl.cu                 | 57 ++++++++-----------
 cpp/src/io/csv/writer_impl.cu                 | 10 +---
 cpp/src/io/json/host_tree_algorithms.cu       | 24 +++-----
 cpp/src/io/json/json_column.cu                | 16 +++---
 cpp/src/io/json/json_tree.cu                  | 15 ++---
 cpp/src/io/json/read_json.cu                  | 13 ++---
 cpp/src/io/orc/reader_impl_chunking.cu        |  8 +--
 cpp/src/io/orc/writer_impl.cu                 | 26 ++++-----
 cpp/src/io/parquet/predicate_pushdown.cpp     | 21 ++++---
 cpp/src/io/parquet/reader_impl.cpp            |  2 +-
 cpp/src/io/parquet/reader_impl.hpp            |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.hpp   |  2 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  | 37 +++++-------
 cpp/src/io/parquet/writer_impl.cu             | 29 +++-------
 cpp/src/io/text/bgzip_data_chunk_source.cu    |  4 +-
 .../io/text/data_chunk_source_factories.cpp   | 22 +++----
 cpp/src/io/utilities/datasource.cpp           | 12 ++--
 18 files changed, 134 insertions(+), 172 deletions(-)

diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 1af45b41d8e..d4d6f46b99a 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -538,8 +538,10 @@ size_t decompress_zstd(host_span<uint8_t const> src,
   CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed");
 
   // Copy temporary output to `dst`
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    dst.data(), d_dst.data(), hd_stats[0].bytes_written, cudaMemcpyDefault, stream.value()));
+  cudf::detail::cuda_memcpy_async(
+    dst.subspan(0, hd_stats[0].bytes_written),
+    device_span<uint8_t const>{d_dst.data(), hd_stats[0].bytes_written},
+    stream);
 
   return hd_stats[0].bytes_written;
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 8c32fc85f78..72fca75c56b 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -21,6 +21,7 @@
 
 #include "csv_common.hpp"
 #include "csv_gpu.hpp"
+#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/io_uncomp.hpp"
 #include "io/utilities/column_buffer.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
@@ -275,11 +276,10 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
     auto const read_offset = byte_range_offset + input_pos + previous_data_size;
     auto const read_size   = target_pos - input_pos - previous_data_size;
     if (data.has_value()) {
-      CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size,
-                                    data->data() + read_offset,
-                                    target_pos - input_pos - previous_data_size,
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      cudf::detail::cuda_memcpy_async(
+        device_span<char>{d_data.data() + previous_data_size, read_size},
+        data->subspan(read_offset, read_size),
+        stream);
     } else {
       if (source->is_device_read_preferred(read_size)) {
         source->device_read(read_offset,
@@ -288,12 +288,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                             stream);
       } else {
         auto const buffer = source->host_read(read_offset, read_size);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size,
-                                      buffer->data(),
-                                      buffer->size(),
-                                      cudaMemcpyDefault,
-                                      stream.value()));
-        stream.synchronize();  // To prevent buffer going out of scope before we copy the data.
+        // Use sync version to prevent buffer going out of scope before we copy the data.
+        cudf::detail::cuda_memcpy(
+          device_span<char>{d_data.data() + previous_data_size, read_size},
+          host_span<char const>{reinterpret_cast<char const*>(buffer->data()), buffer->size()},
+          stream);
       }
     }
 
@@ -311,12 +310,10 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                                                    range_end,
                                                                    skip_rows,
                                                                    stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_ctx.device_ptr(),
-                                  num_blocks * sizeof(uint64_t),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+
+    cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                              device_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                              stream);
 
     // Sum up the rows in each character block, selecting the row count that
     // corresponds to the current input context. Also stores the now known input
@@ -331,11 +328,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
       // At least one row in range in this batch
       all_row_offsets.resize(total_rows - skip_rows, stream);
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
-                                    row_ctx.host_ptr(),
-                                    num_blocks * sizeof(uint64_t),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      cudf::detail::cuda_memcpy_async(device_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                                      host_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                                      stream);
 
       // Pass 2: Output row offsets
       cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
@@ -352,12 +347,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                              stream);
       // With byte range, we want to keep only one row out of the specified range
       if (range_end < data_size) {
-        CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                      row_ctx.device_ptr(),
-                                      num_blocks * sizeof(uint64_t),
-                                      cudaMemcpyDefault,
-                                      stream.value()));
-        stream.synchronize();
+        cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                                  device_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                                  stream);
 
         size_t rows_out_of_range = 0;
         for (uint32_t i = 0; i < num_blocks; i++) {
@@ -401,12 +393,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   // Remove header rows and extract header
   auto const header_row_index = std::max<size_t>(header_rows, 1) - 1;
   if (header_row_index + 1 < row_offsets.size()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_offsets.data() + header_row_index,
-                                  2 * sizeof(uint64_t),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, 2),
+                              device_span<uint64_t const>{row_offsets.data() + header_row_index, 2},
+                              stream);
 
     auto const header_start = input_pos + row_ctx[0];
     auto const header_end   = input_pos + row_ctx[1];
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index b84446b5f3e..2bbe05ced84 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/csv.hpp>
 #include <cudf/null_mask.hpp>
@@ -405,13 +406,8 @@ void write_chunked(data_sink* out_sink,
     out_sink->device_write(ptr_all_bytes, total_num_bytes, stream);
   } else {
     // copy the bytes to host to write them out
-    thrust::host_vector<char> h_bytes(total_num_bytes);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
-                                  ptr_all_bytes,
-                                  total_num_bytes * sizeof(char),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    auto const h_bytes = cudf::detail::make_host_vector_sync(
+      device_span<char const>{ptr_all_bytes, total_num_bytes}, stream);
 
     out_sink->host_write(h_bytes.data(), total_num_bytes);
   }
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 7ee652e0239..d06338c6f69 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -198,17 +198,13 @@ NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
                                        bool is_enabled_lines,
                                        rmm::cuda_stream_view stream)
 {
-  NodeIndexT value = parent_node_sentinel;
-  if (!col_ids.empty()) {
-    auto const list_node_index = is_enabled_lines ? 0 : 1;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                  col_ids.data() + list_node_index,
-                                  sizeof(NodeIndexT),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
-  }
-  return value;
+  if (col_ids.empty()) { return parent_node_sentinel; }
+
+  auto const list_node_index = is_enabled_lines ? 0 : 1;
+  auto const value           = cudf::detail::make_host_vector_sync(
+    device_span<NodeIndexT const>{col_ids.data() + list_node_index, 1}, stream);
+
+  return value[0];
 }
 /**
  * @brief Holds member data pointers of `d_json_column`
@@ -818,11 +814,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                  column_categories.cbegin(),
                  expected_types.begin(),
                  [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
-  cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                  expected_types.data(),
-                  expected_types.size() * sizeof(column_categories[0]),
-                  cudaMemcpyDefault,
-                  stream.value());
+  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, expected_types, stream);
 
   return {is_pruned, columns};
 }
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 4584f71775f..7e4d975e431 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -513,16 +513,14 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 #endif
 
   bool const is_array_of_arrays = [&]() {
-    std::array<node_t, 2> h_node_categories = {NC_ERR, NC_ERR};
-    auto const size_to_copy                 = std::min(size_t{2}, gpu_tree.node_categories.size());
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(),
-                                  gpu_tree.node_categories.data(),
-                                  sizeof(node_t) * size_to_copy,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size());
+    if (size_to_copy == 0) return false;
+    auto const h_node_categories = cudf::detail::make_host_vector_sync(
+      device_span<NodeT const>{gpu_tree.node_categories.data(), size_to_copy}, stream);
+
     if (options.is_enabled_lines()) return h_node_categories[0] == NC_LIST;
-    return h_node_categories[0] == NC_LIST and h_node_categories[1] == NC_LIST;
+    return h_node_categories.size() >= 2 and h_node_categories[0] == NC_LIST and
+           h_node_categories[1] == NC_LIST;
   }();
 
   auto [gpu_col_id, gpu_row_offsets] =
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index d949635c1cc..e2fe926ea19 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -264,16 +264,13 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       error_count > 0) {
     auto const error_location =
       thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
-    SymbolOffsetT error_index;
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(&error_index,
-                      token_indices.data() + thrust::distance(tokens.begin(), error_location),
-                      sizeof(SymbolOffsetT),
-                      cudaMemcpyDefault,
-                      stream.value()));
-    stream.synchronize();
+    auto error_index = cudf::detail::make_host_vector_sync<SymbolOffsetT>(
+      device_span<SymbolOffsetT const>{
+        token_indices.data() + thrust::distance(tokens.begin(), error_location), 1},
+      stream);
+
     CUDF_FAIL("JSON Parser encountered an invalid format at location " +
-              std::to_string(error_index));
+              std::to_string(error_index[0]));
   }
 
   auto const num_tokens = tokens.size();
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index c424d2b3b62..8a740ae17ef 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -315,13 +315,12 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // Reading to host because decompression of a single block is much faster on the CPU
   sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
   auto uncomp_data = decompress(compression, hbuffer);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
-                                reinterpret_cast<char*>(uncomp_data.data()),
-                                uncomp_data.size() * sizeof(char),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  stream.synchronize();
-  return buffer.first(uncomp_data.size());
+  auto ret_buffer  = buffer.first(uncomp_data.size());
+  cudf::detail::cuda_memcpy<char>(
+    ret_buffer,
+    host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
+    stream);
+  return ret_buffer;
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9cc77e8e738..fcaee9c548e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -516,10 +516,10 @@ void reader_impl::load_next_stripe_data(read_mode mode)
         _stream.synchronize();
         stream_synchronized = true;
       }
-      device_read_tasks.push_back(
-        std::pair(source_ptr->device_read_async(
-                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
-                  read_info.length));
+      device_read_tasks.emplace_back(
+        source_ptr->device_read_async(
+          read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+        read_info.length);
 
     } else {
       auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 03020eb649f..d432deb8e79 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
+#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
@@ -1408,7 +1409,8 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     num_entries_seen += stripes_per_col;
   }
 
-  std::vector<statistics_merge_group> file_stats_merge(num_file_blobs);
+  auto file_stats_merge =
+    cudf::detail::make_host_vector<statistics_merge_group>(num_file_blobs, stream);
   for (auto i = 0u; i < num_file_blobs; ++i) {
     auto col_stats         = &file_stats_merge[i];
     col_stats->col_dtype   = per_chunk_stats.col_types[i];
@@ -1418,11 +1420,10 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
   }
 
   auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_file_stats_merge,
-                                file_stats_merge.data(),
-                                num_file_blobs * sizeof(statistics_merge_group),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<statistics_merge_group>(
+    device_span<statistics_merge_group>{stats_merge.device_ptr(num_stripe_blobs), num_file_blobs},
+    file_stats_merge,
+    stream);
 
   auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs;
   detail::merge_group_statistics<detail::io_file_format::ORC>(
@@ -1573,7 +1574,7 @@ void write_index_stream(int32_t stripe_id,
  * @param[in] strm_desc Stream's descriptor
  * @param[in] enc_stream Chunk's streams
  * @param[in] compressed_data Compressed stream data
- * @param[in,out] stream_out Temporary host output buffer
+ * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
  * @param[in] compression_kind The compression kind
@@ -1584,7 +1585,7 @@ void write_index_stream(int32_t stripe_id,
 std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
                                     gpu::encoder_chunk_streams const& enc_stream,
                                     uint8_t const* compressed_data,
-                                    uint8_t* stream_out,
+                                    host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
                                     orc_streams* streams,
                                     CompressionKind compression_kind,
@@ -1604,11 +1605,10 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
     if (out_sink->is_device_write_preferred(length)) {
       return out_sink->device_write_async(stream_in, length, stream);
     } else {
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value()));
-      stream.synchronize();
+      cudf::detail::cuda_memcpy(
+        bounce_buffer.subspan(0, length), device_span<uint8_t const>{stream_in, length}, stream);
 
-      out_sink->host_write(stream_out, length);
+      out_sink->host_write(bounce_buffer.data(), length);
       return std::async(std::launch::deferred, [] {});
     }
   }();
@@ -2616,7 +2616,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
         strm_desc,
         enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
         compressed_data.data(),
-        bounce_buffer.data(),
+        bounce_buffer,
         &stripe,
         &streams,
         _compression_kind,
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index f0a0bc0b51b..32e922b04bb 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -454,15 +454,18 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
                "Filter expression must return a boolean column");
 
-  auto num_bitmasks = num_bitmask_words(predicate.size());
-  std::vector<bitmask_type> host_bitmask(num_bitmasks, ~bitmask_type{0});
-  if (predicate.nullable()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(host_bitmask.data(),
-                                  predicate.null_mask(),
-                                  num_bitmasks * sizeof(bitmask_type),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-  }
+  auto const host_bitmask = [&] {
+    auto const num_bitmasks = num_bitmask_words(predicate.size());
+    if (predicate.nullable()) {
+      return cudf::detail::make_host_vector_sync(
+        device_span<bitmask_type const>(predicate.null_mask(), num_bitmasks), stream);
+    } else {
+      auto bitmask = cudf::detail::make_host_vector<bitmask_type>(num_bitmasks, stream);
+      std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0});
+      return bitmask;
+    }
+  }();
+
   auto validity_it = cudf::detail::make_counting_transform_iterator(
     0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index f0865c715bc..0705ff6f5cc 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -78,7 +78,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
   auto const has_strings = (kernel_mask & STRINGS_MASK) != 0;
-  std::vector<size_t> col_string_sizes(_input_columns.size(), 0L);
+  auto col_string_sizes  = cudf::detail::make_host_vector<size_t>(_input_columns.size(), _stream);
   if (has_strings) {
     // need to compute pages bounds/sizes if we lack page indexes or are using custom bounds
     // TODO: we could probably dummy up size stats for FLBA data since we know the width
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 62ffc4d3077..3aa9b94ed6b 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -284,7 +284,7 @@ class reader::impl {
    *
    * @return Vector of total string data sizes for each column
    */
-  std::vector<size_t> calculate_page_string_offsets();
+  cudf::detail::host_vector<size_t> calculate_page_string_offsets();
 
   /**
    * @brief Converts the page data and outputs to columns.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 3a3cdd34a58..a0c2dbd3e44 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -107,7 +107,7 @@ struct subpass_intermediate_data {
  * rowgroups may represent less than all of the rowgroups to be read for the file.
  */
 struct pass_intermediate_data {
-  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  std::vector<rmm::device_buffer> raw_page_data;
 
   // rowgroup, chunk and page information for the current pass.
   bool has_compressed_data{false};
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 60e35465793..f03f1214b9a 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -218,7 +218,7 @@ void generate_depth_remappings(
  */
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
-  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
+  cudf::host_span<rmm::device_buffer> page_data,
   cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
@@ -251,23 +251,24 @@ void generate_depth_remappings(
       if (source->is_device_read_preferred(io_size)) {
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        auto buffer =
+        page_data[chunk] =
           rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream);
         auto fut_read_size = source->device_read_async(
-          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
+          io_offset, io_size, static_cast<uint8_t*>(page_data[chunk].data()), stream);
         read_tasks.emplace_back(std::move(fut_read_size));
-        page_data[chunk] = datasource::buffer::create(std::move(buffer));
       } else {
         auto const read_buffer = source->host_read(io_offset, io_size);
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        auto tmp_buffer = rmm::device_buffer(
+        page_data[chunk] = rmm::device_buffer(
           cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream));
-        page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer));
+        CUDF_CUDA_TRY(cudaMemcpyAsync(page_data[chunk].data(),
+                                      read_buffer->data(),
+                                      read_buffer->size(),
+                                      cudaMemcpyDefault,
+                                      stream));
       }
-      auto d_compdata = page_data[chunk]->data();
+      auto d_compdata = static_cast<uint8_t const*>(page_data[chunk].data());
       do {
         chunks[chunk].compressed_data = d_compdata;
         d_compdata += chunks[chunk].compressed_size;
@@ -980,7 +981,7 @@ std::pair<bool, std::future<void>> reader::impl::read_column_chunks()
   std::vector<size_type> chunk_source_map(num_chunks);
 
   // Tracker for eventually deallocating compressed and uncompressed data
-  raw_page_data = std::vector<std::unique_ptr<datasource::buffer>>(num_chunks);
+  raw_page_data = std::vector<rmm::device_buffer>(num_chunks);
 
   // Keep track of column chunk file offsets
   std::vector<size_t> column_chunk_offsets(num_chunks);
@@ -1695,15 +1696,14 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
     nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
-std::vector<size_t> reader::impl::calculate_page_string_offsets()
+cudf::detail::host_vector<size_t> reader::impl::calculate_page_string_offsets()
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
   auto page_keys = make_page_key_iterator(subpass.pages);
 
-  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
-  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
+  rmm::device_uvector<size_t> d_col_sizes(_input_columns.size(), _stream);
 
   // use page_index to fetch page string sizes in the proper order
   auto val_iter = thrust::make_transform_iterator(subpass.pages.device_begin(),
@@ -1717,7 +1717,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                                 page_offset_output_iter{subpass.pages.device_ptr()});
 
   // now sum up page sizes
-  rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
+  rmm::device_uvector<int> reduce_keys(d_col_sizes.size(), _stream);
   thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
                         page_keys,
                         page_keys + subpass.pages.size(),
@@ -1725,14 +1725,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                         reduce_keys.begin(),
                         d_col_sizes.begin());
 
-  cudaMemcpyAsync(col_sizes.data(),
-                  d_col_sizes.data(),
-                  sizeof(size_t) * col_sizes.size(),
-                  cudaMemcpyDeviceToHost,
-                  _stream);
-  _stream.synchronize();
-
-  return col_sizes;
+  return cudf::detail::make_host_vector_sync(d_col_sizes, _stream);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 190f13eb688..f865c9a7643 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -183,7 +183,7 @@ struct aggregate_writer_metadata {
     std::vector<RowGroup> row_groups;
     std::vector<KeyValue> key_value_metadata;
     std::vector<OffsetIndex> offset_indexes;
-    std::vector<std::vector<uint8_t>> column_indexes;
+    std::vector<cudf::detail::host_vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
   std::optional<std::vector<ColumnOrder>> column_orders = std::nullopt;
@@ -1543,12 +1543,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
       d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream);
   }
 
-  auto h_chunks = chunks.host_view();
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(),
-                                d_chunks.data(),
-                                d_chunks.flat_view().size_bytes(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  chunks.device_to_host_async(stream);
 
   if (comp_stats.has_value()) {
     comp_stats.value() += collect_compression_statistics(comp_in, comp_res, stream);
@@ -2559,12 +2554,11 @@ void writer::impl::write_parquet_data_to_sink(
         } else {
           CUDF_EXPECTS(bounce_buffer.size() >= ck.compressed_size,
                        "Bounce buffer was not properly initialized.");
-          CUDF_CUDA_TRY(cudaMemcpyAsync(bounce_buffer.data(),
-                                        dev_bfr + ck.ck_stat_size,
-                                        ck.compressed_size,
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
-          _stream.synchronize();
+          cudf::detail::cuda_memcpy(
+            host_span{bounce_buffer}.subspan(0, ck.compressed_size),
+            device_span<uint8_t const>{dev_bfr + ck.ck_stat_size, ck.compressed_size},
+            _stream);
+
           _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size);
         }
 
@@ -2600,13 +2594,8 @@ void writer::impl::write_parquet_data_to_sink(
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
-          std::vector<uint8_t> column_idx;
-          column_idx.resize(ck.column_index_size);
-          CUDF_CUDA_TRY(cudaMemcpyAsync(column_idx.data(),
-                                        ck.column_index_blob,
-                                        ck.column_index_size,
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
+          auto column_idx = cudf::detail::make_host_vector_async(
+            device_span<uint8_t const>{ck.column_index_blob, ck.column_index_size}, _stream);
 
           // calculate offsets while the column index is transferring
           int64_t curr_pg_offset = column_chunk_meta.data_page_offset;
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index badcd3f58f9..06069630685 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -74,8 +74,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     // Buffer needs to be padded.
     // Required by `inflate_kernel`.
     device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+    cudf::detail::cuda_memcpy_async<T>(
+      device_span<T>{device}.subspan(0, host.size()), host, stream);
   }
 
   struct decompression_blocks {
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 58faa0ebfe4..4baea8655e0 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -87,8 +87,10 @@ class datasource_chunk_reader : public data_chunk_reader {
       _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer.data()));
 
       // copy the host-pinned data on to device
-      CUDF_CUDA_TRY(cudaMemcpyAsync(
-        chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+      cudf::detail::cuda_memcpy_async<char>(
+        device_span<char>{chunk}.subspan(0, read_size),
+        host_span<char const>{h_ticket.buffer}.subspan(0, read_size),
+        stream);
 
       // record the host-to-device copy.
       CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -153,8 +155,10 @@ class istream_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host-pinned data on to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+    cudf::detail::cuda_memcpy_async<char>(
+      device_span<char>{chunk}.subspan(0, read_size),
+      host_span<char const>{h_ticket.buffer}.subspan(0, read_size),
+      stream);
 
     // record the host-to-device copy.
     CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -193,12 +197,10 @@ class host_span_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host data to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(  //
-      chunk.data(),
-      _data.data() + _position,
-      read_size,
-      cudaMemcpyDefault,
-      stream.value()));
+    cudf::detail::cuda_memcpy_async<char>(
+      cudf::device_span<char>{chunk}.subspan(0, read_size),
+      cudf::host_span<char const>{_data}.subspan(_position, read_size),
+      stream);
 
     _position += read_size;
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 19f2103071e..4e8908a8942 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -18,6 +18,7 @@
 #include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
@@ -247,17 +248,18 @@ class device_buffer_source final : public datasource {
   size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
     auto const count  = std::min(size, this->size() - offset);
-    auto const stream = cudf::get_default_stream();
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(dst, _d_buffer.data() + offset, count, cudaMemcpyDefault, stream.value()));
-    stream.synchronize();
+    auto const stream = cudf::detail::global_cuda_stream_pool().get_stream();
+    cudf::detail::cuda_memcpy(host_span<uint8_t>{dst, count},
+                              device_span<uint8_t const>{
+                                reinterpret_cast<uint8_t const*>(_d_buffer.data() + offset), count},
+                              stream);
     return count;
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
     auto const count  = std::min(size, this->size() - offset);
-    auto const stream = cudf::get_default_stream();
+    auto const stream = cudf::detail::global_cuda_stream_pool().get_stream();
     auto h_data       = cudf::detail::make_host_vector_async(
       cudf::device_span<std::byte const>{_d_buffer.data() + offset, count}, stream);
     stream.synchronize();

From e7653a70743a76ad3c8ca4b377aa0ec4303e5556 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 23 Oct 2024 13:23:04 -0500
Subject: [PATCH 106/117] Use managed memory for NDSH benchmarks (#17039)

Fixes https://github.com/rapidsai/cudf/issues/16987
Use managed memory to generate the parquet data, and write parquet data to host buffer.
Replace use of parquet_device_buffer with cuio_source_sink_pair

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17039
---
 cpp/benchmarks/CMakeLists.txt     |   2 +-
 cpp/benchmarks/ndsh/q01.cpp       |   8 +-
 cpp/benchmarks/ndsh/q05.cpp       |  16 +--
 cpp/benchmarks/ndsh/q06.cpp       |   8 +-
 cpp/benchmarks/ndsh/q09.cpp       |  17 ++--
 cpp/benchmarks/ndsh/q10.cpp       |  13 +--
 cpp/benchmarks/ndsh/utilities.cpp | 157 +++++++++++++++++++++++-------
 cpp/benchmarks/ndsh/utilities.hpp |  19 ++--
 8 files changed, 161 insertions(+), 79 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e61a8e6e1e6..f013b31b3de 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -49,7 +49,7 @@ target_compile_options(
 
 target_link_libraries(
   ndsh_data_generator
-  PUBLIC cudf GTest::gmock GTest::gtest cudf::cudftestutil nvtx3::nvtx3-cpp
+  PUBLIC cudf cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
diff --git a/cpp/benchmarks/ndsh/q01.cpp b/cpp/benchmarks/ndsh/q01.cpp
index ef709926ae9..485e8e5497c 100644
--- a/cpp/benchmarks/ndsh/q01.cpp
+++ b/cpp/benchmarks/ndsh/q01.cpp
@@ -104,7 +104,7 @@
 }
 
 void run_ndsh_q1(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projections and filter predicate for `lineitem` table
   std::vector<std::string> const lineitem_cols = {"l_returnflag",
@@ -124,8 +124,8 @@ void run_ndsh_q1(nvbench::state& state,
     cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
 
   // Read out the `lineitem` table from parquet file
-  auto lineitem =
-    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
+  auto lineitem = read_parquet(
+    sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Calculate the discount price and charge columns and append to lineitem table
   auto disc_price =
@@ -170,7 +170,7 @@ void ndsh_q1(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
 
   auto stream = cudf::get_default_stream();
diff --git a/cpp/benchmarks/ndsh/q05.cpp b/cpp/benchmarks/ndsh/q05.cpp
index 522bc4789c2..1c2d657913e 100644
--- a/cpp/benchmarks/ndsh/q05.cpp
+++ b/cpp/benchmarks/ndsh/q05.cpp
@@ -89,7 +89,7 @@
 }
 
 void run_ndsh_q5(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
@@ -120,17 +120,17 @@ void run_ndsh_q5(nvbench::state& state,
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer =
-    read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"});
+    read_parquet(sources.at("customer").make_source_info(), {"c_custkey", "c_nationkey"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
-  auto const lineitem = read_parquet(sources["lineitem"].make_source_info(),
+    read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(sources.at("lineitem").make_source_info(),
                                      {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
   auto const supplier =
-    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
+    read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"});
   auto const nation =
-    read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
+    read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
   auto const region =
-    read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred));
+    read_parquet(sources.at("region").make_source_info(), region_cols, std::move(region_pred));
 
   // Perform the joins
   auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
@@ -165,7 +165,7 @@ void ndsh_q5(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/q06.cpp b/cpp/benchmarks/ndsh/q06.cpp
index 04078547973..e1e56c3622e 100644
--- a/cpp/benchmarks/ndsh/q06.cpp
+++ b/cpp/benchmarks/ndsh/q06.cpp
@@ -64,7 +64,7 @@
 }
 
 void run_ndsh_q6(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Read out the `lineitem` table from parquet file
   std::vector<std::string> const lineitem_cols = {
@@ -83,8 +83,8 @@ void run_ndsh_q6(nvbench::state& state,
     cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
   auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
-  auto lineitem =
-    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
+  auto lineitem = read_parquet(
+    sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Cast the discount and quantity columns to float32 and append to lineitem table
   auto discout_float =
@@ -134,7 +134,7 @@ void ndsh_q6(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
 
   auto stream = cudf::get_default_stream();
diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp
index 59218ab8912..2e9a69d9ee2 100644
--- a/cpp/benchmarks/ndsh/q09.cpp
+++ b/cpp/benchmarks/ndsh/q09.cpp
@@ -112,20 +112,21 @@
 }
 
 void run_ndsh_q9(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Read out the table from parquet files
   auto const lineitem = read_parquet(
-    sources["lineitem"].make_source_info(),
+    sources.at("lineitem").make_source_info(),
     {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
-  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"});
+  auto const nation =
+    read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_name"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"});
-  auto const part     = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"});
-  auto const partsupp = read_parquet(sources["partsupp"].make_source_info(),
+    read_parquet(sources.at("orders").make_source_info(), {"o_orderkey", "o_orderdate"});
+  auto const part = read_parquet(sources.at("part").make_source_info(), {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(sources.at("partsupp").make_source_info(),
                                      {"ps_suppkey", "ps_partkey", "ps_supplycost"});
   auto const supplier =
-    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
+    read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"});
 
   // Generating the `profit` table
   // Filter the part table using `p_name like '%green%'`
@@ -178,7 +179,7 @@ void ndsh_q9(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp
index a520480020a..72edd15083d 100644
--- a/cpp/benchmarks/ndsh/q10.cpp
+++ b/cpp/benchmarks/ndsh/q10.cpp
@@ -94,7 +94,7 @@
 }
 
 void run_ndsh_q10(nvbench::state& state,
-                  std::unordered_map<std::string, parquet_device_buffer>& sources)
+                  std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
@@ -122,15 +122,16 @@ void run_ndsh_q10(nvbench::state& state,
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer = read_parquet(
-    sources["customer"].make_source_info(),
+    sources.at("customer").make_source_info(),
     {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
+    read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred));
   auto const lineitem =
-    read_parquet(sources["lineitem"].make_source_info(),
+    read_parquet(sources.at("lineitem").make_source_info(),
                  {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
                  std::move(lineitem_pred));
-  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"});
+  auto const nation =
+    read_parquet(sources.at("nation").make_source_info(), {"n_name", "n_nationkey"});
 
   // Perform the joins
   auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
@@ -163,7 +164,7 @@ void ndsh_q10(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"customer", "orders", "lineitem", "nation"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
index 62116ddf661..9f9849860c9 100644
--- a/cpp/benchmarks/ndsh/utilities.cpp
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -17,6 +17,8 @@
 #include "utilities.hpp"
 
 #include "common/ndsh_data_generator/ndsh_data_generator.hpp"
+#include "common/table_utilities.hpp"
+#include "cudf/detail/utilities/integer_utils.hpp"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -30,8 +32,15 @@
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <algorithm>
 #include <cstdlib>
 #include <ctime>
+#include <iterator>
+#include <unordered_set>
 
 namespace {
 
@@ -85,6 +94,15 @@ std::vector<std::string> const NATION_SCHEMA   = {
   "n_nationkey", "n_name", "n_regionkey", "n_comment"};
 std::vector<std::string> const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"};
 
+std::unordered_map<std::string, std::vector<std::string> const> const SCHEMAS = {
+  {"orders", ORDERS_SCHEMA},
+  {"lineitem", LINEITEM_SCHEMA},
+  {"part", PART_SCHEMA},
+  {"partsupp", PARTSUPP_SCHEMA},
+  {"supplier", SUPPLIER_SCHEMA},
+  {"customer", CUSTOMER_SCHEMA},
+  {"nation", NATION_SCHEMA},
+  {"region", REGION_SCHEMA}};
 }  // namespace
 
 cudf::table_view table_with_names::table() const { return tbl->view(); }
@@ -337,7 +355,7 @@ int32_t days_since_epoch(int year, int month, int day)
 
 void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
                                     std::vector<std::string> const& col_names,
-                                    parquet_device_buffer& source)
+                                    cuio_source_sink_pair& source)
 {
   CUDF_FUNC_RANGE();
   auto const stream = cudf::get_default_stream();
@@ -351,55 +369,124 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
   metadata.schema_info            = col_name_infos;
   auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
 
-  // Declare a host and device buffer
-  std::vector<char> h_buffer;
-
+  auto est_size                     = static_cast<std::size_t>(estimate_size(table->view()));
+  constexpr auto PQ_MAX_TABLE_BYTES = 8ul << 30;  // 8GB
+  // TODO: best to get this limit from percent_of_free_device_memory(50) of device memory resource.
+  if (est_size > PQ_MAX_TABLE_BYTES) {
+    auto builder = cudf::io::chunked_parquet_writer_options::builder(source.make_sink_info());
+    builder.metadata(table_input_metadata);
+    auto const options = builder.build();
+    auto num_splits    = static_cast<cudf::size_type>(
+      std::ceil(static_cast<long double>(est_size) / (PQ_MAX_TABLE_BYTES)));
+    std::vector<cudf::size_type> splits(num_splits - 1);
+    auto num_rows          = table->num_rows();
+    auto num_row_per_chunk = cudf::util::div_rounding_up_safe(num_rows, num_splits);
+    std::generate_n(splits.begin(), splits.size(), [num_row_per_chunk, i = 0]() mutable {
+      return (i += num_row_per_chunk);
+    });
+    std::vector<cudf::table_view> split_tables = cudf::split(table->view(), splits, stream);
+    auto writer                                = cudf::io::parquet_chunked_writer(options, stream);
+    for (auto const& chunk_table : split_tables) {
+      writer.write(chunk_table);
+    }
+    writer.close();
+    return;
+  }
   // Write parquet data to host buffer
-  auto builder =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view());
+  auto builder = cudf::io::parquet_writer_options::builder(source.make_sink_info(), table->view());
   builder.metadata(table_input_metadata);
   auto const options = builder.build();
-  cudf::io::write_parquet(options);
+  cudf::io::write_parquet(options, stream);
+}
 
-  // Copy host buffer to device buffer
-  source.d_buffer.resize(h_buffer.size(), stream);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+inline auto make_managed_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    std::make_shared<rmm::mr::managed_memory_resource>(), rmm::percent_of_free_device_memory(50));
 }
 
 void generate_parquet_data_sources(double scale_factor,
                                    std::vector<std::string> const& table_names,
-                                   std::unordered_map<std::string, parquet_device_buffer>& sources)
+                                   std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   CUDF_FUNC_RANGE();
-  std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) {
-    sources[table_name] = parquet_device_buffer();
-  });
 
-  auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  // Set the memory resource to the managed pool
+  auto old_mr = cudf::get_current_device_resource();
+  // if already managed pool or managed, don't create new one.
+  using managed_pool_mr_t = decltype(make_managed_pool());
+  managed_pool_mr_t managed_pool_mr;
+  bool const is_managed =
+    dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(old_mr) or
+    dynamic_cast<rmm::mr::managed_memory_resource*>(old_mr);
+  if (!is_managed) {
+    std::cout << "Creating managed pool just for data generation\n";
+    managed_pool_mr = make_managed_pool();
+    cudf::set_current_device_resource(managed_pool_mr.get());
+    // drawback: if already pool takes 50% of free memory, we are left with 50% of 50% of free
+    // memory.
+  }
 
-  auto partsupp = cudf::datagen::generate_partsupp(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  std::unordered_set<std::string> const requested_table_names = [&table_names]() {
+    if (table_names.empty()) {
+      return std::unordered_set<std::string>{
+        "orders", "lineitem", "part", "partsupp", "supplier", "customer", "nation", "region"};
+    }
+    return std::unordered_set(table_names.begin(), table_names.end());
+  }();
+  std::for_each(
+    requested_table_names.begin(), requested_table_names.end(), [&](auto const& table_name) {
+      sources.emplace(table_name, cuio_source_sink_pair(io_type::HOST_BUFFER));
+    });
+  std::unordered_map<std::string, std::unique_ptr<cudf::table>> tables;
+
+  if (sources.count("orders") or sources.count("lineitem") or sources.count("part")) {
+    auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    if (sources.count("orders")) {
+      write_to_parquet_device_buffer(orders, SCHEMAS.at("orders"), sources.at("orders"));
+      orders = {};
+    }
+    if (sources.count("part")) {
+      write_to_parquet_device_buffer(part, SCHEMAS.at("part"), sources.at("part"));
+      part = {};
+    }
+    if (sources.count("lineitem")) {
+      write_to_parquet_device_buffer(lineitem, SCHEMAS.at("lineitem"), sources.at("lineitem"));
+      lineitem = {};
+    }
+  }
+
+  if (sources.count("partsupp")) {
+    auto partsupp = cudf::datagen::generate_partsupp(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(partsupp, SCHEMAS.at("partsupp"), sources.at("partsupp"));
+  }
 
-  auto supplier = cudf::datagen::generate_supplier(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  if (sources.count("supplier")) {
+    auto supplier = cudf::datagen::generate_supplier(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(supplier, SCHEMAS.at("supplier"), sources.at("supplier"));
+  }
 
-  auto customer = cudf::datagen::generate_customer(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  if (sources.count("customer")) {
+    auto customer = cudf::datagen::generate_customer(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(customer, SCHEMAS.at("customer"), sources.at("customer"));
+  }
 
-  auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
-                                               cudf::get_current_device_resource_ref());
+  if (sources.count("nation")) {
+    auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
+                                                 cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(nation, SCHEMAS.at("nation"), sources.at("nation"));
+  }
 
-  auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
-                                               cudf::get_current_device_resource_ref());
+  if (sources.count("region")) {
+    auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
+                                                 cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(region, SCHEMAS.at("region"), sources.at("region"));
+  }
 
-  write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]);
-  write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]);
-  write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]);
-  write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]);
-  write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]);
-  write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]);
-  write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]);
-  write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]);
+  // Restore the original memory resource
+  if (!is_managed) { cudf::set_current_device_resource(old_mr); }
 }
diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp
index 762e43deccf..cae07f86a98 100644
--- a/cpp/benchmarks/ndsh/utilities.hpp
+++ b/cpp/benchmarks/ndsh/utilities.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "io/cuio_common.hpp"
+
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/io/parquet.hpp>
@@ -196,24 +198,15 @@ std::tm make_tm(int year, int month, int day);
 int32_t days_since_epoch(int year, int month, int day);
 
 /**
- * @brief Struct representing a parquet device buffer
- */
-struct parquet_device_buffer {
-  parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {};
-  cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); }
-  rmm::device_uvector<std::byte> d_buffer;
-};
-
-/**
- * @brief Write a `cudf::table` to a parquet device buffer
+ * @brief Write a `cudf::table` to a parquet cuio sink
  *
  * @param table The `cudf::table` to write
  * @param col_names The column names of the table
- * @param parquet_device_buffer The parquet device buffer to write the table to
+ * @param source The source sink pair to write the table to
  */
 void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
                                     std::vector<std::string> const& col_names,
-                                    parquet_device_buffer& source);
+                                    cuio_source_sink_pair& source);
 
 /**
  * @brief Generate NDS-H tables and write to parquet device buffers
@@ -224,4 +217,4 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
  */
 void generate_parquet_data_sources(double scale_factor,
                                    std::vector<std::string> const& table_names,
-                                   std::unordered_map<std::string, parquet_device_buffer>& sources);
+                                   std::unordered_map<std::string, cuio_source_sink_pair>& sources);

From 02879724b3073675ff3384071ef9227427f57b41 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 23 Oct 2024 19:16:59 -0400
Subject: [PATCH 107/117] Use the full ref name of `rmm.DeviceBuffer` in the
 sphinx config file (#17150)

This is an improvement PR that uses the full name of `rmm.DeviceBuffer` in the sphinx config file. Its a follow-up to this [comment](https://github.com/rapidsai/cudf/pull/16913#discussion_r1792283249).

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17150
---
 docs/cudf/source/conf.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index ecf619ddc44..5942cc16850 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,10 +342,7 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    # TODO: Replace the first entry in a follow-up with rmm.pylibrmm.device_buffer.DeviceBuffer
-    # when the RMM objects inventory is generated from branch-24.12. The RMM objects inventory
-    # can be accessed here : https://docs.rapids.ai/api/rmm/nightly/objects.inv
-    "DeviceBuffer": ("rmm.DeviceBuffer", "rmm.DeviceBuffer"),
+    "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 

From d7cdf44da2ba921c6fa63feff8749d141643f76e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 23 Oct 2024 20:19:57 -0400
Subject: [PATCH 108/117] Migrate NVText Stemming APIs to pylibcudf (#17085)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17085
---
 cpp/include/nvtext/stemmer.hpp                |  8 +-
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../api_docs/pylibcudf/nvtext/stemmer.rst     |  6 ++
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      | 56 +++++---------
 .../pylibcudf/libcudf/nvtext/stemmer.pxd      |  7 +-
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  2 +
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  2 +
 python/pylibcudf/pylibcudf/nvtext/stemmer.pxd | 14 ++++
 python/pylibcudf/pylibcudf/nvtext/stemmer.pyx | 76 +++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_stemmer.py    | 47 ++++++++++++
 11 files changed, 178 insertions(+), 43 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py

diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 55a4124bfd0..e5b2a4cc21b 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -51,7 +51,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b1 = is_letter(st, VOWEL, 1)
  * b1 is now [false, true, true]
  * @endcode
@@ -62,7 +62,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string
  * b2 is now [false, true, false]
  * @endcode
@@ -99,7 +99,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, 1, 4]
  * b1 = is_letter(st, VOWEL, ix)
  * b1 is now [true, true, false]
@@ -111,7 +111,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, -2, 4] // 2nd to last character in st[1] is checked
  * b2 = is_letter(st, CONSONANT, ix)
  * b2 is now [false, false, true]
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index c5b9533597a..e0735a197fd 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -11,3 +11,4 @@ nvtext
     ngrams_tokenize
     normalize
     replace
+    stemmer
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
new file mode 100644
index 00000000000..b407ff8451a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
@@ -0,0 +1,6 @@
+=======
+stemmer
+=======
+
+.. automodule:: pylibcudf.nvtext.stemmer
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 5bf25562fed..63a389b64d5 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,24 +1,19 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from enum import IntEnum
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
+from cudf.core.buffer import acquire_spill_lock
+
 from pylibcudf.libcudf.nvtext.stemmer cimport (
-    is_letter as cpp_is_letter,
     letter_type,
-    porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 class LetterType(IntEnum):
     CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
@@ -27,43 +22,34 @@ class LetterType(IntEnum):
 
 @acquire_spill_lock()
 def porter_stemmer_measure(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_porter_stemmer_measure(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        nvtext.stemmer.porter_stemmer_measure(
+            strings.to_pylibcudf(mode="read"),
+        )
+    )
 
 
 @acquire_spill_lock()
 def is_letter(Column strings,
               object ltype,
               size_type index):
-    cdef column_view c_strings = strings.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            index,
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, index))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def is_letter_multi(Column strings,
                     object ltype,
                     Column indices):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            indices.to_pylibcudf(mode="read"),
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
index 673bffa28ae..be3a2d75718 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -8,9 +9,9 @@ from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
-    ctypedef enum letter_type:
-        CONSONANT 'nvtext::letter_type::CONSONANT'
-        VOWEL 'nvtext::letter_type::VOWEL'
+    cpdef enum class letter_type:
+        CONSONANT
+        VOWEL
 
     cdef unique_ptr[column] porter_stemmer_measure(
         const column_view & strings
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 7a94490998a..d97c0a73267 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx normalize.pyx replace.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 5a5e665d309..a658e57018e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     ngrams_tokenize,
     normalize,
     replace,
+    stemmer,
 )
 
 __all__ = [
@@ -18,4 +19,5 @@ __all__ = [
     "ngrams_tokenize",
     "normalize",
     "replace",
+    "stemmer",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 77187f0845d..2c1feb089a2 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -8,6 +8,7 @@
     ngrams_tokenize,
     normalize,
     replace,
+    stemmer,
 )
 
 __all__ = [
@@ -18,4 +19,5 @@
     "ngrams_tokenize",
     "normalize",
     "replace",
+    "stemmer",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
new file mode 100644
index 00000000000..48762efc01f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnOrSize:
+    Column
+    size_type
+
+cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices)
+
+cpdef Column porter_stemmer_measure(Column input)
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
new file mode 100644
index 00000000000..854d1053624
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.stemmer cimport (
+    is_letter as cpp_is_letter,
+    letter_type,
+    porter_stemmer_measure as cpp_porter_stemmer_measure,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column is_letter(
+    Column input,
+    bool check_vowels,
+    ColumnOrSize indices
+):
+    """
+    Returns boolean column indicating if the character
+    or characters at the provided character index or
+    indices (respectively) are consonants or vowels
+
+    For details, see :cpp:func:`is_letter`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    check_vowels : bool
+        If true, the check is for vowels. Otherwise the check is
+        for consonants.
+    indices : Union[Column, size_type]
+        The character position(s) to check in each string
+
+    Returns
+    -------
+    Column
+        New boolean column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_is_letter(
+            input.view(),
+            letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
+            indices if ColumnOrSize is size_type else indices.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column porter_stemmer_measure(Column input):
+    """
+    Returns the Porter Stemmer measurements of a strings column.
+
+    For details, see :cpp:func:`porter_stemmer_measure`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column of words to measure
+
+    Returns
+    -------
+    Column
+        New column of measure values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_porter_stemmer_measure(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
new file mode 100644
index 00000000000..75d56f587a4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["trouble", "toy", "syzygy"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("check_vowels", [True, False])
+@pytest.mark.parametrize("indices", [[3, 1, 4], 1])
+def test_is_letter(input_col, check_vowels, indices):
+    def is_letter(s, i, check):
+        vowels = "aeiouy"
+        return (s[i] in vowels) == check
+
+    result = plc.nvtext.stemmer.is_letter(
+        plc.interop.from_arrow(input_col),
+        check_vowels,
+        plc.interop.from_arrow(pa.array(indices))
+        if isinstance(indices, list)
+        else indices,
+    )
+    expected = pa.array(
+        [
+            is_letter(
+                s,
+                indices[i] if isinstance(indices, list) else indices,
+                check_vowels,
+            )
+            for i, s in enumerate(input_col.to_pylist())
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_porter_stemmer_measure(input_col):
+    result = plc.nvtext.stemmer.porter_stemmer_measure(
+        plc.interop.from_arrow(input_col),
+    )
+    expected = pa.array([1, 1, 2], type=pa.int32())
+    assert_column_eq(result, expected)

From 3a623149827ec347e721dd1a18072f18b0b4bcc1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 24 Oct 2024 14:10:33 +0100
Subject: [PATCH 109/117] Upgrade to polars 1.11 in cudf-polars (#17154)

Polars 1.11 is out, with slight updates to the IR, so we can correctly raise for dynamic groupbys and see inequality joins.

These changes adapt to that and do a first pass at supporting inequality joins (by translating to cross + filter). A followup (#17000) will use libcudf's conditional joins.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/cudf/pull/17154
---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-x86_64.yaml             |  2 +-
 conda/recipes/cudf-polars/meta.yaml           |  2 +-
 dependencies.yaml                             |  2 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 17 ++---
 .../cudf_polars/cudf_polars/dsl/translate.py  | 76 ++++++++++++++++++-
 .../cudf_polars/cudf_polars/testing/plugin.py | 38 ++++++++--
 python/cudf_polars/pyproject.toml             |  2 +-
 python/cudf_polars/tests/test_join.py         | 60 ++++++++++++++-
 9 files changed, 172 insertions(+), 29 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bd5e6c3d569..c3716c4759a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -65,7 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.12
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 565a3ebfa3c..38e131e79cb 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -63,7 +63,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.12
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index e8fef715c60..edf92b930d9 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.8,<1.9
+    - polars >=1.11,<1.12
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index ff97b67f0ce..4804f7b00b0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -727,7 +727,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.8,<1.9
+          - polars>=1.11,<1.12
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index eb93929cf61..f79e229d3f3 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -666,11 +666,11 @@ def __init__(
             raise NotImplementedError(
                 "rolling window/groupby"
             )  # pragma: no cover; rollingwindow constructor has already raised
+        if self.options.dynamic:
+            raise NotImplementedError("dynamic group by")
         if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
-        if len(self.keys) == 0:
-            raise NotImplementedError("dynamic groupby")
 
     @staticmethod
     def check_agg(agg: expr.Expr) -> int:
@@ -802,10 +802,10 @@ class Join(IR):
     right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
+        Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
         bool,
         tuple[int, int] | None,
-        str | None,
+        str,
         bool,
     ]
     """
@@ -840,7 +840,7 @@ def __init__(
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"],
+        how: Literal["inner", "left", "right", "full", "semi", "anti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
@@ -862,13 +862,13 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "leftsemi":
+        elif how == "semi":
             return (
                 plc.join.left_semi_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        elif how == "leftanti":
+        elif how == "anti":
             return (
                 plc.join.left_anti_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -933,7 +933,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left, right = (c.evaluate(cache=cache) for c in self.children)
         how, join_nulls, zlice, suffix, coalesce = self.options
-        suffix = "_right" if suffix is None else suffix
         if how == "cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
@@ -955,7 +954,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     columns[left.num_columns :], right.column_names, strict=True
                 )
             ]
-            return DataFrame([*left_cols, *right_cols])
+            return DataFrame([*left_cols, *right_cols]).slice(zlice)
         # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
         left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
         right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 522c4a6729c..c28f2c2651a 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,10 +5,11 @@
 
 from __future__ import annotations
 
+import functools
 import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
 import pylibcudf as plc
@@ -19,9 +20,13 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged
 from cudf_polars.typing import NodeTraverser
 from cudf_polars.utils import dtypes, sorting
 
+if TYPE_CHECKING:
+    from cudf_polars.typing import ExprTransformer
+
 __all__ = ["translate_ir", "translate_named_expr"]
 
 
@@ -182,7 +187,71 @@ def _(
     with set_node(visitor, node.input_right):
         inp_right = translate_ir(visitor, n=None)
         right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
-    return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
+    if (how := node.options[0]) in {
+        "inner",
+        "left",
+        "right",
+        "full",
+        "cross",
+        "semi",
+        "anti",
+    }:
+        return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
+    else:
+        how, op1, op2 = how
+        if how != "ie_join":
+            raise NotImplementedError(
+                f"Unsupported join type {how}"
+            )  # pragma: no cover; asof joins not yet exposed
+        # No exposure of mixed/conditional joins in pylibcudf yet, so in
+        # the first instance, implement by doing a cross join followed by
+        # a filter.
+        _, join_nulls, zlice, suffix, coalesce = node.options
+        cross = ir.Join(
+            schema,
+            [],
+            [],
+            ("cross", join_nulls, None, suffix, coalesce),
+            inp_left,
+            inp_right,
+        )
+        dtype = plc.DataType(plc.TypeId.BOOL8)
+        if op2 is None:
+            ops = [op1]
+        else:
+            ops = [op1, op2]
+        suffix = cross.options[3]
+
+        # Column references in the right table refer to the post-join
+        # names, so with suffixes.
+        def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr:
+            if isinstance(e, expr.Col) and e.name in inp_left.schema:
+                return type(e)(e.dtype, f"{e.name}{suffix}")
+            return reuse_if_unchanged(e, rec)
+
+        mapper = make_recursive(_rename)
+        right_on = [
+            expr.NamedExpr(
+                f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new
+            )
+            for new, old in zip(
+                (mapper(e.value) for e in right_on), right_on, strict=True
+            )
+        ]
+        mask = functools.reduce(
+            functools.partial(
+                expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND
+            ),
+            (
+                expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value)
+                for op, left, right in zip(ops, left_on, right_on, strict=True)
+            ),
+        )
+        filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross)
+        if zlice is not None:
+            offset, length = zlice
+            return ir.Slice(schema, offset, length, filtered)
+        return filtered
 
 
 @_translate_ir.register
@@ -319,8 +388,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     # IR is versioned with major.minor, minor is bumped for backwards
     # compatible changes (e.g. adding new nodes), major is bumped for
     # incompatible changes (e.g. renaming nodes).
-    # Polars 1.7 changes definition of the CSV reader options schema name.
-    if (version := visitor.version()) >= (3, 0):
+    if (version := visitor.version()) >= (4, 0):
         raise NotImplementedError(
             f"No support for polars IR {version=}"
         )  # pragma: no cover; no such version for now.
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 05b76d76808..a3607159e01 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
@@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
     "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
     "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
@@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
     "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
@@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index a8bb634732f..2afdab1be4b 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.8,<1.9",
+    "polars>=1.11,<1.12",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 7d9ec98db97..501560d15b8 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -2,9 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from contextlib import nullcontext
+
 import pytest
 
 import polars as pl
+from polars.testing import assert_frame_equal
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
@@ -22,6 +25,11 @@ def how(request):
     return request.param
 
 
+@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)])
+def zlice(request):
+    return request.param
+
+
 @pytest.fixture
 def left():
     return pl.LazyFrame(
@@ -37,8 +45,9 @@ def left():
 def right():
     return pl.LazyFrame(
         {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
+            "a": [1, 4, 3, 7, None, None, 1],
+            "c": [2, 3, 4, 5, 6, 7, 8],
+            "d": [6, None, 7, 8, -1, 2, 4],
         }
     )
 
@@ -70,11 +79,31 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
-def test_cross_join(left, right):
+def test_left_join_with_slice(left, right, join_nulls, zlice):
+    q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True)
+    ctx = nullcontext()
+    if zlice is not None:
+        q_expect = q.collect().slice(*zlice)
+        q = q.slice(*zlice)
+        if zlice == (1, 5) or zlice == (0, 2):
+            # https://github.com/pola-rs/polars/issues/19403
+            # https://github.com/pola-rs/polars/issues/19405
+            ctx = pytest.raises(AssertionError)
+            assert_frame_equal(
+                q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True))
+            )
+
+    with ctx:
+        assert_gpu_result_equal(q)
+
+
+def test_cross_join(left, right, zlice):
     q = left.join(right, how="cross")
+    if zlice is not None:
+        q = q.slice(*zlice)
 
     assert_gpu_result_equal(q)
 
@@ -86,3 +115,26 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "conditions",
+    [
+        [pl.col("a") < pl.col("a_right")],
+        [pl.col("a_right") <= pl.col("a") * 2],
+        [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
+        [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
+        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
+    ],
+)
+def test_join_where(left, right, conditions, zlice):
+    q = left.join_where(right, *conditions)
+
+    assert_gpu_result_equal(q, check_row_order=False)
+
+    if zlice is not None:
+        q_len = q.slice(*zlice).select(pl.len())
+        # Can't compare result, since row order is not guaranteed and
+        # therefore we only check the length
+
+        assert_gpu_result_equal(q_len)

From b75036b12a8d5713e34162571cec24ac91941b85 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 24 Oct 2024 15:34:15 -0400
Subject: [PATCH 110/117] Remove unused variable in internal merge_tdigests
 utility (#17151)

Removes unused variable that contains host copy of the group_offsets data.
This host variable appears to have been made obsolete by a combination of #16897 and #16780
Found while working on #17149

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17151
---
 .../quantiles/tdigest/tdigest_aggregation.cu  | 40 +++++--------------
 1 file changed, 9 insertions(+), 31 deletions(-)

diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index b0a84a6d50c..d27420658d6 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1126,12 +1126,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  * `max` of 0.
  *
  * @param tdv input tdigests. The tdigests within this column are grouped by key.
- * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `group_offsets`.
  * @param group_offsets a device iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `h_group_offsets`.
+ * counted as one even when the cluster is empty in it.
  * @param group_labels a device iterator of the the group label for each tdigest cluster including
  * empty clusters.
  * @param num_group_labels the number of unique group labels.
@@ -1142,9 +1138,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  *
  * @return A column containing the merged tdigests.
  */
-template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
+template <typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
-                                       HGroupOffsetIter h_group_offsets,
                                        GroupOffsetIter group_offsets,
                                        GroupLabelIter group_labels,
                                        size_t num_group_labels,
@@ -1313,21 +1308,13 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
 
   if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
-  auto group_offsets_  = group_offsets_fn{input.size()};
-  auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_offsets   = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_labels    = thrust::make_constant_iterator(0);
-  return to_tdigest_scalar(merge_tdigests(tdv,
-                                          h_group_offsets,
-                                          group_offsets,
-                                          group_labels,
-                                          input.size(),
-                                          1,
-                                          max_centroids,
-                                          stream,
-                                          mr),
-                           stream,
-                           mr);
+  auto group_offsets_ = group_offsets_fn{input.size()};
+  auto group_offsets  = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
+  auto group_labels   = thrust::make_constant_iterator(0);
+  return to_tdigest_scalar(
+    merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr),
+    stream,
+    mr);
 }
 
 std::unique_ptr<column> group_tdigest(column_view const& col,
@@ -1376,16 +1363,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
     return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr);
   }
 
-  // bring group offsets back to the host
-  std::vector<size_type> h_group_offsets(group_offsets.size());
-  cudaMemcpyAsync(h_group_offsets.data(),
-                  group_offsets.begin(),
-                  sizeof(size_type) * group_offsets.size(),
-                  cudaMemcpyDefault,
-                  stream);
-
   return merge_tdigests(tdv,
-                        h_group_offsets.begin(),
                         group_offsets.data(),
                         group_labels.data(),
                         group_labels.size(),

From 7115f20e91a314f07333cbd5c01adc62bf2fbb0c Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 24 Oct 2024 15:34:44 -0400
Subject: [PATCH 111/117] Move `segmented_gather` function from the copying
 module to the lists module (#17148)

This PR moves `segmented_gather` out of the copying module and into the lists module. And it uses the pylibcudf `segmented_gather` implementation in cudf python.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17148
---
 python/cudf/cudf/_lib/copying.pyx     | 26 +-----------------
 python/cudf/cudf/_lib/lists.pyx       | 38 +++++++++++++++++----------
 python/cudf/cudf/core/column/lists.py |  2 +-
 3 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 30353c4be6c..4221e745e65 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -4,7 +4,7 @@ import pickle
 
 from libc.stdint cimport uint8_t, uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -30,10 +30,6 @@ from libcpp.memory cimport make_unique
 cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.gather cimport (
-    segmented_gather as cpp_segmented_gather,
-)
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
@@ -339,26 +335,6 @@ def get_element(Column input_column, size_type index):
     )
 
 
-@acquire_spill_lock()
-def segmented_gather(Column source_column, Column gather_map):
-    cdef shared_ptr[lists_column_view] source_LCV = (
-        make_shared[lists_column_view](source_column.view())
-    )
-    cdef shared_ptr[lists_column_view] gather_map_LCV = (
-        make_shared[lists_column_view](gather_map.view())
-    )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_segmented_gather(
-                source_LCV.get()[0], gather_map_LCV.get()[0])
-        )
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
-
 cdef class _CPackedColumns:
 
     @staticmethod
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 7e8710bedb6..12432ac6d5d 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import pylibcudf
+import pylibcudf as plc
 
 from pylibcudf cimport Scalar
 
@@ -17,7 +17,7 @@ from pylibcudf cimport Scalar
 @acquire_spill_lock()
 def count_elements(Column col):
     return Column.from_pylibcudf(
-        pylibcudf.lists.count_elements(
+        plc.lists.count_elements(
             col.to_pylibcudf(mode="read"))
     )
 
@@ -25,8 +25,8 @@ def count_elements(Column col):
 @acquire_spill_lock()
 def explode_outer(list source_columns, int explode_column_idx):
     return columns_from_pylibcudf_table(
-        pylibcudf.lists.explode_outer(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
+        plc.lists.explode_outer(
+            plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
             explode_column_idx,
         )
     )
@@ -35,7 +35,7 @@ def explode_outer(list source_columns, int explode_column_idx):
 @acquire_spill_lock()
 def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
-        pylibcudf.lists.distinct(
+        plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
             nulls_equal,
             nans_all_equal,
@@ -46,7 +46,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
 @acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
-        pylibcudf.lists.sort_lists(
+        plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
             ascending,
             null_order.BEFORE if na_position == "first" else null_order.AFTER,
@@ -58,7 +58,7 @@ def sort_lists(Column col, bool ascending, str na_position):
 @acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index,
         )
@@ -68,7 +68,7 @@ def extract_element_scalar(Column col, size_type index):
 @acquire_spill_lock()
 def extract_element_column(Column col, Column index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index.to_pylibcudf(mode="read"),
         )
@@ -78,7 +78,7 @@ def extract_element_column(Column col, Column index):
 @acquire_spill_lock()
 def contains_scalar(Column col, py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.contains(
+        plc.lists.contains(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
         )
@@ -88,7 +88,7 @@ def contains_scalar(Column col, py_search_key):
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
             True,
@@ -99,7 +99,7 @@ def index_of_scalar(Column col, object py_search_key):
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             search_keys.to_pylibcudf(mode="read"),
             True,
@@ -110,8 +110,8 @@ def index_of_column(Column col, Column search_keys):
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_rows(
-            pylibcudf.Table([
+        plc.lists.concatenate_rows(
+            plc.Table([
                 c.to_pylibcudf(mode="read") for c in source_columns
             ])
         )
@@ -121,8 +121,18 @@ def concatenate_rows(list source_columns):
 @acquire_spill_lock()
 def concatenate_list_elements(Column input_column, dropna=False):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_list_elements(
+        plc.lists.concatenate_list_elements(
             input_column.to_pylibcudf(mode="read"),
             dropna,
         )
     )
+
+
+@acquire_spill_lock()
+def segmented_gather(Column source_column, Column gather_map):
+    return Column.from_pylibcudf(
+        plc.lists.segmented_gather(
+            source_column.to_pylibcudf(mode="read"),
+            gather_map.to_pylibcudf(mode="read"),
+        )
+    )
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c6a39199e3b..e9d24d4f450 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -11,7 +11,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
     concatenate_list_elements,
     concatenate_rows,
@@ -22,6 +21,7 @@
     extract_element_scalar,
     index_of_column,
     index_of_scalar,
+    segmented_gather,
     sort_lists,
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column

From 03777f6b5d44d54316e55bff4e31d3e8e6583c25 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 25 Oct 2024 09:06:25 -0400
Subject: [PATCH 112/117] Fix host-to-device copy missing sync in
 strings/duration convert (#17149)

Fixes a missing stream sync when copying a temporary host vector to device. The host vector could be destroyed before the copy is completed. Updates the code to use vector factory function `make_device_uvector_sync()` instead of `cudaMemcpyAsync`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17149
---
 cpp/src/strings/convert/convert_durations.cu | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 0db1adf1223..f5d052c6657 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -152,12 +153,8 @@ struct format_compiler {
     }
 
     // create program in device memory
-    d_items.resize(items.size(), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(),
-                                  items.data(),
-                                  items.size() * sizeof(items[0]),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    d_items = cudf::detail::make_device_uvector_sync(
+      items, stream, cudf::get_current_device_resource_ref());
   }
 
   format_item const* compiled_format_items() { return d_items.data(); }

From e98e6b9209ff8557d85cb9b828b895884b0c7b7a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 25 Oct 2024 09:07:06 -0400
Subject: [PATCH 113/117] Deprecate current libcudf nvtext minhash functions
 (#17152)

Deprecates the current nvtext minhash functions some of which will be replaced in #16756 with a different signature. The others will no longer be used and removed in future release. The existing gtests and benchmarks will be retained for rework in the future release as well.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17152
---
 cpp/benchmarks/CMakeLists.txt  |  4 ++--
 cpp/include/nvtext/minhash.hpp | 24 ++++++++++++++++++------
 cpp/tests/CMakeLists.txt       |  1 -
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index f013b31b3de..7f82b603912 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/normalize.cpp
+  text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7c909f1a948..42124461cdf 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext {
  *
  * This function uses MurmurHash3_x86_32 for the hash algorithm.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values for each string in input
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
@@ -71,6 +73,8 @@ std::unique_ptr<cudf::column> minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -83,7 +87,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   cudf::size_type width             = 4,
@@ -102,6 +106,8 @@ std::unique_ptr<cudf::column> minhash(
  * The hash function returns 2 uint64 values but only the first value
  * is used with the minhash calculation.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -112,7 +118,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values as UINT64 for each string in input
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
@@ -132,6 +138,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -144,7 +152,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   cudf::size_type width             = 4,
@@ -164,6 +172,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -173,7 +183,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
   cudf::lists_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -193,6 +203,8 @@ std::unique_ptr<cudf::column> word_minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -202,7 +214,7 @@ std::unique_ptr<cudf::column> word_minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
   cudf::lists_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a4213dcbe94..b78a64d0e55 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -611,7 +611,6 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
-  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp

From 0bb699e7616bbfb8564fb3d9db986756713aec8c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 25 Oct 2024 13:10:10 -0400
Subject: [PATCH 114/117] Move nvtext ngrams benchmarks to nvbench (#17173)

Moves the `nvtext::generate_ngrams` and `nvtext::generate_character_ngrams` benchmarks from google-bench to nvbench.
Target parameters are exposed to help with profiling.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17173
---
 cpp/benchmarks/CMakeLists.txt  |  6 ++--
 cpp/benchmarks/text/ngrams.cpp | 65 ++++++++++++++--------------------
 2 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7f82b603912..2a4ac789046 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -345,11 +345,11 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
+ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/normalize.cpp
-  text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 8e48f8e9a05..43d57201b20 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -15,58 +15,45 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/generate_ngrams.hpp>
 
-class TextNGrams : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum class ngrams_type { tokens, characters };
-
-static void BM_ngrams(benchmark::State& state, ngrams_type nt)
+static void bench_ngrams(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const ngram_type = state.get_string("type");
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   auto const separator = cudf::string_scalar("_");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
-      case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
-    }
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size * 2);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 5;
-  int const max_rowlen = 40;
-  int const len_mult   = 2;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (ngram_type == "chars") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_character_ngrams(input);
+    });
+  } else {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_ngrams(input, 2, separator);
+    });
+  }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)                             \
-  BENCHMARK_DEFINE_F(TextNGrams, name)                            \
-  (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
-  BENCHMARK_REGISTER_F(TextNGrams, name)                          \
-    ->Apply(generate_bench_args)                                  \
-    ->UseManualTime()                                             \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(tokens)
-NVTEXT_BENCHMARK_DEFINE(characters)
+NVBENCH_BENCH(bench_ngrams)
+  .set_name("ngrams")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578})
+  .add_int64_axis("row_width", {10, 20, 40, 100})
+  .add_string_axis("type", {"chars", "tokens"});

From 2113bd6bbce62028eff0fa523a85ea859bf2bc08 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Fri, 25 Oct 2024 19:18:53 +0200
Subject: [PATCH 115/117] devcontainer: replace `VAULT_HOST` with
 `AWS_ROLE_ARN` (#17134)

This PR is replacing the `VAULT_HOST` variable with `AWS_ROLE_ARN`. This is required to use the new token service to get AWS credentials.

Authors:
  - Jordan Jacobelli (https://github.com/jjacobelli)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Taylor (https://github.com/trxcllnt)

URL: https://github.com/rapidsai/cudf/pull/17134
---
 .devcontainer/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 8190b5d0297..315a389339a 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
 ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"

From 5cba4fb18883dd511e3f892bfbe3ac46caa2db6c Mon Sep 17 00:00:00 2001
From: Jirka Borovec <6035284+Borda@users.noreply.github.com>
Date: Fri, 25 Oct 2024 22:38:38 +0200
Subject: [PATCH 116/117] lint: replace `isort` with Ruff's rule I (#16685)

since #15312 moved formatting from Black to Rufft, it would make sense also unify import formatting under the same ruff so use build-in `I` rule instead of additional `isort`

Authors:
  - Jirka Borovec (https://github.com/Borda)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16685
---
 .pre-commit-config.yaml                       | 12 +---
 CONTRIBUTING.md                               |  4 +-
 .../developer_guide/contributing_guide.md     |  3 +-
 docs/cudf/source/user_guide/10min.ipynb       |  2 +-
 .../source/user_guide/guide-to-udfs.ipynb     |  6 ++
 pyproject.toml                                |  2 +
 python/cudf/benchmarks/conftest.py            | 18 +++---
 python/cudf/cudf/_typing.py                   | 10 ++--
 python/cudf/cudf/core/buffer/buffer.py        |  5 +-
 .../core/buffer/exposure_tracked_buffer.py    |  5 +-
 python/cudf/cudf/core/column/__init__.py      |  1 -
 python/cudf/cudf/core/column/categorical.py   |  3 +-
 python/cudf/cudf/core/column/column.py        |  3 +-
 python/cudf/cudf/core/column/datetime.py      |  4 +-
 python/cudf/cudf/core/column/decimal.py       |  3 +-
 python/cudf/cudf/core/column/lists.py         |  4 +-
 python/cudf/cudf/core/column/methods.py       |  4 +-
 python/cudf/cudf/core/column/numerical.py     |  4 +-
 python/cudf/cudf/core/column/string.py        |  4 +-
 python/cudf/cudf/core/column/timedelta.py     |  4 +-
 python/cudf/cudf/core/column_accessor.py      |  3 +-
 python/cudf/cudf/core/dataframe.py            |  4 +-
 python/cudf/cudf/core/df_protocol.py          |  7 ++-
 python/cudf/cudf/core/frame.py                |  3 +-
 python/cudf/cudf/core/groupby/groupby.py      |  4 +-
 python/cudf/cudf/core/index.py                |  4 +-
 python/cudf/cudf/core/indexed_frame.py        |  4 +-
 python/cudf/cudf/core/indexing_utils.py       | 12 ++--
 python/cudf/cudf/core/multiindex.py           |  4 +-
 python/cudf/cudf/core/series.py               |  4 +-
 python/cudf/cudf/core/tools/datetimes.py      |  5 +-
 python/cudf/cudf/pandas/fast_slow_proxy.py    |  4 +-
 python/cudf/cudf/pandas/module_accelerator.py |  2 +-
 .../pandas/scripts/analyze-test-failures.py   |  1 +
 .../cudf/pandas/scripts/conftest-patch.py     |  2 +-
 .../pandas/scripts/summarize-test-results.py  |  3 +-
 .../cudf_pandas_tests/test_fast_slow_proxy.py |  3 +-
 python/cudf/pyproject.toml                    | 59 +++++--------------
 python/cudf_kafka/pyproject.toml              | 59 +++++--------------
 python/custreamz/custreamz/tests/conftest.py  |  1 +
 python/custreamz/pyproject.toml               | 58 +++++-------------
 python/dask_cudf/dask_cudf/__init__.py        | 20 +++----
 python/dask_cudf/dask_cudf/expr/__init__.py   |  4 +-
 python/dask_cudf/dask_cudf/io/__init__.py     | 12 ++--
 python/dask_cudf/pyproject.toml               | 51 +++-------------
 .../pylibcudf/pylibcudf/tests/common/utils.py |  3 +-
 python/pylibcudf/pylibcudf/tests/conftest.py  |  3 +-
 .../pylibcudf/pylibcudf/tests/io/test_avro.py |  3 +-
 .../pylibcudf/pylibcudf/tests/io/test_csv.py  |  5 +-
 .../pylibcudf/pylibcudf/tests/io/test_json.py |  5 +-
 .../pylibcudf/pylibcudf/tests/io/test_orc.py  |  3 +-
 .../pylibcudf/tests/io/test_parquet.py        |  5 +-
 .../tests/io/test_source_sink_info.py         |  3 +-
 .../pylibcudf/tests/io/test_timezone.py       |  3 +-
 .../pylibcudf/tests/test_binaryops.py         |  3 +-
 .../pylibcudf/tests/test_column_factories.py  |  3 +-
 .../tests/test_column_from_device.py          |  3 +-
 .../pylibcudf/tests/test_contiguous_split.py  |  3 +-
 .../pylibcudf/pylibcudf/tests/test_copying.py |  3 +-
 .../pylibcudf/tests/test_datetime.py          |  3 +-
 .../pylibcudf/tests/test_expressions.py       |  3 +-
 .../pylibcudf/pylibcudf/tests/test_interop.py |  3 +-
 python/pylibcudf/pylibcudf/tests/test_join.py |  3 +-
 python/pylibcudf/pylibcudf/tests/test_json.py |  3 +-
 .../pylibcudf/tests/test_labeling.py          |  3 +-
 .../pylibcudf/pylibcudf/tests/test_lists.py   |  3 +-
 .../pylibcudf/tests/test_null_mask.py         |  5 +-
 .../tests/test_nvtext_edit_distance.py        |  3 +-
 .../tests/test_nvtext_generate_ngrams.py      |  3 +-
 .../pylibcudf/tests/test_nvtext_jaccard.py    |  3 +-
 .../pylibcudf/tests/test_nvtext_minhash.py    |  3 +-
 .../tests/test_nvtext_ngrams_tokenize.py      |  3 +-
 .../pylibcudf/tests/test_nvtext_normalize.py  |  3 +-
 .../pylibcudf/tests/test_nvtext_replace.py    |  3 +-
 .../pylibcudf/tests/test_nvtext_stemmer.py    |  3 +-
 .../pylibcudf/tests/test_partitioning.py      |  3 +-
 .../pylibcudf/tests/test_quantiles.py         |  3 +-
 .../pylibcudf/tests/test_regex_program.py     |  3 +-
 .../pylibcudf/pylibcudf/tests/test_reshape.py |  3 +-
 .../pylibcudf/pylibcudf/tests/test_round.py   |  3 +-
 .../pylibcudf/tests/test_string_attributes.py |  3 +-
 .../pylibcudf/tests/test_string_capitalize.py |  3 +-
 .../pylibcudf/tests/test_string_case.py       |  3 +-
 .../pylibcudf/tests/test_string_char_types.py |  3 +-
 .../pylibcudf/tests/test_string_combine.py    |  3 +-
 .../pylibcudf/tests/test_string_contains.py   |  3 +-
 .../pylibcudf/tests/test_string_convert.py    |  3 +-
 .../tests/test_string_convert_booleans.py     |  3 +-
 .../tests/test_string_convert_datetime.py     |  3 +-
 .../tests/test_string_convert_durations.py    |  3 +-
 .../tests/test_string_convert_fixed_point.py  |  3 +-
 .../tests/test_string_convert_floats.py       |  3 +-
 .../tests/test_string_convert_integers.py     |  3 +-
 .../tests/test_string_convert_ipv4.py         |  3 +-
 .../tests/test_string_convert_lists.py        |  3 +-
 .../tests/test_string_convert_urls.py         |  3 +-
 .../pylibcudf/tests/test_string_extract.py    |  1 +
 .../pylibcudf/tests/test_string_find.py       |  3 +-
 .../tests/test_string_find_multiple.py        |  3 +-
 .../pylibcudf/tests/test_string_findall.py    |  3 +-
 .../pylibcudf/tests/test_string_padding.py    |  1 +
 .../pylibcudf/tests/test_string_repeat.py     |  3 +-
 .../pylibcudf/tests/test_string_replace.py    |  3 +-
 .../pylibcudf/tests/test_string_replace_re.py |  3 +-
 .../pylibcudf/tests/test_string_slice.py      |  3 +-
 .../tests/test_string_split_partition.py      |  3 +-
 .../tests/test_string_split_split.py          |  3 +-
 .../pylibcudf/tests/test_string_strip.py      |  3 +-
 .../pylibcudf/tests/test_string_translate.py  |  3 +-
 .../pylibcudf/tests/test_string_wrap.py       |  3 +-
 .../pylibcudf/pylibcudf/tests/test_table.py   |  3 +-
 .../pylibcudf/tests/test_transform.py         |  3 +-
 .../pylibcudf/tests/test_transpose.py         |  3 +-
 python/pylibcudf/pyproject.toml               | 56 +++++-------------
 114 files changed, 318 insertions(+), 380 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 174dc72bf02..0e86407de11 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,17 +16,6 @@ repos:
             ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-        # Use the config file specific to each subproject so that each
-        # project can specify its own first/third-party packages.
-        args: ["--config-root=python/", "--resolve-all-configs"]
-        files: python/.*
-        exclude: |
-          (?x)^(^python/cudf_polars/.*)
-        types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.2
     hooks:
@@ -150,6 +139,7 @@ repos:
     rev: v0.4.8
     hooks:
       - id: ruff
+        args: ["--fix"]
         files: python/.*$
       - id: ruff-format
         files: python/.*$
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9cdde7c2b7..b55af21a300 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -293,8 +293,8 @@ In order to run doxygen as a linter on C++/CUDA code, run
 ./ci/checks/doxygen.sh
 ```
 
-Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/),
-[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/).
+Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/)
+with its various rules  for Black-like formatting or Isort.
 
 cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling
 mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes,
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 6fce268f309..f4d2c7319b3 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm
 The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting.
 Specifically, cuDF uses the following tools:
 
-- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
-- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
+- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 95f5f9734dd..46221b6015b 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -38,10 +38,10 @@
     "import os\n",
     "\n",
     "import cupy as cp\n",
+    "import dask_cudf\n",
     "import pandas as pd\n",
     "\n",
     "import cudf\n",
-    "import dask_cudf\n",
     "\n",
     "cp.random.seed(12)\n",
     "\n",
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 75eafcc5387..abfe5a1b178 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -101,6 +101,8 @@
    "outputs": [],
    "source": [
     "# define a scalar function\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -247,6 +249,8 @@
    "outputs": [],
    "source": [
     "# redefine the same function from above\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -1622,6 +1626,8 @@
    "outputs": [],
    "source": [
     "# a user defined aggregation function.\n",
+    "\n",
+    "\n",
     "def udaf(df):\n",
     "    return df[\"b\"].max() - df[\"b\"].min() / 2"
    ]
diff --git a/pyproject.toml b/pyproject.toml
index 661c68ee62e..6933484f4e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,8 @@ select = [
     "F",
     # pycodestyle Warning
     "W",
+    # isort
+    "I",
     # no-blank-line-before-function
     "D201",
     # one-blank-line-after-class
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 7b2b71cf216..0e4afadccf5 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -56,27 +56,23 @@
 # into the main repo.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-from config import cudf  # noqa: W0611, E402, F401
-from utils import (  # noqa: E402
-    OrderedSet,
-    collapse_fixtures,
-    column_generators,
-    make_fixture,
-)
-
 # Turn off isort until we upgrade to 5.8.0
 # https://github.com/pycqa/isort/issues/1594
-# isort: off
 from config import (  # noqa: W0611, E402, F401
     NUM_COLS,
     NUM_ROWS,
     collect_ignore,
+    cudf,  # noqa: W0611, E402, F401
     pytest_collection_modifyitems,
     pytest_sessionfinish,
     pytest_sessionstart,
 )
-
-# isort: on
+from utils import (  # noqa: E402
+    OrderedSet,
+    collapse_fixtures,
+    column_generators,
+    make_fixture,
+)
 
 
 @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"])
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 6e8ad556b08..3b13cc258ab 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import sys
-from collections.abc import Callable
-from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union
+from collections.abc import Callable, Iterable
+from typing import TYPE_CHECKING, Any, TypeVar, Union
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
@@ -42,7 +42,7 @@
 SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"]
 
 # Groupby aggregation
-AggType = Union[str, Callable]
-MultiColumnAggType = Union[
-    AggType, Iterable[AggType], Dict[Any, Iterable[AggType]]
+AggType = Union[str, Callable]  # noqa: UP007
+MultiColumnAggType = Union[  # noqa: UP007
+    AggType, Iterable[AggType], dict[Any, Iterable[AggType]]
 ]
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index caff019f575..ffa306bf93f 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Literal, Mapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
 from typing_extensions import Self
@@ -18,6 +18,9 @@
 from cudf.core.abc import Serializable
 from cudf.utils.string import format_bytes
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 def host_memory_allocation(nbytes: int) -> memoryview:
     """Allocate host memory using NumPy
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 0bd8d6054b3..ecf9807cfc2 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -2,13 +2,16 @@
 
 from __future__ import annotations
 
-from typing import Literal, Mapping
+from typing import TYPE_CHECKING, Literal
 
 from typing_extensions import Self
 
 import cudf
 from cudf.core.buffer.buffer import Buffer, BufferOwner
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 class ExposureTrackedBuffer(Buffer):
     """An exposure tracked buffer.
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 06791df7dc0..a1e87d04bc9 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -29,4 +29,3 @@
     Decimal128Column,
     DecimalBaseColumn,
 )
-from cudf.core.column.interval import IntervalColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 864e87b5377..087d0ed65f5 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -4,7 +4,7 @@
 
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -26,6 +26,7 @@
 
 if TYPE_CHECKING:
     from collections import abc
+    from collections.abc import Mapping, Sequence
 
     import numba.cuda
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7674565e2c3..d2cd6e8ac8f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -4,10 +4,11 @@
 
 import pickle
 from collections import abc
+from collections.abc import MutableSequence, Sequence
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
-from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 2c9b0baa9b6..b6dc250e64d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -8,7 +8,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Literal, Sequence, cast
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 import pandas as pd
@@ -31,6 +31,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import (
         ColumnBinaryOperand,
         DatetimeLikeScalar,
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 8803ebd6791..8ae06f72d1e 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -3,8 +3,9 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Sequence
 from decimal import Decimal
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import cupy as cp
 import numpy as np
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index e9d24d4f450..6b25e568f00 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -34,6 +34,8 @@
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
 
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 05a0ab2e09a..a91c080fe21 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Union, overload
-
-from typing_extensions import Literal
+from typing import Literal, Union, overload
 
 import cudf
 import cudf.core.column
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 78d2814ed26..620cae65374 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -28,7 +28,7 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Sequence
 
     from cudf._typing import (
         ColumnBinaryOperand,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b25e486d855..856ce0f75de 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5,7 +5,7 @@
 import re
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast, overload
+from typing import TYPE_CHECKING, cast, overload
 
 import numpy as np
 import pandas as pd
@@ -35,6 +35,8 @@ def str_to_boolean(column: StringColumn):
 
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     import cupy
     import numba.cuda
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 6b6f3e517a8..087d6474e7f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,7 +4,7 @@
 
 import datetime
 import functools
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -19,6 +19,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
 
 _unit_to_nanoseconds_conversion = {
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index bc093fdaa9a..496e86ed709 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -5,8 +5,9 @@
 import itertools
 import sys
 from collections import abc
+from collections.abc import Mapping
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Mapping, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7d4d34f5b04..bf1c39b23da 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -13,8 +13,8 @@
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Callable, Iterator
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from collections.abc import Callable, Iterator, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numba
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 5250a741d3d..aa601a2b322 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -3,7 +3,7 @@
 
 import enum
 from collections import abc
-from typing import Any, Iterable, Mapping, Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import cupy as cp
 import numpy as np
@@ -20,6 +20,9 @@
     build_column,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping, Sequence
+
 # Implementation of interchange protocol classes
 # ----------------------------------------------
 
@@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum):
     _DtypeKind.BOOL,
     _DtypeKind.STRING,
 }
-ProtoDtype = Tuple[_DtypeKind, int, str, str]
+ProtoDtype = tuple[_DtypeKind, int, str, str]
 
 
 class _CuDFBuffer:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 37ad6b8fabb..205edd91d9d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import pickle
 import warnings
 from collections import abc
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 # TODO: The `numpy` import is needed for typing purposes during doc builds
 # only, need to figure out why the `np` alias is insufficient then remove.
@@ -36,6 +36,7 @@
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
     from types import ModuleType
 
     from cudf._typing import Dtype, ScalarLike
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 81b20488d8d..6630bd96c01 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable, Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy as cp
 import numpy as np
@@ -36,6 +36,8 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable
+
     from cudf._typing import (
         AggType,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd07c58c5d9..1b90e9f9df0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,10 +5,10 @@
 import operator
 import pickle
 import warnings
-from collections.abc import Hashable
+from collections.abc import Hashable, MutableMapping
 from functools import cache, cached_property
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5952815deef..e031f2a4e8e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -10,9 +10,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Literal,
-    MutableMapping,
     TypeVar,
     cast,
 )
@@ -63,6 +61,8 @@
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, MutableMapping
+
     from cudf._typing import (
         ColumnLike,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 8182e5cede2..ce6a5c960dd 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -3,9 +3,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, List, Union
-
-from typing_extensions import TypeAlias
+from typing import Any, TypeAlias
 
 import cudf
 from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
@@ -46,11 +44,11 @@ class ScalarIndexer:
     key: GatherMap
 
 
-IndexingSpec: TypeAlias = Union[
-    EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer
-]
+IndexingSpec: TypeAlias = (
+    EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer
+)
 
-ColumnLabels: TypeAlias = List[str]
+ColumnLabels: TypeAlias = list[str]
 
 
 def destructure_iloc_key(
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 92d094d9de5..bfff62f0a89 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,7 +8,7 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, MutableMapping
+from typing import TYPE_CHECKING, Any
 
 import cupy as cp
 import numpy as np
@@ -36,7 +36,7 @@
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Hashable
+    from collections.abc import Generator, Hashable, MutableMapping
 
     from typing_extensions import Self
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 29ed18ac0ce..9b60424c924 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,7 +9,7 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy
 import numpy as np
@@ -71,6 +71,8 @@
 from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
     import pyarrow as pa
 
     from cudf._typing import (
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 68f34fa28ff..885e7b16644 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -4,7 +4,7 @@
 import math
 import re
 import warnings
-from typing import Literal, Sequence
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
@@ -20,6 +20,9 @@
 from cudf.core import column
 from cudf.core.index import ensure_index
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
     "year": "year",
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index c364d55e677..73afde407db 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -10,9 +10,9 @@
 import pickle
 import types
 import warnings
-from collections.abc import Callable, Iterator
+from collections.abc import Callable, Iterator, Mapping
 from enum import IntEnum
-from typing import Any, Literal, Mapping
+from typing import Any, Literal
 
 import numpy as np
 
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index f82e300e83d..38103a71908 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, NamedTuple
+from typing import Any, ContextManager, NamedTuple  # noqa: UP035
 
 from typing_extensions import Self
 
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index 8870fbc5c28..bb2fc00d9fc 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -9,6 +9,7 @@
     python analyze-test-failures.py <path-to-test-log> <file-or-pattern>
 
 Example:
+-------
     python analyze-test-failures.py log.json frame/*
 """
 
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index d12d2697729..59966a5ff0c 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs):
 
 @pytest.fixture(scope="session", autouse=True)  # type: ignore
 def patch_testing_functions():
-    tm.assert_produces_warning = null_assert_warnings
+    tm.assert_produces_warning = null_assert_warnings  # noqa: F821
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 4ea0b3b4413..a0ad872e4c7 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -5,7 +5,8 @@
 """
 Summarizes the test results per module.
 
-Examples:
+Examples
+--------
     python summarize-test-results.py log.json
     python summarize-test-results.py log.json --output json
     python summarize-test-results.py log.json --output table
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index a75a20a4681..63fd9601fc1 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -387,7 +387,8 @@ def test_dir_bound_method(
 ):
     """This test will fail because dir for bound methods is currently
     incorrect, but we have no way to fix it without materializing the slow
-    type, which is unnecessarily expensive."""
+    type, which is unnecessarily expensive.
+    """
     Fast, FastIntermediate = fast_and_intermediate_with_doc
     Slow, SlowIntermediate = slow_and_intermediate_with_doc
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index feab04ffadc..80201dd84db 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -81,50 +81,6 @@ cudf-pandas-tests = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "pylibcudf"
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
-
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
@@ -174,3 +130,18 @@ wheel.packages = ["cudf"]
 provider = "scikit_build_core.metadata.regex"
 input = "cudf/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "pylibcudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 87e19a2bccf..667cd7b1db8 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -32,51 +32,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-    "streamz",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "cudf_kafka",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf_kafka"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda", "streamz"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py
index 1cda9b71387..c5135bc6414 100644
--- a/python/custreamz/custreamz/tests/conftest.py
+++ b/python/custreamz/custreamz/tests/conftest.py
@@ -2,6 +2,7 @@
 import socket
 
 import pytest
+
 from custreamz import kafka
 
 
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index af45f49d9b4..a8ab05a3922 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -65,50 +65,20 @@ include = [
 ]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "streamz",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["streamz"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 04c2ad65b99..f9df22cc436 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -7,15 +7,15 @@
 # do anything for dask==2024.2.0)
 config.set({"dataframe.query-planning-warning": False})
 
-import dask.dataframe as dd
-from dask.dataframe import from_delayed
+import dask.dataframe as dd  # noqa: E402
+from dask.dataframe import from_delayed  # noqa: E402
 
-import cudf
+import cudf  # noqa: E402
 
-from . import backends
-from ._version import __git_commit__, __version__
-from .core import concat, from_cudf, from_dask_dataframe
-from .expr import QUERY_PLANNING_ON
+from . import backends  # noqa: E402, F401
+from ._version import __git_commit__, __version__  # noqa: E402, F401
+from .core import concat, from_cudf, from_dask_dataframe  # noqa: E402
+from .expr import QUERY_PLANNING_ON  # noqa: E402
 
 
 def read_csv(*args, **kwargs):
@@ -55,9 +55,9 @@ def inner_func(*args, **kwargs):
     to_orc = raise_not_implemented_error("to_orc")
 
 else:
-    from .core import DataFrame, Index, Series
-    from .groupby import groupby_agg
-    from .io import read_text, to_orc
+    from .core import DataFrame, Index, Series  # noqa: F401
+    from .groupby import groupby_agg  # noqa: F401
+    from .io import read_text, to_orc  # noqa: F401
 
 
 __all__ = [
diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index a76b655ef42..6dadadd5263 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -12,8 +12,8 @@
     config.set({"dataframe.shuffle.method": "tasks"})
 
     try:
-        import dask_cudf.expr._collection
-        import dask_cudf.expr._expr
+        import dask_cudf.expr._collection  # noqa: F401
+        import dask_cudf.expr._expr  # noqa: F401
 
     except ImportError as err:
         # Dask *should* raise an error before this.
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 76bb2ea99b4..0421bd755f4 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from .csv import read_csv
-from .json import read_json
-from .orc import read_orc, to_orc
-from .text import read_text
+from .csv import read_csv  # noqa: F401
+from .json import read_json  # noqa: F401
+from .orc import read_orc, to_orc  # noqa: F401
+from .text import read_text  # noqa: F401
 
 try:
-    from .parquet import read_parquet, to_parquet
+    from .parquet import read_parquet, to_parquet  # noqa: F401
 except ImportError:
     pass
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 705865d083b..862e8f36eaa 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -69,50 +69,17 @@ version = {file = "dask_cudf/VERSION"}
 [tool.setuptools.packages.find]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
+[tool.ruff]
+extend = "../../pyproject.toml"
 
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-]
-known_first_party = [
-    "dask_cudf",
-]
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["dask_cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
 
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-]
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index 9f389fa42c4..d95849ef371 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -7,10 +7,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.orc import write_table as orc_write_table
 from pyarrow.parquet import write_table as pq_write_table
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
index fdce6f353ca..a19a8835498 100644
--- a/python/pylibcudf/pylibcudf/tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -8,8 +8,9 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 0cd5064a697..3d9d99ffa61 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -5,10 +5,11 @@
 
 import fastavro
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_and_meta_eq
 
+import pylibcudf as plc
+
 avro_dtype_pairs = [
     ("boolean", pa.bool_()),
     ("int", pa.int32()),
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index ab26f23418d..22c83acc47c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -5,9 +5,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     _convert_types,
     assert_table_and_meta_eq,
@@ -15,6 +13,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_CSV_SOURCE_KWARGS = {
     "format": "csv",
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
index 9d976fedf00..453e5ce32a8 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -3,9 +3,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     assert_table_and_meta_eq,
     make_source,
@@ -13,6 +11,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 42b14b1feff..5ed660ba6cf 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import _convert_types, assert_table_and_meta_eq, make_source
 
+import pylibcudf as plc
+
 # Shared kwargs to pass to make_source
 _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index f6e843ccf66..41298601539 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.parquet import read_table
+from utils import assert_table_and_meta_eq, make_source
+
+import pylibcudf as plc
 from pylibcudf.expressions import (
     ASTOperator,
     ColumnNameReference,
@@ -11,7 +13,6 @@
     Literal,
     Operation,
 )
-from utils import assert_table_and_meta_eq, make_source
 
 # Shared kwargs to pass to make_source
 _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
index 747f58ec8cf..0c43c363e55 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
@@ -2,9 +2,10 @@
 
 import io
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
 def io_class(request):
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
index 76b0424b2af..b3555013927 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import zoneinfo
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 def test_make_timezone_transition_table():
     if len(zoneinfo.TZPATH) == 0:
diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
index f784cb3c191..bbb08e8b95a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -4,10 +4,11 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def idfn(param):
     ltype, rtype, outtype, plc_op, _ = param
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
index 8cedbc6d42f..e317362a76b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
 
+import pylibcudf as plc
+
 EMPTY_COL_SIZE = 3
 
 NUMERIC_TYPES = [
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
index 0e129fdf0ef..24cd6b9e35f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
 import rmm
 
+import pylibcudf as plc
+
 VALID_TYPES = [
     pa.int8(),
     pa.int16(),
diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
index 7a5c1664eed..6d8b5993964 100644
--- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
+++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 param_pyarrow_tables = [
     pa.table([]),
     pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py
index 628682d0a66..c0a41b96b1a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_copying.py
+++ b/python/pylibcudf/pylibcudf/tests/test_copying.py
@@ -2,7 +2,6 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
@@ -16,6 +15,8 @@
     metadata_from_arrow_type,
 )
 
+import pylibcudf as plc
+
 
 # TODO: consider moving this to conftest and "pairing"
 # it with pa_type, so that they don't get out of sync
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index 75930d59058..a80ab8d9f65 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -4,10 +4,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
 def datetime_column(has_nulls, request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py
index 5894ef4624c..6eabd6db617 100644
--- a/python/pylibcudf/pylibcudf/tests/test_expressions.py
+++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 # We can't really evaluate these expressions, so just make sure
 # construction works properly
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
index 01c998f16d4..e4f5174ad9b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 def test_list_dtype_roundtrip():
     list_type = pa.list_(pa.int32())
diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py
index 61e02f4d28d..f43a56046a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_join.py
+++ b/python/pylibcudf/pylibcudf/tests/test_join.py
@@ -2,9 +2,10 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 def test_cross_join():
     left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py
index 3d2955211f8..486a9524e92 100644
--- a/python/pylibcudf/pylibcudf/tests/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/test_json.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def plc_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
index f7fb7463b50..beacfc63ce5 100644
--- a/python/pylibcudf/pylibcudf/tests/test_labeling.py
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("left_inclusive", [True, False])
 @pytest.mark.parametrize("right_inclusive", [True, False])
diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
index 2353a6ff8f9..f3ef555f11d 100644
--- a/python/pylibcudf/pylibcudf/tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def test_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
index 3edcae59edc..cd3da856de2 100644
--- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py
+++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.null_mask import MaskState
 
 import rmm
 
+import pylibcudf as plc
+from pylibcudf.null_mask import MaskState
+
 
 @pytest.fixture(params=[False, True])
 def nullable(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
index 7d93c471cc4..8b14e0db576 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def edit_distance_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
index 5cf9874d595..fae4685f81b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
index d5a168426b1..05fe7b53c16 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index 4e389a63f90..ead9ee094af 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
 def minhash_input_data(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
index 283a009288d..84748b5597e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
index fe28b83c09a..25b6d1389ec 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def norm_spaces_input_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
index 0fb54bb4ee1..65687f31c85 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
index 75d56f587a4..e7f4a971f08 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
index 444d0089d2c..c55e54cebc6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py
+++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def partitioning_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
index bac56691306..e4a24fb1c98 100644
--- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py
+++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 # Map pylibcudf interpolation options to pyarrow options
 interp_mapping = {
     plc.types.Interpolation.LINEAR: "linear",
diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
index 777315df538..52598f2c462 100644
--- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py
+++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("pat", ["(", "*", "\\"])
 def test_regex_program_invalid(pat):
diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py
index 01115bc363a..ef23e23766a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_reshape.py
+++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def reshape_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py
index 0b30316b9a0..2526580bc13 100644
--- a/python/pylibcudf/pylibcudf/tests/test_round.py
+++ b/python/pylibcudf/pylibcudf/tests/test_round.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=["float32", "float64"])
 def column(request, has_nulls):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
index a1820def0b1..f461657281a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture()
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
index 176ccc55b96..3e31c75c38a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py
index 233cc253b14..08ac371fd96 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_case.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def string_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
index bcd030c019e..06b44210d74 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_all_characters_of_type():
     pa_array = pa.array(["1", "A"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
index 4a7007a0d6b..eea3ac68e84 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_combine.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_concatenate_scalar_seperator():
     plc_table = plc.interop.from_arrow(
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
index 4e4dd7cbb00..ba9a4a7d3b8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def target_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
index 69f7a0fdd33..3f3f452c4f6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     scope="module",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
index 117c59ff1b8..b391d2b290e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_booleans():
     pa_array = pa.array(["true", None, "True"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
index f3e84286a36..c9368d858a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
@@ -3,10 +3,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def fmt():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
index 6d704309bfd..2d3578e4e71 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
@@ -3,10 +3,11 @@
 from datetime import datetime, timedelta
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     params=[
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
index b1c4d729604..012e722038e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
@@ -2,9 +2,10 @@
 import decimal
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_fixed_point():
     typ = pa.decimal128(38, 2)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
index e9918fab559..8ee2b5075af 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_floats():
     typ = pa.float32()
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
index 6d1d565af30..01192c2d1f8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_integers():
     typ = pa.int8()
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
index 4dc3e512624..b533809f106 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_ipv4_to_integers():
     arr = pa.array(["123.45.67.890", None])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
index 8591732b39e..737036a4f0f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("na_rep", [None, pa.scalar("")])
 @pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
index fee8c3fb8f6..528736798c7 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
@@ -2,9 +2,10 @@
 import urllib
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_url_encode():
     data = ["/home/nfs", None]
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
index 788b86423c4..e70edf4fb33 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py
index db3b13a5aae..82ec18832a9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
index d6b37a388f0..fa9eee3594b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_find_multiple():
     arr = pa.array(["abc", "def"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
index debfad92d00..b73d812c898 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
@@ -2,9 +2,10 @@
 import re
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_findall():
     arr = pa.array(["bunny", "rabbit", "hare", "dog"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
index 2ba775d17ae..79498132097 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_padding.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
index 18b5d8bf4d0..c06c06be7c6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2])
 def test_repeat_strings(repeats):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
index 5a9c2007b73..2c7d25133de 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
index ff2ce348d3b..511f826441a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("max_replace_count", [-1, 1])
 def test_replace_re_regex_program_scalar(max_replace_count):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
index d9ce5591b98..1759f739e31 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def pa_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
index 80cae8d1c6b..4e80f19b814 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
index 2aeffac8209..450b336ce65 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
index 005e5e4a405..5869e5f4920 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 data_strings = [
     "AbC",
     "123abc",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
index 2ae893e69fb..84fd3354ac6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_translate.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
index a1c820cd586..00442d866e9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
@@ -2,9 +2,10 @@
 import textwrap
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_wrap():
     width = 12
diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py
index e822d6a97a8..ac39ef4c5c9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_table.py
+++ b/python/pylibcudf/pylibcudf/tests/test_table.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize(
     "arrow_tbl",
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index d5c618f07e4..49802fe64ac 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -3,9 +3,10 @@
 import math
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_nans_to_nulls(has_nans):
     if has_nans:
diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py
index ac11123f680..b0c0bc72ead 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transpose.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from packaging.version import parse
 
+import pylibcudf as plc
+
 
 @pytest.mark.skipif(
     parse(pa.__version__) < parse("16.0.0"),
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index ea5b3065896..a80c85a1fa8 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -53,48 +53,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module

From 8bc9f19ebbb57bbc9bfa98efd94c8d7f8c65d316 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 25 Oct 2024 10:50:11 -1000
Subject: [PATCH 117/117] Add to_dlpack/from_dlpack APIs to pylibcudf (#17055)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Could use some advice how to type the input of `from_dlpack` and outut of `to_dlpack` which are PyCapsule objects.
EDIT: I notice Cython just types them as object https://github.com/cython/cython/blob/master/Cython/Includes/cpython/pycapsule.pxd. Stylistically do we want add `object var_name` or just leave untyped?

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17055
---
 python/cudf/cudf/_lib/interop.pyx             | 69 ++------------
 python/pylibcudf/pylibcudf/__init__.pxd       |  2 +
 python/pylibcudf/pylibcudf/interop.pxd        |  8 ++
 python/pylibcudf/pylibcudf/interop.pyx        | 94 ++++++++++++++++++-
 .../pylibcudf/pylibcudf/libcudf/interop.pxd   | 10 +-
 .../pylibcudf/pylibcudf/tests/test_interop.py | 31 ++++++
 6 files changed, 148 insertions(+), 66 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/interop.pxd

diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 1dc586bb257..1c9d3a01b80 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,49 +1,22 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cpython cimport pycapsule
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 import pylibcudf
 
-from pylibcudf.libcudf.interop cimport (
-    DLManagedTensor,
-    from_dlpack as cpp_from_dlpack,
-    to_dlpack as cpp_to_dlpack,
-)
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-
-from cudf._lib.utils cimport (
-    columns_from_pylibcudf_table,
-    columns_from_unique_ptr,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import ListDtype, StructDtype
 
 
-def from_dlpack(dlpack_capsule):
+def from_dlpack(object dlpack_capsule):
     """
     Converts a DLPack Tensor PyCapsule into a list of columns.
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>pycapsule.\
-        PyCapsule_GetPointer(dlpack_capsule, 'dltensor')
-    pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor')
-
-    cdef unique_ptr[table] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_from_dlpack(dlpack_tensor)
-        )
-
-    res = columns_from_unique_ptr(move(c_result))
-    dlpack_tensor.deleter(dlpack_tensor)
-    return res
+    return columns_from_pylibcudf_table(
+        pylibcudf.interop.from_dlpack(dlpack_capsule)
+    )
 
 
 def to_dlpack(list source_columns):
@@ -52,39 +25,13 @@ def to_dlpack(list source_columns):
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-    if any(column.null_count for column in source_columns):
-        raise ValueError(
-            "Cannot create a DLPack tensor with null values. \
-                Input is required to have null count as zero."
-        )
-
-    cdef DLManagedTensor *dlpack_tensor
-    cdef table_view source_table_view = table_view_from_columns(source_columns)
-
-    with nogil:
-        dlpack_tensor = cpp_to_dlpack(
-            source_table_view
+    return pylibcudf.interop.to_dlpack(
+        pylibcudf.Table(
+            [col.to_pylibcudf(mode="read") for col in source_columns]
         )
-
-    return pycapsule.PyCapsule_New(
-        dlpack_tensor,
-        'dltensor',
-        dlmanaged_tensor_pycapsule_deleter
     )
 
 
-cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>0
-    try:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'used_dltensor')
-        return  # we do not call a used capsule's deleter
-    except Exception:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'dltensor')
-    dlpack_tensor.deleter(dlpack_tensor)
-
-
 def gather_metadata(object cols_dtypes):
     """
     Generates a ColumnMetadata vector for each column.
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index aa67b4b1149..9bdfdab97c2 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     expressions,
     filling,
     groupby,
+    interop,
     join,
     json,
     labeling,
@@ -62,6 +63,7 @@ __all__ = [
     "filling",
     "gpumemoryview",
     "groupby",
+    "interop",
     "join",
     "json",
     "lists",
diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd
new file mode 100644
index 00000000000..2a0a8c15fdd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/interop.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table cimport Table
+
+
+cpdef Table from_dlpack(object managed_tensor)
+
+cpdef object to_dlpack(Table input)
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index 642516a1b90..61e812353b7 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -1,6 +1,11 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
+from cpython.pycapsule cimport (
+    PyCapsule_GetPointer,
+    PyCapsule_IsValid,
+    PyCapsule_New,
+    PyCapsule_SetName,
+)
 from libc.stdlib cimport free
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport (
     ArrowArray,
     ArrowArrayStream,
     ArrowSchema,
+    DLManagedTensor,
     column_metadata,
     from_arrow_column as cpp_from_arrow_column,
     from_arrow_stream as cpp_from_arrow_stream,
+    from_dlpack as cpp_from_dlpack,
     to_arrow_host_raw,
     to_arrow_schema_raw,
+    to_dlpack as cpp_to_dlpack,
 )
 from pylibcudf.libcudf.table.table cimport table
 
@@ -315,3 +323,87 @@ def _to_arrow_scalar(cudf_object, metadata=None):
     # Note that metadata for scalars is primarily important for preserving
     # information on nested types since names are otherwise irrelevant.
     return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0]
+
+
+cpdef Table from_dlpack(object managed_tensor):
+    """
+    Convert a DLPack DLTensor into a cudf table.
+
+    For details, see :cpp:func:`cudf::from_dlpack`
+
+    Parameters
+    ----------
+    managed_tensor : PyCapsule
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    Table
+        Table with a copy of the tensor data.
+    """
+    if not PyCapsule_IsValid(managed_tensor, "dltensor"):
+        raise ValueError("Invalid PyCapsule object")
+    cdef unique_ptr[table] c_result
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        managed_tensor, "dltensor"
+    )
+    if dlpack_tensor is NULL:
+        raise ValueError("PyCapsule object contained a NULL pointer")
+    PyCapsule_SetName(managed_tensor, "used_dltensor")
+
+    # Note: A copy is always performed when converting the dlpack
+    # data to a libcudf table. We also delete the dlpack_tensor pointer
+    # as the pointer is not deleted by libcudf's from_dlpack function.
+    # TODO: https://github.com/rapidsai/cudf/issues/10874
+    # TODO: https://github.com/rapidsai/cudf/issues/10849
+    with nogil:
+        c_result = cpp_from_dlpack(dlpack_tensor)
+
+    cdef Table result = Table.from_libcudf(move(c_result))
+    dlpack_tensor.deleter(dlpack_tensor)
+    return result
+
+
+cpdef object to_dlpack(Table input):
+    """
+    Convert a cudf table into a DLPack DLTensor.
+
+    For details, see :cpp:func:`cudf::to_dlpack`
+
+    Parameters
+    ----------
+    input : Table
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    PyCapsule
+        1D or 2D DLPack tensor with a copy of the table data, or nullptr.
+    """
+    for col in input._columns:
+        if col.null_count():
+            raise ValueError(
+                "Cannot create a DLPack tensor with null values. "
+                "Input is required to have null count as zero."
+            )
+    cdef DLManagedTensor *dlpack_tensor
+
+    with nogil:
+        dlpack_tensor = cpp_to_dlpack(input.view())
+
+    return PyCapsule_New(
+        dlpack_tensor,
+        "dltensor",
+        dlmanaged_tensor_pycapsule_deleter
+    )
+
+
+cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
+    if PyCapsule_IsValid(pycap_obj, "used_dltensor"):
+        # we do not call a used capsule's deleter
+        return
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        pycap_obj, "dltensor"
+    )
+    if dlpack_tensor is not NULL:
+        dlpack_tensor.deleter(dlpack_tensor)
diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 30b97fdec34..b75e9ca7001 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -32,11 +32,13 @@ cdef extern from "cudf/interop.hpp" nogil:
 
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
-    cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor
-                                       ) except +
+    cdef unique_ptr[table] from_dlpack(
+        const DLManagedTensor* managed_tensor
+    ) except +
 
-    DLManagedTensor* to_dlpack(table_view input_table
-                               ) except +
+    DLManagedTensor* to_dlpack(
+        const table_view& input
+    ) except +
 
     cdef cppclass column_metadata:
         column_metadata() except +
diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
index e4f5174ad9b..af80b6e5978 100644
--- a/python/pylibcudf/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,7 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import cupy as cp
+import numpy as np
 import pyarrow as pa
 import pytest
+from utils import assert_table_eq
 
 import pylibcudf as plc
 
@@ -67,3 +70,31 @@ def test_decimal_other(data_type):
 
     arrow_type = plc.interop.to_arrow(data_type, precision=precision)
     assert arrow_type == pa.decimal128(precision, 0)
+
+
+def test_round_trip_dlpack_plc_table():
+    expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]})
+    plc_table = plc.interop.from_arrow(expected)
+    result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+    assert_table_eq(expected, result)
+
+
+@pytest.mark.parametrize("array", [np.array, cp.array])
+def test_round_trip_dlpack_array(array):
+    arr = array([1, 2, 3])
+    result = plc.interop.from_dlpack(arr.__dlpack__())
+    expected = pa.table({"a": [1, 2, 3]})
+    assert_table_eq(expected, result)
+
+
+def test_to_dlpack_error():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": [1, None, 3], "b": [5, 6, 7]})
+    )
+    with pytest.raises(ValueError, match="Cannot create a DLPack tensor"):
+        plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+
+
+def test_from_dlpack_error():
+    with pytest.raises(ValueError, match="Invalid PyCapsule object"):
+        plc.interop.from_dlpack(1)