diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 8190b5d0297..315a389339a 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
 ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 2a195c6c81d..a0e193ff0bf 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -15,9 +15,31 @@
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
+        "version": "12.5",
+        "installCompilers": false,
+        "installProfilers": true,
+        "installDevPackages": false,
+        "installcuDNN": false,
+        "installcuTensor": false,
+        "installNCCL": false,
+        "installCUDARuntime": false,
+        "installNVRTC": false,
+        "installOpenCL": false,
+        "installcuBLAS": false,
+        "installcuSPARSE": false,
+        "installcuFFT": false,
+        "installcuFile": false,
+        "installcuRAND": false,
+        "installcuSOLVER": false,
+        "installNPP": false,
+        "installnvJPEG": false,
+        "pruneStaticLibs": true
+      },
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
   "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index af8d1289ea1..6f0e88fb245 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -62,3 +62,33 @@ jobs:
         UPDATE_ITEM: true
         UPDATE_LINKED_ISSUES: true
       secrets: inherit
+
+    process-branch-name:
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
+      needs: get-project-id
+      runs-on: ubuntu-latest
+      outputs:
+        branch-name: ${{ steps.process-branch-name.outputs.branch-name }}
+      steps:
+        - name: Extract branch name
+          id: process-branch-name
+          run: |
+            branch=${{ github.event.pull_request.base.ref }}
+            release=${branch#branch-}
+            echo "branch-name=$release" >> "$GITHUB_OUTPUT"
+
+    update-release:
+      # This job sets the PR and its linked issues to the release they are targeting
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
+      needs: [get-project-id, process-branch-name]
+      with:
+        PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
+        SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ"
+        SINGLE_SELECT_FIELD_NAME: "Release"
+        SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}"
+        ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}"
+        ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}"
+        UPDATE_ITEM: true
+        UPDATE_LINKED_ISSUES: true
+      secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a22d3c5b9cc..1275aad757c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -47,11 +47,23 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
-      build_type: pull-request
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
       # Use the wheel container so we can skip conda solves and since our
       # primary static consumers (Spark) are not in conda anyway.
       container_image: "rapidsai/ci-wheel:latest"
       run_script: "ci/configure_cpp_static.sh"
+  clang-tidy:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      run_script: "ci/clang_tidy.sh"
   conda-python-cudf-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f861fb57916..0e86407de11 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,17 +16,6 @@ repos:
             ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-        # Use the config file specific to each subproject so that each
-        # project can specify its own first/third-party packages.
-        args: ["--config-root=python/", "--resolve-all-configs"]
-        files: python/.*
-        exclude: |
-          (?x)^(^python/cudf_polars/.*)
-        types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.2
     hooks:
@@ -95,6 +84,16 @@ repos:
         entry: 'pytest\.xfail'
         language: pygrep
         types: [python]
+      - id: no-unseeded-default-rng
+        name: no-unseeded-default-rng
+        description: 'Enforce that no non-seeded default_rng is used and default_rng is used instead of np.random.seed'
+        entry: |
+          # Check for usage of default_rng without seeding
+          default_rng\(\)|
+          # Check for usage of np.random.seed
+          np.random.seed\(
+        language: pygrep
+        types: [python]
       - id: cmake-format
         name: cmake-format
         entry: ./cpp/scripts/run-cmake-format.sh cmake-format
@@ -140,6 +139,7 @@ repos:
     rev: v0.4.8
     hooks:
       - id: ruff
+        args: ["--fix"]
         files: python/.*$
       - id: ruff-format
         files: python/.*$
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f2a7c337675..7a75b2a95a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,299 @@
+# cudf 24.10.00 (9 Oct 2024)
+
+## 🚨 Breaking Changes
+
+- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi)
+- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism)
+- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson)
+- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2)
+- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr)
+- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr)
+- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Make proxy NumPy arrays pass isinstance check in `cudf.pandas`&quot; ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711)
+- Align public utility function signatures  with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke)
+- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt)
+- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann)
+- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke)
+- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke)
+- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke)
+- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke)
+- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+
+## 🐛 Bug Fixes
+
+- Add license to the pylibcudf wheel ([#16976](https://github.com/rapidsai/cudf/pull/16976)) [@raydouglass](https://github.com/raydouglass)
+- Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter ([#16950](https://github.com/rapidsai/cudf/pull/16950)) [@shrshi](https://github.com/shrshi)
+- Add dask-cudf workaround for missing `rename_axis` support in cudf ([#16899](https://github.com/rapidsai/cudf/pull/16899)) [@rjzamora](https://github.com/rjzamora)
+- Update oldest deps for `pyarrow` &amp; `numpy` ([#16883](https://github.com/rapidsai/cudf/pull/16883)) [@galipremsagar](https://github.com/galipremsagar)
+- Update labeler for pylibcudf ([#16868](https://github.com/rapidsai/cudf/pull/16868)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Refactor mixed_semi_join using cuco::static_set&quot; ([#16855](https://github.com/rapidsai/cudf/pull/16855)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Fix metadata after implicit array conversion from Dask cuDF ([#16842](https://github.com/rapidsai/cudf/pull/16842)) [@rjzamora](https://github.com/rjzamora)
+- Add cudf.pandas dependencies.yaml to update-version.sh ([#16840](https://github.com/rapidsai/cudf/pull/16840)) [@raydouglass](https://github.com/raydouglass)
+- Use cupy 12.2.0 as oldest dependency pinning on CUDA 12 ARM ([#16808](https://github.com/rapidsai/cudf/pull/16808)) [@bdice](https://github.com/bdice)
+- Revert &quot;Fix empty cluster handling in tdigest merge ([#16675)&quot; (#16800](https://github.com/rapidsai/cudf/pull/16675)&quot; (#16800)) [@jihoonson](https://github.com/jihoonson)
+- Intentionally leak thread_local CUDA resources to avoid crash (part 1) ([#16787](https://github.com/rapidsai/cudf/pull/16787)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix `cov`/`corr` bug in dask-cudf ([#16786](https://github.com/rapidsai/cudf/pull/16786)) [@rjzamora](https://github.com/rjzamora)
+- Fix slice_strings wide strings logic with multi-byte characters ([#16777](https://github.com/rapidsai/cudf/pull/16777)) [@davidwendt](https://github.com/davidwendt)
+- Fix nvbench output for sha512 ([#16773](https://github.com/rapidsai/cudf/pull/16773)) [@davidwendt](https://github.com/davidwendt)
+- Allow read_csv(header=None) to return int column labels in `mode.pandas_compatible` ([#16769](https://github.com/rapidsai/cudf/pull/16769)) [@mroeschke](https://github.com/mroeschke)
+- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi)
+- Fix DataFrame.drop(columns=cudf.Series/Index, axis=1) ([#16712](https://github.com/rapidsai/cudf/pull/16712)) [@mroeschke](https://github.com/mroeschke)
+- Use merge base when calculating changed files ([#16709](https://github.com/rapidsai/cudf/pull/16709)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Ensure we pass the has_nulls tparam to mixed_join kernels ([#16708](https://github.com/rapidsai/cudf/pull/16708)) [@abellina](https://github.com/abellina)
+- Add boost-devel to Java CI Docker image ([#16707](https://github.com/rapidsai/cudf/pull/16707)) [@jlowe](https://github.com/jlowe)
+- [BUG] Add gpu node type to cudf-pandas 3rd-party integration nightly CI job ([#16704](https://github.com/rapidsai/cudf/pull/16704)) [@Matt711](https://github.com/Matt711)
+- Fix typo in column_factories.hpp comment from &#39;depth 1&#39; to &#39;depth 2&#39; ([#16700](https://github.com/rapidsai/cudf/pull/16700)) [@a-hirota](https://github.com/a-hirota)
+- Fix Series.to_frame(name=None) setting a None name ([#16698](https://github.com/rapidsai/cudf/pull/16698)) [@mroeschke](https://github.com/mroeschke)
+- Disable gtests/ERROR_TEST during compute-sanitizer memcheck test ([#16691](https://github.com/rapidsai/cudf/pull/16691)) [@davidwendt](https://github.com/davidwendt)
+- Enable batched multi-source reading of JSONL files with large records ([#16687](https://github.com/rapidsai/cudf/pull/16687)) [@shrshi](https://github.com/shrshi)
+- Handle `ordered` parameter in `CategoricalIndex.__repr__` ([#16683](https://github.com/rapidsai/cudf/pull/16683)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix loc/iloc.__setitem__[:, loc] with non cupy types ([#16677](https://github.com/rapidsai/cudf/pull/16677)) [@mroeschke](https://github.com/mroeschke)
+- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson)
+- Fix `cudf::rank` not getting enough params ([#16666](https://github.com/rapidsai/cudf/pull/16666)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Fix slowdown in `CategoricalIndex.__repr__` ([#16665](https://github.com/rapidsai/cudf/pull/16665)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2)
+- Fix slowdown in DataFrame repr in jupyter notebook ([#16656](https://github.com/rapidsai/cudf/pull/16656)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve Series name in duplicated method. ([#16655](https://github.com/rapidsai/cudf/pull/16655)) [@bdice](https://github.com/bdice)
+- Fix interval_range right child non-zero offset ([#16651](https://github.com/rapidsai/cudf/pull/16651)) [@mroeschke](https://github.com/mroeschke)
+- fix libcudf wheel publishing, make package-type explicit in wheel publishing ([#16650](https://github.com/rapidsai/cudf/pull/16650)) [@jameslamb](https://github.com/jameslamb)
+- Revert &quot;Hide all gtest symbols in cudftestutil ([#16546)&quot; (#16644](https://github.com/rapidsai/cudf/pull/16546)&quot; (#16644)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix integer overflow in indexalator pointer logic ([#16643](https://github.com/rapidsai/cudf/pull/16643)) [@davidwendt](https://github.com/davidwendt)
+- Allow for binops between two differently sized DecimalDtypes ([#16638](https://github.com/rapidsai/cudf/pull/16638)) [@mroeschke](https://github.com/mroeschke)
+- Move pragma once in rolling/jit/operation.hpp. ([#16636](https://github.com/rapidsai/cudf/pull/16636)) [@bdice](https://github.com/bdice)
+- Fix overflow bug in low-memory JSON reader ([#16632](https://github.com/rapidsai/cudf/pull/16632)) [@shrshi](https://github.com/shrshi)
+- Add the missing `num_aggregations` axis for `groupby_max_cardinality` ([#16630](https://github.com/rapidsai/cudf/pull/16630)) [@PointKernel](https://github.com/PointKernel)
+- Fix strings::detail::copy_range when target contains nulls ([#16626](https://github.com/rapidsai/cudf/pull/16626)) [@davidwendt](https://github.com/davidwendt)
+- Fix function parameters with common dependency modified during their evaluation ([#16620](https://github.com/rapidsai/cudf/pull/16620)) [@ttnghia](https://github.com/ttnghia)
+- bug-fix: Don&#39;t enable the CUDA language if testing was requested when finding cudf ([#16615](https://github.com/rapidsai/cudf/pull/16615)) [@cryos](https://github.com/cryos)
+- bug-fix: cudf/io/json.hpp use after move ([#16609](https://github.com/rapidsai/cudf/pull/16609)) [@NicolasDenoyelle](https://github.com/NicolasDenoyelle)
+- Remove CUDA whole compilation ODR violations ([#16603](https://github.com/rapidsai/cudf/pull/16603)) [@robertmaynard](https://github.com/robertmaynard)
+- MAINT: Adapt to numpy hiding flagsobject away ([#16593](https://github.com/rapidsai/cudf/pull/16593)) [@seberg](https://github.com/seberg)
+- Revert &quot;Make proxy NumPy arrays pass isinstance check in `cudf.pandas`&quot; ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711)
+- Switch python version to `3.10` in `cudf.pandas` pandas test scripts ([#16559](https://github.com/rapidsai/cudf/pull/16559)) [@galipremsagar](https://github.com/galipremsagar)
+- Hide all gtest symbols in cudftestutil ([#16546](https://github.com/rapidsai/cudf/pull/16546)) [@robertmaynard](https://github.com/robertmaynard)
+- Update the java code to properly deal with lists being returned as strings ([#16536](https://github.com/rapidsai/cudf/pull/16536)) [@revans2](https://github.com/revans2)
+- Register `read_parquet` and `read_csv` with dask-expr ([#16535](https://github.com/rapidsai/cudf/pull/16535)) [@rjzamora](https://github.com/rjzamora)
+- Change cudf::empty_like to not include offsets for empty strings columns ([#16529](https://github.com/rapidsai/cudf/pull/16529)) [@davidwendt](https://github.com/davidwendt)
+- Fix DataFrame reductions with median returning scalar instead of Series ([#16527](https://github.com/rapidsai/cudf/pull/16527)) [@mroeschke](https://github.com/mroeschke)
+- Allow DataFrame.sort_values(by=) to select an index level ([#16519](https://github.com/rapidsai/cudf/pull/16519)) [@mroeschke](https://github.com/mroeschke)
+- Fix `date_range(start, end, freq)` when end-start is divisible by freq ([#16516](https://github.com/rapidsai/cudf/pull/16516)) [@mroeschke](https://github.com/mroeschke)
+- Preserve array name in MultiIndex.from_arrays ([#16515](https://github.com/rapidsai/cudf/pull/16515)) [@mroeschke](https://github.com/mroeschke)
+- Disallow indexing by selecting duplicate labels ([#16514](https://github.com/rapidsai/cudf/pull/16514)) [@mroeschke](https://github.com/mroeschke)
+- Fix `.replace(Index, Index)` raising a TypeError ([#16513](https://github.com/rapidsai/cudf/pull/16513)) [@mroeschke](https://github.com/mroeschke)
+- Check index bounds in compact protocol reader. ([#16493](https://github.com/rapidsai/cudf/pull/16493)) [@bdice](https://github.com/bdice)
+- Fix build failures with GCC 13 ([#16488](https://github.com/rapidsai/cudf/pull/16488)) [@PointKernel](https://github.com/PointKernel)
+- Fix all-empty input column for strings split APIs ([#16466](https://github.com/rapidsai/cudf/pull/16466)) [@davidwendt](https://github.com/davidwendt)
+- Fix segmented-sort overlapped input/output indices ([#16463](https://github.com/rapidsai/cudf/pull/16463)) [@davidwendt](https://github.com/davidwendt)
+- Fix merge conflict for auto merge 16447 ([#16449](https://github.com/rapidsai/cudf/pull/16449)) [@davidwendt](https://github.com/davidwendt)
+
+## 📖 Documentation
+
+- Fix links in Dask cuDF documentation ([#16929](https://github.com/rapidsai/cudf/pull/16929)) [@rjzamora](https://github.com/rjzamora)
+- Improve aggregation documentation ([#16822](https://github.com/rapidsai/cudf/pull/16822)) [@PointKernel](https://github.com/PointKernel)
+- Add best practices page to Dask cuDF docs ([#16821](https://github.com/rapidsai/cudf/pull/16821)) [@rjzamora](https://github.com/rjzamora)
+- [DOC] Update Pylibcudf doc strings ([#16810](https://github.com/rapidsai/cudf/pull/16810)) [@Matt711](https://github.com/Matt711)
+- Recommending `miniforge` for conda install ([#16782](https://github.com/rapidsai/cudf/pull/16782)) [@mmccarty](https://github.com/mmccarty)
+- Add labeling pylibcudf doc pages ([#16779](https://github.com/rapidsai/cudf/pull/16779)) [@mroeschke](https://github.com/mroeschke)
+- Migrate dask-cudf README improvements to dask-cudf sphinx docs ([#16765](https://github.com/rapidsai/cudf/pull/16765)) [@rjzamora](https://github.com/rjzamora)
+- [DOC] Remove out of date section from cudf.pandas docs ([#16697](https://github.com/rapidsai/cudf/pull/16697)) [@Matt711](https://github.com/Matt711)
+- Add performance tips to cudf.pandas FAQ. ([#16693](https://github.com/rapidsai/cudf/pull/16693)) [@bdice](https://github.com/bdice)
+- Update documentation for Dask cuDF ([#16671](https://github.com/rapidsai/cudf/pull/16671)) [@rjzamora](https://github.com/rjzamora)
+- Add missing pylibcudf strings docs ([#16471](https://github.com/rapidsai/cudf/pull/16471)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- DOC: Refresh pylibcudf guide ([#15856](https://github.com/rapidsai/cudf/pull/15856)) [@lithomas1](https://github.com/lithomas1)
+
+## 🚀 New Features
+
+- Build `cudf-polars` with `build.sh` ([#16898](https://github.com/rapidsai/cudf/pull/16898)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add polars to &quot;all&quot; dependency list. ([#16875](https://github.com/rapidsai/cudf/pull/16875)) [@bdice](https://github.com/bdice)
+- nvCOMP GZIP integration ([#16770](https://github.com/rapidsai/cudf/pull/16770)) [@vuule](https://github.com/vuule)
+- [FEA] Add support for `cudf.NamedAgg` ([#16744](https://github.com/rapidsai/cudf/pull/16744)) [@Matt711](https://github.com/Matt711)
+- Add experimental `filesystem=&quot;arrow&quot;` support in `dask_cudf.read_parquet` ([#16684](https://github.com/rapidsai/cudf/pull/16684)) [@rjzamora](https://github.com/rjzamora)
+- Relax Arrow pin ([#16681](https://github.com/rapidsai/cudf/pull/16681)) [@vyasr](https://github.com/vyasr)
+- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism)
+- Move NDS-H examples into benchmarks ([#16663](https://github.com/rapidsai/cudf/pull/16663)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- [FEA] Add third-party library integration testing of cudf.pandas to cudf ([#16645](https://github.com/rapidsai/cudf/pull/16645)) [@Matt711](https://github.com/Matt711)
+- Make isinstance check pass for proxy ndarrays ([#16601](https://github.com/rapidsai/cudf/pull/16601)) [@Matt711](https://github.com/Matt711)
+- [FEA] Add an environment variable to fail on fallback in `cudf.pandas` ([#16562](https://github.com/rapidsai/cudf/pull/16562)) [@Matt711](https://github.com/Matt711)
+- [FEA] Add support for `cudf.unique` ([#16554](https://github.com/rapidsai/cudf/pull/16554)) [@Matt711](https://github.com/Matt711)
+- [FEA] Support named aggregations in `df.groupby().agg()` ([#16528](https://github.com/rapidsai/cudf/pull/16528)) [@Matt711](https://github.com/Matt711)
+- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt)
+- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann)
+- Remove cuDF dependency from pylibcudf column from_device tests ([#16441](https://github.com/rapidsai/cudf/pull/16441)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Enable cudf.pandas REPL and -c command support ([#16428](https://github.com/rapidsai/cudf/pull/16428)) [@bdice](https://github.com/bdice)
+- Setup pylibcudf package ([#16299](https://github.com/rapidsai/cudf/pull/16299)) [@lithomas1](https://github.com/lithomas1)
+- Add a libcudf/thrust-based TPC-H derived datagen ([#16294](https://github.com/rapidsai/cudf/pull/16294)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Make proxy NumPy arrays pass isinstance check in `cudf.pandas` ([#16286](https://github.com/rapidsai/cudf/pull/16286)) [@Matt711](https://github.com/Matt711)
+- Add skiprows and nrows to parquet reader ([#16214](https://github.com/rapidsai/cudf/pull/16214)) [@lithomas1](https://github.com/lithomas1)
+- Upgrade to nvcomp 4.0.1 ([#16076](https://github.com/rapidsai/cudf/pull/16076)) [@vuule](https://github.com/vuule)
+- Migrate ORC reader to pylibcudf ([#16042](https://github.com/rapidsai/cudf/pull/16042)) [@lithomas1](https://github.com/lithomas1)
+- JSON reader validation of values ([#15968](https://github.com/rapidsai/cudf/pull/15968)) [@karthikeyann](https://github.com/karthikeyann)
+- Implement exposed null mask APIs in pylibcudf ([#15908](https://github.com/rapidsai/cudf/pull/15908)) [@charlesbluca](https://github.com/charlesbluca)
+- Word-based nvtext::minhash function ([#15368](https://github.com/rapidsai/cudf/pull/15368)) [@davidwendt](https://github.com/davidwendt)
+
+## 🛠️ Improvements
+
+- Make tests deterministic ([#16910](https://github.com/rapidsai/cudf/pull/16910)) [@galipremsagar](https://github.com/galipremsagar)
+- Update update-version.sh to use packaging lib ([#16891](https://github.com/rapidsai/cudf/pull/16891)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Pin polars for 24.10 and update polars test suite xfail list ([#16886](https://github.com/rapidsai/cudf/pull/16886)) [@wence-](https://github.com/wence-)
+- Add in support for setting delim when parsing JSON through java ([#16867) (#16880](https://github.com/rapidsai/cudf/pull/16867) (#16880)) [@revans2](https://github.com/revans2)
+- Remove unnecessary flag from build.sh ([#16879](https://github.com/rapidsai/cudf/pull/16879)) [@vyasr](https://github.com/vyasr)
+- Ignore numba warning specific to ARM runners ([#16872](https://github.com/rapidsai/cudf/pull/16872)) [@galipremsagar](https://github.com/galipremsagar)
+- Display deltas for `cudf.pandas` test summary ([#16864](https://github.com/rapidsai/cudf/pull/16864)) [@galipremsagar](https://github.com/galipremsagar)
+- Switch to using native `traceback` ([#16851](https://github.com/rapidsai/cudf/pull/16851)) [@galipremsagar](https://github.com/galipremsagar)
+- JSON tree algorithm code reorg ([#16836](https://github.com/rapidsai/cudf/pull/16836)) [@karthikeyann](https://github.com/karthikeyann)
+- Add string.repeats API to pylibcudf ([#16834](https://github.com/rapidsai/cudf/pull/16834)) [@mroeschke](https://github.com/mroeschke)
+- Use CI workflow branch &#39;branch-24.10&#39; again ([#16832](https://github.com/rapidsai/cudf/pull/16832)) [@jameslamb](https://github.com/jameslamb)
+- Rename the NDS-H benchmark binaries ([#16831](https://github.com/rapidsai/cudf/pull/16831)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add string.findall APIs to pylibcudf ([#16825](https://github.com/rapidsai/cudf/pull/16825)) [@mroeschke](https://github.com/mroeschke)
+- Add string.extract APIs to pylibcudf ([#16823](https://github.com/rapidsai/cudf/pull/16823)) [@mroeschke](https://github.com/mroeschke)
+- use get-pr-info from nv-gha-runners ([#16819](https://github.com/rapidsai/cudf/pull/16819)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Add string.contains APIs to pylibcudf ([#16814](https://github.com/rapidsai/cudf/pull/16814)) [@mroeschke](https://github.com/mroeschke)
+- Forward-merge branch-24.08 to branch-24.10 ([#16813](https://github.com/rapidsai/cudf/pull/16813)) [@bdice](https://github.com/bdice)
+- Add io_type axis with default `PINNED_BUFFER` to nvbench PQ multithreaded reader ([#16809](https://github.com/rapidsai/cudf/pull/16809)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Update fmt (to 11.0.2) and spdlog (to 1.14.1). ([#16806](https://github.com/rapidsai/cudf/pull/16806)) [@jameslamb](https://github.com/jameslamb)
+- Add ability to set parquet row group max #rows and #bytes in java ([#16805](https://github.com/rapidsai/cudf/pull/16805)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Add in option for Java JSON APIs to do column pruning in CUDF ([#16796](https://github.com/rapidsai/cudf/pull/16796)) [@revans2](https://github.com/revans2)
+- Support drop_first in get_dummies ([#16795](https://github.com/rapidsai/cudf/pull/16795)) [@mroeschke](https://github.com/mroeschke)
+- Exposed stream-ordering to join API ([#16793](https://github.com/rapidsai/cudf/pull/16793)) [@lamarrr](https://github.com/lamarrr)
+- Add string.attributes APIs to pylibcudf ([#16785](https://github.com/rapidsai/cudf/pull/16785)) [@mroeschke](https://github.com/mroeschke)
+- Java: Make ColumnVector.fromViewWithContiguousAllocation public ([#16784](https://github.com/rapidsai/cudf/pull/16784)) [@jlowe](https://github.com/jlowe)
+- Add partitioning APIs to pylibcudf ([#16781](https://github.com/rapidsai/cudf/pull/16781)) [@mroeschke](https://github.com/mroeschke)
+- Optimization of tdigest merge aggregation. ([#16780](https://github.com/rapidsai/cudf/pull/16780)) [@nvdbaranec](https://github.com/nvdbaranec)
+- use libkvikio wheels in wheel builds ([#16778](https://github.com/rapidsai/cudf/pull/16778)) [@jameslamb](https://github.com/jameslamb)
+- Exposed stream-ordering to datetime API ([#16774](https://github.com/rapidsai/cudf/pull/16774)) [@lamarrr](https://github.com/lamarrr)
+- Add io/timezone APIs to pylibcudf ([#16771](https://github.com/rapidsai/cudf/pull/16771)) [@mroeschke](https://github.com/mroeschke)
+- Remove `MultiIndex._poplevel` inplace implementation. ([#16767](https://github.com/rapidsai/cudf/pull/16767)) [@mroeschke](https://github.com/mroeschke)
+- allow pandas patch version to float in cudf-pandas unit tests ([#16763](https://github.com/rapidsai/cudf/pull/16763)) [@jameslamb](https://github.com/jameslamb)
+- Simplify the nvCOMP adapter ([#16762](https://github.com/rapidsai/cudf/pull/16762)) [@vuule](https://github.com/vuule)
+- Add labeling APIs to pylibcudf ([#16761](https://github.com/rapidsai/cudf/pull/16761)) [@mroeschke](https://github.com/mroeschke)
+- Add transform APIs to pylibcudf ([#16760](https://github.com/rapidsai/cudf/pull/16760)) [@mroeschke](https://github.com/mroeschke)
+- Add a benchmark to study Parquet reader&#39;s performance for wide tables ([#16751](https://github.com/rapidsai/cudf/pull/16751)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Change the Parquet writer&#39;s `default_row_group_size_bytes` from 128MB to inf ([#16750](https://github.com/rapidsai/cudf/pull/16750)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Add transpose API to pylibcudf ([#16749](https://github.com/rapidsai/cudf/pull/16749)) [@mroeschke](https://github.com/mroeschke)
+- Add support for Python 3.12, update Kafka dependencies to 2.5.x ([#16745](https://github.com/rapidsai/cudf/pull/16745)) [@jameslamb](https://github.com/jameslamb)
+- Generate GPU vs CPU usage metrics per pytest file in pandas testsuite for `cudf.pandas` ([#16739](https://github.com/rapidsai/cudf/pull/16739)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor cudf pandas integration tests CI ([#16728](https://github.com/rapidsai/cudf/pull/16728)) [@Matt711](https://github.com/Matt711)
+- Remove ERROR_TEST gtest from libcudf ([#16722](https://github.com/rapidsai/cudf/pull/16722)) [@davidwendt](https://github.com/davidwendt)
+- Use Series._from_column more consistently to avoid validation ([#16716](https://github.com/rapidsai/cudf/pull/16716)) [@mroeschke](https://github.com/mroeschke)
+- remove some unnecessary libcudf nightly builds ([#16714](https://github.com/rapidsai/cudf/pull/16714)) [@jameslamb](https://github.com/jameslamb)
+- Remove xfail from torch-cudf.pandas integration test ([#16705](https://github.com/rapidsai/cudf/pull/16705)) [@Matt711](https://github.com/Matt711)
+- Add return type annotations to MultiIndex ([#16696](https://github.com/rapidsai/cudf/pull/16696)) [@mroeschke](https://github.com/mroeschke)
+- Add type annotations to Index classes, utilize _from_column more ([#16695](https://github.com/rapidsai/cudf/pull/16695)) [@mroeschke](https://github.com/mroeschke)
+- Have interval_range use IntervalIndex.from_breaks, remove column_empty_same_mask ([#16694](https://github.com/rapidsai/cudf/pull/16694)) [@mroeschke](https://github.com/mroeschke)
+- Increase timeouts for couple of tests ([#16692](https://github.com/rapidsai/cudf/pull/16692)) [@galipremsagar](https://github.com/galipremsagar)
+- Replace raw device_memory_resource pointer in pylibcudf Cython ([#16674](https://github.com/rapidsai/cudf/pull/16674)) [@harrism](https://github.com/harrism)
+- switch from typing.Callable to collections.abc.Callable ([#16670](https://github.com/rapidsai/cudf/pull/16670)) [@jameslamb](https://github.com/jameslamb)
+- Update rapidsai/pre-commit-hooks ([#16669](https://github.com/rapidsai/cudf/pull/16669)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Multi-file and Parquet-aware prefetching from remote storage ([#16657](https://github.com/rapidsai/cudf/pull/16657)) [@rjzamora](https://github.com/rjzamora)
+- Access Frame attributes instead of ColumnAccessor attributes when available ([#16652](https://github.com/rapidsai/cudf/pull/16652)) [@mroeschke](https://github.com/mroeschke)
+- Use non-mangled type names in nvbench output ([#16649](https://github.com/rapidsai/cudf/pull/16649)) [@davidwendt](https://github.com/davidwendt)
+- Add pylibcudf build dir in build.sh for `clean` ([#16648](https://github.com/rapidsai/cudf/pull/16648)) [@galipremsagar](https://github.com/galipremsagar)
+- Prune workflows based on changed files ([#16642](https://github.com/rapidsai/cudf/pull/16642)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove arrow dependency ([#16640](https://github.com/rapidsai/cudf/pull/16640)) [@vyasr](https://github.com/vyasr)
+- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Drop Python 3.9 support ([#16637](https://github.com/rapidsai/cudf/pull/16637)) [@jameslamb](https://github.com/jameslamb)
+- Support DecimalDtype meta in dask_cudf ([#16634](https://github.com/rapidsai/cudf/pull/16634)) [@mroeschke](https://github.com/mroeschke)
+- Add `num_multiprocessors` utility ([#16628](https://github.com/rapidsai/cudf/pull/16628)) [@PointKernel](https://github.com/PointKernel)
+- Annotate `ColumnAccessor._data` labels as `Hashable` ([#16623](https://github.com/rapidsai/cudf/pull/16623)) [@mroeschke](https://github.com/mroeschke)
+- Remove build_categorical_column in favor of CategoricalColumn constructor ([#16617](https://github.com/rapidsai/cudf/pull/16617)) [@mroeschke](https://github.com/mroeschke)
+- Move apply_boolean_mask benchmark to nvbench ([#16616](https://github.com/rapidsai/cudf/pull/16616)) [@davidwendt](https://github.com/davidwendt)
+- Revise `get_reader_filepath_or_buffer` to handle a list of data sources ([#16613](https://github.com/rapidsai/cudf/pull/16613)) [@rjzamora](https://github.com/rjzamora)
+- do not install cudf in cudf_polars wheel tests ([#16612](https://github.com/rapidsai/cudf/pull/16612)) [@jameslamb](https://github.com/jameslamb)
+- remove streamz git dependency, standardize build dependency names, consolidate some dependency lists ([#16611](https://github.com/rapidsai/cudf/pull/16611)) [@jameslamb](https://github.com/jameslamb)
+- Fix C++ and Cython io types ([#16610](https://github.com/rapidsai/cudf/pull/16610)) [@vyasr](https://github.com/vyasr)
+- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr)
+- Remove thrust::optional from expression evaluator ([#16604](https://github.com/rapidsai/cudf/pull/16604)) [@bdice](https://github.com/bdice)
+- Add stricter typing and validation to ColumnAccessor ([#16602](https://github.com/rapidsai/cudf/pull/16602)) [@mroeschke](https://github.com/mroeschke)
+- make more use of YAML anchors in dependencies.yaml ([#16597](https://github.com/rapidsai/cudf/pull/16597)) [@jameslamb](https://github.com/jameslamb)
+- Enable testing `cudf.pandas` unit tests for all minor versions of pandas ([#16595](https://github.com/rapidsai/cudf/pull/16595)) [@galipremsagar](https://github.com/galipremsagar)
+- Extend the Parquet writer&#39;s dictionary encoding benchmark. ([#16591](https://github.com/rapidsai/cudf/pull/16591)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr)
+- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr)
+- Add build job for pylibcudf ([#16587](https://github.com/rapidsai/cudf/pull/16587)) [@vyasr](https://github.com/vyasr)
+- Add `public` qualifier for some member functions in Java class `Schema` ([#16583](https://github.com/rapidsai/cudf/pull/16583)) [@ttnghia](https://github.com/ttnghia)
+- Enable gtests previously disabled for compute-sanitizer bug ([#16581](https://github.com/rapidsai/cudf/pull/16581)) [@davidwendt](https://github.com/davidwendt)
+- [FEA] Add filesystem argument to `cudf.read_parquet` ([#16577](https://github.com/rapidsai/cudf/pull/16577)) [@rjzamora](https://github.com/rjzamora)
+- Ensure size is always passed to NumericalColumn ([#16576](https://github.com/rapidsai/cudf/pull/16576)) [@mroeschke](https://github.com/mroeschke)
+- standardize and consolidate wheel installations in testing scripts ([#16575](https://github.com/rapidsai/cudf/pull/16575)) [@jameslamb](https://github.com/jameslamb)
+- Performance improvement for strings::slice for wide strings ([#16574](https://github.com/rapidsai/cudf/pull/16574)) [@davidwendt](https://github.com/davidwendt)
+- Add `ToCudfBackend` expression to dask-cudf ([#16573](https://github.com/rapidsai/cudf/pull/16573)) [@rjzamora](https://github.com/rjzamora)
+- CI: Test against old versions of key dependencies ([#16570](https://github.com/rapidsai/cudf/pull/16570)) [@seberg](https://github.com/seberg)
+- Replace `NativeFile` dependency in dask-cudf Parquet reader ([#16569](https://github.com/rapidsai/cudf/pull/16569)) [@rjzamora](https://github.com/rjzamora)
+- Align public utility function signatures  with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke)
+- Move libcudf reduction google-benchmarks to nvbench ([#16564](https://github.com/rapidsai/cudf/pull/16564)) [@davidwendt](https://github.com/davidwendt)
+- Rework strings::slice benchmark to use nvbench ([#16563](https://github.com/rapidsai/cudf/pull/16563)) [@davidwendt](https://github.com/davidwendt)
+- Reenable arrow tests ([#16556](https://github.com/rapidsai/cudf/pull/16556)) [@vyasr](https://github.com/vyasr)
+- Clean up reshaping ops ([#16553](https://github.com/rapidsai/cudf/pull/16553)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke)
+- Rewrite remaining Python Arrow interop conversions using the C Data Interface ([#16548](https://github.com/rapidsai/cudf/pull/16548)) [@vyasr](https://github.com/vyasr)
+- [REVIEW] JSON host tree algorithms ([#16545](https://github.com/rapidsai/cudf/pull/16545)) [@shrshi](https://github.com/shrshi)
+- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove hardcoded versions from workflows. ([#16540](https://github.com/rapidsai/cudf/pull/16540)) [@bdice](https://github.com/bdice)
+- Ensure comparisons with pyints and integer series always succeed ([#16532](https://github.com/rapidsai/cudf/pull/16532)) [@seberg](https://github.com/seberg)
+- Remove unneeded output size parameter from internal count_matches utility ([#16531](https://github.com/rapidsai/cudf/pull/16531)) [@davidwendt](https://github.com/davidwendt)
+- Remove invalid column_view usage in string-scalar-to-column function ([#16530](https://github.com/rapidsai/cudf/pull/16530)) [@davidwendt](https://github.com/davidwendt)
+- Raise NotImplementedError for Series.rename that&#39;s not a scalar ([#16525](https://github.com/rapidsai/cudf/pull/16525)) [@mroeschke](https://github.com/mroeschke)
+- Remove deprecated public APIs from libcudf ([#16524](https://github.com/rapidsai/cudf/pull/16524)) [@davidwendt](https://github.com/davidwendt)
+- Return Interval object in pandas compat mode for IntervalIndex reductions ([#16523](https://github.com/rapidsai/cudf/pull/16523)) [@mroeschke](https://github.com/mroeschke)
+- Update json normalization to take device_buffer ([#16520](https://github.com/rapidsai/cudf/pull/16520)) [@karthikeyann](https://github.com/karthikeyann)
+- Rework cudf::io::text::byte_range_info class member functions ([#16518](https://github.com/rapidsai/cudf/pull/16518)) [@davidwendt](https://github.com/davidwendt)
+- Remove unneeded pair-iterator benchmark ([#16511](https://github.com/rapidsai/cudf/pull/16511)) [@davidwendt](https://github.com/davidwendt)
+- Update pre-commit hooks ([#16510](https://github.com/rapidsai/cudf/pull/16510)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve update-version.sh ([#16506](https://github.com/rapidsai/cudf/pull/16506)) [@bdice](https://github.com/bdice)
+- Use tool.scikit-build.cmake.version, set scikit-build-core minimum-version ([#16503](https://github.com/rapidsai/cudf/pull/16503)) [@jameslamb](https://github.com/jameslamb)
+- Pass batch size to JSON reader using environment variable ([#16502](https://github.com/rapidsai/cudf/pull/16502)) [@shrshi](https://github.com/shrshi)
+- Remove a deprecated multibyte_split API ([#16501](https://github.com/rapidsai/cudf/pull/16501)) [@davidwendt](https://github.com/davidwendt)
+- Add interop example for `arrow::StringViewArray` to `cudf::column` ([#16498](https://github.com/rapidsai/cudf/pull/16498)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add keep option to distinct nvbench ([#16497](https://github.com/rapidsai/cudf/pull/16497)) [@bdice](https://github.com/bdice)
+- Use more idomatic cudf APIs in dask_cudf meta generation ([#16487](https://github.com/rapidsai/cudf/pull/16487)) [@mroeschke](https://github.com/mroeschke)
+- Fix typo in dispatch_row_equal. ([#16473](https://github.com/rapidsai/cudf/pull/16473)) [@bdice](https://github.com/bdice)
+- Use explicit construction of column subclass instead of `build_column` when type is known ([#16470](https://github.com/rapidsai/cudf/pull/16470)) [@mroeschke](https://github.com/mroeschke)
+- Move exception handler into pylibcudf from cudf ([#16468](https://github.com/rapidsai/cudf/pull/16468)) [@lithomas1](https://github.com/lithomas1)
+- Make StructColumn.__init__ strict ([#16467](https://github.com/rapidsai/cudf/pull/16467)) [@mroeschke](https://github.com/mroeschke)
+- Make ListColumn.__init__ strict ([#16465](https://github.com/rapidsai/cudf/pull/16465)) [@mroeschke](https://github.com/mroeschke)
+- Make Timedelta/DatetimeColumn.__init__ strict ([#16464](https://github.com/rapidsai/cudf/pull/16464)) [@mroeschke](https://github.com/mroeschke)
+- Make NumericalColumn.__init__ strict ([#16457](https://github.com/rapidsai/cudf/pull/16457)) [@mroeschke](https://github.com/mroeschke)
+- Make CategoricalColumn.__init__ strict ([#16456](https://github.com/rapidsai/cudf/pull/16456)) [@mroeschke](https://github.com/mroeschke)
+- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke)
+- Expose `stream` param in transform APIs ([#16452](https://github.com/rapidsai/cudf/pull/16452)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Add upper bound pin for polars ([#16442](https://github.com/rapidsai/cudf/pull/16442)) [@wence-](https://github.com/wence-)
+- Make (Indexed)Frame.__init__ require data (and index) ([#16430](https://github.com/rapidsai/cudf/pull/16430)) [@mroeschke](https://github.com/mroeschke)
+- Add Java APIs to copy column data to host asynchronously ([#16429](https://github.com/rapidsai/cudf/pull/16429)) [@jlowe](https://github.com/jlowe)
+- Update docs of the TPC-H derived examples ([#16423](https://github.com/rapidsai/cudf/pull/16423)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Use RMM adaptor constructors instead of factories. ([#16414](https://github.com/rapidsai/cudf/pull/16414)) [@bdice](https://github.com/bdice)
+- Align ewm APIs with pandas 2.x ([#16413](https://github.com/rapidsai/cudf/pull/16413)) [@mroeschke](https://github.com/mroeschke)
+- Remove checking for specific tests in memcheck script ([#16412](https://github.com/rapidsai/cudf/pull/16412)) [@davidwendt](https://github.com/davidwendt)
+- Add stream parameter to reshape APIs ([#16410](https://github.com/rapidsai/cudf/pull/16410)) [@davidwendt](https://github.com/davidwendt)
+- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke)
+- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke)
+- update some branch references in GitHub Actions configs ([#16397](https://github.com/rapidsai/cudf/pull/16397)) [@jameslamb](https://github.com/jameslamb)
+- Support reading matching projected and filter cols from Parquet files with otherwise mismatched schemas ([#16394](https://github.com/rapidsai/cudf/pull/16394)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Merge branch-24.08 into branch-24.10 ([#16393](https://github.com/rapidsai/cudf/pull/16393)) [@jameslamb](https://github.com/jameslamb)
+- Add query 10 to the TPC-H suite ([#16392](https://github.com/rapidsai/cudf/pull/16392)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Use `make_host_vector` instead of `make_std_vector` to facilitate pinned memory optimizations ([#16386](https://github.com/rapidsai/cudf/pull/16386)) [@vuule](https://github.com/vuule)
+- Fix some issues with deprecated / removed cccl facilities ([#16377](https://github.com/rapidsai/cudf/pull/16377)) [@miscco](https://github.com/miscco)
+- Align IntervalIndex APIs with pandas 2.x ([#16371](https://github.com/rapidsai/cudf/pull/16371)) [@mroeschke](https://github.com/mroeschke)
+- Align CategoricalIndex APIs with pandas 2.x ([#16369](https://github.com/rapidsai/cudf/pull/16369)) [@mroeschke](https://github.com/mroeschke)
+- Align TimedeltaIndex APIs with pandas 2.x ([#16368](https://github.com/rapidsai/cudf/pull/16368)) [@mroeschke](https://github.com/mroeschke)
+- Align DatetimeIndex APIs with pandas 2.x ([#16367](https://github.com/rapidsai/cudf/pull/16367)) [@mroeschke](https://github.com/mroeschke)
+- fix [tool.setuptools] reference in custreamz config ([#16365](https://github.com/rapidsai/cudf/pull/16365)) [@jameslamb](https://github.com/jameslamb)
+- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke)
+- Rebuild for &amp; Support NumPy 2 ([#16300](https://github.com/rapidsai/cudf/pull/16300)) [@jakirkham](https://github.com/jakirkham)
+- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Added batch memset to memset data and validity buffers in parquet reader ([#16281](https://github.com/rapidsai/cudf/pull/16281)) [@sdrp713](https://github.com/sdrp713)
+- Deduplicate decimal32/decimal64 to decimal128 conversion function ([#16236](https://github.com/rapidsai/cudf/pull/16236)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Refactor mixed_semi_join using cuco::static_set ([#16230](https://github.com/rapidsai/cudf/pull/16230)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Improve performance of hash_character_ngrams using warp-per-string kernel ([#16212](https://github.com/rapidsai/cudf/pull/16212)) [@davidwendt](https://github.com/davidwendt)
+- Add environment variable to log cudf.pandas fallback calls ([#16161](https://github.com/rapidsai/cudf/pull/16161)) [@mroeschke](https://github.com/mroeschke)
+- Add libcudf example with large strings ([#15983](https://github.com/rapidsai/cudf/pull/15983)) [@davidwendt](https://github.com/davidwendt)
+- JSON tree algorithms refactor I: CSR data structure for column tree ([#15979](https://github.com/rapidsai/cudf/pull/15979)) [@shrshi](https://github.com/shrshi)
+- Support multiple new-line characters in regex APIs ([#15961](https://github.com/rapidsai/cudf/pull/15961)) [@davidwendt](https://github.com/davidwendt)
+- adding wheel build for libcudf ([#15483](https://github.com/rapidsai/cudf/pull/15483)) [@msarahan](https://github.com/msarahan)
+- Replace usages of `thrust::optional` with `std::optional` ([#15091](https://github.com/rapidsai/cudf/pull/15091)) [@miscco](https://github.com/miscco)
+
 # cudf 24.08.00 (7 Aug 2024)
 
 ## 🚨 Breaking Changes
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9cdde7c2b7..b55af21a300 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -293,8 +293,8 @@ In order to run doxygen as a linter on C++/CUDA code, run
 ./ci/checks/doxygen.sh
 ```
 
-Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/),
-[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/).
+Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/)
+with its various rules  for Black-like formatting or Isort.
 
 cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling
 mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes,
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index c67d127e635..4290d013fe4 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -5,7 +5,6 @@ set -euo pipefail
 
 export RAPIDS_VERSION="$(rapids-version)"
 export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
@@ -29,13 +28,16 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcudf pylibcudf cudf dask-cudf
+  "libcudf=${RAPIDS_VERSION}" \
+  "pylibcudf=${RAPIDS_VERSION}" \
+  "cudf=${RAPIDS_VERSION}" \
+  "dask-cudf=${RAPIDS_VERSION}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_NUMBER}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
@@ -55,4 +57,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 2e3f70ba767..823d7f62290 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -52,5 +52,10 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/custreamz
 
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/cudf-polars
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index 8975381ceba..91bc071583e 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -5,11 +5,15 @@ set -euo pipefail
 
 package_dir="python/libcudf"
 
+export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 mkdir -p ${package_dir}/final_dist
-python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+python -m auditwheel repair \
+    --exclude libnvcomp.so.4 \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
diff --git a/ci/clang_tidy.sh b/ci/clang_tidy.sh
new file mode 100755
index 00000000000..4d5d3fc3136
--- /dev/null
+++ b/ci/clang_tidy.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+rapids-logger "Create clang-tidy conda environment"
+. /opt/conda/etc/profile.d/conda.sh
+
+ENV_YAML_DIR="$(mktemp -d)"
+
+rapids-dependency-file-generator \
+  --output conda \
+  --file-key clang_tidy \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
+
+rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate clang_tidy
+set -u
+
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
+source rapids-configure-sccache
+
+# Run the build via CMake, which will run clang-tidy when CUDF_CLANG_TIDY is enabled.
+cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON -GNinja
+cmake --build cpp/build
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 870901d223b..95f36653c2c 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -93,6 +93,7 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
 # .devcontainer files
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 0819eacf636..2439af5b644 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -23,7 +23,10 @@ compute-sanitizer --tool memcheck custom_optimized names.csv
 compute-sanitizer --tool memcheck custom_prealloc names.csv
 compute-sanitizer --tool memcheck custom_with_malloc names.csv
 
-compute-sanitizer --tool memcheck parquet_io
+compute-sanitizer --tool memcheck parquet_io example.parquet
 compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
 
+compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet
+compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2
+
 exit ${EXITCODE}
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
index f5a8de543f6..8cd78eb11c2 100755
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate C++ testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -31,7 +33,10 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf libcudf_kafka libcudf-tests libcudf-example
+  "libcudf=${RAPIDS_VERSION}" \
+  "libcudf_kafka=${RAPIDS_VERSION}" \
+  "libcudf-tests=${RAPIDS_VERSION}" \
+  "libcudf-example=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 55399d0371a..f5bcdc62604 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -24,14 +24,17 @@ rapids-logger "Download wheels"
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+# Download libcudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep
 
-rapids-logger "Install pylibcudf"
-python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+rapids-logger "Install libcudf, pylibcudf and cudf_polars"
+python -m pip install \
+    -v \
+    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
-rapids-logger "Install cudf_polars"
-python -m pip install $(echo ./dist/cudf_polars*.whl)
 
 TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"
diff --git a/ci/test_java.sh b/ci/test_java.sh
index 629ad11014a..7f1aa633afc 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate Java testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -30,7 +32,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf
+  "libcudf=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
index da9478ce25d..4197dc5617f 100755
--- a/ci/test_notebooks.sh
+++ b/ci/test_notebooks.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate notebook testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -30,7 +32,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  cudf libcudf
+  "cudf=${RAPIDS_VERSION}" \
+  "libcudf=${RAPIDS_VERSION}"
 
 NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")"
 pushd notebooks
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index dc70661a17a..4327bfff3af 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -7,6 +7,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate Python testing dependencies"
 
 ENV_YAML_DIR="$(mktemp -d)"
@@ -38,4 +40,5 @@ rapids-print-env
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  cudf libcudf
+  "cudf=${RAPIDS_VERSION}" \
+  "libcudf=${RAPIDS_VERSION}"
diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index 2386414b32e..9528549a562 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf
 
 rapids-logger "Check GPU usage"
 nvidia-smi
-
+rapids-print-env
 EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index 67c97ad29a5..db86721755d 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -7,10 +7,15 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 # Common setup steps shared by Python test jobs
 source ./ci/test_python_common.sh test_python_other
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  dask-cudf cudf_kafka custreamz
+  "dask-cudf=${RAPIDS_VERSION}" \
+  "cudf_kafka=${RAPIDS_VERSION}" \
+  "custreamz=${RAPIDS_VERSION}" \
+  "cudf-polars=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
@@ -33,7 +38,7 @@ rapids-logger "pytest dask_cudf (legacy)"
 DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   .
 
 rapids-logger "pytest cudf_kafka"
@@ -50,5 +55,19 @@ rapids-logger "pytest custreamz"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \
   --cov-report=term
 
+# Note that cudf-polars uses rmm.mr.CudaAsyncMemoryResource() which allocates
+# half the available memory. This doesn't play well with multiple workers, so
+# we keep --numprocesses=1 for now. This should be resolved by
+# https://github.com/rapidsai/cudf/issues/16723.
+rapids-logger "pytest cudf-polars"
+./ci/run_cudf_polars_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" \
+  --numprocesses=1 \
+  --dist=worksteal \
+  --cov-config=./pyproject.toml \
+  --cov=cudf_polars \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-polars-coverage.xml" \
+  --cov-report=term
+
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8b45d26c367..c3716c4759a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -63,9 +63,9 @@ dependencies:
 - openpyxl
 - packaging
 - pandas
-- pandas>=2.0,<2.2.3dev0
+- pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.12
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 354c1360e5a..38e131e79cb 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -61,9 +61,9 @@ dependencies:
 - openpyxl
 - packaging
 - pandas
-- pandas>=2.0,<2.2.3dev0
+- pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.12
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/recipes/cudf-polars/build.sh b/conda/recipes/cudf-polars/build.sh
new file mode 100644
index 00000000000..06e2f1bcb99
--- /dev/null
+++ b/conda/recipes/cudf-polars/build.sh
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# This assumes the script is executed from the root of the repo directory
+./build.sh cudf_polars
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
new file mode 100644
index 00000000000..edf92b930d9
--- /dev/null
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: cudf-polars
+  version: {{ version }}
+
+source:
+  path: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    - AWS_SESSION_TOKEN
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_GENERATOR
+    - PARALLEL_LEVEL
+    - SCCACHE_BUCKET
+    - SCCACHE_IDLE_TIMEOUT
+    - SCCACHE_REGION
+    - SCCACHE_S3_KEY_PREFIX=cudf-polars-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=cudf-polars-linux64 # [linux64]
+    - SCCACHE_S3_USE_SSL
+    - SCCACHE_S3_NO_CREDENTIALS
+
+requirements:
+  host:
+    - python
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - setuptools
+    - cuda-version ={{ cuda_version }}
+  run:
+    - python
+    - pylibcudf ={{ version }}
+    - polars >=1.11,<1.12
+    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
+
+test:
+  requires:
+    - cuda-version ={{ cuda_version }}
+  imports:
+    - cudf_polars
+
+
+about:
+  home: https://rapids.ai/
+  license: Apache-2.0
+  license_family: APACHE
+  license_file: LICENSE
+  summary: cudf-polars library
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 25e69b89789..2c254415318 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,7 @@ requirements:
   run:
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.3dev0
+    - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
     - numba-cuda >=0.0.13
     - numpy >=1.23,<3.0a0
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index 7c1efa0176c..3d965f30986 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -77,7 +77,7 @@ requirements:
   run:
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.3dev0
+    - pandas >=2.0,<2.2.4dev0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
index b791d846d1d..12120a5c6d1 100644
--- a/cpp/.clang-tidy
+++ b/cpp/.clang-tidy
@@ -1,18 +1,47 @@
 ---
+# Notes on disabled checks
+# ------------------------
+# modernize-use-equals-default:
+#     auto-fix is broken (doesn't insert =default correctly)
+# modernize-concat-nested-namespaces:
+#     auto-fix is broken (can delete code)
+# modernize-use-trailing-return-type:
+#     Purely stylistic, no benefit to rewriting everything
+# modernize-return-braced-init-list:
+#     Stylistically we prefer to see the return type at the return site.
+#     See https://github.com/rapidsai/cudf/pull/16956#pullrequestreview-2341891672
+#     for more information.
+# modernize-use-bool-literals:
+#     Our tests use int flags for validity masks extensively and we prefer that
+# clang-analyzer-cplusplus.NewDeleteLeaks:
+#     This check has numerous bugs, see
+#     https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+newdeleteleaks
+#     We encounter at least
+#     https://github.com/llvm/llvm-project/issues/60896
+#     https://github.com/llvm/llvm-project/issues/69602
+# clang-analyzer-optin.core.EnumCastOutOfRange
+#     We use enums as flags in multiple cases and this check makes ORing flags invalid
+# clang-analyzer-optin.cplusplus.UninitializedObject'
+#     There is an error in nanoarrow that none of the clang-tidy filters (i.e.
+#     header-filter and exclude-header-filter are able to properly avoid. This
+#     merits further investigation
+#
+# We need to verify that broken checks are still broken
 Checks:
       'modernize-*,
        -modernize-use-equals-default,
        -modernize-concat-nested-namespaces,
        -modernize-use-trailing-return-type,
-       -modernize-use-bool-literals'
+       -modernize-return-braced-init-list,
+       -modernize-use-bool-literals,
+       clang-analyzer-*,
+       -clang-analyzer-cplusplus.NewDeleteLeaks,
+       -clang-analyzer-optin.core.EnumCastOutOfRange,
+       -clang-analyzer-optin.cplusplus.UninitializedObject'
 
-      # -modernize-use-equals-default        # auto-fix is broken (doesn't insert =default correctly)
-      # -modernize-concat-nested-namespaces  # auto-fix is broken (can delete code)
-      # -modernize-use-trailing-return-type  # just a preference
-
-WarningsAsErrors: ''
-HeaderFilterRegex: ''
-AnalyzeTemporaryDtors: false
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*'
+ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*'
 FormatStyle:     none
 CheckOptions:
  - key:             modernize-loop-convert.MaxCopySize
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 136f43ee706..e4b9cbf8921 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
 option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
 mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
+option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
 mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
 option(
@@ -87,6 +88,7 @@ option(
   ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL}
 )
 mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
+option(CUDF_CLANG_TIDY "Enable clang-tidy checking" OFF)
 
 message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
@@ -143,6 +145,58 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR)
   set(CUDF_GENERATED_INCLUDE_DIR ${CUDF_BINARY_DIR})
 endif()
 
+# ##################################################################################################
+# * clang-tidy configuration ----------------------------------------------------------------------
+if(CUDF_CLANG_TIDY)
+  find_program(
+    CLANG_TIDY_EXE
+    NAMES "clang-tidy"
+    DOC "Path to clang-tidy executable" REQUIRED
+  )
+
+  execute_process(
+    COMMAND ${CLANG_TIDY_EXE} --version
+    OUTPUT_VARIABLE CLANG_TIDY_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  string(REGEX MATCH "LLVM version ([0-9]+\\.[0-9]+)\\.[0-9]+" LLVM_VERSION_MATCH
+               "${CLANG_TIDY_OUTPUT}"
+  )
+  # Discard the patch version and allow it to float. Empirically, results between patch versions are
+  # mostly stable, and looking at available packages on some package managers sometimes patch
+  # versions are skipped so we don't want to constrain to a patch version that the user can't
+  # install.
+  set(LLVM_VERSION "${CMAKE_MATCH_1}")
+  set(expected_clang_tidy_version 19.1)
+  if(NOT expected_clang_tidy_version VERSION_EQUAL LLVM_VERSION)
+    message(
+      FATAL_ERROR
+        "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}"
+    )
+  endif()
+endif()
+
+# Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES.
+function(enable_clang_tidy target)
+  set(_tidy_options)
+  set(_tidy_one_value)
+  set(_tidy_multi_value SKIPPED_FILES)
+  cmake_parse_arguments(
+    _TIDY "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN}
+  )
+
+  if(CUDF_CLANG_TIDY)
+    # clang will complain about unused link libraries on the compile line unless we specify
+    # -Qunused-arguments.
+    set_target_properties(
+      ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
+    )
+    foreach(file IN LISTS _TIDY_SKIPPED_FILES)
+      set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON)
+    endforeach()
+  endif()
+endfunction()
+
 # ##################################################################################################
 # * conda environment -----------------------------------------------------------------------------
 rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
@@ -314,7 +368,13 @@ add_library(
   src/filling/repeat.cu
   src/filling/sequence.cu
   src/groupby/groupby.cu
+  src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_single_pass_aggs.cu
+  src/groupby/hash/create_sparse_results_table.cu
+  src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
+  src/groupby/hash/hash_compound_agg_finalizer.cu
+  src/groupby/hash/sparse_to_dense_results.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu
@@ -712,6 +772,7 @@ target_compile_options(
   cudf PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
+enable_clang_tidy(cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp)
 
 if(CUDF_BUILD_STACKTRACE_DEBUG)
   # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option
@@ -861,15 +922,7 @@ if(CUDF_BUILD_TESTUTIL)
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
-  add_library(
-    cudftestutil SHARED
-    tests/io/metadata_utilities.cpp
-    tests/utilities/column_utilities.cu
-    tests/utilities/debug_utilities.cu
-    tests/utilities/random_seed.cpp
-    tests/utilities/table_utilities.cu
-    tests/utilities/tdigest_utilities.cu
-  )
+  add_library(cudftestutil INTERFACE)
 
   set_target_properties(
     cudftestutil
@@ -878,32 +931,56 @@ if(CUDF_BUILD_TESTUTIL)
                # set target compile options
                CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
-               CXX_VISIBILITY_PRESET hidden
                CUDA_STANDARD 17
                CUDA_STANDARD_REQUIRED ON
-               CUDA_VISIBILITY_PRESET hidden
-               POSITION_INDEPENDENT_CODE ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
   target_compile_options(
-    cudftestutil PUBLIC "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
-                        "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
+    cudftestutil INTERFACE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
+                           "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
   )
 
   target_link_libraries(
-    cudftestutil
-    PUBLIC Threads::Threads cudf cudftest_default_stream
-    PRIVATE GTest::gmock GTest::gtest $<TARGET_NAME_IF_EXISTS:conda_env>
+    cudftestutil INTERFACE Threads::Threads cudf cudftest_default_stream
+                           $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
   target_include_directories(
-    cudftestutil PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
-                        "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+    cudftestutil INTERFACE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+                           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
   )
   rapids_cuda_set_runtime(cudftestutil USE_STATIC ${CUDA_STATIC_RUNTIME})
   add_library(cudf::cudftestutil ALIAS cudftestutil)
 
+  add_library(cudftestutil_impl INTERFACE)
+  add_library(cudf::cudftestutil_impl ALIAS cudftestutil_impl)
+  target_sources(
+    cudftestutil_impl
+    INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/io/metadata_utilities.cpp>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/column_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/debug_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/random_seed.cpp>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/table_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/tdigest_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/io/metadata_utilities.cpp>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/column_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/debug_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/random_seed.cpp>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/table_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/tdigest_utilities.cu>
+  )
+  target_link_libraries(cudftestutil_impl INTERFACE cudf::cudftestutil)
+
+  install(FILES tests/io/metadata_utilities.cpp DESTINATION src/cudftestutil/io)
+  install(
+    FILES tests/utilities/column_utilities.cu
+          tests/utilities/debug_utilities.cu
+          tests/utilities/random_seed.cpp
+          tests/utilities/table_utilities.cu
+          tests/utilities/tdigest_utilities.cu
+    DESTINATION src/cudftestutil/utilities
+  )
+
 endif()
 
 # * build cudf_identify_stream_usage --------------------------------------------------------------
@@ -1004,7 +1081,7 @@ install(
 set(_components_export_string)
 if(TARGET cudftestutil)
   install(
-    TARGETS cudftest_default_stream cudftestutil
+    TARGETS cudftest_default_stream cudftestutil cudftestutil_impl
     DESTINATION ${lib_dir}
     EXPORT cudf-testing-exports
   )
@@ -1044,14 +1121,15 @@ targets:
 This module offers an optional testing component which defines the
 following IMPORTED GLOBAL  targets:
 
- cudf::cudftestutil     - The main cudf testing library
+ cudf::cudftestutil          - The main cudf testing library
+ cudf::cudftestutil_impl     - C++ and CUDA sources to compile for definitions in cudf::cudftestutil
     ]=]
 )
 
 rapids_export(
   INSTALL cudf
   EXPORT_SET cudf-exports ${_components_export_string}
-  GLOBAL_TARGETS cudf cudftestutil
+  GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl
   NAMESPACE cudf::
   DOCUMENTATION doc_string
 )
@@ -1072,7 +1150,7 @@ endif()
 rapids_export(
   BUILD cudf
   EXPORT_SET cudf-exports ${_components_export_string}
-  GLOBAL_TARGETS cudf cudftestutil
+  GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK build_code_string
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4113e38dcf4..2a4ac789046 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -25,7 +25,7 @@ target_compile_options(
 target_link_libraries(
   cudf_datagen
   PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf
-         cudftestutil nvtx3::nvtx3-cpp
+         cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -49,7 +49,7 @@ target_compile_options(
 
 target_link_libraries(
   ndsh_data_generator
-  PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
+  PUBLIC cudf cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -65,14 +65,14 @@ target_include_directories(
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
   cudf_benchmark_common OBJECT
-  "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
-  synchronization/synchronization.cpp
-  io/cuio_common.cpp
-  common/table_utilities.cpp
-  common/benchmark_utilities.cpp
-  common/nvbench_utilities.cpp
+  synchronization/synchronization.cpp io/cuio_common.cpp common/table_utilities.cpp
+  common/benchmark_utilities.cpp common/nvbench_utilities.cpp
 )
-target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
+target_link_libraries(
+  cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env> GTest::gmock
+                                GTest::gtest
+)
+
 add_custom_command(
   OUTPUT CUDF_BENCHMARKS
   COMMAND echo Running benchmarks
@@ -99,7 +99,7 @@ function(ConfigureBench CMAKE_BENCH_NAME)
   )
   target_link_libraries(
     ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
-                                $<TARGET_NAME_IF_EXISTS:conda_env>
+                                cudf::cudftestutil_impl $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   add_custom_command(
     OUTPUT CUDF_BENCHMARKS
@@ -127,8 +127,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
-                                nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_BENCH_NAME}
+    PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen nvbench::nvbench
+            $<TARGET_NAME_IF_EXISTS:conda_env> cudf::cudftestutil_impl
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
@@ -245,6 +246,7 @@ ConfigureNVBench(
   REDUCTION_NVBENCH
   reduction/anyall.cpp
   reduction/dictionary.cpp
+  reduction/histogram.cpp
   reduction/minmax.cpp
   reduction/rank.cpp
   reduction/reduce.cpp
@@ -270,8 +272,13 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
-  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH
+  groupby/group_histogram.cpp
+  groupby/group_max.cpp
+  groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp
+  groupby/group_rank.cpp
+  groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################
@@ -330,19 +337,19 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureBench(AST_BENCH ast/transform.cpp)
+ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
+ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
@@ -377,6 +384,7 @@ ConfigureNVBench(
   string/join_strings.cpp
   string/lengths.cpp
   string/like.cpp
+  string/make_strings_column.cu
   string/replace_re.cpp
   string/reverse.cpp
   string/slice.cpp
@@ -392,11 +400,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
-# ##################################################################################################
-# * multi buffer memset benchmark
-# ----------------------------------------------------------------------
-ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
-
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 65a44532cf1..f44f26e4d2c 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,16 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
 #include <list>
 #include <memory>
@@ -35,13 +37,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-class AST : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-static void BM_ast_transform(benchmark::State& state)
+static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
@@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state)
 
   auto const& expression_tree_root = expressions.back();
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::compute_column(table, expression_tree_root);
-  }
-
   // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
-}
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
 
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
-  (::benchmark::State & st)                                                                \
+  static void name(::nvbench::state& st)                                                   \
   {                                                                                        \
-    BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                    \
+    ::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                  \
   }                                                                                        \
-  BENCHMARK_REGISTER_F(AST, name)                                                          \
-    ->Apply(CustomRanges)                                                                  \
-    ->Unit(benchmark::kMillisecond)                                                        \
-    ->UseManualTime();
+  NVBENCH_BENCH(name)                                                                      \
+    .set_name(#name)                                                                       \
+    .add_int64_axis("tree_levels", {1, 5, 10})                                             \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index fa98d9e601a..7d267a88764 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,14 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
-#include <vector>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
 
@@ -33,13 +32,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns>
-class BINARYOP : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns>
-static void BM_binaryop_transform(benchmark::State& state)
+static void BM_binaryop_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
+  auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};
 
   // Create table data
   auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
@@ -47,9 +43,10 @@ static void BM_binaryop_transform(benchmark::State& state)
     cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
   cudf::table_view table{*source_table};
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
+  // Use the number of bytes read from global memory
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
     auto const op               = cudf::binary_operator::ADD;
     auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
@@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state)
         result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }
-  }
-
-  // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
+  });
 }
 
 #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
-  BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns)     \
-  (::benchmark::State & st) { BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); }
+                                                                                      \
+  static void name(::nvbench::state& st)                                              \
+  {                                                                                   \
+    BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                    \
+  }                                                                                   \
+  NVBENCH_BENCH(name)                                                                 \
+    .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
                                     int32_t,
@@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
                                     double,
                                     TreeType::IMBALANCED_LEFT,
                                     false);
-
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 2, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
-}
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index 7086a61c7c5..bc0ff69bce9 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -15,20 +15,18 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 
-class COMPILED_BINARYOP : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
+void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
 {
-  auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
 
   auto const source_table = create_random_table(
-    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{column_size});
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});
 
   auto lhs = cudf::column_view(source_table->get_column(0));
   auto rhs = cudf::column_view(source_table->get_column(1));
@@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
   // Call once for hot cache.
   cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    cudf::binary_operation(lhs, rhs, binop, output_dtype);
-  }
-
   // use number of bytes read and written to global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut)));
+  state.add_global_memory_reads<TypeLhs>(table_size);
+  state.add_global_memory_reads<TypeRhs>(table_size);
+  state.add_global_memory_reads<TypeOut>(table_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
 }
 
+#define BM_STRINGIFY(a) #a
+
 // TODO tparam boolean for null.
-#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)           \
-  BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name)                             \
-  (::benchmark::State & st)                                               \
-  {                                                                       \
-    BM_compiled_binaryop<lhs, rhs, tout>(st, cudf::binary_operator::bop); \
-  }                                                                       \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                           \
-    ->Unit(benchmark::kMicrosecond)                                       \
-    ->UseManualTime()                                                     \
-    ->Arg(10000)      /* 10k */                                           \
-    ->Arg(100000)     /* 100k */                                          \
-    ->Arg(1000000)    /* 1M */                                            \
-    ->Arg(10000000)   /* 10M */                                           \
-    ->Arg(100000000); /* 100M */
+#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)               \
+  static void name(::nvbench::state& st)                                      \
+  {                                                                           \
+    ::BM_compiled_binaryop<lhs, rhs, tout>(st, ::cudf::binary_operator::bop); \
+  }                                                                           \
+  NVBENCH_BENCH(name)                                                         \
+    .set_name("compiled_binary_op_" BM_STRINGIFY(name))                       \
+    .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
 
 #define build_name(a, b, c, d) a##_##b##_##c##_##d
 
diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp
new file mode 100644
index 00000000000..cd7f9f298af
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_histogram.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/groupby.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void groupby_histogram_helper(nvbench::state& state,
+                              cudf::size_type num_rows,
+                              cudf::size_type cardinality,
+                              double null_probability)
+{
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const values = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  // Vector of 1 request
+  std::vector<cudf::groupby::aggregation_request> requests(1);
+  requests.back().values = values->view();
+  requests.back().aggregations.push_back(
+    cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto gb_obj       = cudf::groupby::groupby(cudf::table_view({keys->view()}));
+    auto const result = gb_obj.aggregate(requests);
+  });
+
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time, "rows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_groupby_histogram(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
+
+  groupby_histogram_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_histogram,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_histogram")
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("cardinality", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000})
+  .add_int64_axis("num_rows", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
deleted file mode 100644
index 2905895a63b..00000000000
--- a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_common.hpp>
-#include <benchmarks/io/nvbench_helpers.hpp>
-
-#include <cudf/io/parquet.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <nvbench/nvbench.cuh>
-
-// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
-// run on most GPUs, but large enough to allow highest throughput
-constexpr size_t data_size = 512 << 20;
-
-void parquet_read_common(cudf::size_type num_rows_to_read,
-                         cudf::size_type num_cols_to_read,
-                         cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
-{
-  cudf::io::parquet_reader_options read_opts =
-    cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
-
-  auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      try_drop_l3_cache();
-
-      timer.start();
-      auto const result = cudf::io::read_parquet(read_opts);
-      timer.stop();
-
-      CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns");
-      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-    });
-
-  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
-  state.add_buffer_size(
-    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
-  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
-}
-
-template <data_type DataType>
-void bench_batched_memset(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
-{
-  auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
-  auto const num_cols    = static_cast<cudf::size_type>(state.get_int64("num_cols"));
-  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
-  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
-  auto const compression = cudf::io::compression_type::NONE;
-  cuio_source_sink_pair source_sink(source_type);
-  auto const tbl =
-    create_random_table(cycle_dtypes(d_type, num_cols),
-                        table_size_bytes{data_size},
-                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
-  auto const view = tbl->view();
-
-  cudf::io::parquet_writer_options write_opts =
-    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
-      .compression(compression);
-  cudf::io::write_parquet(write_opts);
-  auto const num_rows = view.num_rows();
-
-  parquet_read_common(num_rows, num_cols, source_sink, state);
-}
-
-using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
-                                            data_type::FLOAT,
-                                            data_type::DECIMAL,
-                                            data_type::TIMESTAMP,
-                                            data_type::DURATION,
-                                            data_type::STRING,
-                                            data_type::LIST,
-                                            data_type::STRUCT>;
-
-NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list))
-  .set_name("batched_memset")
-  .set_type_axes_names({"data_type"})
-  .add_int64_axis("num_cols", {1000})
-  .add_string_axis("io_type", {"DEVICE_BUFFER"})
-  .set_min_samples(4)
-  .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/benchmarks/ndsh/q01.cpp b/cpp/benchmarks/ndsh/q01.cpp
index ef709926ae9..485e8e5497c 100644
--- a/cpp/benchmarks/ndsh/q01.cpp
+++ b/cpp/benchmarks/ndsh/q01.cpp
@@ -104,7 +104,7 @@
 }
 
 void run_ndsh_q1(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projections and filter predicate for `lineitem` table
   std::vector<std::string> const lineitem_cols = {"l_returnflag",
@@ -124,8 +124,8 @@ void run_ndsh_q1(nvbench::state& state,
     cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
 
   // Read out the `lineitem` table from parquet file
-  auto lineitem =
-    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
+  auto lineitem = read_parquet(
+    sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Calculate the discount price and charge columns and append to lineitem table
   auto disc_price =
@@ -170,7 +170,7 @@ void ndsh_q1(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
 
   auto stream = cudf::get_default_stream();
diff --git a/cpp/benchmarks/ndsh/q05.cpp b/cpp/benchmarks/ndsh/q05.cpp
index 522bc4789c2..1c2d657913e 100644
--- a/cpp/benchmarks/ndsh/q05.cpp
+++ b/cpp/benchmarks/ndsh/q05.cpp
@@ -89,7 +89,7 @@
 }
 
 void run_ndsh_q5(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
@@ -120,17 +120,17 @@ void run_ndsh_q5(nvbench::state& state,
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer =
-    read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"});
+    read_parquet(sources.at("customer").make_source_info(), {"c_custkey", "c_nationkey"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
-  auto const lineitem = read_parquet(sources["lineitem"].make_source_info(),
+    read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(sources.at("lineitem").make_source_info(),
                                      {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
   auto const supplier =
-    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
+    read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"});
   auto const nation =
-    read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
+    read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
   auto const region =
-    read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred));
+    read_parquet(sources.at("region").make_source_info(), region_cols, std::move(region_pred));
 
   // Perform the joins
   auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
@@ -165,7 +165,7 @@ void ndsh_q5(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/q06.cpp b/cpp/benchmarks/ndsh/q06.cpp
index 04078547973..e1e56c3622e 100644
--- a/cpp/benchmarks/ndsh/q06.cpp
+++ b/cpp/benchmarks/ndsh/q06.cpp
@@ -64,7 +64,7 @@
 }
 
 void run_ndsh_q6(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Read out the `lineitem` table from parquet file
   std::vector<std::string> const lineitem_cols = {
@@ -83,8 +83,8 @@ void run_ndsh_q6(nvbench::state& state,
     cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
   auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
-  auto lineitem =
-    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
+  auto lineitem = read_parquet(
+    sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Cast the discount and quantity columns to float32 and append to lineitem table
   auto discout_float =
@@ -134,7 +134,7 @@ void ndsh_q6(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
 
   auto stream = cudf::get_default_stream();
diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp
index 59218ab8912..2e9a69d9ee2 100644
--- a/cpp/benchmarks/ndsh/q09.cpp
+++ b/cpp/benchmarks/ndsh/q09.cpp
@@ -112,20 +112,21 @@
 }
 
 void run_ndsh_q9(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Read out the table from parquet files
   auto const lineitem = read_parquet(
-    sources["lineitem"].make_source_info(),
+    sources.at("lineitem").make_source_info(),
     {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
-  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"});
+  auto const nation =
+    read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_name"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"});
-  auto const part     = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"});
-  auto const partsupp = read_parquet(sources["partsupp"].make_source_info(),
+    read_parquet(sources.at("orders").make_source_info(), {"o_orderkey", "o_orderdate"});
+  auto const part = read_parquet(sources.at("part").make_source_info(), {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(sources.at("partsupp").make_source_info(),
                                      {"ps_suppkey", "ps_partkey", "ps_supplycost"});
   auto const supplier =
-    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
+    read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"});
 
   // Generating the `profit` table
   // Filter the part table using `p_name like '%green%'`
@@ -178,7 +179,7 @@ void ndsh_q9(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp
index a520480020a..72edd15083d 100644
--- a/cpp/benchmarks/ndsh/q10.cpp
+++ b/cpp/benchmarks/ndsh/q10.cpp
@@ -94,7 +94,7 @@
 }
 
 void run_ndsh_q10(nvbench::state& state,
-                  std::unordered_map<std::string, parquet_device_buffer>& sources)
+                  std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
@@ -122,15 +122,16 @@ void run_ndsh_q10(nvbench::state& state,
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer = read_parquet(
-    sources["customer"].make_source_info(),
+    sources.at("customer").make_source_info(),
     {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
+    read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred));
   auto const lineitem =
-    read_parquet(sources["lineitem"].make_source_info(),
+    read_parquet(sources.at("lineitem").make_source_info(),
                  {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
                  std::move(lineitem_pred));
-  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"});
+  auto const nation =
+    read_parquet(sources.at("nation").make_source_info(), {"n_name", "n_nationkey"});
 
   // Perform the joins
   auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
@@ -163,7 +164,7 @@ void ndsh_q10(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"customer", "orders", "lineitem", "nation"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
index 62116ddf661..9f9849860c9 100644
--- a/cpp/benchmarks/ndsh/utilities.cpp
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -17,6 +17,8 @@
 #include "utilities.hpp"
 
 #include "common/ndsh_data_generator/ndsh_data_generator.hpp"
+#include "common/table_utilities.hpp"
+#include "cudf/detail/utilities/integer_utils.hpp"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -30,8 +32,15 @@
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <algorithm>
 #include <cstdlib>
 #include <ctime>
+#include <iterator>
+#include <unordered_set>
 
 namespace {
 
@@ -85,6 +94,15 @@ std::vector<std::string> const NATION_SCHEMA   = {
   "n_nationkey", "n_name", "n_regionkey", "n_comment"};
 std::vector<std::string> const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"};
 
+std::unordered_map<std::string, std::vector<std::string> const> const SCHEMAS = {
+  {"orders", ORDERS_SCHEMA},
+  {"lineitem", LINEITEM_SCHEMA},
+  {"part", PART_SCHEMA},
+  {"partsupp", PARTSUPP_SCHEMA},
+  {"supplier", SUPPLIER_SCHEMA},
+  {"customer", CUSTOMER_SCHEMA},
+  {"nation", NATION_SCHEMA},
+  {"region", REGION_SCHEMA}};
 }  // namespace
 
 cudf::table_view table_with_names::table() const { return tbl->view(); }
@@ -337,7 +355,7 @@ int32_t days_since_epoch(int year, int month, int day)
 
 void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
                                     std::vector<std::string> const& col_names,
-                                    parquet_device_buffer& source)
+                                    cuio_source_sink_pair& source)
 {
   CUDF_FUNC_RANGE();
   auto const stream = cudf::get_default_stream();
@@ -351,55 +369,124 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
   metadata.schema_info            = col_name_infos;
   auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
 
-  // Declare a host and device buffer
-  std::vector<char> h_buffer;
-
+  auto est_size                     = static_cast<std::size_t>(estimate_size(table->view()));
+  constexpr auto PQ_MAX_TABLE_BYTES = 8ul << 30;  // 8GB
+  // TODO: best to get this limit from percent_of_free_device_memory(50) of device memory resource.
+  if (est_size > PQ_MAX_TABLE_BYTES) {
+    auto builder = cudf::io::chunked_parquet_writer_options::builder(source.make_sink_info());
+    builder.metadata(table_input_metadata);
+    auto const options = builder.build();
+    auto num_splits    = static_cast<cudf::size_type>(
+      std::ceil(static_cast<long double>(est_size) / (PQ_MAX_TABLE_BYTES)));
+    std::vector<cudf::size_type> splits(num_splits - 1);
+    auto num_rows          = table->num_rows();
+    auto num_row_per_chunk = cudf::util::div_rounding_up_safe(num_rows, num_splits);
+    std::generate_n(splits.begin(), splits.size(), [num_row_per_chunk, i = 0]() mutable {
+      return (i += num_row_per_chunk);
+    });
+    std::vector<cudf::table_view> split_tables = cudf::split(table->view(), splits, stream);
+    auto writer                                = cudf::io::parquet_chunked_writer(options, stream);
+    for (auto const& chunk_table : split_tables) {
+      writer.write(chunk_table);
+    }
+    writer.close();
+    return;
+  }
   // Write parquet data to host buffer
-  auto builder =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view());
+  auto builder = cudf::io::parquet_writer_options::builder(source.make_sink_info(), table->view());
   builder.metadata(table_input_metadata);
   auto const options = builder.build();
-  cudf::io::write_parquet(options);
+  cudf::io::write_parquet(options, stream);
+}
 
-  // Copy host buffer to device buffer
-  source.d_buffer.resize(h_buffer.size(), stream);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+inline auto make_managed_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    std::make_shared<rmm::mr::managed_memory_resource>(), rmm::percent_of_free_device_memory(50));
 }
 
 void generate_parquet_data_sources(double scale_factor,
                                    std::vector<std::string> const& table_names,
-                                   std::unordered_map<std::string, parquet_device_buffer>& sources)
+                                   std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   CUDF_FUNC_RANGE();
-  std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) {
-    sources[table_name] = parquet_device_buffer();
-  });
 
-  auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  // Set the memory resource to the managed pool
+  auto old_mr = cudf::get_current_device_resource();
+  // if already managed pool or managed, don't create new one.
+  using managed_pool_mr_t = decltype(make_managed_pool());
+  managed_pool_mr_t managed_pool_mr;
+  bool const is_managed =
+    dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(old_mr) or
+    dynamic_cast<rmm::mr::managed_memory_resource*>(old_mr);
+  if (!is_managed) {
+    std::cout << "Creating managed pool just for data generation\n";
+    managed_pool_mr = make_managed_pool();
+    cudf::set_current_device_resource(managed_pool_mr.get());
+    // drawback: if already pool takes 50% of free memory, we are left with 50% of 50% of free
+    // memory.
+  }
 
-  auto partsupp = cudf::datagen::generate_partsupp(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  std::unordered_set<std::string> const requested_table_names = [&table_names]() {
+    if (table_names.empty()) {
+      return std::unordered_set<std::string>{
+        "orders", "lineitem", "part", "partsupp", "supplier", "customer", "nation", "region"};
+    }
+    return std::unordered_set(table_names.begin(), table_names.end());
+  }();
+  std::for_each(
+    requested_table_names.begin(), requested_table_names.end(), [&](auto const& table_name) {
+      sources.emplace(table_name, cuio_source_sink_pair(io_type::HOST_BUFFER));
+    });
+  std::unordered_map<std::string, std::unique_ptr<cudf::table>> tables;
+
+  if (sources.count("orders") or sources.count("lineitem") or sources.count("part")) {
+    auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    if (sources.count("orders")) {
+      write_to_parquet_device_buffer(orders, SCHEMAS.at("orders"), sources.at("orders"));
+      orders = {};
+    }
+    if (sources.count("part")) {
+      write_to_parquet_device_buffer(part, SCHEMAS.at("part"), sources.at("part"));
+      part = {};
+    }
+    if (sources.count("lineitem")) {
+      write_to_parquet_device_buffer(lineitem, SCHEMAS.at("lineitem"), sources.at("lineitem"));
+      lineitem = {};
+    }
+  }
+
+  if (sources.count("partsupp")) {
+    auto partsupp = cudf::datagen::generate_partsupp(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(partsupp, SCHEMAS.at("partsupp"), sources.at("partsupp"));
+  }
 
-  auto supplier = cudf::datagen::generate_supplier(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  if (sources.count("supplier")) {
+    auto supplier = cudf::datagen::generate_supplier(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(supplier, SCHEMAS.at("supplier"), sources.at("supplier"));
+  }
 
-  auto customer = cudf::datagen::generate_customer(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  if (sources.count("customer")) {
+    auto customer = cudf::datagen::generate_customer(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(customer, SCHEMAS.at("customer"), sources.at("customer"));
+  }
 
-  auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
-                                               cudf::get_current_device_resource_ref());
+  if (sources.count("nation")) {
+    auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
+                                                 cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(nation, SCHEMAS.at("nation"), sources.at("nation"));
+  }
 
-  auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
-                                               cudf::get_current_device_resource_ref());
+  if (sources.count("region")) {
+    auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
+                                                 cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(region, SCHEMAS.at("region"), sources.at("region"));
+  }
 
-  write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]);
-  write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]);
-  write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]);
-  write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]);
-  write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]);
-  write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]);
-  write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]);
-  write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]);
+  // Restore the original memory resource
+  if (!is_managed) { cudf::set_current_device_resource(old_mr); }
 }
diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp
index 762e43deccf..cae07f86a98 100644
--- a/cpp/benchmarks/ndsh/utilities.hpp
+++ b/cpp/benchmarks/ndsh/utilities.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "io/cuio_common.hpp"
+
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/io/parquet.hpp>
@@ -196,24 +198,15 @@ std::tm make_tm(int year, int month, int day);
 int32_t days_since_epoch(int year, int month, int day);
 
 /**
- * @brief Struct representing a parquet device buffer
- */
-struct parquet_device_buffer {
-  parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {};
-  cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); }
-  rmm::device_uvector<std::byte> d_buffer;
-};
-
-/**
- * @brief Write a `cudf::table` to a parquet device buffer
+ * @brief Write a `cudf::table` to a parquet cuio sink
  *
  * @param table The `cudf::table` to write
  * @param col_names The column names of the table
- * @param parquet_device_buffer The parquet device buffer to write the table to
+ * @param source The source sink pair to write the table to
  */
 void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
                                     std::vector<std::string> const& col_names,
-                                    parquet_device_buffer& source);
+                                    cuio_source_sink_pair& source);
 
 /**
  * @brief Generate NDS-H tables and write to parquet device buffers
@@ -224,4 +217,4 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
  */
 void generate_parquet_data_sources(double scale_factor,
                                    std::vector<std::string> const& table_names,
-                                   std::unordered_map<std::string, parquet_device_buffer>& sources);
+                                   std::unordered_map<std::string, cuio_source_sink_pair>& sources);
diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp
new file mode 100644
index 00000000000..d0925de5c87
--- /dev/null
+++ b/cpp/benchmarks/reduction/histogram.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf/aggregation.hpp"
+#include "cudf/detail/aggregation/aggregation.hpp"
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/reduction/detail/histogram.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename type>
+static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list<type>)
+{
+  auto const dtype = cudf::type_to_id<type>();
+
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
+
+  data_profile const profile = data_profile_builder()
+                                 .null_probability(null_probability)
+                                 .cardinality(cardinality)
+                                 .distribution(dtype, distribution_id::UNIFORM, 0, num_rows);
+
+  auto const input = create_random_column(dtype, row_count{num_rows}, profile);
+  auto agg         = cudf::make_histogram_aggregation<cudf::reduce_aggregation>();
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::reduce(*input, *agg, input->type(), stream_view);
+  });
+
+  state.add_element_count(input->size());
+}
+
+using data_type = nvbench::type_list<int32_t, int64_t>;
+
+NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
+  .set_name("histogram")
+  .add_float64_axis("null_probability", {0.1})
+  .add_int64_axis("cardinality",
+                  {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000})
+  .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/string/make_strings_column.cu b/cpp/benchmarks/string/make_strings_column.cu
new file mode 100644
index 00000000000..e86824b9f40
--- /dev/null
+++ b/cpp/benchmarks/string/make_strings_column.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/pair.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+namespace {
+
+constexpr int min_row_width = 0;
+constexpr int max_row_width = 50;
+
+using string_index_pair = thrust::pair<char const*, cudf::size_type>;
+
+template <bool batch_construction>
+std::vector<std::unique_ptr<cudf::column>> make_strings_columns(
+  std::vector<cudf::device_span<string_index_pair const>> const& input,
+  rmm::cuda_stream_view stream)
+{
+  if constexpr (batch_construction) {
+    return cudf::make_strings_column_batch(input, stream);
+  } else {
+    std::vector<std::unique_ptr<cudf::column>> output;
+    output.reserve(input.size());
+    for (auto const& column_input : input) {
+      output.emplace_back(cudf::make_strings_column(column_input, stream));
+    }
+    return output;
+  }
+}
+
+}  // namespace
+
+static void BM_make_strings_column_batch(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));
+  auto const has_nulls  = true;
+
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
+      .null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
+  auto const data_table = create_random_table(
+    cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);
+
+  auto const stream = cudf::get_default_stream();
+  auto input_data   = std::vector<rmm::device_uvector<string_index_pair>>{};
+  auto input        = std::vector<cudf::device_span<string_index_pair const>>{};
+  input_data.reserve(batch_size);
+  input.reserve(batch_size);
+  for (auto const& cv : data_table->view()) {
+    auto const d_data_ptr = cudf::column_device_view::create(cv, stream);
+    auto batch_input      = rmm::device_uvector<string_index_pair>(cv.size(), stream);
+    thrust::tabulate(rmm::exec_policy(stream),
+                     batch_input.begin(),
+                     batch_input.end(),
+                     [data_col = *d_data_ptr] __device__(auto const idx) {
+                       if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+                       auto const row = data_col.element<cudf::string_view>(idx);
+                       return string_index_pair{row.data(), row.size_bytes()};
+                     });
+    input_data.emplace_back(std::move(batch_input));
+    input.emplace_back(input_data.back());
+  }
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto const output = make_strings_columns<true>(input, stream);
+  });
+}
+
+NVBENCH_BENCH(BM_make_strings_column_batch)
+  .set_name("make_strings_column_batch")
+  .add_int64_axis("num_rows", {100'000, 500'000, 1'000'000, 2'000'000})
+  .add_int64_axis("batch_size", {10, 20, 50, 100});
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 8e48f8e9a05..43d57201b20 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -15,58 +15,45 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/generate_ngrams.hpp>
 
-class TextNGrams : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum class ngrams_type { tokens, characters };
-
-static void BM_ngrams(benchmark::State& state, ngrams_type nt)
+static void bench_ngrams(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const ngram_type = state.get_string("type");
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   auto const separator = cudf::string_scalar("_");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
-      case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
-    }
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size * 2);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 5;
-  int const max_rowlen = 40;
-  int const len_mult   = 2;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (ngram_type == "chars") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_character_ngrams(input);
+    });
+  } else {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_ngrams(input, 2, separator);
+    });
+  }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)                             \
-  BENCHMARK_DEFINE_F(TextNGrams, name)                            \
-  (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
-  BENCHMARK_REGISTER_F(TextNGrams, name)                          \
-    ->Apply(generate_bench_args)                                  \
-    ->UseManualTime()                                             \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(tokens)
-NVTEXT_BENCHMARK_DEFINE(characters)
+NVBENCH_BENCH(bench_ngrams)
+  .set_name("ngrams")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578})
+  .add_int64_axis("row_width", {10, 20, 40, 100})
+  .add_string_axis("type", {"chars", "tokens"});
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 8df1b431095..d7d7fcca044 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,15 +14,17 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+
+  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
+  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
+
   # Currently we need to always build nanoarrow so we don't pickup a previous installed version
   set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
     nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
-    GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
-    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
-    GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index 41bbf44abc8..33b1b45fb44 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,11 +16,11 @@
 function(find_and_configure_nvcomp)
 
   include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  rapids_cpm_nvcomp(
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports
-    USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}
-  )
+  set(export_args)
+  if(CUDF_EXPORT_NVCOMP)
+    set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+  endif()
+  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP})
 
   # Per-thread default stream
   if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
new file mode 100644
index 00000000000..e9a36fcb567
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
@@ -0,0 +1,38 @@
+diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h
+index caa6be4..70ec8a2 100644
+--- a/src/nanoarrow/common/inline_buffer.h
++++ b/src/nanoarrow/common/inline_buffer.h
+@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) {
+ }
+ 
+ static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) {
+-  *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
++  *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT
+                    ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
+                    ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
+                    ((values[7] + 0x7f) & 0x80));
+@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l
+     // set bits within a single byte
+     const uint8_t only_byte_mask =
+         i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask);
+-    bits[bytes_begin] &= only_byte_mask;
++    bits[bytes_begin] &= only_byte_mask;  // NOLINT
+     bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask);
+     return;
+   }
+ 
+   // set/clear trailing bits of first byte
+-  bits[bytes_begin] &= first_byte_mask;
++  bits[bytes_begin] &= first_byte_mask;  // NOLINT
+   bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask);
+ 
+   if (bytes_end - bytes_begin > 2) {
+@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap,
+   n_remaining -= n_full_bytes * 8;
+   if (n_remaining > 0) {
+     // Zero out the last byte
+-    *out_cursor = 0x00;
++    *out_cursor = 0x00;  // NOLINT
+     for (int i = 0; i < n_remaining; i++) {
+       ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]);
+     }
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
new file mode 100644
index 00000000000..d529787e7c8
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
@@ -0,0 +1,18 @@
+
+{
+  "packages" : {
+    "nanoarrow" : {
+      "version" : "0.6.0.dev",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb",
+      "git_shallow" : false,
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff",
+          "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index fce8adb4c06..311539efbfc 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -370,7 +370,7 @@ any type that cudf supports. For example, a `list_scalar` representing a list of
 |Value type|Scalar class|Notes|
 |-|-|-|
 |fixed-width|`fixed_width_scalar<T>`| `T` can be any fixed-width type|
-|numeric|`numeric_scalar<T>` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int_64_t`, `float` or `double`|
+|numeric|`numeric_scalar<T>` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int64_t`, `float` or `double`|
 |fixed-point|`fixed_point_scalar<T>` | `T` can be `numeric::decimal32` or `numeric::decimal64`|
 |timestamp|`timestamp_scalar<T>` | `T` can be `timestamp_D`, `timestamp_s`, etc.|
 |duration|`duration_scalar<T>` | `T` can be `duration_D`, `duration_s`, etc.|
diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index 6d1c91a5752..6902b1948bd 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -8,6 +8,7 @@ This page specifies which regular expression (regex) features are currently supp
 - cudf::strings::extract()
 - cudf::strings::extract_all_record()
 - cudf::strings::findall()
+- cudf::strings::find_re()
 - cudf::strings::replace_re()
 - cudf::strings::replace_with_backrefs()
 - cudf::strings::split_re()
diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index d8e9205ffd4..a7d0146b170 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -16,10 +16,23 @@ project(
 
 include(../fetch_dependencies.cmake)
 
-# Configure your project here
+add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp)
+target_compile_features(parquet_io_utils PRIVATE cxx_std_17)
+target_link_libraries(parquet_io_utils PRIVATE cudf::cudf)
+
+# Build and install parquet_io
 add_executable(parquet_io parquet_io.cpp)
-target_link_libraries(parquet_io PRIVATE cudf::cudf)
+target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_utils>)
 target_compile_features(parquet_io PRIVATE cxx_std_17)
-
 install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
+
+# Build and install parquet_io_multithreaded
+add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp)
+target_link_libraries(
+  parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_utils>
+)
+target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17)
+install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf)
+
+# Install the example.parquet file
 install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/common_utils.cpp
similarity index 50%
rename from cpp/examples/parquet_io/parquet_io.hpp
rename to cpp/examples/parquet_io/common_utils.cpp
index e27cbec4fce..a79ca48af86 100644
--- a/cpp/examples/parquet_io/parquet_io.hpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -14,30 +14,27 @@
  * limitations under the License.
  */
 
-#pragma once
+#include "common_utils.hpp"
 
-#include <cudf/io/parquet.hpp>
+#include <cudf/concatenate.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <rmm/cuda_device.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <chrono>
-#include <iostream>
-#include <optional>
+#include <iomanip>
 #include <string>
 
 /**
- * @brief Create memory resource for libcudf functions
+ * @file common_utils.cpp
+ * @brief Definitions for common utilities for `parquet_io` examples
  *
- * @param pool Whether to use a pool memory resource.
- * @return Memory resource instance
  */
+
 std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used)
 {
   auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
@@ -48,17 +45,11 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
   return cuda_mr;
 }
 
-/**
- * @brief Get encoding type from the keyword
- *
- * @param name encoding keyword name
- * @return corresponding column encoding type
- */
-[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name)
+cudf::io::column_encoding get_encoding_type(std::string name)
 {
   using encoding_type = cudf::io::column_encoding;
 
-  static const std::unordered_map<std::string_view, cudf::io::column_encoding> map = {
+  static std::unordered_map<std::string_view, encoding_type> const map = {
     {"DEFAULT", encoding_type::USE_DEFAULT},
     {"DICTIONARY", encoding_type::DICTIONARY},
     {"PLAIN", encoding_type::PLAIN},
@@ -69,26 +60,18 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   std::transform(name.begin(), name.end(), name.begin(), ::toupper);
   if (map.find(name) != map.end()) { return map.at(name); }
-  throw std::invalid_argument("FATAL: " + std::string(name) +
+  throw std::invalid_argument(name +
                               " is not a valid encoding type.\n\n"
                               "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
                               "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
-                              "DELTA_BYTE_ARRAY\n"
-                              "\n"
-                              "Exiting...\n");
+                              "DELTA_BYTE_ARRAY\n\n");
 }
 
-/**
- * @brief Get compression type from the keyword
- *
- * @param name compression keyword name
- * @return corresponding compression type
- */
-[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name)
+cudf::io::compression_type get_compression_type(std::string name)
 {
   using compression_type = cudf::io::compression_type;
 
-  static const std::unordered_map<std::string_view, cudf::io::compression_type> map = {
+  static std::unordered_map<std::string_view, compression_type> const map = {
     {"NONE", compression_type::NONE},
     {"AUTO", compression_type::AUTO},
     {"SNAPPY", compression_type::SNAPPY},
@@ -97,30 +80,58 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   std::transform(name.begin(), name.end(), name.begin(), ::toupper);
   if (map.find(name) != map.end()) { return map.at(name); }
-  throw std::invalid_argument("FATAL: " + std::string(name) +
+  throw std::invalid_argument(name +
                               " is not a valid compression type.\n\n"
-                              "Available compression_type types: NONE, AUTO, SNAPPY,\n"
-                              "LZ4, ZSTD\n"
-                              "\n"
-                              "Exiting...\n");
+                              "Available compression types: NONE, AUTO, SNAPPY,\n"
+                              "LZ4, ZSTD\n\n");
 }
 
-/**
- * @brief Get the optional page size stat frequency from they keyword
- *
- * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON
- * @return optional page statistics frequency set to full (STATISTICS_COLUMN)
- */
-[[nodiscard]] std::optional<cudf::io::statistics_freq> get_page_size_stats(std::string use_stats)
+bool get_boolean(std::string input)
 {
-  std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper);
+  std::transform(input.begin(), input.end(), input.begin(), ::toupper);
 
   // Check if the input string matches to any of the following
-  if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or
-      not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) {
-    // Full column and offset indices - STATISTICS_COLUMN
-    return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN);
+  return input == "ON" or input == "TRUE" or input == "YES" or input == "Y" or input == "T";
+}
+
+void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table)
+{
+  try {
+    // Left anti-join the original and transcoded tables
+    // identical tables should not throw an exception and
+    // return an empty indices vector
+    auto const indices = cudf::left_anti_join(lhs_table, rhs_table, cudf::null_equality::EQUAL);
+
+    // No exception thrown, check indices
+    auto const valid = indices->size() == 0;
+    std::cout << "Tables identical: " << valid << "\n\n";
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl << std::endl;
+    throw std::runtime_error("Tables identical: false\n\n");
   }
+}
 
-  return std::nullopt;
+std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
+                                                rmm::cuda_stream_view stream)
+{
+  if (tables.size() == 1) { return std::move(tables[0]); }
+
+  std::vector<cudf::table_view> table_views;
+  table_views.reserve(tables.size());
+  std::transform(
+    tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) {
+      return tbl->view();
+    });
+  // Construct the final table
+  return cudf::concatenate(table_views, stream);
+}
+
+std::string current_date_and_time()
+{
+  auto const time       = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+  auto const local_time = *std::localtime(&time);
+  // Stringstream to format the date and time
+  std::stringstream ss;
+  ss << std::put_time(&local_time, "%Y-%m-%d-%H-%M-%S");
+  return ss.str();
 }
diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp
new file mode 100644
index 00000000000..12896e61a0d
--- /dev/null
+++ b/cpp/examples/parquet_io/common_utils.hpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+#include <string>
+
+/**
+ * @file common_utils.hpp
+ * @brief Common utilities for `parquet_io` examples
+ *
+ */
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used);
+
+/**
+ * @brief Get encoding type from the keyword
+ *
+ * @param name encoding keyword name
+ * @return corresponding column encoding type
+ */
+[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name);
+
+/**
+ * @brief Get compression type from the keyword
+ *
+ * @param name compression keyword name
+ * @return corresponding compression type
+ */
+[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name);
+
+/**
+ * @brief Get boolean from they keyword
+ *
+ * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON
+ * @return true or false
+ */
+[[nodiscard]] bool get_boolean(std::string input);
+
+/**
+ * @brief Check if two tables are identical, throw an error otherwise
+ *
+ * @param lhs_table View to lhs table
+ * @param rhs_table View to rhs table
+ */
+void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table);
+
+/**
+ * @brief Concatenate a vector of tables and return the resultant table
+ *
+ * @param tables Vector of tables to concatenate
+ * @param stream CUDA stream to use
+ *
+ * @return Unique pointer to the resultant concatenated table.
+ */
+std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
+                                                rmm::cuda_stream_view stream);
+
+/**
+ * @brief Returns a string containing current date and time
+ *
+ */
+std::string current_date_and_time();
diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp
new file mode 100644
index 00000000000..019b3f96474
--- /dev/null
+++ b/cpp/examples/parquet_io/io_source.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io_source.hpp"
+
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <filesystem>
+#include <fstream>
+#include <string>
+
+rmm::host_async_resource_ref pinned_memory_resource()
+{
+  static auto mr = rmm::mr::pinned_host_memory_resource{};
+  return mr;
+}
+
+io_source_type get_io_source_type(std::string name)
+{
+  static std::unordered_map<std::string_view, io_source_type> const map = {
+    {"FILEPATH", io_source_type::FILEPATH},
+    {"HOST_BUFFER", io_source_type::HOST_BUFFER},
+    {"PINNED_BUFFER", io_source_type::PINNED_BUFFER},
+    {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}};
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) {
+    return map.at(name);
+  } else {
+    throw std::invalid_argument(name +
+                                " is not a valid io source type. Available: FILEPATH,\n"
+                                "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n");
+  }
+}
+
+io_source::io_source(std::string_view file_path, io_source_type type, rmm::cuda_stream_view stream)
+  : pinned_buffer({pinned_memory_resource(), stream}), d_buffer{0, stream}
+{
+  std::string const file_name{file_path};
+  auto const file_size = std::filesystem::file_size(file_name);
+
+  // For filepath make a quick source_info and return early
+  if (type == io_source_type::FILEPATH) {
+    source_info = cudf::io::source_info(file_name);
+    return;
+  }
+
+  std::ifstream file{file_name, std::ifstream::binary};
+
+  // Copy file contents to the specified io source buffer
+  switch (type) {
+    case io_source_type::HOST_BUFFER: {
+      h_buffer.resize(file_size);
+      file.read(h_buffer.data(), file_size);
+      source_info = cudf::io::source_info(h_buffer.data(), file_size);
+      break;
+    }
+    case io_source_type::PINNED_BUFFER: {
+      pinned_buffer.resize(file_size);
+      file.read(pinned_buffer.data(), file_size);
+      source_info = cudf::io::source_info(pinned_buffer.data(), file_size);
+      break;
+    }
+    case io_source_type::DEVICE_BUFFER: {
+      h_buffer.resize(file_size);
+      file.read(h_buffer.data(), file_size);
+      d_buffer.resize(file_size, stream);
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value()));
+
+      source_info = cudf::io::source_info(d_buffer);
+      break;
+    }
+    default: {
+      throw std::runtime_error("Encountered unexpected source type\n\n");
+    }
+  }
+}
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
new file mode 100644
index 00000000000..a614d348fae
--- /dev/null
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <string>
+
+/**
+ * @file io_source.hpp
+ * @brief Utilities for constructing the specified IO sources from the input parquet files.
+ *
+ */
+
+/**
+ * @brief Available IO source types
+ */
+enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER };
+
+/**
+ * @brief Get io source type from the string keyword argument
+ *
+ * @param name io source type keyword name
+ * @return io source type
+ */
+[[nodiscard]] io_source_type get_io_source_type(std::string name);
+
+/**
+ * @brief Create and return a reference to a static pinned memory pool
+ *
+ * @return Reference to a static pinned memory pool
+ */
+rmm::host_async_resource_ref pinned_memory_resource();
+
+/**
+ * @brief Custom allocator for pinned_buffer via RMM.
+ */
+template <typename T>
+struct pinned_allocator : public std::allocator<T> {
+  pinned_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
+    : mr{_mr}, stream{_stream}
+  {
+  }
+
+  T* allocate(std::size_t n)
+  {
+    auto ptr = mr.allocate_async(n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+    stream.synchronize();
+    return static_cast<T*>(ptr);
+  }
+
+  void deallocate(T* ptr, std::size_t n)
+  {
+    mr.deallocate_async(ptr, n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+ private:
+  rmm::host_async_resource_ref mr;
+  rmm::cuda_stream_view stream;
+};
+
+/**
+ * @brief Class to create a cudf::io::source_info of given type from the input parquet file
+ *
+ */
+class io_source {
+ public:
+  io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream);
+
+  // Get the internal source info
+  [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; }
+
+ private:
+  // alias for pinned vector
+  template <typename T>
+  using pinned_vector = thrust::host_vector<T, pinned_allocator<T>>;
+  cudf::io::source_info source_info;
+  std::vector<char> h_buffer;
+  pinned_vector<char> pinned_buffer;
+  rmm::device_uvector<std::byte> d_buffer;
+};
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 9cda22d0695..c11b8de82b5 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-#include "parquet_io.hpp"
-
 #include "../utilities/timer.hpp"
+#include "common_utils.hpp"
+#include "io_source.hpp"
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
 
-#include <cudf/utilities/default_stream.hpp>
+#include <string>
 
 /**
  * @file parquet_io.cpp
@@ -81,6 +85,18 @@ void write_parquet(cudf::table_view input,
   cudf::io::write_parquet(options);
 }
 
+/**
+ * @brief Function to print example usage and argument information.
+ */
+void print_usage()
+{
+  std::cout << "\nUsage: parquet_io <input parquet file> <output parquet file> <encoding type>\n"
+               "                  <compression type> <write page stats: yes/no>\n\n"
+               "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n"
+               "                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n"
+               "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n";
+}
+
 /**
  * @brief Main for nested_types examples
  *
@@ -97,29 +113,28 @@ void write_parquet(cudf::table_view input,
  */
 int main(int argc, char const** argv)
 {
-  std::string input_filepath;
-  std::string output_filepath;
-  cudf::io::column_encoding encoding;
-  cudf::io::compression_type compression;
-  std::optional<cudf::io::statistics_freq> page_stats;
+  std::string input_filepath                          = "example.parquet";
+  std::string output_filepath                         = "output.parquet";
+  cudf::io::column_encoding encoding                  = get_encoding_type("DELTA_BINARY_PACKED");
+  cudf::io::compression_type compression              = get_compression_type("ZSTD");
+  std::optional<cudf::io::statistics_freq> page_stats = std::nullopt;
 
   switch (argc) {
-    case 1:
-      input_filepath  = "example.parquet";
-      output_filepath = "output.parquet";
-      encoding        = get_encoding_type("DELTA_BINARY_PACKED");
-      compression     = get_compression_type("ZSTD");
-      break;
-    case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]];
-    case 5:
-      input_filepath  = argv[1];
-      output_filepath = argv[2];
-      encoding        = get_encoding_type(argv[3]);
-      compression     = get_compression_type(argv[4]);
-      break;
-    default:
-      throw std::runtime_error(
-        "Either provide all command-line arguments, or none to use defaults\n");
+    case 6:
+      page_stats = get_boolean(argv[5])
+                     ? std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN)
+                     : std::nullopt;
+      [[fallthrough]];
+    case 5: compression = get_compression_type(argv[4]); [[fallthrough]];
+    case 4: encoding = get_encoding_type(argv[3]); [[fallthrough]];
+    case 3: output_filepath = argv[2]; [[fallthrough]];
+    case 2:  // Check if instead of input_paths, the first argument is `-h` or `--help`
+      if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") {
+        input_filepath = std::move(arg);
+        break;
+      }
+      [[fallthrough]];
+    default: print_usage(); throw std::runtime_error("");
   }
 
   // Create and use a memory pool
@@ -130,18 +145,16 @@ int main(int argc, char const** argv)
   // Read input parquet file
   // We do not want to time the initial read time as it may include
   // time for nvcomp, cufile loading and RMM growth
-  std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl;
+  std::cout << "\nReading " << input_filepath << "...\n";
   std::cout << "Note: Not timing the initial parquet read as it may include\n"
-               "times for nvcomp, cufile loading and RMM growth."
-            << std::endl
-            << std::endl;
+               "times for nvcomp, cufile loading and RMM growth.\n\n";
   auto [input, metadata] = read_parquet(input_filepath);
 
   // Status string to indicate if page stats are set to be written or not
   auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
   // Write parquet file with the specified encoding and compression
   std::cout << "Writing " << output_filepath << " with encoding, compression and "
-            << page_stat_string << ".." << std::endl;
+            << page_stat_string << "..\n";
 
   // `timer` is automatically started here
   cudf::examples::timer timer;
@@ -149,7 +162,7 @@ int main(int argc, char const** argv)
   timer.print_elapsed_millis();
 
   // Read the parquet file written with encoding and compression
-  std::cout << "Reading " << output_filepath << "..." << std::endl;
+  std::cout << "Reading " << output_filepath << "...\n";
 
   // Reset the timer
   timer.reset();
@@ -157,23 +170,7 @@ int main(int argc, char const** argv)
   timer.print_elapsed_millis();
 
   // Check for validity
-  try {
-    // Left anti-join the original and transcoded tables
-    // identical tables should not throw an exception and
-    // return an empty indices vector
-    auto const indices = cudf::left_anti_join(input->view(),
-                                              transcoded_input->view(),
-                                              cudf::null_equality::EQUAL,
-                                              cudf::get_default_stream(),
-                                              resource.get());
-
-    // No exception thrown, check indices
-    auto const valid = indices->size() == 0;
-    std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
-  } catch (std::exception& e) {
-    std::cerr << e.what() << std::endl << std::endl;
-    std::cout << "Transcoding valid: false" << std::endl;
-  }
+  check_tables_equal(input->view(), transcoded_input->view());
 
   return 0;
 }
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
new file mode 100644
index 00000000000..6ad4b862240
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "common_utils.hpp"
+#include "io_source.hpp"
+
+#include <cudf/concatenate.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <filesystem>
+#include <stdexcept>
+#include <string>
+#include <thread>
+
+/**
+ * @file parquet_io_multithreaded.cpp
+ * @brief Demonstrates reading parquet data from the specified io source using multiple threads.
+ *
+ * The input parquet data is provided via files which are converted to the specified io source type
+ * to be read using multiple threads. Optionally, the parquet data read by each thread can be
+ * written to corresponding files and checked for validatity of the output files against the input
+ * data.
+ *
+ * Run: ``parquet_io_multithreaded -h`` to see help with input args and more information.
+ *
+ * The following io source types are supported:
+ * IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER
+ *
+ */
+
+// Type alias for unique ptr to cudf table
+using table_t = std::unique_ptr<cudf::table>;
+
+/**
+ * @brief Behavior when handling the read tables by multiple threads
+ */
+enum class read_mode {
+  NO_CONCATENATE,      ///< Only read and discard tables
+  CONCATENATE_THREAD,  ///< Read and concatenate tables from each thread
+  CONCATENATE_ALL,     ///< Read and concatenate everything to a single table
+};
+
+/**
+ * @brief Functor for multithreaded parquet reading based on the provided read_mode
+ */
+template <read_mode read_mode>
+struct read_fn {
+  std::vector<io_source> const& input_sources;
+  std::vector<table_t>& tables;
+  int const thread_id;
+  int const thread_count;
+  rmm::cuda_stream_view stream;
+
+  void operator()()
+  {
+    // Tables read by this thread
+    std::vector<table_t> tables_this_thread;
+
+    // Sweep the available input files
+    for (auto curr_file_idx = thread_id; curr_file_idx < input_sources.size();
+         curr_file_idx += thread_count) {
+      auto builder =
+        cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info());
+      auto const options = builder.build();
+      if constexpr (read_mode != read_mode::NO_CONCATENATE) {
+        tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl);
+      } else {
+        cudf::io::read_parquet(options, stream);
+      }
+    }
+
+    // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode.
+    if constexpr (read_mode != read_mode::NO_CONCATENATE) {
+      auto table = concatenate_tables(std::move(tables_this_thread), stream);
+      stream.synchronize_no_throw();
+      tables[thread_id] = std::move(table);
+    } else {
+      // Just synchronize this stream and exit
+      stream.synchronize_no_throw();
+    }
+  }
+};
+
+/**
+ * @brief Function to setup and launch multithreaded parquet reading.
+ *
+ * @tparam read_mode Specifies if to concatenate and return the actual
+ *                    tables or discard them and return an empty vector
+ *
+ * @param input_sources List of input sources to read
+ * @param thread_count Number of threads
+ * @param stream_pool CUDA stream pool to use for threads
+ *
+ * @return Vector of read tables.
+ */
+template <read_mode read_mode>
+std::vector<table_t> read_parquet_multithreaded(std::vector<io_source> const& input_sources,
+                                                int32_t thread_count,
+                                                rmm::cuda_stream_pool& stream_pool)
+{
+  // Tables read by each thread
+  std::vector<table_t> tables(thread_count);
+
+  // Table reading tasks
+  std::vector<read_fn<read_mode>> read_tasks;
+  read_tasks.reserve(thread_count);
+
+  // Create the read tasks
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) {
+      read_tasks.emplace_back(
+        read_fn<read_mode>{input_sources, tables, tid, thread_count, stream_pool.get_stream()});
+    });
+
+  // Create threads with tasks
+  std::vector<std::thread> threads;
+  threads.reserve(thread_count);
+  for (auto& c : read_tasks) {
+    threads.emplace_back(c);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // If CONCATENATE_ALL mode, then concatenate to a vector of one final table.
+  if (read_mode == read_mode::CONCATENATE_ALL) {
+    auto stream    = stream_pool.get_stream();
+    auto final_tbl = concatenate_tables(std::move(tables), stream);
+    stream.synchronize();
+    tables.clear();
+    tables.emplace_back(std::move(final_tbl));
+  }
+
+  return tables;
+}
+
+/**
+ * @brief Functor for multithreaded parquet writing
+ */
+struct write_fn {
+  std::string const& output_path;
+  std::vector<cudf::table_view> const& table_views;
+  int const thread_id;
+  rmm::cuda_stream_view stream;
+
+  void operator()()
+  {
+    // Create a sink
+    cudf::io::sink_info const sink_info{output_path + "/table_" + std::to_string(thread_id) +
+                                        ".parquet"};
+    // Writer options builder
+    auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]);
+    // Create a new metadata for the table
+    auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]};
+
+    builder.metadata(table_metadata);
+    auto options = builder.build();
+
+    // Write parquet data
+    cudf::io::write_parquet(options, stream);
+
+    // Done with this stream
+    stream.synchronize_no_throw();
+  }
+};
+
+/**
+ * @brief Function to setup and launch multithreaded writing parquet files.
+ *
+ * @param output_path Path to output directory
+ * @param tables List of at least table views to be written
+ * @param thread_count Number of threads to use for writing tables.
+ * @param stream_pool CUDA stream pool to use for threads
+ *
+ */
+void write_parquet_multithreaded(std::string const& output_path,
+                                 std::vector<cudf::table_view> const& tables,
+                                 int32_t thread_count,
+                                 rmm::cuda_stream_pool& stream_pool)
+{
+  // Table writing tasks
+  std::vector<write_fn> write_tasks;
+  write_tasks.reserve(thread_count);
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) {
+      write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()});
+    });
+
+  // Writer threads
+  std::vector<std::thread> threads;
+  threads.reserve(thread_count);
+  for (auto& c : write_tasks) {
+    threads.emplace_back(c);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+/**
+ * @brief Function to print example usage and argument information.
+ */
+void print_usage()
+{
+  std::cout
+    << "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files> <input "
+       "multiplier>\n"
+       "                                <io source type> <number of times to read> <thread count>\n"
+       "                                <write to temp output files and validate: "
+       "yes/no>\n\n"
+       "Available IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER (Default), "
+       "DEVICE_BUFFER\n\n"
+       "Note: Provide as many arguments as you like in the above order. Default values\n"
+       "      for the unprovided arguments will be used. All input parquet files will\n"
+       "      be converted to the specified IO source type before reading\n\n";
+}
+
+/**
+ * @brief Function to process comma delimited input paths string to parquet files and/or dirs
+ *        and convert them to specified io sources.
+ *
+ * Process the input path string containing directories (of parquet files) and/or individual
+ * parquet files into a list of input parquet files, multiple the list by `input_multiplier`,
+ * make sure to have at least `thread_count` files to satisfy at least file per parallel thread,
+ * and convert the final list of files to a list of `io_source` and return.
+ *
+ * @param paths Comma delimited input paths string
+ * @param input_multiplier Multiplier for the input files list
+ * @param thread_count Number of threads being used in the example
+ * @param io_source_type Specified IO source type to convert input files to
+ * @param stream CUDA stream to use
+ *
+ * @return Vector of input sources for the given paths
+ */
+std::vector<io_source> extract_input_sources(std::string const& paths,
+                                             int32_t input_multiplier,
+                                             int32_t thread_count,
+                                             io_source_type io_source_type,
+                                             rmm::cuda_stream_view stream)
+{
+  // Get the delimited paths to directory and/or files.
+  std::vector<std::string> const delimited_paths = [&]() {
+    std::vector<std::string> paths_list;
+    std::stringstream strstream{paths};
+    std::string path;
+    // Extract the delimited paths.
+    while (std::getline(strstream, path, char{','})) {
+      paths_list.push_back(path);
+    }
+    return paths_list;
+  }();
+
+  // List of parquet files
+  std::vector<std::string> parquet_files;
+  std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
+    std::filesystem::path path{path_string};
+    // If this is a parquet file, add it.
+    if (std::filesystem::is_regular_file(path)) {
+      parquet_files.push_back(path_string);
+    }
+    // If this is a directory, add all files in the directory.
+    else if (std::filesystem::is_directory(path)) {
+      for (auto const& file : std::filesystem::directory_iterator(path)) {
+        if (std::filesystem::is_regular_file(file.path())) {
+          parquet_files.push_back(file.path().string());
+        } else {
+          std::cout << "Skipping sub-directory: " << file.path().string() << "\n";
+        }
+      }
+    } else {
+      print_usage();
+      throw std::runtime_error("Encountered an invalid input path\n");
+    }
+  });
+
+  // Current size of list of parquet files
+  auto const initial_size = parquet_files.size();
+  if (initial_size == 0) { return {}; }
+
+  // Reserve space
+  parquet_files.reserve(std::max<size_t>(thread_count, input_multiplier * parquet_files.size()));
+
+  // Append the input files by input_multiplier times
+  std::for_each(thrust::make_counting_iterator(1),
+                thrust::make_counting_iterator(input_multiplier),
+                [&](auto i) {
+                  parquet_files.insert(parquet_files.end(),
+                                       parquet_files.begin(),
+                                       parquet_files.begin() + initial_size);
+                });
+
+  // Cycle append parquet files from the existing ones if less than the thread_count
+  std::cout << "Warning: Number of input sources < thread count. Cycling from\n"
+               "and appending to current input sources such that the number of\n"
+               "input source == thread count\n";
+  for (size_t idx = 0; thread_count > static_cast<int>(parquet_files.size()); idx++) {
+    parquet_files.emplace_back(parquet_files[idx % initial_size]);
+  }
+
+  // Vector of io sources
+  std::vector<io_source> input_sources;
+  input_sources.reserve(parquet_files.size());
+  // Transform input files to the specified io sources
+  std::transform(parquet_files.begin(),
+                 parquet_files.end(),
+                 std::back_inserter(input_sources),
+                 [&](auto const& file_name) {
+                   return io_source{file_name, io_source_type, stream};
+                 });
+  stream.synchronize();
+  return input_sources;
+}
+
+/**
+ * @brief The main function
+ */
+int32_t main(int argc, char const** argv)
+{
+  // Set arguments to defaults
+  std::string input_paths       = "example.parquet";
+  int32_t input_multiplier      = 1;
+  int32_t num_reads             = 1;
+  int32_t thread_count          = 1;
+  io_source_type io_source_type = io_source_type::PINNED_BUFFER;
+  bool write_and_validate       = false;
+
+  // Set to the provided args
+  switch (argc) {
+    case 7: write_and_validate = get_boolean(argv[6]); [[fallthrough]];
+    case 6: thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); [[fallthrough]];
+    case 5: num_reads = std::max(1, std::stoi(argv[4])); [[fallthrough]];
+    case 4: io_source_type = get_io_source_type(argv[3]); [[fallthrough]];
+    case 3:
+      input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]}));
+      [[fallthrough]];
+    case 2:
+      // Check if instead of input_paths, the first argument is `-h` or `--help`
+      if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") {
+        input_paths = std::move(arg);
+        break;
+      }
+      [[fallthrough]];
+    default: print_usage(); throw std::runtime_error("");
+  }
+
+  // Initialize mr, default stream and stream pool
+  auto const is_pool_used = true;
+  auto resource           = create_memory_resource(is_pool_used);
+  auto default_stream     = cudf::get_default_stream();
+  auto stream_pool        = rmm::cuda_stream_pool(thread_count);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+
+  // List of input sources from the input_paths string.
+  auto const input_sources = extract_input_sources(
+    input_paths, input_multiplier, thread_count, io_source_type, default_stream);
+
+  // Check if there is nothing to do
+  if (input_sources.empty()) {
+    print_usage();
+    throw std::runtime_error("No input files to read. Exiting early.\n");
+  }
+
+  // Read the same parquet files specified times with multiple threads and discard the read tables
+  {
+    // Print status
+    std::cout << "\nReading " << input_sources.size() << " input sources " << num_reads
+              << " time(s) using " << thread_count
+              << " threads and discarding output "
+                 "tables..\n";
+
+    if (io_source_type == io_source_type::FILEPATH) {
+      std::cout << "Note that the first read may include times for nvcomp, cufile loading and RMM "
+                   "growth.\n\n";
+    }
+
+    cudf::examples::timer timer;
+    std::for_each(thrust::make_counting_iterator(0),
+                  thrust::make_counting_iterator(num_reads),
+                  [&](auto i) {  // Read parquet files and discard the tables
+                    std::ignore = read_parquet_multithreaded<read_mode::NO_CONCATENATE>(
+                      input_sources, thread_count, stream_pool);
+                  });
+    default_stream.synchronize();
+    timer.print_elapsed_millis();
+  }
+
+  // Write parquet files and validate if needed
+  if (write_and_validate) {
+    // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables
+    auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
+      input_sources, thread_count, stream_pool);
+    default_stream.synchronize();
+
+    // Construct a vector of table views for write_parquet_multithreaded
+    auto const table_views = [&tables]() {
+      std::vector<cudf::table_view> table_views;
+      table_views.reserve(tables.size());
+      std::transform(
+        tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) {
+          return tbl->view();
+        });
+      return table_views;
+    }();
+
+    // Write tables to parquet
+    std::cout << "Writing parquet output files..\n";
+
+    // Create a directory at the tmpdir path.
+    std::string output_path =
+      std::filesystem::temp_directory_path().string() + "/output_" + current_date_and_time();
+    std::filesystem::create_directory({output_path});
+    cudf::examples::timer timer;
+    write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool);
+    default_stream.synchronize();
+    timer.print_elapsed_millis();
+
+    // Verify the output
+    std::cout << "Verifying output..\n";
+
+    // Simply concatenate the previously read tables from input sources
+    auto const input_table = cudf::concatenate(table_views, default_stream);
+
+    // Sources from written parquet files
+    auto const written_pq_sources = extract_input_sources(
+      output_path, input_multiplier, thread_count, io_source_type, default_stream);
+
+    // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only
+    auto const transcoded_table = std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(
+                                              written_pq_sources, thread_count, stream_pool)
+                                              .back());
+    default_stream.synchronize();
+
+    // Check if the tables are identical
+    check_tables_equal(input_table->view(), transcoded_table->view());
+
+    // Remove the created temp directory and parquet data
+    std::filesystem::remove_all(output_path);
+  }
+
+  // Print peak memory
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n";
+
+  return 0;
+}
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index c3b68b52c36..6bbe32de134 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -378,6 +378,26 @@ std::unique_ptr<column> make_strings_column(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Construct a batch of STRING type columns given an array of device spans of pointer/size
+ * pairs.
+ *
+ * This function has input/output expectation similar to the `make_strings_column()` API that
+ * accepts only one device span of pointer/size pairs. The difference is that, this is designed to
+ * create many strings columns at once with minimal overhead of multiple kernel launches and
+ * stream synchronizations.
+ *
+ * @param input Array of device spans of pointer/size pairs, where each pointer is a device memory
+ *        address or `nullptr` (indicating a null string), and size is string length (in bytes)
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for memory allocation of the output columns
+ * @return Array of constructed strings columns
+ */
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Construct a STRING type column given a device span of string_view.
  *
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 7359a0d5fde..1eaea5b6374 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -38,6 +38,22 @@ namespace datetime {
  * @file
  */
 
+/**
+ * @brief Types of datetime components that may be extracted.
+ */
+enum class datetime_component : uint8_t {
+  YEAR,
+  MONTH,
+  DAY,
+  WEEKDAY,
+  HOUR,
+  MINUTE,
+  SECOND,
+  MILLISECOND,
+  MICROSECOND,
+  NANOSECOND
+};
+
 /**
  * @brief  Extracts year from any datetime type and returns an int16_t
  * cudf::column.
@@ -207,6 +223,24 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Extracts the specified datetime component from any datetime type and
+ * returns an int16_t cudf::column.
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param component The datetime component to extract
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate device memory of the returned column
+ *
+ * @returns cudf::column of the extracted int16_t datetime component
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ */
+std::unique_ptr<cudf::column> extract_datetime_component(
+  cudf::column_view const& column,
+  datetime_component component,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 /**
  * @addtogroup datetime_compute
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 10be5e1d36f..204eee49a2a 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/aggregation.hpp>
@@ -29,12 +28,31 @@
 #include <cuda/std/type_traits>
 
 namespace cudf::detail {
+/// Checks if an aggregation kind needs to operate on the underlying storage type
+template <aggregation::Kind k>
+__device__ constexpr bool uses_underlying_type()
+{
+  return k == aggregation::MIN or k == aggregation::MAX or k == aggregation::SUM;
+}
+
+/// Gets the underlying target type for the given source type and aggregation kind
+template <typename Source, aggregation::Kind k>
+using underlying_target_t =
+  cuda::std::conditional_t<uses_underlying_type<k>(),
+                           cudf::device_storage_type_t<cudf::detail::target_type_t<Source, k>>,
+                           cudf::detail::target_type_t<Source, k>>;
+
+/// Gets the underlying source type for the given source type and aggregation kind
+template <typename Source, aggregation::Kind k>
+using underlying_source_t =
+  cuda::std::conditional_t<uses_underlying_type<k>(), cudf::device_storage_type_t<Source>, Source>;
+
 template <typename Source, aggregation::Kind k, typename Enable = void>
 struct update_target_element {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
+  __device__ void operator()(mutable_column_device_view,
+                             size_type,
+                             column_device_view,
+                             size_type) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
@@ -51,8 +69,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::MIN>;
     cudf::detail::atomic_min(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -72,8 +88,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::MIN>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -96,8 +110,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::MAX>;
     cudf::detail::atomic_max(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -117,8 +129,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::MAX>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -141,8 +151,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::SUM>;
     cudf::detail::atomic_add(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -162,8 +170,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::SUM>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -197,10 +203,10 @@ struct update_target_from_dictionary {
   template <typename Source,
             aggregation::Kind k,
             cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
+  __device__ void operator()(mutable_column_device_view,
+                             size_type,
+                             column_device_view,
+                             size_type) const noexcept
   {
   }
 };
@@ -227,8 +233,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     dispatch_type_and_aggregation(
       source.child(cudf::dictionary_column_view::keys_column_index).type(),
       k,
@@ -249,8 +253,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
     auto value   = static_cast<Target>(source.element<Source>(source_index));
     cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
@@ -267,8 +269,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::PRODUCT>;
     cudf::detail::atomic_mul(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -286,8 +286,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::COUNT_VALID>;
     cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
 
@@ -323,8 +321,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::ARGMAX>;
     auto old     = cudf::detail::atomic_cas(
       &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
@@ -349,8 +345,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::ARGMIN>;
     auto old     = cudf::detail::atomic_cas(
       &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
@@ -376,6 +370,9 @@ struct elementwise_aggregator {
                              column_device_view source,
                              size_type source_index) const noexcept
   {
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source.is_null(source_index)) { return; }
+    }
     update_target_element<Source, k>{}(target, target_index, source, source_index);
   }
 };
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index dfb646c66c4..4159e324472 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -19,6 +19,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -36,7 +37,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -256,7 +256,7 @@ struct scatter_gather_functor {
 
     cudf::detail::grid_1d grid{input.size(), block_size, per_thread};
 
-    rmm::device_scalar<cudf::size_type> null_count{0, stream};
+    cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
     if (output.nullable()) {
       // Have to initialize the output mask to all zeros because we may update
       // it with atomicOr().
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index a70cd5a0661..5dc75b1a3fb 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -19,12 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <rmm/device_scalar.hpp>
-
 #include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
 
@@ -171,7 +170,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
 
   // if we have validity in the output
   if (nullable) {
-    rmm::device_scalar<size_type> valid_count{0, stream};
+    cudf::detail::device_scalar<size_type> valid_count{0, stream};
 
     // call the kernel
     copy_if_else_kernel<block_size, Element, LeftIter, RightIter, FilterFn, true>
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 3aa136d630b..fcb80fe45f7 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>
@@ -154,7 +154,7 @@ void copy_range(SourceValueIterator source_value_begin,
   auto grid = cudf::detail::grid_1d{num_items, block_size, 1};
 
   if (target.nullable()) {
-    rmm::device_scalar<size_type> null_count(target.null_count(), stream);
+    cudf::detail::device_scalar<size_type> null_count(target.null_count(), stream);
 
     auto kernel =
       copy_range_kernel<block_size, SourceValueIterator, SourceValidityIterator, T, true>;
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 9db7e48498f..df3050d6494 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -115,6 +115,16 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view cons
                                                           rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr);
 
+/**
+ * @copydoc cudf::extract_datetime_component(cudf::column_view const&, datetime_component,
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
+ *
+ */
+std::unique_ptr<cudf::column> extract_datetime_component(cudf::column_view const& column,
+                                                         datetime_component component,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr);
+
 /**
  * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp
new file mode 100644
index 00000000000..16ca06c6561
--- /dev/null
+++ b/cpp/include/cudf/detail/device_scalar.hpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+template <typename T>
+class device_scalar : public rmm::device_scalar<T> {
+ public:
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  ~device_scalar() = default;
+
+// Implementation is the same as what compiler should generate
+// Could not use default move constructor as 11.8 compiler fails to generate it
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  device_scalar(device_scalar&& other) noexcept
+    : rmm::device_scalar<T>{std::move(other)}, bounce_buffer{std::move(other.bounce_buffer)}
+  {
+  }
+  device_scalar& operator=(device_scalar&&) noexcept = default;
+
+  device_scalar(device_scalar const&)            = delete;
+  device_scalar& operator=(device_scalar const&) = delete;
+
+  device_scalar() = delete;
+
+  explicit device_scalar(
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+  }
+
+  explicit device_scalar(
+    T const& initial_value,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+    bounce_buffer[0] = initial_value;
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  device_scalar(device_scalar const& other,
+                rmm::cuda_stream_view stream,
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(other, stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+  }
+
+  [[nodiscard]] T value(rmm::cuda_stream_view stream) const
+  {
+    cuda_memcpy<T>(bounce_buffer, device_span<T const>{this->data(), 1}, stream);
+    return bounce_buffer[0];
+  }
+
+  void set_value_async(T const& value, rmm::cuda_stream_view stream)
+  {
+    bounce_buffer[0] = value;
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  void set_value_async(T&& value, rmm::cuda_stream_view stream)
+  {
+    bounce_buffer[0] = std::move(value);
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  void set_value_to_zero_async(rmm::cuda_stream_view stream) { set_value_async(T{}, stream); }
+
+ private:
+  mutable cudf::detail::host_vector<T> bounce_buffer;
+};
+
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
deleted file mode 100644
index 7de79b31bc7..00000000000
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cuco/static_map.cuh>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace cudf::detail {
-
-using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator<char>>;
-
-/**
- * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
- * rows that compared equal.
- *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
-struct reduce_by_row_fn_base {
- protected:
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
-  OutputType* const d_output;
-
-  reduce_by_row_fn_base(MapView const& d_map,
-                        KeyHasher const& d_hasher,
-                        KeyEqual const& d_equal,
-                        OutputType* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
-  {
-  }
-
-  /**
-   * @brief Return a pointer to the output array at the given index.
-   *
-   * @param idx The access index
-   * @return A pointer to the given index in the output array
-   */
-  __device__ OutputType* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one (undetermined) index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
-
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
-  }
-};
-
-/**
- * @brief Perform a reduction on groups of rows that are compared equal.
- *
- * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
- *
- * At the beginning of the operation, the entire output array is filled with a value given by
- * the `init` parameter. Then, the reduction result for each row group is written into the output
- * array at the index of an unspecified row in the group.
- *
- * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a
- *         reduction functor derived from `reduce_by_row_fn_base`
- * @tparam OutputType Type of the reduction results
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
- * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
- * @param init The initial value for reduction of each row group
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
- */
-template <typename ReduceFuncBuilder, typename OutputType>
-rmm::device_uvector<OutputType> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  ReduceFuncBuilder func_builder,
-  OutputType init,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto const map_dview  = map.get_device_view();
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
-  thrust::uninitialized_fill(
-    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 4349e1b70fd..30f36d6a5da 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -38,18 +38,19 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-#include <utility>
-
 namespace cudf {
 namespace detail {
 /**
  * @brief Convenience wrapper for creating a `thrust::transform_iterator` over a
- * `thrust::counting_iterator`.
+ * `thrust::counting_iterator` within the range [0, INT_MAX].
+ *
  *
  * Example:
  * @code{.cpp}
@@ -62,14 +63,21 @@ namespace detail {
  * iter[n] == n * n
  * @endcode
  *
- * @param start The starting value of the counting iterator
+ * @param start The starting value of the counting iterator (must be size_type or smaller type).
  * @param f The unary function to apply to the counting iterator.
  * @return A transform iterator that applies `f` to a counting iterator
  */
-template <typename UnaryFunction>
-CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(cudf::size_type start,
+template <typename CountingIterType, typename UnaryFunction>
+CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {
+  // Check if the `start` for counting_iterator is of size_type or a smaller integral type
+  static_assert(
+    cuda::std::is_integral_v<CountingIterType> and
+      cuda::std::numeric_limits<CountingIterType>::digits <=
+        cuda::std::numeric_limits<cudf::size_type>::digits,
+    "The `start` for the counting_transform_iterator must be size_type or smaller type");
+
   return thrust::make_transform_iterator(thrust::make_counting_iterator(start), f);
 }
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 327c732716c..482265d633e 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
@@ -25,7 +26,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cub/block/block_reduce.cuh>
@@ -165,7 +165,7 @@ size_type inplace_bitmask_binop(Binop op,
                "Mask pointer cannot be null");
 
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
-  rmm::device_scalar<size_type> d_counter{0, stream, mr};
+  cudf::detail::device_scalar<size_type> d_counter{0, stream, mr};
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
 
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 18b1e9b2d2e..0f852db0c54 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -59,7 +59,7 @@ std::unique_ptr<column> true_if(InputIterator begin,
   auto output_mutable_view = output->mutable_view();
   auto output_data         = output_mutable_view.data<bool>();
 
-  thrust::transform(rmm::exec_policy(stream), begin, end, output_data, p);
+  thrust::transform(rmm::exec_policy_nosync(stream), begin, end, output_data, p);
 
   return output;
 }
diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
new file mode 100644
index 00000000000..ed0ab9e6e5b
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <cub/device/device_memcpy.cuh>
+#include <cuda/functional>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief A helper function that copies a vector of vectors from source to destination addresses in
+ * a batched manner.
+ *
+ * @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator
+ * @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator
+ * @tparam SizeIterator **[inferred]** The type of device-accessible buffer size iterator
+ *
+ * @param src_iter Device-accessible iterator to source addresses
+ * @param dst_iter Device-accessible iterator to destination addresses
+ * @param size_iter Device-accessible iterator to the buffer sizes (in bytes)
+ * @param num_buffs Number of buffers to be copied
+ * @param stream CUDA stream to use
+ */
+template <typename SrcIterator, typename DstIterator, typename SizeIterator>
+void batched_memcpy_async(SrcIterator src_iter,
+                          DstIterator dst_iter,
+                          SizeIterator size_iter,
+                          size_t num_buffs,
+                          rmm::cuda_stream_view stream)
+{
+  size_t temp_storage_bytes = 0;
+  cub::DeviceMemcpy::Batched(
+    nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value());
+
+  rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()};
+
+  cub::DeviceMemcpy::Batched(d_temp_storage.data(),
+                             temp_storage_bytes,
+                             src_iter,
+                             dst_iter,
+                             size_iter,
+                             num_buffs,
+                             stream.value());
+}
+
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp
similarity index 95%
rename from cpp/include/cudf/io/detail/batched_memset.hpp
rename to cpp/include/cudf/detail/utilities/batched_memset.hpp
index 1c74be4a9fe..78be5b91248 100644
--- a/cpp/include/cudf/io/detail/batched_memset.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp
@@ -28,7 +28,7 @@
 #include <thrust/transform.h>
 
 namespace CUDF_EXPORT cudf {
-namespace io::detail {
+namespace detail {
 
 /**
  * @brief A helper function that takes in a vector of device spans and memsets them to the
@@ -53,8 +53,8 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
     cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref());
 
   // get a vector with the sizes of all buffers
-  auto sizes = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
+  auto sizes = thrust::make_transform_iterator(
+    thrust::counting_iterator<std::size_t>(0),
     cuda::proclaim_return_type<std::size_t>(
       [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
 
@@ -78,5 +78,5 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
     d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
 }
 
-}  // namespace io::detail
+}  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 632d5a732ec..4f0c52c5954 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -25,33 +26,82 @@ namespace detail {
 
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
+void cuda_memcpy_async_impl(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
 /**
- * @brief Asynchronously copies data between the host and device.
+ * @brief Asynchronously copies data from host to device memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination device memory
+ * @param src Source host memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy_async(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+  auto const is_pinned = src.is_device_accessible();
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
+}
 
 /**
- * @brief Synchronously copies data between the host and device.
+ * @brief Asynchronously copies data from device to host memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination host memory
+ * @param src Source device memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+  auto const is_pinned = dst.is_device_accessible();
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
+}
+
+/**
+ * @brief Synchronously copies data from host to device memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination device memory
+ * @param src Source host memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
+}
+
+/**
+ * @brief Synchronously copies data from device to host memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination host memory
+ * @param src Source device memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
+}
 
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp
index 8c1c3c28df8..e7643eb44bd 100644
--- a/cpp/include/cudf/detail/utilities/logger.hpp
+++ b/cpp/include/cudf/detail/utilities/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,9 @@
 #include <cudf/utilities/logger.hpp>
 
 // Log messages that require computation should only be used at level TRACE and DEBUG
-#define CUDF_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&cudf::logger(), __VA_ARGS__)
-#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::logger(), __VA_ARGS__)
+#define CUDF_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__)
+#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__)
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 953ae5b9308..1f1e7a2db77 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -101,12 +101,7 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  auto const is_pinned = source_data.is_device_accessible();
-  cuda_memcpy_async(ret.data(),
-                    source_data.data(),
-                    source_data.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  cuda_memcpy_async<T>(ret, source_data, stream);
   return ret;
 }
 
@@ -405,13 +400,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
 template <typename T>
 host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  auto result          = make_host_vector<T>(v.size(), stream);
-  auto const is_pinned = result.get_allocator().is_device_accessible();
-  cuda_memcpy_async(result.data(),
-                    v.data(),
-                    v.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  auto result = make_host_vector<T>(v.size(), stream);
+  cuda_memcpy_async<T>(result, v, stream);
   return result;
 }
 
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index cfb2e70bfed..af182b69c3a 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
@@ -25,7 +26,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/distance.h>
 
@@ -101,7 +101,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
 
   size_type null_count{0};
   if (size > 0) {
-    rmm::device_scalar<size_type> valid_count{0, stream};
+    cudf::detail::device_scalar<size_type> valid_count{0, stream};
 
     constexpr size_type block_size{256};
     grid_1d grid{size, block_size};
diff --git a/cpp/include/cudf/hashing/detail/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh
index 3489fdeccee..ea1accc62a4 100644
--- a/cpp/include/cudf/hashing/detail/helper_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/helper_functions.cuh
@@ -47,197 +47,3 @@ inline size_t compute_hash_table_size(cudf::size_type num_keys_to_insert,
 
   return hash_table_size;
 }
-
-template <typename pair_type>
-__forceinline__ __device__ pair_type load_pair_vectorized(pair_type const* __restrict__ const ptr)
-{
-  if (sizeof(uint4) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint4 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0, 0, 0, 0};
-    converter.vec_val            = *reinterpret_cast<uint4 const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(uint2) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint2 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0, 0};
-    converter.vec_val            = *reinterpret_cast<uint2 const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(int) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      int vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.vec_val            = *reinterpret_cast<int const*>(ptr);
-    return converter.pair_val;
-  } else if (sizeof(short) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      short vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.vec_val            = *reinterpret_cast<short const*>(ptr);
-    return converter.pair_val;
-  } else {
-    return *ptr;
-  }
-}
-
-template <typename pair_type>
-__forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ const ptr,
-                                                      pair_type const val)
-{
-  if (sizeof(uint4) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint4 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0, 0, 0, 0};
-    converter.pair_val             = val;
-    *reinterpret_cast<uint4*>(ptr) = converter.vec_val;
-  } else if (sizeof(uint2) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      uint2 vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0, 0};
-    converter.pair_val             = val;
-    *reinterpret_cast<uint2*>(ptr) = converter.vec_val;
-  } else if (sizeof(int) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      int vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter = {0};
-    converter.pair_val           = val;
-    *reinterpret_cast<int*>(ptr) = converter.vec_val;
-  } else if (sizeof(short) == sizeof(pair_type)) {
-    union pair_type2vec_type {
-      short vec_val;
-      pair_type pair_val;
-    };
-    pair_type2vec_type converter   = {0};
-    converter.pair_val             = val;
-    *reinterpret_cast<short*>(ptr) = converter.vec_val;
-  } else {
-    *ptr = val;
-  }
-}
-
-template <typename value_type, typename size_type, typename key_type, typename elem_type>
-CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values,
-                              size_type const n,
-                              key_type const key_val,
-                              elem_type const elem_val)
-{
-  size_type const idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    store_pair_vectorized(hashtbl_values + idx, thrust::make_pair(key_val, elem_val));
-  }
-}
-
-template <typename T>
-struct equal_to {
-  using result_type          = bool;
-  using first_argument_type  = T;
-  using second_argument_type = T;
-  __forceinline__ __host__ __device__ constexpr bool operator()(
-    first_argument_type const& lhs, second_argument_type const& rhs) const
-  {
-    return lhs == rhs;
-  }
-};
-
-template <typename Iterator>
-class cycle_iterator_adapter {
- public:
-  using value_type      = typename std::iterator_traits<Iterator>::value_type;
-  using difference_type = typename std::iterator_traits<Iterator>::difference_type;
-  using pointer         = typename std::iterator_traits<Iterator>::pointer;
-  using reference       = typename std::iterator_traits<Iterator>::reference;
-  using iterator_type   = Iterator;
-
-  cycle_iterator_adapter() = delete;
-
-  __host__ __device__ explicit cycle_iterator_adapter(iterator_type const& begin,
-                                                      iterator_type const& end,
-                                                      iterator_type const& current)
-    : m_begin(begin), m_end(end), m_current(current)
-  {
-  }
-
-  __host__ __device__ cycle_iterator_adapter& operator++()
-  {
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return *this;
-  }
-
-  __host__ __device__ cycle_iterator_adapter const& operator++() const
-  {
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return *this;
-  }
-
-  __host__ __device__ cycle_iterator_adapter& operator++(int)
-  {
-    cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return old;
-  }
-
-  __host__ __device__ cycle_iterator_adapter const& operator++(int) const
-  {
-    cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
-    if (m_end == (m_current + 1))
-      m_current = m_begin;
-    else
-      ++m_current;
-    return old;
-  }
-
-  __host__ __device__ bool equal(cycle_iterator_adapter<iterator_type> const& other) const
-  {
-    return m_current == other.m_current && m_begin == other.m_begin && m_end == other.m_end;
-  }
-
-  __host__ __device__ reference& operator*() { return *m_current; }
-
-  __host__ __device__ reference const& operator*() const { return *m_current; }
-
-  __host__ __device__ const pointer operator->() const { return m_current.operator->(); }
-
-  __host__ __device__ pointer operator->() { return m_current; }
-
- private:
-  iterator_type m_current;
-  iterator_type m_begin;
-  iterator_type m_end;
-};
-
-template <class T>
-__host__ __device__ bool operator==(cycle_iterator_adapter<T> const& lhs,
-                                    cycle_iterator_adapter<T> const& rhs)
-{
-  return lhs.equal(rhs);
-}
-
-template <class T>
-__host__ __device__ bool operator!=(cycle_iterator_adapter<T> const& lhs,
-                                    cycle_iterator_adapter<T> const& rhs)
-{
-  return !lhs.equal(rhs);
-}
diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
index 1827ba0e3e6..13a76d50346 100644
--- a/cpp/include/cudf/io/config_utils.hpp
+++ b/cpp/include/cudf/io/config_utils.hpp
@@ -18,7 +18,8 @@
 #include <cudf/utilities/export.hpp>
 
 namespace CUDF_EXPORT cudf {
-namespace io::cufile_integration {
+namespace io {
+namespace cufile_integration {
 
 /**
  * @brief Returns true if cuFile and its compatibility mode are enabled.
@@ -35,9 +36,15 @@ bool is_gds_enabled();
  */
 bool is_kvikio_enabled();
 
-}  // namespace io::cufile_integration
+/**
+ * @brief Set kvikIO thread pool size according to the environment variable KVIKIO_NTHREADS. If
+ * KVIKIO_NTHREADS is not set, use 8 threads by default.
+ */
+void set_thread_pool_nthreads_from_env();
+
+}  // namespace cufile_integration
 
-namespace io::nvcomp_integration {
+namespace nvcomp_integration {
 
 /**
  * @brief Returns true if all nvCOMP uses are enabled.
@@ -49,5 +56,6 @@ bool is_all_enabled();
  */
 bool is_stable_enabled();
 
-}  // namespace io::nvcomp_integration
+}  // namespace nvcomp_integration
+}  // namespace io
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index b12fbe39a57..7d2cc4ad493 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -86,14 +86,21 @@ class datasource {
   /**
    * @brief Creates a source from a file path.
    *
+   * Parameters `offset` and `max_size_estimate` are hints to the `datasource` implementation about
+   * the expected range of the data that will be read. The implementation may use these hints to
+   * optimize the read operation. These parameters are usually based on the byte range option. In
+   * this case, `max_size_estimate` can include padding after the byte range, to include additional
+   * data that may be needed for processing.
+   *
    * @param[in] filepath Path to the file to use
-   * @param[in] offset Bytes from the start of the file (the default is zero)
-   * @param[in] size Bytes from the offset; use zero for entire file (the default is zero)
+   * @param[in] offset Starting byte offset from which data will be read (the default is zero)
+   * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is
+   * zero, which means the whole file after `offset`)
    * @return Constructed datasource object
    */
   static std::unique_ptr<datasource> create(std::string const& filepath,
-                                            size_t offset = 0,
-                                            size_t size   = 0);
+                                            size_t offset            = 0,
+                                            size_t max_size_estimate = 0);
 
   /**
    * @brief Creates a source from a host memory buffer.
diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
index 4cf8564ab3a..5694362af8f 100644
--- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
@@ -31,17 +31,41 @@ namespace detail {
 // intermediate data structure to compute `var`, `std`
 template <typename ResultType>
 struct var_std {
-  ResultType value;          /// the value
-  ResultType value_squared;  /// the value of squared
-
-  CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0)
-    : value(_value), value_squared(_value_squared){};
+  // Uses the pairwise approach of Chan, Golub, and LeVeque,
+  // _Algorithms for computing the sample variance: analysis and
+  // recommendations_ (1983)
+  // https://doi.org/10.1080/00031305.1983.10483115
+  // Also http://www.cs.yale.edu/publications/techreports/tr222.pdf
+  // This is a modification of Youngs and Cramer's online approach.
+  ResultType running_sum;
+  ResultType running_square_deviations;
+  size_type count;
+
+  CUDF_HOST_DEVICE inline var_std(ResultType t = 0, ResultType s = 0, size_type n = 0)
+    : running_sum(t), running_square_deviations(s), count(n){};
 
   using this_t = var_std<ResultType>;
 
   CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
   {
-    return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared));
+    // Updates as per equations 1.5a and 1.5b in the paper
+    // T_{1,m+n} = T_{1,m} + T_{m+1,n+1}
+    // S_{1,m+n} = S_{1,m} + S_{m+1,n+1} + m/(n(m+n)) * (n/m T_{1,m} - T_{m+1,n+1})**2
+    // Here the first m samples are in this, the remaining n samples are in rhs.
+    auto const m = this->count;
+    auto const n = rhs.count;
+    // Avoid division by zero.
+    if (m == 0) { return rhs; }
+    if (n == 0) { return *this; }
+    auto const tm   = this->running_sum;
+    auto const tn   = rhs.running_sum;
+    auto const sm   = this->running_square_deviations;
+    auto const sn   = rhs.running_square_deviations;
+    auto const tmn  = tm + tn;
+    auto const diff = ((static_cast<ResultType>(n) / m) * tm) - tn;
+    // Computing m/n(m+n) as m/n/(m+n) to avoid integer overflow
+    auto const smn = sm + sn + ((static_cast<ResultType>(m) / n) / (m + n)) * diff * diff;
+    return {tmn, smn, m + n};
   };
 };
 
@@ -50,10 +74,7 @@ template <typename ResultType>
 struct transformer_var_std {
   using OutputType = var_std<ResultType>;
 
-  CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value)
-  {
-    return OutputType(value, value * value);
-  };
+  CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) { return {value, 0, 1}; };
 };
 
 // ------------------------------------------------------------------------
@@ -257,12 +278,7 @@ struct variance : public compound_op<variance> {
                                                              cudf::size_type const& count,
                                                              cudf::size_type const& ddof)
     {
-      ResultType mean     = input.value / count;
-      ResultType asum     = input.value_squared;
-      cudf::size_type div = count - ddof;
-      ResultType var      = asum / div - ((mean * mean) * count) / div;
-
-      return var;
+      return input.running_square_deviations / (count - ddof);
     };
   };
 };
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 66be2a12fbe..360dde11fc0 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -94,8 +95,8 @@ class scalar {
   [[nodiscard]] bool const* validity_data() const;
 
  protected:
-  data_type _type{type_id::EMPTY};     ///< Logical type of value in the scalar
-  rmm::device_scalar<bool> _is_valid;  ///< Device bool signifying validity
+  data_type _type{type_id::EMPTY};              ///< Logical type of value in the scalar
+  cudf::detail::device_scalar<bool> _is_valid;  ///< Device bool signifying validity
 
   /**
    * @brief Move constructor for scalar.
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index d6e87f9d543..febc63d8779 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -28,7 +28,7 @@ namespace strings {
  */
 
 /**
- * @brief Decodes each string using URL encoding.
+ * @brief Encodes each string using URL encoding.
  *
  * Converts mostly non-ascii characters and control characters into UTF-8 hex code-points
  * prefixed with '%'. For example, the space character must be converted to characters '%20' where
@@ -49,7 +49,7 @@ std::unique_ptr<column> url_encode(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Encodes each string using URL encoding.
+ * @brief Decodes each string using URL encoding.
  *
  * Converts all character sequences starting with '%' into character code-points
  * interpreting the 2 following characters as hex values to create the code-point.
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 1283226879b..de2f1770e28 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -29,6 +30,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cub/device/device_memcpy.cuh>
+#include <cuda/functional>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
@@ -38,6 +41,62 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+/**
+ * @brief Gather characters to create a strings column using the given string-index pair iterator
+ *
+ * @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
+ *
+ * @param offsets The offsets for the output strings column
+ * @param chars_size The size (in bytes) of the chars data
+ * @param begin Iterator to the first string-index pair
+ * @param strings_count The number of strings
+ * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return An array of chars gathered from the input string-index pair iterator
+ */
+template <typename IndexPairIterator>
+rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
+                                            int64_t chars_size,
+                                            IndexPairIterator begin,
+                                            size_type strings_count,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::device_async_resource_ref mr)
+{
+  auto chars_data      = rmm::device_uvector<char>(chars_size, stream, mr);
+  auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);
+
+  auto const src_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
+      // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
+      // we have to use `const_cast` to remove `const` qualifier from the source pointer.
+      // This should be fine as long as we only read but not write anything to the source.
+      return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
+    }));
+  auto const src_sizes = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<size_type>(
+      [begin] __device__(uint32_t idx) { return begin[idx].second; }));
+  auto const dst_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
+                                        uint32_t idx) { return output + offsets[idx]; }));
+
+  size_t temp_storage_bytes = 0;
+  CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(
+    nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, strings_count, stream.value()));
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
+  CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(),
+                                           temp_storage_bytes,
+                                           src_ptrs,
+                                           dst_ptrs,
+                                           src_sizes,
+                                           strings_count,
+                                           stream.value()));
+
+  return chars_data;
+}
+
 /**
  * @brief Create an offsets column to be a child of a compound column
  *
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 6b1b453a752..03240f418fe 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -49,16 +49,6 @@ namespace detail {
  */
 using string_index_pair = thrust::pair<char const*, size_type>;
 
-/**
- * @brief Average string byte-length threshold for deciding character-level
- * vs. row-level parallel algorithm.
- *
- * This value was determined by running the factory_benchmark against different
- * string lengths and observing the point where the performance is faster for
- * long strings.
- */
-constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;
-
 /**
  * @brief Create a strings-type column from iterators of pointer/size pairs
  *
@@ -88,8 +78,6 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
   auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto const d_offsets =
-    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // create null mask
   auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
@@ -99,38 +87,8 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
-    auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
-    // use a character-parallel kernel for long string lengths
-    if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
-      auto const str_begin = thrust::make_transform_iterator(
-        begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
-          return string_view{ip.first, ip.second};
-        }));
-
-      return gather_chars(str_begin,
-                          thrust::make_counting_iterator<size_type>(0),
-                          thrust::make_counting_iterator<size_type>(strings_count),
-                          d_offsets,
-                          bytes,
-                          stream,
-                          mr);
-    } else {
-      // this approach is 2-3x faster for a large number of smaller string lengths
-      auto chars_data = rmm::device_uvector<char>(bytes, stream, mr);
-      auto d_chars    = chars_data.data();
-      auto copy_chars = [d_chars] __device__(auto item) {
-        string_index_pair const str = thrust::get<0>(item);
-        int64_t const offset        = thrust::get<1>(item);
-        if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
-      };
-      thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
-                         strings_count,
-                         copy_chars);
-      return chars_data;
-    }
-  }();
+  auto chars_data =
+    make_chars_buffer(offsets_column->view(), bytes, begin, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index c6b9bc7e58a..867764b6d9a 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -66,6 +66,35 @@ std::unique_ptr<column> findall(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the starting character index of the first match for the given pattern
+ * in each row of the input column
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["bunny", "rabbit", "hare", "dog"]
+ * p = regex_program::create("[be]")
+ * r = find_re(s, p)
+ * r is now [0, 2, 3, -1]
+ * @endcode
+ *
+ * A null output row occurs if the corresponding input row is null.
+ * A -1 is returned for rows that do not contain a match.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param input Strings instance for this operation
+ * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of integers
+ */
+std::unique_ptr<column> find_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 762131a174f..15fdad21d9f 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -148,7 +148,7 @@ class table {
     std::vector<column_view> columns(std::distance(begin, end));
     std::transform(
       begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); });
-    return table_view(columns);
+    return table_view{columns};
   }
 
   /**
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 4a990f67ce4..d41176590ea 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -241,7 +241,7 @@ class table_view : public detail::table_view_base<column_view> {
   {
     std::vector<column_view> columns(std::distance(begin, end));
     std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); });
-    return table_view(columns);
+    return table_view{columns};
   }
 
   /**
diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp
index 45d5d1b12e1..982554a23f5 100644
--- a/cpp/include/cudf/utilities/logger.hpp
+++ b/cpp/include/cudf/utilities/logger.hpp
@@ -22,6 +22,10 @@
 
 namespace CUDF_EXPORT cudf {
 
+namespace detail {
+spdlog::logger& logger();
+}
+
 /**
  * @brief Returns the global logger.
  *
@@ -43,6 +47,8 @@ namespace CUDF_EXPORT cudf {
  *
  * @return spdlog::logger& The logger.
  */
-spdlog::logger& logger();
+[[deprecated(
+  "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger&
+logger();
 
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 914731ea417..21ee4fa9e9b 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -180,18 +180,6 @@ class span_base {
     return Derived(_data + _size - count, count);
   }
 
-  /**
-   * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
-   *
-   * @param offset The offset of the first element in the subspan
-   * @param count The number of elements in the subspan
-   * @return A subspan of the sequence, of requested count and offset
-   */
-  [[nodiscard]] constexpr Derived subspan(size_type offset, size_type count) const noexcept
-  {
-    return Derived(_data + offset, count);
-  }
-
  private:
   pointer _data{nullptr};
   size_type _size{0};
@@ -234,6 +222,15 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 
   constexpr host_span() noexcept : base() {}  // required to compile on centos
 
+  /// Constructor from pointer and size
+  /// @param data Pointer to the first element in the span
+  /// @param size The number of elements in the span
+  /// @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
+  constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
+    : base(data, size), _is_device_accessible{is_device_accessible}
+  {
+  }
+
   /// Constructor from container
   /// @param in The container to construct the span from
   template <typename C,
@@ -288,7 +285,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
   constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
-    : base(other.data(), other.size())
+    : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
 
@@ -299,6 +296,19 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
    */
   [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; }
 
+  /**
+   * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
+   *
+   * @param offset The offset of the first element in the subspan
+   * @param count The number of elements in the subspan
+   * @return A subspan of the sequence, of requested count and offset
+   */
+  [[nodiscard]] constexpr host_span subspan(typename base::size_type offset,
+                                            typename base::size_type count) const noexcept
+  {
+    return host_span{this->data() + offset, count, _is_device_accessible};
+  }
+
  private:
   bool _is_device_accessible{false};
 };
@@ -368,6 +378,19 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
     : base(other.data(), other.size())
   {
   }
+
+  /**
+   * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
+   *
+   * @param offset The offset of the first element in the subspan
+   * @param count The number of elements in the subspan
+   * @return A subspan of the sequence, of requested count and offset
+   */
+  [[nodiscard]] constexpr device_span subspan(typename base::size_type offset,
+                                              typename base::size_type count) const noexcept
+  {
+    return device_span{this->data() + offset, count};
+  }
 };
 /** @} */  // end of group
 
@@ -386,42 +409,38 @@ class base_2dspan {
 
   constexpr base_2dspan() noexcept = default;
   /**
-   * @brief Constructor a 2D span
+   * @brief Constructor from a span and number of elements in each row.
    *
-   * @param data Pointer to the data
-   * @param rows Number of rows
+   * @param flat_view The flattened 2D span
    * @param columns Number of columns
    */
-  constexpr base_2dspan(T* data, size_t rows, size_t columns) noexcept
-    : _data{data}, _size{rows, columns}
+  constexpr base_2dspan(RowType<T, dynamic_extent> flat_view, size_t columns)
+    : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns}
   {
+    CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size");
   }
-  /**
-   * @brief Constructor a 2D span
-   *
-   * @param data Pointer to the data
-   * @param size Size of the 2D span as pair
-   */
-  base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{std::move(size)} {}
 
   /**
    * @brief Returns a pointer to the beginning of the sequence.
    *
    * @return A pointer to the first element of the span
    */
-  constexpr auto data() const noexcept { return _data; }
+  [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); }
+
   /**
    * @brief Returns the size in the span as pair.
    *
    * @return pair representing rows and columns size of the span
    */
-  constexpr auto size() const noexcept { return _size; }
+  [[nodiscard]] constexpr auto size() const noexcept { return _size; }
+
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return Number of elements in the span
    */
-  constexpr auto count() const noexcept { return size().first * size().second; }
+  [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); }
+
   /**
    * @brief Checks if the span is empty.
    *
@@ -429,19 +448,6 @@ class base_2dspan {
    */
   [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; }
 
-  /**
-   * @brief Returns flattened index of the element at the specified 2D position.
-   *
-   * @param row The row index
-   * @param column The column index
-   * @param size The size of the 2D span as pair
-   * @return The flattened index of the element at the specified 2D position
-   */
-  static constexpr size_t flatten_index(size_t row, size_t column, size_type size) noexcept
-  {
-    return row * size.second + column;
-  }
-
   /**
    * @brief Returns a reference to the row-th element of the sequence.
    *
@@ -453,41 +459,7 @@ class base_2dspan {
    */
   constexpr RowType<T, dynamic_extent> operator[](size_t row) const
   {
-    return {this->data() + flatten_index(row, 0, this->size()), this->size().second};
-  }
-
-  /**
-   * @brief Returns a reference to the first element in the span.
-   *
-   * Calling front() on an empty span results in undefined behavior.
-   *
-   * @return Reference to the first element in the span
-   */
-  [[nodiscard]] constexpr RowType<T, dynamic_extent> front() const { return (*this)[0]; }
-  /**
-   * @brief Returns a reference to the last element in the span.
-   *
-   * Calling back() on an empty span results in undefined behavior.
-   *
-   * @return Reference to the last element in the span
-   */
-  [[nodiscard]] constexpr RowType<T, dynamic_extent> back() const
-  {
-    return (*this)[size().first - 1];
-  }
-
-  /**
-   * @brief Obtains a 2D span that is a view over the `num_rows` rows of this span starting at
-   * `first_row`
-   *
-   * @param first_row The first row in the subspan
-   * @param num_rows The number of rows in the subspan
-   * @return A subspan of the sequence, of requested starting `first_row` and `num_rows`
-   */
-  constexpr base_2dspan subspan(size_t first_row, size_t num_rows) const noexcept
-  {
-    return base_2dspan(
-      _data + flatten_index(first_row, 0, this->size()), num_rows, this->size().second);
+    return _flat.subspan(row * _size.second, _size.second);
   }
 
   /**
@@ -495,10 +467,7 @@ class base_2dspan {
    *
    * @return A flattened span of the 2D span
    */
-  constexpr RowType<T, dynamic_extent> flat_view()
-  {
-    return {this->data(), this->size().first * this->size().second};
-  }
+  [[nodiscard]] constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
 
   /**
    * @brief Construct a 2D span from another 2D span of convertible type
@@ -514,13 +483,13 @@ class base_2dspan {
                                                    RowType<T, dynamic_extent>>,
                              void>* = nullptr>
   constexpr base_2dspan(base_2dspan<OtherT, OtherRowType> const& other) noexcept
-    : _data{other.data()}, _size{other.size()}
+    : _flat{other.flat_view()}, _size{other.size()}
   {
   }
 
  protected:
-  T* _data = nullptr;     ///< pointer to the first element
-  size_type _size{0, 0};  ///< rows, columns
+  RowType<T, dynamic_extent> _flat;  ///< flattened 2D span
+  size_type _size{0, 0};             ///< num rows, num columns
 };
 
 /**
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 272c91133f8..2bd08f410e0 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/cxxopts.hpp>
 #include <cudf_test/stream_checking_resource_adaptor.hpp>
 
@@ -36,6 +37,12 @@
 namespace CUDF_EXPORT cudf {
 namespace test {
 
+struct config {
+  std::string rmm_mode;
+  std::string stream_mode;
+  std::string stream_error_mode;
+};
+
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
@@ -157,10 +164,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
  * @param cmd_opts Command line options returned by parse_cudf_test_opts
  * @return Memory resource adaptor
  */
-inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
+inline auto make_memory_resource_adaptor(cudf::test::config const& config)
 {
-  auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
-  auto resource       = cudf::test::create_memory_resource(rmm_mode);
+  auto resource = cudf::test::create_memory_resource(config.rmm_mode);
   cudf::set_current_device_resource(resource.get());
   return resource;
 }
@@ -176,37 +182,54 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
  * @param cmd_opts Command line options returned by parse_cudf_test_opts
  * @return Memory resource adaptor
  */
-inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
+inline auto make_stream_mode_adaptor(cudf::test::config const& config)
 {
   auto resource                      = cudf::get_current_device_resource_ref();
-  auto const stream_mode             = cmd_opts["stream_mode"].as<std::string>();
-  auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
-  auto const error_on_invalid_stream = (stream_error_mode == "error");
-  auto const check_default_stream    = (stream_mode == "new_cudf_default");
+  auto const error_on_invalid_stream = (config.stream_error_mode == "error");
+  auto const check_default_stream    = (config.stream_mode == "new_cudf_default");
   auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
     resource, error_on_invalid_stream, check_default_stream);
-  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
+  if ((config.stream_mode == "new_cudf_default") || (config.stream_mode == "new_testing_default")) {
     cudf::set_current_device_resource(&adaptor);
   }
   return adaptor;
 }
 
+/**
+ * @brief Should be called in every test program that uses rmm allocators since it maintains the
+ * lifespan of the rmm default memory resource. this function parses the command line to customize
+ * test behavior, like the allocation mode used for creating the default memory resource.
+ *
+ */
+inline void init_cudf_test(int argc, char** argv, cudf::test::config const& config_override = {})
+{
+  // static lifetime to keep rmm resource alive till tests end
+  auto const cmd_opts       = parse_cudf_test_opts(argc, argv);
+  cudf::test::config config = config_override;
+  if (config.rmm_mode.empty()) { config.rmm_mode = cmd_opts["rmm_mode"].as<std::string>(); }
+
+  if (config.stream_mode.empty()) {
+    config.stream_mode = cmd_opts["stream_mode"].as<std::string>();
+  }
+
+  if (config.stream_error_mode.empty()) {
+    config.stream_error_mode = cmd_opts["stream_error_mode"].as<std::string>();
+  }
+
+  [[maybe_unused]] static auto mr      = make_memory_resource_adaptor(config);
+  [[maybe_unused]] static auto adaptor = make_stream_mode_adaptor(config);
+}
+
 /**
  * @brief Macro that defines main function for gtest programs that use rmm
  *
- * Should be included in every test program that uses rmm allocators since
- * it maintains the lifespan of the rmm default memory resource.
  * This `main` function is a wrapper around the google test generated `main`,
- * maintaining the original functionality. In addition, this custom `main`
- * function parses the command line to customize test behavior, like the
- * allocation mode used for creating the default memory resource.
+ * maintaining the original functionality.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                            \
-  int main(int argc, char** argv)                                           \
-  {                                                                         \
-    ::testing::InitGoogleTest(&argc, argv);                                 \
-    auto const cmd_opts           = parse_cudf_test_opts(argc, argv);       \
-    [[maybe_unused]] auto mr      = make_memory_resource_adaptor(cmd_opts); \
-    [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts);     \
-    return RUN_ALL_TESTS();                                                 \
+#define CUDF_TEST_PROGRAM_MAIN()            \
+  int main(int argc, char** argv)           \
+  {                                         \
+    ::testing::InitGoogleTest(&argc, argv); \
+    init_cudf_test(argc, argv);             \
+    return RUN_ALL_TESTS();                 \
   }
diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index 723ba310a1e..dca590baebf 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext {
  * @param targets Strings to compute edit distance against `input`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New lists column of edit distance values
  */
 std::unique_ptr<cudf::column> edit_distance(
   cudf::strings_column_view const& input,
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7c909f1a948..42124461cdf 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext {
  *
  * This function uses MurmurHash3_x86_32 for the hash algorithm.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values for each string in input
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
@@ -71,6 +73,8 @@ std::unique_ptr<cudf::column> minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -83,7 +87,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   cudf::size_type width             = 4,
@@ -102,6 +106,8 @@ std::unique_ptr<cudf::column> minhash(
  * The hash function returns 2 uint64 values but only the first value
  * is used with the minhash calculation.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -112,7 +118,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values as UINT64 for each string in input
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
@@ -132,6 +138,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -144,7 +152,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   cudf::size_type width             = 4,
@@ -164,6 +172,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -173,7 +183,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
   cudf::lists_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -193,6 +203,8 @@ std::unique_ptr<cudf::column> word_minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -202,7 +214,7 @@ std::unique_ptr<cudf::column> word_minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
   cudf::lists_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index bbd0503379b..822edcbdb43 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -82,7 +82,7 @@ namespace CUDF_EXPORT nvtext {
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column with replaced strings
  */
 std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& input,
@@ -131,7 +131,7 @@ std::unique_ptr<cudf::column> replace_tokens(
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column of filtered strings
  */
 std::unique_ptr<cudf::column> filter_tokens(
   cudf::strings_column_view const& input,
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 55a4124bfd0..e5b2a4cc21b 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -51,7 +51,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b1 = is_letter(st, VOWEL, 1)
  * b1 is now [false, true, true]
  * @endcode
@@ -62,7 +62,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string
  * b2 is now [false, true, false]
  * @endcode
@@ -99,7 +99,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, 1, 4]
  * b1 = is_letter(st, VOWEL, ix)
  * b1 is now [true, true, false]
@@ -111,7 +111,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, -2, 4] // 2nd to last character in st[1] is checked
  * b2 = is_letter(st, CONSONANT, ix)
  * b2 is now [false, false, true]
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
deleted file mode 100644
index e5e57dbf562..00000000000
--- a/cpp/scripts/run-clang-tidy.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import re
-import os
-import subprocess
-import argparse
-import json
-import multiprocessing as mp
-import shutil
-
-
-EXPECTED_VERSION = "16.0.6"
-VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
-GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
-SPACES = re.compile(r"\s+")
-SEPARATOR = "-" * 16
-
-
-def parse_args():
-    argparser = argparse.ArgumentParser("Runs clang-tidy on a project")
-    argparser.add_argument("-cdb", type=str,
-                           # TODO This is a hack, needs to be fixed
-                           default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json",
-                           help="Path to cmake-generated compilation database"
-                           " file. It is always found inside the root of the "
-                           "cmake build folder. So make sure that `cmake` has "
-                           "been run once before running this script!")
-    argparser.add_argument("-exe", type=str, default="clang-tidy",
-                           help="Path to clang-tidy exe")
-    argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/",
-                           help="Regex used to ignore files from checking")
-    argparser.add_argument("-select", type=str, default=None,
-                           help="Regex used to select files for checking")
-    argparser.add_argument("-j", type=int, default=-1,
-                           help="Number of parallel jobs to launch.")
-    args = argparser.parse_args()
-    if args.j <= 0:
-        args.j = mp.cpu_count()
-    args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
-    args.select_compiled = re.compile(args.select) if args.select else None
-    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
-    ret = ret.decode("utf-8")
-    version = VERSION_REGEX.search(ret)
-    if version is None:
-        raise Exception("Failed to figure out clang-tidy version!")
-    version = version.group(1)
-    if version != EXPECTED_VERSION:
-        raise Exception("clang-tidy exe must be v%s found '%s'" % \
-                        (EXPECTED_VERSION, version))
-    if not os.path.exists(args.cdb):
-        raise Exception("Compilation database '%s' missing" % args.cdb)
-    return args
-
-
-def get_all_commands(cdb):
-    with open(cdb) as fp:
-        return json.load(fp)
-
-
-def get_gpu_archs(command):
-    archs = []
-    for loc in range(len(command)):
-        if command[loc] != "-gencode":
-            continue
-        arch_flag = command[loc + 1]
-        match = GPU_ARCH_REGEX.search(arch_flag)
-        if match is not None:
-            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
-    return archs
-
-
-def get_index(arr, item):
-    try:
-        return arr.index(item)
-    except:
-        return -1
-
-
-def remove_item(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc]
-    return loc
-
-
-def remove_item_plus_one(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc + 1]
-        del arr[loc]
-    return loc
-
-
-def get_clang_includes(exe):
-    dir = os.getenv("CONDA_PREFIX")
-    if dir is None:
-        ret = subprocess.check_output("which %s 2>&1" % exe, shell=True)
-        ret = ret.decode("utf-8")
-        dir = os.path.dirname(os.path.dirname(ret))
-    header = os.path.join(dir, "include", "ClangHeaders")
-    return ["-I", header]
-
-
-def get_tidy_args(cmd, exe):
-    command, file = cmd["command"], cmd["file"]
-    is_cuda = file.endswith(".cu")
-    command = re.split(SPACES, command)
-    # compiler is always clang++!
-    command[0] = "clang++"
-    # remove compilation and output targets from the original command
-    remove_item_plus_one(command, "-c")
-    remove_item_plus_one(command, "-o")
-    if is_cuda:
-        # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
-        archs = get_gpu_archs(command)
-        command.extend(archs)
-        while True:
-            loc = remove_item_plus_one(command, "-gencode")
-            if loc < 0:
-                break
-        # "-x cuda" is the right usage in clang
-        loc = get_index(command, "-x")
-        if loc >= 0:
-            command[loc + 1] = "cuda"
-        remove_item_plus_one(command, "-ccbin")
-        remove_item(command, "--expt-extended-lambda")
-        remove_item(command, "--diag_suppress=unrecognized_gcc_pragma")
-    command.extend(get_clang_includes(exe))
-    return command, is_cuda
-
-
-def run_clang_tidy_command(tidy_cmd):
-    cmd = " ".join(tidy_cmd)
-    result = subprocess.run(cmd, check=False, shell=True,
-                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    status = result.returncode == 0
-    if status:
-        out = ""
-    else:
-        out = "CMD: " + cmd
-    out += result.stdout.decode("utf-8").rstrip()
-    return status, out
-
-
-def run_clang_tidy(cmd, args):
-    command, is_cuda = get_tidy_args(cmd, args.exe)
-    tidy_cmd = [args.exe,
-                "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'",
-                cmd["file"], "--", ]
-    tidy_cmd.extend(command)
-    status = True
-    out = ""
-    if is_cuda:
-        tidy_cmd.append("--cuda-device-only")
-        tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        out += out1
-        out += "%s" % SEPARATOR
-        if not ret:
-            status = ret
-        tidy_cmd[-2] = "--cuda-host-only"
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
-        out += out1
-    else:
-        tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
-        out += out1
-    return status, out, cmd["file"]
-
-
-# yikes! global var :(
-results = []
-def collect_result(result):
-    global results
-    results.append(result)
-
-
-def print_result(passed, stdout, file):
-    status_str = "PASSED" if passed else "FAILED"
-    print(f"{SEPARATOR} File:{file} {status_str} {SEPARATOR}")
-    if stdout:
-        print(stdout)
-        print(f"{SEPARATOR} File:{file} ENDS {SEPARATOR}")
-
-
-def print_results():
-    global results
-    status = True
-    for passed, stdout, file in results:
-        print_result(passed, stdout, file)
-        if not passed:
-            status = False
-    return status
-
-
-def run_tidy_for_all_files(args, all_files):
-    pool = None if args.j == 1 else mp.Pool(args.j)
-    # actual tidy checker
-    for cmd in all_files:
-        # skip files that we don't want to look at
-        if args.ignore_compiled is not None and \
-           re.search(args.ignore_compiled, cmd["file"]) is not None:
-            continue
-        if args.select_compiled is not None and \
-           re.search(args.select_compiled, cmd["file"]) is None:
-            continue
-        if pool is not None:
-            pool.apply_async(run_clang_tidy, args=(cmd, args),
-                             callback=collect_result)
-        else:
-            passed, stdout, file = run_clang_tidy(cmd, args)
-            collect_result((passed, stdout, file))
-    if pool is not None:
-        pool.close()
-        pool.join()
-    return print_results()
-
-
-def main():
-    args = parse_args()
-    # Attempt to making sure that we run this script from root of repo always
-    if not os.path.exists(".git"):
-        raise Exception("This needs to always be run from the root of repo")
-    # Check whether clang-tidy exists
-    # print(args)
-    if "exe" not in args and shutil.which("clang-tidy") is not None:
-        print("clang-tidy not found. Exiting...")
-        return
-    all_files = get_all_commands(args.cdb)
-    status = run_tidy_for_all_files(args, all_files)
-    if not status:
-        raise Exception("clang-tidy failed! Refer to the errors above.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 4ca05f9c335..e6659f76c7c 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -32,7 +33,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cub/cub.cuh>
@@ -329,7 +329,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask,
 
   cudf::detail::grid_1d grid(num_words, block_size);
 
-  rmm::device_scalar<size_type> non_zero_count(0, stream);
+  cudf::detail::device_scalar<size_type> non_zero_count(0, stream);
 
   count_set_bits_kernel<block_size>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b8e140f1fa5..d8419760120 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -19,6 +19,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -162,7 +163,7 @@ size_type concatenate_masks(device_span<column_device_view const> d_views,
                             size_type output_size,
                             rmm::cuda_stream_view stream)
 {
-  rmm::device_scalar<size_type> d_valid_count(0, stream);
+  cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
   constexpr size_type block_size{256};
   cudf::detail::grid_1d config(output_size, block_size);
   concatenate_masks_kernel<block_size>
@@ -265,7 +266,7 @@ std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
   auto out_view     = out_col->mutable_view();
   auto d_out_view   = mutable_column_device_view::create(out_view, stream);
 
-  rmm::device_scalar<size_type> d_valid_count(0, stream);
+  cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
 
   // Launch kernel
   constexpr size_type block_size{256};
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 29a28f81d1a..80b0bd5242f 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/is_element_valid.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -71,7 +72,7 @@ struct get_element_functor {
     auto device_col = column_device_view::create(input, stream);
 
     rmm::device_scalar<string_view> temp_data(stream, mr);
-    rmm::device_scalar<bool> temp_valid(stream, mr);
+    cudf::detail::device_scalar<bool> temp_valid(stream, mr);
 
     device_single_thread(
       [buffer   = temp_data.data(),
@@ -155,8 +156,8 @@ struct get_element_functor {
 
     auto device_col = column_device_view::create(input, stream);
 
-    rmm::device_scalar<Type> temp_data(stream, mr);
-    rmm::device_scalar<bool> temp_valid(stream, mr);
+    cudf::detail::device_scalar<Type> temp_data(stream, mr);
+    cudf::detail::device_scalar<bool> temp_valid(stream, mr);
 
     device_single_thread(
       [buffer   = temp_data.data(),
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index ddb0dbcd96d..a497cedb3bc 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -44,19 +44,6 @@
 namespace cudf {
 namespace datetime {
 namespace detail {
-enum class datetime_component {
-  INVALID = 0,
-  YEAR,
-  MONTH,
-  DAY,
-  WEEKDAY,
-  HOUR,
-  MINUTE,
-  SECOND,
-  MILLISECOND,
-  MICROSECOND,
-  NANOSECOND
-};
 
 enum class rounding_function {
   CEIL,   ///< Rounds up to the next integer multiple of the provided frequency
@@ -453,90 +440,70 @@ std::unique_ptr<column> extract_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::YEAR>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::YEAR, stream, mr);
 }
 
 std::unique_ptr<column> extract_month(column_view const& column,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::MONTH>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::MONTH, stream, mr);
 }
 
 std::unique_ptr<column> extract_day(column_view const& column,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::DAY>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::DAY, stream, mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::WEEKDAY>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::WEEKDAY, stream, mr);
 }
 
 std::unique_ptr<column> extract_hour(column_view const& column,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::HOUR>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::HOUR, stream, mr);
 }
 
 std::unique_ptr<column> extract_minute(column_view const& column,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::MINUTE>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::MINUTE, stream, mr);
 }
 
 std::unique_ptr<column> extract_second(column_view const& column,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::SECOND>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::SECOND, stream, mr);
 }
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::MILLISECOND>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::MILLISECOND, stream, mr);
 }
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::MICROSECOND>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::MICROSECOND, stream, mr);
 }
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
-  return detail::apply_datetime_op<
-    detail::extract_component_operator<detail::datetime_component::NANOSECOND>,
-    cudf::type_id::INT16>(column, stream, mr);
+  return detail::extract_datetime_component(column, datetime_component::NANOSECOND, stream, mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
@@ -576,6 +543,32 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
   return apply_datetime_op<extract_quarter_op, type_id::INT16>(column, stream, mr);
 }
 
+std::unique_ptr<cudf::column> extract_datetime_component(cudf::column_view const& column,
+                                                         datetime_component component,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+#define extract(field)                                                                 \
+  case field:                                                                          \
+    return apply_datetime_op<extract_component_operator<field>, cudf::type_id::INT16>( \
+      column, stream, mr)
+
+  switch (component) {
+    extract(datetime_component::YEAR);
+    extract(datetime_component::MONTH);
+    extract(datetime_component::DAY);
+    extract(datetime_component::WEEKDAY);
+    extract(datetime_component::HOUR);
+    extract(datetime_component::MINUTE);
+    extract(datetime_component::SECOND);
+    extract(datetime_component::MILLISECOND);
+    extract(datetime_component::MICROSECOND);
+    extract(datetime_component::NANOSECOND);
+    default: CUDF_FAIL("Unsupported datetime component.");
+  }
+#undef extract
+}
+
 }  // namespace detail
 
 std::unique_ptr<column> ceil_datetimes(column_view const& column,
@@ -661,6 +654,15 @@ std::unique_ptr<column> extract_second(column_view const& column,
   return detail::extract_second(column, stream, mr);
 }
 
+std::unique_ptr<cudf::column> extract_datetime_component(cudf::column_view const& column,
+                                                         datetime_component component,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_datetime_component(column, component, stream, mr);
+}
+
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index a6b6cbbf0b5..2196ee97fee 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -138,7 +138,7 @@ struct timezone_file {
       std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name;
     std::ifstream fin;
     fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate);
-    CUDF_EXPECTS(fin, "Failed to open the timezone file.");
+    CUDF_EXPECTS(fin, "Failed to open the timezone file '" + tz_filename.string() + "'");
     auto const file_size = fin.tellg();
     fin.seekg(0);
 
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
new file mode 100644
index 00000000000..59457bea694
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_groupby.hpp"
+#include "compute_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "sparse_to_dense_results.hpp"
+
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <iterator>
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
+                                                      size_type num_keys,
+                                                      rmm::cuda_stream_view stream)
+{
+  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+  return populated_keys;
+}
+
+template <typename Equal, typename Hash>
+std::unique_ptr<table> compute_groupby(table_view const& keys,
+                                       host_span<aggregation_request const> requests,
+                                       bool skip_rows_with_nulls,
+                                       Equal const& d_row_equal,
+                                       Hash const& d_row_hash,
+                                       cudf::detail::result_cache* cache,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+{
+  // convert to int64_t to avoid potential overflow with large `keys`
+  auto const num_keys = static_cast<int64_t>(keys.num_rows());
+
+  // Cache of sparse results where the location of aggregate value in each
+  // column is indexed by the hash set
+  cudf::detail::result_cache sparse_results(requests.size());
+
+  auto const set = cuco::static_set{
+    num_keys,
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% load factor
+    cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    d_row_equal,
+    probing_scheme_t{d_row_hash},
+    cuco::thread_scope_device,
+    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
+  auto row_bitmask =
+    skip_rows_with_nulls
+      ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      : rmm::device_buffer{};
+
+  // Compute all single pass aggs first
+  compute_single_pass_aggs(num_keys,
+                           skip_rows_with_nulls,
+                           static_cast<bitmask_type*>(row_bitmask.data()),
+                           set.ref(cuco::insert_and_find),
+                           requests,
+                           &sparse_results,
+                           stream);
+
+  // Extract the populated indices from the hash set and create a gather map.
+  // Gathering using this map from sparse results will give dense results.
+  auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
+
+  // Compact all results from sparse_results and insert into cache
+  sparse_to_dense_results(requests,
+                          &sparse_results,
+                          cache,
+                          gather_map,
+                          set.ref(cuco::find),
+                          static_cast<bitmask_type*>(row_bitmask.data()),
+                          stream,
+                          mr);
+
+  return cudf::detail::gather(keys,
+                              gather_map,
+                              out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
+                              stream,
+                              mr);
+}
+
+template rmm::device_uvector<size_type> extract_populated_keys<global_set_t>(
+  global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
+
+template rmm::device_uvector<size_type> extract_populated_keys<nullable_global_set_t>(
+  nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
+
+template std::unique_ptr<table> compute_groupby<row_comparator_t, row_hash_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  bool skip_rows_with_nulls,
+  row_comparator_t const& d_row_equal,
+  row_hash_t const& d_row_hash,
+  cudf::detail::result_cache* cache,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t, row_hash_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  bool skip_rows_with_nulls,
+  nullable_row_comparator_t const& d_row_equal,
+  row_hash_t const& d_row_hash,
+  cudf::detail::result_cache* cache,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
new file mode 100644
index 00000000000..7bb3a60ff07
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes and returns a device vector containing all populated keys in
+ * `key_set`.
+ *
+ * @tparam SetType Type of key hash set
+ *
+ * @param key_set Key hash set
+ * @param num_keys Number of input keys
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return An array of unique keys contained in `key_set`
+ */
+template <typename SetType>
+rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
+                                                      size_type num_keys,
+                                                      rmm::cuda_stream_view stream);
+
+/**
+ * @brief Computes groupby using hash table.
+ *
+ * First, we create a hash table that stores the indices of unique rows in
+ * `keys`. The upper limit on the number of values in this map is the number
+ * of rows in `keys`.
+ *
+ * To store the results of aggregations, we create temporary sparse columns
+ * which have the same size as input value columns. Using the hash map, we
+ * determine the location within the sparse column to write the result of the
+ * aggregation into.
+ *
+ * The sparse column results of all aggregations are stored into the cache
+ * `sparse_results`. This enables the use of previously calculated results in
+ * other aggregations.
+ *
+ * All the aggregations which can be computed in a single pass are computed
+ * first, in a combined kernel. Then using these results, aggregations that
+ * require multiple passes, will be computed.
+ *
+ * Finally, using the hash map, we generate a vector of indices of populated
+ * values in sparse result columns. Then, for each aggregation originally
+ * requested in `requests`, we gather sparse results into a column of dense
+ * results using the aforementioned index vector. Dense results are stored into
+ * the in/out parameter `cache`.
+ *
+ * @tparam Equal Device row comparator type
+ * @tparam Hash Device row hasher type
+ *
+ * @param keys Table whose rows act as the groupby keys
+ * @param requests The set of columns to aggregate and the aggregations to perform
+ * @param skip_rows_with_nulls Flag indicating whether to ignore nulls or not
+ * @param d_row_equal Device row comparator
+ * @param d_row_hash Device row hasher
+ * @param cache Dense aggregation results
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table
+ * @return Table of unique keys
+ */
+template <typename Equal, typename Hash>
+std::unique_ptr<cudf::table> compute_groupby(table_view const& keys,
+                                             host_span<aggregation_request const> requests,
+                                             bool skip_rows_with_nulls,
+                                             Equal const& d_row_equal,
+                                             Hash const& d_row_hash,
+                                             cudf::detail::result_cache* cache,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
new file mode 100644
index 00000000000..e292543e6e9
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_single_pass_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+void compute_single_pass_aggs(int64_t num_keys,
+                              bool skip_rows_with_nulls,
+                              bitmask_type const* row_bitmask,
+                              SetType set,
+                              host_span<aggregation_request const> requests,
+                              cudf::detail::result_cache* sparse_results,
+                              rmm::cuda_stream_view stream)
+{
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+
+  // make table that will hold sparse results
+  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto const d_aggs   = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, cudf::get_current_device_resource_ref());
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    num_keys,
+    hash::compute_single_pass_aggs_fn{
+      set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls});
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+}
+
+template void compute_single_pass_aggs<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  int64_t num_keys,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  hash_set_ref_t<cuco::insert_and_find_tag> set,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+
+template void compute_single_pass_aggs<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
+  int64_t num_keys,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  nullable_hash_set_ref_t<cuco::insert_and_find_tag> set,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
new file mode 100644
index 00000000000..a7434bdf61a
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+void compute_single_pass_aggs(int64_t num_keys,
+                              bool skip_rows_with_nulls,
+                              bitmask_type const* row_bitmask,
+                              SetType set,
+                              cudf::host_span<cudf::groupby::aggregation_request const> requests,
+                              cudf::detail::result_cache* sparse_results,
+                              rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
new file mode 100644
index 00000000000..22fa4fc584c
--- /dev/null
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "create_sparse_results_table.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+// make table that will hold sparse results
+cudf::table create_sparse_results_table(table_view const& flattened_values,
+                                        std::vector<aggregation::Kind> aggs,
+                                        rmm::cuda_stream_view stream)
+{
+  // TODO single allocation - room for performance improvement
+  std::vector<std::unique_ptr<column>> sparse_columns;
+  sparse_columns.reserve(flattened_values.num_columns());
+  std::transform(
+    flattened_values.begin(),
+    flattened_values.end(),
+    aggs.begin(),
+    std::back_inserter(sparse_columns),
+    [stream](auto const& col, auto const& agg) {
+      bool nullable =
+        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
+          ? false
+          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
+      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
+
+      auto col_type = cudf::is_dictionary(col.type())
+                        ? cudf::dictionary_column_view(col).keys().type()
+                        : col.type();
+
+      return make_fixed_width_column(
+        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+    });
+
+  table sparse_table(std::move(sparse_columns));
+  mutable_table_view table_view = sparse_table.mutable_view();
+  cudf::detail::initialize_with_identity(table_view, aggs, stream);
+  return sparse_table;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
new file mode 100644
index 00000000000..c1d4e0d3f20
--- /dev/null
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+// make table that will hold sparse results
+cudf::table create_sparse_results_table(table_view const& flattened_values,
+                                        std::vector<aggregation::Kind> aggs_kinds,
+                                        rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
new file mode 100644
index 00000000000..b2048a9fbb8
--- /dev/null
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flatten_single_pass_aggs.hpp"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <memory>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+
+class groupby_simple_aggregations_collector final
+  : public cudf::detail::simple_aggregations_collector {
+ public:
+  using cudf::detail::simple_aggregations_collector::visit;
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::min_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
+                                                    : make_min_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::max_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
+                                                    : make_max_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::mean_aggregation const&) override
+  {
+    (void)col_type;
+    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type,
+                                                  cudf::detail::var_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type,
+                                                  cudf::detail::std_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type, cudf::detail::correlation_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+};
+
+// flatten aggs to filter in single pass aggs
+std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests)
+{
+  std::vector<column_view> columns;
+  std::vector<std::unique_ptr<aggregation>> aggs;
+  std::vector<aggregation::Kind> agg_kinds;
+
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+
+    std::unordered_set<aggregation::Kind> agg_kinds_set;
+    auto insert_agg = [&](column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
+      if (agg_kinds_set.insert(agg->kind).second) {
+        agg_kinds.push_back(agg->kind);
+        aggs.push_back(std::move(agg));
+        columns.push_back(request_values);
+      }
+    };
+
+    auto values_type = cudf::is_dictionary(request.values.type())
+                         ? cudf::dictionary_column_view(request.values).keys().type()
+                         : request.values.type();
+    for (auto&& agg : agg_v) {
+      groupby_simple_aggregations_collector collector;
+
+      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
+        insert_agg(request.values, std::move(agg_s));
+      }
+    }
+  }
+
+  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs));
+}
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
new file mode 100644
index 00000000000..2bf983e5e90
--- /dev/null
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+
+// flatten aggs to filter in single pass aggs
+std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
new file mode 100644
index 00000000000..50e89c727ff
--- /dev/null
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/type_traits>
+
+namespace cudf::groupby::detail::hash {
+template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
+struct update_target_element_gmem {
+  __device__ void operator()(cudf::mutable_column_device_view,
+                             cudf::size_type,
+                             cudf::column_device_view,
+                             cuda::std::byte*,
+                             cudf::size_type) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MIN,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_min(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MAX,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_max(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_add(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// The shared memory will already have it squared
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM_OF_SQUARES,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    Target value          = static_cast<Target>(source_casted[source_index]);
+
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value);
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::PRODUCT,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and
+// non-fixed point column
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_ALL is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmax_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source_column.element<Source>(source_argmax_index) >
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmax_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmin_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source_column.element<Source>(source_argmin_index) <
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmin_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief A functor that updates a single element in the target column stored in global memory by
+ * applying an aggregation operation to a corresponding element from a source column in shared
+ * memory.
+ *
+ * This functor can NOT be used for dictionary columns.
+ *
+ * This is a redundant copy replicating the behavior of `elementwise_aggregator` from
+ * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts
+ * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from
+ * shared memory.
+ */
+struct gmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
+  {
+    // Early exit for all aggregation kinds since shared memory aggregation of
+    // `COUNT_ALL` is always valid
+    if (!source_mask[source_index]) { return; }
+
+    update_target_element_gmem<Source, k>{}(
+      target, target_index, source_column, source, source_index);
+  }
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index f9a80a048b5..30e1d52fdbf 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,58 +14,32 @@
  * limitations under the License.
  */
 
+#include "compute_groupby.hpp"
 #include "groupby/common/utils.hpp"
-#include "groupby/hash/groupby_kernels.cuh"
+#include "helpers.cuh"
 
 #include <cudf/aggregation.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/unary.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <cuco/static_set.cuh>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
+#include <algorithm>
 #include <memory>
-#include <unordered_set>
 #include <utility>
+#include <vector>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 namespace {
-
-// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
-// types and `cg_size = 1`for flat data to improve performance
-using probing_scheme_type = cuco::linear_probing<
-  1,  ///< Number of threads used to handle each input key
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC>>;
-
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.
@@ -110,517 +84,33 @@ bool constexpr is_hash_aggregation(aggregation::Kind t)
   return array_contains(hash_aggregations, t);
 }
 
-class groupby_simple_aggregations_collector final
-  : public cudf::detail::simple_aggregations_collector {
- public:
-  using cudf::detail::simple_aggregations_collector::visit;
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::min_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
-                                                    : make_min_aggregation());
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::max_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
-                                                    : make_max_aggregation());
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                  cudf::detail::mean_aggregation const&) override
-  {
-    (void)col_type;
-    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type,
-                                                  cudf::detail::var_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(data_type,
-                                                  cudf::detail::std_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-
-  std::vector<std::unique_ptr<aggregation>> visit(
-    data_type, cudf::detail::correlation_aggregation const&) override
-  {
-    std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(make_sum_aggregation());
-    // COUNT_VALID
-    aggs.push_back(make_count_aggregation());
-
-    return aggs;
-  }
-};
-
-template <typename SetType>
-class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
-  column_view col;
-  data_type result_type;
-  cudf::detail::result_cache* sparse_results;
-  cudf::detail::result_cache* dense_results;
-  device_span<size_type const> gather_map;
-  SetType set;
-  bitmask_type const* __restrict__ row_bitmask;
-  rmm::cuda_stream_view stream;
-  rmm::device_async_resource_ref mr;
-
- public:
-  using cudf::detail::aggregation_finalizer::visit;
-
-  hash_compound_agg_finalizer(column_view col,
-                              cudf::detail::result_cache* sparse_results,
-                              cudf::detail::result_cache* dense_results,
-                              device_span<size_type const> gather_map,
-                              SetType set,
-                              bitmask_type const* row_bitmask,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-    : col(col),
-      sparse_results(sparse_results),
-      dense_results(dense_results),
-      gather_map(gather_map),
-      set(set),
-      row_bitmask(row_bitmask),
-      stream(stream),
-      mr(mr)
-  {
-    result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type()
-                                                  : col.type();
-  }
-
-  auto to_dense_agg_result(cudf::aggregation const& agg)
-  {
-    auto s                  = sparse_results->get_result(col, agg);
-    auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
-                                                   gather_map,
-                                                   out_of_bounds_policy::DONT_CHECK,
-                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                   stream,
-                                                   mr);
-    return std::move(dense_result_table->release()[0]);
-  }
-
-  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
-  auto gather_argminmax(aggregation const& agg)
-  {
-    auto arg_result = to_dense_agg_result(agg);
-    // We make a view of ARG(MIN/MAX) result without a null mask and gather
-    // using this map. The values in data buffer of ARG(MIN/MAX) result
-    // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
-    // which is an out of bounds index value (-1) and causes the gathered
-    // value to be null.
-    column_view null_removed_map(
-      data_type(type_to_id<size_type>()),
-      arg_result->size(),
-      static_cast<void const*>(arg_result->view().template data<size_type>()),
-      nullptr,
-      0);
-    auto gather_argminmax =
-      cudf::detail::gather(table_view({col}),
-                           null_removed_map,
-                           arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
-                                                  : cudf::out_of_bounds_policy::DONT_CHECK,
-                           cudf::detail::negative_index_policy::NOT_ALLOWED,
-                           stream,
-                           mr);
-    return std::move(gather_argminmax->release()[0]);
-  }
-
-  // Declare overloads for each kind of aggregation to dispatch
-  void visit(cudf::aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::min_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmin_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::max_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmax_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::mean_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = dense_results->get_result(col, *sum_agg);
-    column_view count_result = dense_results->get_result(col, *count_agg);
-
-    auto result =
-      cudf::detail::binary_operation(sum_result,
-                                     count_result,
-                                     binary_operator::DIV,
-                                     cudf::detail::target_type(result_type, aggregation::MEAN),
-                                     stream,
-                                     mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-
-  void visit(cudf::detail::var_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = sparse_results->get_result(col, *sum_agg);
-    column_view count_result = sparse_results->get_result(col, *count_agg);
-
-    auto values_view = column_device_view::create(col, stream);
-    auto sum_view    = column_device_view::create(sum_result, stream);
-    auto count_view  = column_device_view::create(count_result, stream);
-
-    auto var_result = make_fixed_width_column(
-      cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
-    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
-    mutable_table_view var_table_view{{var_result->mutable_view()}};
-    cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
-
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      col.size(),
-      ::cudf::detail::var_hash_functor{
-        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
-    sparse_results->add_result(col, agg, std::move(var_result));
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::std_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    auto var_agg = make_variance_aggregation(agg._ddof);
-    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
-    column_view variance = dense_results->get_result(col, *var_agg);
-
-    auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-};
-// flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
-flatten_single_pass_aggs(host_span<aggregation_request const> requests)
+std::unique_ptr<table> dispatch_groupby(table_view const& keys,
+                                        host_span<aggregation_request const> requests,
+                                        cudf::detail::result_cache* cache,
+                                        bool const keys_have_nulls,
+                                        null_policy const include_null_keys,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr)
 {
-  std::vector<column_view> columns;
-  std::vector<std::unique_ptr<aggregation>> aggs;
-  std::vector<aggregation::Kind> agg_kinds;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-
-    std::unordered_set<aggregation::Kind> agg_kinds_set;
-    auto insert_agg = [&](column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
-      if (agg_kinds_set.insert(agg->kind).second) {
-        agg_kinds.push_back(agg->kind);
-        aggs.push_back(std::move(agg));
-        columns.push_back(request_values);
-      }
-    };
-
-    auto values_type = cudf::is_dictionary(request.values.type())
-                         ? cudf::dictionary_column_view(request.values).keys().type()
-                         : request.values.type();
-    for (auto&& agg : agg_v) {
-      groupby_simple_aggregations_collector collector;
-
-      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
-        insert_agg(request.values, std::move(agg_s));
-      }
-    }
-  }
-
-  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs));
-}
-
-/**
- * @brief Gather sparse results into dense using `gather_map` and add to
- * `dense_cache`
- *
- * @see groupby_null_templated()
- */
-template <typename SetType>
-void sparse_to_dense_results(table_view const& keys,
-                             host_span<aggregation_request const> requests,
-                             cudf::detail::result_cache* sparse_results,
-                             cudf::detail::result_cache* dense_results,
-                             device_span<size_type const> gather_map,
-                             SetType set,
-                             bool keys_have_nulls,
-                             null_policy include_null_keys,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
-  bitmask_type const* row_bitmask_ptr =
-    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-    auto const& col   = request.values;
-
-    // Given an aggregation, this will get the result from sparse_results and
-    // convert and return dense, compacted result
-    auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
-    for (auto&& agg : agg_v) {
-      agg->finalize(finalizer);
-    }
-  }
-}
-
-// make table that will hold sparse results
-auto create_sparse_results_table(table_view const& flattened_values,
-                                 std::vector<aggregation::Kind> aggs,
-                                 rmm::cuda_stream_view stream)
-{
-  // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<column>> sparse_columns;
-  std::transform(
-    flattened_values.begin(),
-    flattened_values.end(),
-    aggs.begin(),
-    std::back_inserter(sparse_columns),
-    [stream](auto const& col, auto const& agg) {
-      bool nullable =
-        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
-          ? false
-          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
-      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
-
-      auto col_type = cudf::is_dictionary(col.type())
-                        ? cudf::dictionary_column_view(col).keys().type()
-                        : col.type();
-
-      return make_fixed_width_column(
-        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-    });
-
-  table sparse_table(std::move(sparse_columns));
-  mutable_table_view table_view = sparse_table.mutable_view();
-  cudf::detail::initialize_with_identity(table_view, aggs, stream);
-  return sparse_table;
-}
-
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-void compute_single_pass_aggs(table_view const& keys,
-                              host_span<aggregation_request const> requests,
-                              cudf::detail::result_cache* sparse_results,
-                              SetType set,
-                              bool keys_have_nulls,
-                              null_policy include_null_keys,
-                              rmm::cuda_stream_view stream)
-{
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-
-  // make table that will hold sparse results
-  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, cudf::get_current_device_resource_ref());
-  auto const skip_key_rows_with_nulls =
-    keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
-
-  auto row_bitmask =
-    skip_key_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
-      : rmm::device_buffer{};
-
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    keys.num_rows(),
-    hash::compute_single_pass_aggs_fn{set,
-                                      *d_values,
-                                      *d_sparse_table,
-                                      d_aggs.data(),
-                                      static_cast<bitmask_type*>(row_bitmask.data()),
-                                      skip_key_rows_with_nulls});
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-}
-
-/**
- * @brief Computes and returns a device vector containing all populated keys in
- * `map`.
- */
-template <typename SetType>
-rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
-                                                      size_type num_keys,
-                                                      rmm::cuda_stream_view stream)
-{
-  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-  return populated_keys;
-}
-
-/**
- * @brief Computes groupby using hash table.
- *
- * First, we create a hash table that stores the indices of unique rows in
- * `keys`. The upper limit on the number of values in this map is the number
- * of rows in `keys`.
- *
- * To store the results of aggregations, we create temporary sparse columns
- * which have the same size as input value columns. Using the hash map, we
- * determine the location within the sparse column to write the result of the
- * aggregation into.
- *
- * The sparse column results of all aggregations are stored into the cache
- * `sparse_results`. This enables the use of previously calculated results in
- * other aggregations.
- *
- * All the aggregations which can be computed in a single pass are computed
- * first, in a combined kernel. Then using these results, aggregations that
- * require multiple passes, will be computed.
- *
- * Finally, using the hash map, we generate a vector of indices of populated
- * values in sparse result columns. Then, for each aggregation originally
- * requested in `requests`, we gather sparse results into a column of dense
- * results using the aforementioned index vector. Dense results are stored into
- * the in/out parameter `cache`.
- */
-std::unique_ptr<table> groupby(table_view const& keys,
-                               host_span<aggregation_request const> requests,
-                               cudf::detail::result_cache* cache,
-                               bool const keys_have_nulls,
-                               null_policy const include_null_keys,
-                               rmm::cuda_stream_view stream,
-                               rmm::device_async_resource_ref mr)
-{
-  // convert to int64_t to avoid potential overflow with large `keys`
-  auto const num_keys            = static_cast<int64_t>(keys.num_rows());
-  auto const null_keys_are_equal = null_equality::EQUAL;
-  auto const has_null            = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+  auto const null_keys_are_equal  = null_equality::EQUAL;
+  auto const has_null             = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+  auto const skip_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
   auto const comparator  = cudf::experimental::row::equality::self_comparator{preprocessed_keys};
   auto const row_hash    = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)};
   auto const d_row_hash  = row_hash.device_hasher(has_null);
 
-  // Cache of sparse results where the location of aggregate value in each
-  // column is indexed by the hash set
-  cudf::detail::result_cache sparse_results(requests.size());
-
-  auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set = cuco::static_set{
-      num_keys,
-      0.5,  // desired load factor
-      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-      d_key_equal,
-      probing_scheme_type{d_row_hash},
-      cuco::thread_scope_device,
-      cuco::storage<1>{},
-      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-      stream.value()};
-
-    // Compute all single pass aggs first
-    compute_single_pass_aggs(keys,
-                             requests,
-                             &sparse_results,
-                             set.ref(cuco::insert_and_find),
-                             keys_have_nulls,
-                             include_null_keys,
-                             stream);
-
-    // Extract the populated indices from the hash set and create a gather map.
-    // Gathering using this map from sparse results will give dense results.
-    auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
-
-    // Compact all results from sparse_results and insert into cache
-    sparse_to_dense_results(keys,
-                            requests,
-                            &sparse_results,
-                            cache,
-                            gather_map,
-                            set.ref(cuco::find),
-                            keys_have_nulls,
-                            include_null_keys,
-                            stream,
-                            mr);
-
-    return cudf::detail::gather(keys,
-                                gather_map,
-                                out_of_bounds_policy::DONT_CHECK,
-                                cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                stream,
-                                mr);
-  };
-
   if (cudf::detail::has_nested_columns(keys)) {
-    auto const d_key_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
-    return comparator_helper(d_key_equal);
+    auto const d_row_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
+    return compute_groupby<nullable_row_comparator_t>(
+      keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr);
   } else {
-    auto const d_key_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
-    return comparator_helper(d_key_equal);
+    auto const d_row_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
+    return compute_groupby<row_comparator_t>(
+      keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr);
   }
 }
-
 }  // namespace
 
 /**
@@ -661,11 +151,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   cudf::detail::result_cache cache(requests.size());
 
   std::unique_ptr<table> unique_keys =
-    groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
+    dispatch_groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
 
   return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
 }
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
new file mode 100644
index 00000000000..37a61c1a22c
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash_compound_agg_finalizer.hpp"
+#include "helpers.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+hash_compound_agg_finalizer<SetType>::hash_compound_agg_finalizer(
+  column_view col,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  SetType set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+  : col(col),
+    sparse_results(sparse_results),
+    dense_results(dense_results),
+    gather_map(gather_map),
+    set(set),
+    row_bitmask(row_bitmask),
+    stream(stream),
+    mr(mr)
+{
+  result_type =
+    cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
+}
+
+template <typename SetType>
+auto hash_compound_agg_finalizer<SetType>::to_dense_agg_result(cudf::aggregation const& agg)
+{
+  auto s                  = sparse_results->get_result(col, agg);
+  auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
+                                                 gather_map,
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                                 stream,
+                                                 mr);
+  return std::move(dense_result_table->release()[0]);
+}
+
+template <typename SetType>
+auto hash_compound_agg_finalizer<SetType>::gather_argminmax(aggregation const& agg)
+{
+  auto arg_result = to_dense_agg_result(agg);
+  // We make a view of ARG(MIN/MAX) result without a null mask and gather
+  // using this map. The values in data buffer of ARG(MIN/MAX) result
+  // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
+  // which is an out of bounds index value (-1) and causes the gathered
+  // value to be null.
+  column_view null_removed_map(
+    data_type(type_to_id<size_type>()),
+    arg_result->size(),
+    static_cast<void const*>(arg_result->view().template data<size_type>()),
+    nullptr,
+    0);
+  auto gather_argminmax =
+    cudf::detail::gather(table_view({col}),
+                         null_removed_map,
+                         arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
+                                                : cudf::out_of_bounds_policy::DONT_CHECK,
+                         cudf::detail::negative_index_policy::NOT_ALLOWED,
+                         stream,
+                         mr);
+  return std::move(gather_argminmax->release()[0]);
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  dense_results->add_result(col, agg, to_dense_agg_result(agg));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::min_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  if (result_type.id() == type_id::STRING) {
+    auto transformed_agg = make_argmin_aggregation();
+    dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+  } else {
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::max_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  if (result_type.id() == type_id::STRING) {
+    auto transformed_agg = make_argmax_aggregation();
+    dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+  } else {
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::mean_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  auto sum_agg   = make_sum_aggregation();
+  auto count_agg = make_count_aggregation();
+  this->visit(*sum_agg);
+  this->visit(*count_agg);
+  column_view sum_result   = dense_results->get_result(col, *sum_agg);
+  column_view count_result = dense_results->get_result(col, *count_agg);
+
+  auto result =
+    cudf::detail::binary_operation(sum_result,
+                                   count_result,
+                                   binary_operator::DIV,
+                                   cudf::detail::target_type(result_type, aggregation::MEAN),
+                                   stream,
+                                   mr);
+  dense_results->add_result(col, agg, std::move(result));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::var_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  auto sum_agg   = make_sum_aggregation();
+  auto count_agg = make_count_aggregation();
+  this->visit(*sum_agg);
+  this->visit(*count_agg);
+  column_view sum_result   = sparse_results->get_result(col, *sum_agg);
+  column_view count_result = sparse_results->get_result(col, *count_agg);
+
+  auto values_view = column_device_view::create(col, stream);
+  auto sum_view    = column_device_view::create(sum_result, stream);
+  auto count_view  = column_device_view::create(count_result, stream);
+
+  auto var_result = make_fixed_width_column(
+    cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
+  auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
+  mutable_table_view var_table_view{{var_result->mutable_view()}};
+  cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    col.size(),
+    var_hash_functor{
+      set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
+  sparse_results->add_result(col, agg, std::move(var_result));
+  dense_results->add_result(col, agg, to_dense_agg_result(agg));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::std_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  auto var_agg = make_variance_aggregation(agg._ddof);
+  this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
+  column_view variance = dense_results->get_result(col, *var_agg);
+
+  auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
+  dense_results->add_result(col, agg, std::move(result));
+}
+
+template class hash_compound_agg_finalizer<hash_set_ref_t<cuco::find_tag>>;
+template class hash_compound_agg_finalizer<nullable_hash_set_ref_t<cuco::find_tag>>;
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
new file mode 100644
index 00000000000..8bee1a92c40
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
+  column_view col;
+  data_type result_type;
+  cudf::detail::result_cache* sparse_results;
+  cudf::detail::result_cache* dense_results;
+  device_span<size_type const> gather_map;
+  SetType set;
+  bitmask_type const* __restrict__ row_bitmask;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+ public:
+  using cudf::detail::aggregation_finalizer::visit;
+
+  hash_compound_agg_finalizer(column_view col,
+                              cudf::detail::result_cache* sparse_results,
+                              cudf::detail::result_cache* dense_results,
+                              device_span<size_type const> gather_map,
+                              SetType set,
+                              bitmask_type const* row_bitmask,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr);
+
+  auto to_dense_agg_result(cudf::aggregation const& agg);
+
+  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
+  auto gather_argminmax(cudf::aggregation const& agg);
+
+  // Declare overloads for each kind of aggregation to dispatch
+  void visit(cudf::aggregation const& agg) override;
+
+  void visit(cudf::detail::min_aggregation const& agg) override;
+
+  void visit(cudf::detail::max_aggregation const& agg) override;
+
+  void visit(cudf::detail::mean_aggregation const& agg) override;
+
+  void visit(cudf::detail::var_aggregation const& agg) override;
+
+  void visit(cudf::detail::std_aggregation const& agg) override;
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
new file mode 100644
index 00000000000..0d117ca35b3
--- /dev/null
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <cuco/static_set.cuh>
+
+namespace cudf::groupby::detail::hash {
+// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
+// types and `cg_size = 1`for flat data to improve performance
+/// Number of threads to handle each input element
+CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
+
+/// Number of slots per thread
+CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
+
+/// Thread block size
+CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128;
+
+/// Threshold cardinality to switch between shared memory aggregations and global memory
+/// aggregations
+CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128;
+
+// We add additional `block_size`, because after the number of elements in the local hash set
+// exceeds the threshold, all threads in the thread block can still insert one more element.
+/// The maximum number of elements handled per block
+CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS =
+  GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE;
+
+// GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
+/// Shared memory hash set extent type
+using shmem_extent_t =
+  cuco::extent<cudf::size_type,
+               static_cast<cudf::size_type>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>;
+
+/// Number of windows needed by each shared memory hash set
+CUDF_HOST_DEVICE auto constexpr window_extent =
+  cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
+
+/**
+ * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer.
+ */
+CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
+{
+  std::size_t constexpr base = 8;
+  return cudf::util::div_rounding_up_safe(num, base) * base;
+}
+
+using row_hash_t =
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>;
+
+/// Probing scheme type used by groupby hash table
+using probing_scheme_t = cuco::linear_probing<GROUPBY_CG_SIZE, row_hash_t>;
+
+using row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
+  false,
+  cudf::nullate::DYNAMIC,
+  cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
+
+using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
+  true,
+  cudf::nullate::DYNAMIC,
+  cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
+
+using global_set_t = cuco::static_set<cudf::size_type,
+                                      cuco::extent<int64_t>,
+                                      cuda::thread_scope_device,
+                                      row_comparator_t,
+                                      probing_scheme_t,
+                                      cudf::detail::cuco_allocator<char>,
+                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+using nullable_global_set_t = cuco::static_set<cudf::size_type,
+                                               cuco::extent<int64_t>,
+                                               cuda::thread_scope_device,
+                                               nullable_row_comparator_t,
+                                               probing_scheme_t,
+                                               cudf::detail::cuco_allocator<char>,
+                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+template <typename Op>
+using hash_set_ref_t = cuco::static_set_ref<
+  cudf::size_type,
+  cuda::thread_scope_device,
+  row_comparator_t,
+  probing_scheme_t,
+  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  Op>;
+
+template <typename Op>
+using nullable_hash_set_ref_t = cuco::static_set_ref<
+  cudf::size_type,
+  cuda::thread_scope_device,
+  nullable_row_comparator_t,
+  probing_scheme_t,
+  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  Op>;
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
new file mode 100644
index 00000000000..9cbeeb34b86
--- /dev/null
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/type_traits>
+
+namespace cudf::groupby::detail::hash {
+template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
+struct update_target_element_shmem {
+  __device__ void operator()(
+    cuda::std::byte*, bool*, cudf::size_type, cudf::column_device_view, cudf::size_type) const
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MIN,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MIN>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_min(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MAX,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MAX>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_max(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::SUM>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM_OF_SQUARES,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto value            = static_cast<Target>(source.element<Source>(source_index));
+    cudf::detail::atomic_add(&target_casted[target_index], value * value);
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::PRODUCT,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_mul(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    // The nullability was checked prior to this call in the `shmem_element_aggregator` functor
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+
+    // Assumes target is already set to be valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source.element<Source>(source_index) > source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source.element<Source>(source_index) < source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+/**
+ * @brief A functor that updates a single element in the target column stored in shared memory by
+ * applying an aggregation operation to a corresponding element from a source column in global
+ * memory.
+ *
+ * This functor can NOT be used for dictionary columns.
+ *
+ * This is a redundant copy replicating the behavior of `elementwise_aggregator` from
+ * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts
+ * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from
+ * shared memory.
+ */
+struct shmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    // Check nullability for all aggregation kinds but `COUNT_ALL`
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source.is_null(source_index)) { return; }
+    }
+    update_target_element_shmem<Source, k>{}(
+      target, target_mask, target_index, source, source_index);
+  }
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
similarity index 95%
rename from cpp/src/groupby/hash/groupby_kernels.cuh
rename to cpp/src/groupby/hash/single_pass_functors.cuh
index 188d0cff3f1..73791b3aa71 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -13,22 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include "multi_pass_kernels.cuh"
-
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/bit.hpp>
 
-#include <thrust/pair.h>
+#include <cuco/static_set_ref.cuh>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 /**
  * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
  * and populate `set` with indices of unique keys
@@ -104,8 +98,4 @@ struct compute_single_pass_aggs_fn {
     }
   }
 };
-
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
new file mode 100644
index 00000000000..e1c2cd22309
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash_compound_agg_finalizer.hpp"
+#include "helpers.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetRef>
+void sparse_to_dense_results(host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetRef set,
+                             bitmask_type const* row_bitmask,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+    auto const& col   = request.values;
+
+    // Given an aggregation, this will get the result from sparse_results and
+    // convert and return dense, compacted result
+    auto finalizer = hash_compound_agg_finalizer(
+      col, sparse_results, dense_results, gather_map, set, row_bitmask, stream, mr);
+    for (auto&& agg : agg_v) {
+      agg->finalize(finalizer);
+    }
+  }
+}
+
+template void sparse_to_dense_results<hash_set_ref_t<cuco::find_tag>>(
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  hash_set_ref_t<cuco::find_tag> set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template void sparse_to_dense_results<nullable_hash_set_ref_t<cuco::find_tag>>(
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  nullable_hash_set_ref_t<cuco::find_tag> set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
new file mode 100644
index 00000000000..3a2b3090b99
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Gather sparse aggregation results into dense using `gather_map` and add to
+ * `dense_results`
+ *
+ * @tparam SetRef Device hash set ref type
+ *
+ * @param[in] requests The set of columns to aggregate and the aggregations to perform
+ * @param[in] sparse_results Sparse aggregation results
+ * @param[out] dense_results Dense aggregation results
+ * @param[in] gather_map Gather map indicating valid elements in `sparse_results`
+ * @param[in] set Device hash set ref
+ * @param[in] row_bitmask Bitmask indicating the validity of input keys
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @param[in] mr Device memory resource used to allocate the returned table
+ */
+template <typename SetRef>
+void sparse_to_dense_results(host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetRef set,
+                             bitmask_type const* row_bitmask,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh
similarity index 69%
rename from cpp/src/groupby/hash/multi_pass_kernels.cuh
rename to cpp/src/groupby/hash/var_hash_functor.cuh
index 7043eafdc10..bb55cc9188c 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/var_hash_functor.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/aggregation.hpp>
@@ -21,17 +20,14 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuco/static_set_ref.cuh>
 #include <cuda/atomic>
+#include <cuda/std/type_traits>
 
-#include <cmath>
-
-namespace cudf {
-namespace detail {
-
-template <typename SetType, bool target_has_nulls = true, bool source_has_nulls = true>
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
 struct var_hash_functor {
   SetType set;
   bitmask_type const* __restrict__ row_bitmask;
@@ -47,13 +43,13 @@ struct var_hash_functor {
                    column_device_view sum,
                    column_device_view count,
                    size_type ddof)
-    : set(set),
-      row_bitmask(row_bitmask),
-      target(target),
-      source(source),
-      sum(sum),
-      count(count),
-      ddof(ddof)
+    : set{set},
+      row_bitmask{row_bitmask},
+      target{target},
+      source{source},
+      sum{sum},
+      count{count},
+      ddof{ddof}
   {
   }
 
@@ -64,23 +60,21 @@ struct var_hash_functor {
   }
 
   template <typename Source>
-  __device__ std::enable_if_t<!is_supported<Source>()> operator()(column_device_view const& source,
-                                                                  size_type source_index,
-                                                                  size_type target_index) noexcept
+  __device__ cuda::std::enable_if_t<!is_supported<Source>()> operator()(
+    column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
     CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination.");
   }
 
   template <typename Source>
-  __device__ std::enable_if_t<is_supported<Source>()> operator()(column_device_view const& source,
-                                                                 size_type source_index,
-                                                                 size_type target_index) noexcept
+  __device__ cuda::std::enable_if_t<is_supported<Source>()> operator()(
+    column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
-    using Target    = target_type_t<Source, aggregation::VARIANCE>;
-    using SumType   = target_type_t<Source, aggregation::SUM>;
-    using CountType = target_type_t<Source, aggregation::COUNT_VALID>;
+    using Target    = cudf::detail::target_type_t<Source, aggregation::VARIANCE>;
+    using SumType   = cudf::detail::target_type_t<Source, aggregation::SUM>;
+    using CountType = cudf::detail::target_type_t<Source, aggregation::COUNT_VALID>;
 
-    if (source_has_nulls and source.is_null(source_index)) return;
+    if (source.is_null(source_index)) return;
     CountType group_size = count.element<CountType>(target_index);
     if (group_size == 0 or group_size - ddof <= 0) return;
 
@@ -91,8 +85,9 @@ struct var_hash_functor {
     ref.fetch_add(result, cuda::std::memory_order_relaxed);
     // STD sqrt is applied in finalize()
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
+
   __device__ inline void operator()(size_type source_index)
   {
     if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) {
@@ -110,6 +105,4 @@ struct var_hash_functor {
     }
   }
 };
-
-}  // namespace detail
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 82d557b9f7e..d6c900fb689 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -108,7 +109,7 @@ struct quantiles_functor {
     auto values_view     = column_device_view::create(values, stream);
     auto group_size_view = column_device_view::create(group_sizes, stream);
     auto result_view     = mutable_column_device_view::create(result->mutable_view(), stream);
-    auto null_count      = rmm::device_scalar<cudf::size_type>(0, stream, mr);
+    auto null_count      = cudf::detail::device_scalar<cudf::size_type>(0, stream, mr);
 
     // For each group, calculate quantile
     if (!cudf::is_dictionary(values.type())) {
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 86ee20dbbe2..c3dfac46502 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -134,7 +134,7 @@ struct var_functor {
 
     // set nulls
     auto result_view  = mutable_column_device_view::create(*result, stream);
-    auto null_count   = rmm::device_scalar<cudf::size_type>(0, stream, mr);
+    auto null_count   = cudf::detail::device_scalar<cudf::size_type>(0, stream, mr);
     auto d_null_count = null_count.data();
     thrust::for_each_n(
       rmm::exec_policy(stream),
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index a2874b46b06..fc1b0226a48 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -19,6 +19,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -35,7 +36,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
@@ -60,7 +60,7 @@ template <typename>
 struct is_device_scalar : public std::false_type {};
 
 template <typename T>
-struct is_device_scalar<rmm::device_scalar<T>> : public std::true_type {};
+struct is_device_scalar<cudf::detail::device_scalar<T>> : public std::true_type {};
 
 template <typename>
 struct is_device_uvector : public std::false_type {};
@@ -232,10 +232,10 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
     // in the offsets buffer. While some arrow implementations may accept a zero-sized
     // offsets buffer, best practices would be to allocate the buffer with the single value.
     if (nanoarrow_type == NANOARROW_TYPE_STRING) {
-      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int32_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     } else {
-      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int64_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     }
 
@@ -466,10 +466,10 @@ int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out
   if (column.size() == 0) {
     // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
     if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
-      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int64_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     } else {
-      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int32_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     }
 
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index 79fb7550044..8ec0904f1ba 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -44,6 +44,7 @@
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
 #include <nanoarrow/nanoarrow_device.h>
+#include <sys/mman.h>
 
 #include <iostream>
 
@@ -52,6 +53,30 @@ namespace detail {
 
 namespace {
 
+/*
+  Enable Transparent Huge Pages (THP) for large (>4MB) allocations.
+  `buf` is returned untouched.
+  Enabling THP can improve performance of device-host memory transfers
+  significantly, see <https://github.com/rapidsai/cudf/pull/13914>.
+*/
+void enable_hugepage(ArrowBuffer* buffer)
+{
+  if (buffer->size_bytes < (1u << 22u)) {  // Smaller than 4 MB
+    return;
+  }
+
+#ifdef MADV_HUGEPAGE
+  auto const pagesize = sysconf(_SC_PAGESIZE);
+  void* addr          = const_cast<uint8_t*>(buffer->data);
+  auto length{static_cast<std::size_t>(buffer->size_bytes)};
+  if (std::align(pagesize, pagesize, addr, length)) {
+    // Intentionally not checking for errors that may be returned by older kernel versions;
+    // optimistically tries enabling huge pages.
+    madvise(addr, length, MADV_HUGEPAGE);
+  }
+#endif
+}
+
 struct dispatch_to_arrow_host {
   cudf::column_view column;
   rmm::cuda_stream_view stream;
@@ -62,6 +87,7 @@ struct dispatch_to_arrow_host {
     if (!column.has_nulls()) { return NANOARROW_OK; }
 
     NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast<int64_t>(column.size()), 0));
+    enable_hugepage(&bitmap->buffer);
     CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data,
                                   (column.offset() > 0)
                                     ? cudf::detail::copy_bitmask(column, stream, mr).data()
@@ -76,6 +102,7 @@ struct dispatch_to_arrow_host {
   int populate_data_buffer(device_span<T const> input, ArrowBuffer* buffer) const
   {
     NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1));
+    enable_hugepage(buffer);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value()));
     return NANOARROW_OK;
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 03cf6d4a0e0..d5caa4720ac 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -16,6 +16,7 @@
 
 #include "avro.hpp"
 
+#include <array>
 #include <cstring>
 #include <unordered_map>
 
@@ -302,7 +303,7 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, std::string const&
   // Empty schema
   if (json_str == "[]") return true;
 
-  char depthbuf[MAX_SCHEMA_DEPTH];
+  std::array<char, MAX_SCHEMA_DEPTH> depthbuf;
   int depth = 0, parent_idx = -1, entry_idx = -1;
   json_state_e state = state_attrname;
   std::string str;
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 1af45b41d8e..d4d6f46b99a 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -538,8 +538,10 @@ size_t decompress_zstd(host_span<uint8_t const> src,
   CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed");
 
   // Copy temporary output to `dst`
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    dst.data(), d_dst.data(), hd_stats[0].bytes_written, cudaMemcpyDefault, stream.value()));
+  cudf::detail::cuda_memcpy_async(
+    dst.subspan(0, hd_stats[0].bytes_written),
+    device_span<uint8_t const>{d_dst.data(), hd_stats[0].bytes_written},
+    stream);
 
   return hd_stats[0].bytes_written;
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 8c32fc85f78..72fca75c56b 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -21,6 +21,7 @@
 
 #include "csv_common.hpp"
 #include "csv_gpu.hpp"
+#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/io_uncomp.hpp"
 #include "io/utilities/column_buffer.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
@@ -275,11 +276,10 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
     auto const read_offset = byte_range_offset + input_pos + previous_data_size;
     auto const read_size   = target_pos - input_pos - previous_data_size;
     if (data.has_value()) {
-      CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size,
-                                    data->data() + read_offset,
-                                    target_pos - input_pos - previous_data_size,
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      cudf::detail::cuda_memcpy_async(
+        device_span<char>{d_data.data() + previous_data_size, read_size},
+        data->subspan(read_offset, read_size),
+        stream);
     } else {
       if (source->is_device_read_preferred(read_size)) {
         source->device_read(read_offset,
@@ -288,12 +288,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                             stream);
       } else {
         auto const buffer = source->host_read(read_offset, read_size);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size,
-                                      buffer->data(),
-                                      buffer->size(),
-                                      cudaMemcpyDefault,
-                                      stream.value()));
-        stream.synchronize();  // To prevent buffer going out of scope before we copy the data.
+        // Use sync version to prevent buffer going out of scope before we copy the data.
+        cudf::detail::cuda_memcpy(
+          device_span<char>{d_data.data() + previous_data_size, read_size},
+          host_span<char const>{reinterpret_cast<char const*>(buffer->data()), buffer->size()},
+          stream);
       }
     }
 
@@ -311,12 +310,10 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                                                    range_end,
                                                                    skip_rows,
                                                                    stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_ctx.device_ptr(),
-                                  num_blocks * sizeof(uint64_t),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+
+    cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                              device_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                              stream);
 
     // Sum up the rows in each character block, selecting the row count that
     // corresponds to the current input context. Also stores the now known input
@@ -331,11 +328,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
       // At least one row in range in this batch
       all_row_offsets.resize(total_rows - skip_rows, stream);
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
-                                    row_ctx.host_ptr(),
-                                    num_blocks * sizeof(uint64_t),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      cudf::detail::cuda_memcpy_async(device_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                                      host_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                                      stream);
 
       // Pass 2: Output row offsets
       cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
@@ -352,12 +347,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                              stream);
       // With byte range, we want to keep only one row out of the specified range
       if (range_end < data_size) {
-        CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                      row_ctx.device_ptr(),
-                                      num_blocks * sizeof(uint64_t),
-                                      cudaMemcpyDefault,
-                                      stream.value()));
-        stream.synchronize();
+        cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                                  device_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                                  stream);
 
         size_t rows_out_of_range = 0;
         for (uint32_t i = 0; i < num_blocks; i++) {
@@ -401,12 +393,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   // Remove header rows and extract header
   auto const header_row_index = std::max<size_t>(header_rows, 1) - 1;
   if (header_row_index + 1 < row_offsets.size()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_offsets.data() + header_row_index,
-                                  2 * sizeof(uint64_t),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, 2),
+                              device_span<uint64_t const>{row_offsets.data() + header_row_index, 2},
+                              stream);
 
     auto const header_start = input_pos + row_ctx[0];
     auto const header_end   = input_pos + row_ctx[1];
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index b84446b5f3e..2bbe05ced84 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/csv.hpp>
 #include <cudf/null_mask.hpp>
@@ -405,13 +406,8 @@ void write_chunked(data_sink* out_sink,
     out_sink->device_write(ptr_all_bytes, total_num_bytes, stream);
   } else {
     // copy the bytes to host to write them out
-    thrust::host_vector<char> h_bytes(total_num_bytes);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
-                                  ptr_all_bytes,
-                                  total_num_bytes * sizeof(char),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    auto const h_bytes = cudf::detail::make_host_vector_sync(
+      device_span<char const>{ptr_all_bytes, total_num_bytes}, stream);
 
     out_sink->host_write(h_bytes.data(), total_num_bytes);
   }
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index de8eea9e99b..a8682e6a760 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -122,14 +122,14 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 namespace {
 
 std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
-                                                                    size_t range_offset = 0,
-                                                                    size_t range_size   = 0)
+                                                                    size_t offset            = 0,
+                                                                    size_t max_size_estimate = 0)
 {
   switch (info.type()) {
     case io_type::FILEPATH: {
       auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
       for (auto const& filepath : info.filepaths()) {
-        sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
+        sources.emplace_back(cudf::io::datasource::create(filepath, offset, max_size_estimate));
       }
       return sources;
     }
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 5855f1b5a5f..d06338c6f69 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -134,12 +134,13 @@ std::vector<std::string> copy_strings_to_host_sync(
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
     host_data.reserve(col.size());
-    std::transform(
-      std::begin(h_offsets),
-      std::end(h_offsets) - 1,
-      std::begin(h_offsets) + 1,
-      std::back_inserter(host_data),
-      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
+    std::transform(std::begin(h_offsets),
+                   std::end(h_offsets) - 1,
+                   std::begin(h_offsets) + 1,
+                   std::back_inserter(host_data),
+                   [&h_chars](auto start, auto end) {
+                     return std::string(h_chars.data() + start, end - start);
+                   });
     return host_data;
   };
   return to_host(d_column_names->view());
@@ -170,643 +171,78 @@ rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const>
   rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
   thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
 
-  auto parse_opt = parsing_options(options, stream);
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [options           = parse_opt.view(),
-     data              = input.data(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
-      auto const node_category = column_categories[col_ids[i]];
-      if (node_category == NC_STR or node_category == NC_VAL) {
-        auto const is_null_literal = serialized_trie_contains(
-          options.trie_na,
-          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
-        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
-      }
-    });
-  return is_all_nulls;
-}
-
-NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
-                                       bool is_enabled_lines,
-                                       rmm::cuda_stream_view stream)
-{
-  NodeIndexT value = parent_node_sentinel;
-  if (!col_ids.empty()) {
-    auto const list_node_index = is_enabled_lines ? 0 : 1;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                  col_ids.data() + list_node_index,
-                                  sizeof(NodeIndexT),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
-  }
-  return value;
-}
-/**
- * @brief Holds member data pointers of `d_json_column`
- *
- */
-struct json_column_data {
-  using row_offset_t = json_column::row_offset_t;
-  row_offset_t* string_offsets;
-  row_offset_t* string_lengths;
-  row_offset_t* child_offsets;
-  bitmask_type* validity;
-};
-
-using hashmap_of_device_columns =
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
-
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-void scatter_offsets(tree_meta_t const& tree,
-                     device_span<NodeIndexT const> col_ids,
-                     device_span<size_type const> row_offsets,
-                     device_span<size_type> node_ids,
-                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-                     tree_meta_t const& d_column_tree,
-                     host_span<const bool> ignore_vals,
-                     hashmap_of_device_columns const& columns,
-                     rmm::cuda_stream_view stream);
-
-/**
- * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are inserted into `root`'s children.
- * `root` must be a list type.
- *
- * @param input Input JSON string device data
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param row_offsets Row offsets of the nodes in the tree
- * @param root Root node of the `d_json_column` tree
- * @param is_array_of_arrays Whether the tree is an array of arrays
- * @param options Parsing options specifying the parsing behaviour
- * options affecting behaviour are
- *   is_enabled_lines: Whether the input is a line-delimited JSON
- *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the device memory
- * of child_offets and validity members of `d_json_column`
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t const& tree,
-                             device_span<NodeIndexT const> col_ids,
-                             device_span<size_type const> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  bool const is_enabled_lines                 = options.is_enabled_lines();
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  // make a copy
-  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
-    col_ids, stream, cudf::get_current_device_resource_ref());
-
-  // sort by {col_id} on {node_ids} stable
-  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
-                             sorted_col_ids.begin(),
-                             sorted_col_ids.end(),
-                             node_ids.begin());
-
-  NodeIndexT const row_array_parent_col_id =
-    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
-
-  // 1. gather column information.
-  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
-    reduce_to_column_tree(tree,
-                          col_ids,
-                          sorted_col_ids,
-                          node_ids,
-                          row_offsets,
-                          is_array_of_arrays,
-                          row_array_parent_col_id,
-                          stream);
-  auto num_columns                      = d_unique_col_ids.size();
-  std::vector<std::string> column_names = copy_strings_to_host_sync(
-    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  // array of arrays column names
-  if (is_array_of_arrays) {
-    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-    auto const column_parent_ids =
-      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
-    auto values_column_indices =
-      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
-    auto h_values_column_indices =
-      cudf::detail::make_host_vector_sync(values_column_indices, stream);
-    std::transform(unique_col_ids.begin(),
-                   unique_col_ids.end(),
-                   column_names.cbegin(),
-                   column_names.begin(),
-                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
-                     auto col_id, auto name) mutable {
-                     return column_parent_ids[col_id] == row_array_parent_col_id
-                              ? std::to_string(h_values_column_indices[col_id])
-                              : name;
-                   });
-  }
-
-  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
-    if (is_enabled_mixed_types_as_string) {
-      return cudf::detail::make_std_vector_sync(
-        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
-    }
-    return std::vector<uint8_t>();
-  }();
-  auto const [ignore_vals, columns] = build_tree(root,
-                                                 is_str_column_all_nulls,
-                                                 d_column_tree,
-                                                 d_unique_col_ids,
-                                                 d_max_row_offsets,
-                                                 column_names,
-                                                 row_array_parent_col_id,
-                                                 is_array_of_arrays,
-                                                 options,
-                                                 stream,
-                                                 mr);
-
-  scatter_offsets(tree,
-                  col_ids,
-                  row_offsets,
-                  node_ids,
-                  sorted_col_ids,
-                  d_column_tree,
-                  ignore_vals,
-                  columns,
-                  stream);
-}
-
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-  auto column_categories =
-    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
-  auto const column_parent_ids =
-    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-  auto column_range_beg =
-    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
-  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
-  auto num_columns           = d_unique_col_ids.size();
-  stream.synchronize();
-
-  auto to_json_col_type = [](auto category) {
-    switch (category) {
-      case NC_STRUCT: return json_col_t::StructColumn;
-      case NC_LIST: return json_col_t::ListColumn;
-      case NC_STR: [[fallthrough]];
-      case NC_VAL: return json_col_t::StringColumn;
-      default: return json_col_t::Unknown;
-    }
-  };
-  auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
-  };
-
-  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
-    if (column_category == NC_ERR || column_category == NC_FN) {
-      return;
-    } else if (column_category == NC_VAL || column_category == NC_STR) {
-      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-      init_to_zero(col.string_offsets);
-      init_to_zero(col.string_lengths);
-    } else if (column_category == NC_LIST) {
-      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
-      init_to_zero(col.child_offsets);
-    }
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_category);
-  };
-
-  auto reinitialize_as_string = [&](auto i, auto& col) {
-    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-    init_to_zero(col.string_offsets);
-    init_to_zero(col.string_lengths);
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = json_col_t::StringColumn;
-    // destroy references of all child columns after this step, by calling remove_child_columns
-  };
-
-  path_from_tree tree_path{column_categories,
-                           column_parent_ids,
-                           column_names,
-                           is_array_of_arrays,
-                           row_array_parent_col_id};
-
-  // 2. generate nested columns tree and its device_memory
-  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
-  auto h_range_col_id_it =
-    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<0>(a) < thrust::get<0>(b);
-  });
-
-  // use hash map because we may skip field name's col_ids
-  hashmap_of_device_columns columns;
-  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
-  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
-  // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<bool>(num_columns, stream);
-  std::fill(ignore_vals.begin(), ignore_vals.end(), false);
-  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
-  std::vector<uint8_t> is_pruned(num_columns, 0);
-  // for columns that are not mixed type but have been forced as string
-  std::vector<bool> forced_as_string_column(num_columns);
-  columns.try_emplace(parent_node_sentinel, std::ref(root));
-
-  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
-    [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto const& col_name : col.column_order) {
-        auto child_id                  = mapped_columns[{this_col_id, col_name}];
-        is_mixed_type_column[child_id] = 1;
-        remove_child_columns(child_id, col.child_columns.at(col_name));
-        mapped_columns.erase({this_col_id, col_name});
-        columns.erase(child_id);
-      }
-      col.child_columns.clear();  // their references are deleted above.
-      col.column_order.clear();
-    };
-
-  auto name_and_parent_index = [&is_array_of_arrays,
-                                &row_array_parent_col_id,
-                                &column_parent_ids,
-                                &column_categories,
-                                &column_names](auto this_col_id) {
-    std::string name   = "";
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    } else {
-      CUDF_FAIL("Unexpected parent column category");
-    }
-    return std::pair{name, parent_col_id};
-  };
-
-  // Prune columns that are not required to be parsed.
-  if (options.is_enabled_prune_columns()) {
-    for (auto const this_col_id : unique_col_ids) {
-      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-        continue;
-      }
-      // Struct, List, String, Value
-      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-      // get path of this column, and get its dtype if present in options
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
-        is_pruned[this_col_id] = 1;
-        continue;
-      } else {
-        // make sure all its parents are not pruned.
-        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
-          is_pruned[parent_col_id] = 0;
-          parent_col_id            = column_parent_ids[parent_col_id];
-        }
-      }
-    }
-  }
-
-  // Build the column tree, also, handles mixed types.
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
-    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-
-    // if parent is mixed type column or this column is pruned or if parent
-    // has been forced as string, ignore this column.
-    if (parent_col_id != parent_node_sentinel &&
-          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
-        forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = true;
-      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
-      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
-      continue;
-    }
-
-    // If the child is already found,
-    // replace if this column is a nested column and the existing was a value column
-    // ignore this column if this column is a value column and the existing was a nested column
-    auto it = columns.find(parent_col_id);
-    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
-    auto& parent_col = it->second.get();
-    bool replaced    = false;
-    if (mapped_columns.count({parent_col_id, name}) > 0) {
-      auto const old_col_id = mapped_columns[{parent_col_id, name}];
-      // If mixed type as string is enabled, make both of them strings and merge them.
-      // All child columns will be ignored when parsing.
-      if (is_enabled_mixed_types_as_string) {
-        bool const is_mixed_type = [&]() {
-          // If new or old is STR and they are all not null, make it mixed type, else ignore.
-          if (column_categories[this_col_id] == NC_VAL ||
-              column_categories[this_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[this_col_id]) return false;
-          }
-          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[old_col_id]) return false;
-          }
-          return true;
-        }();
-        if (is_mixed_type) {
-          is_mixed_type_column[this_col_id] = 1;
-          is_mixed_type_column[old_col_id]  = 1;
-          // if old col type (not cat) is list or struct, replace with string.
-          auto& col = columns.at(old_col_id).get();
-          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
-            reinitialize_as_string(old_col_id, col);
-            remove_child_columns(old_col_id, col);
-            // all its children (which are already inserted) are ignored later.
-          }
-          col.forced_as_string_column = true;
-          columns.try_emplace(this_col_id, columns.at(old_col_id));
-          continue;
-        }
-      }
-
-      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = true;
-        continue;
-      }
-      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-        // remap
-        ignore_vals[old_col_id] = true;
-        mapped_columns.erase({parent_col_id, name});
-        columns.erase(old_col_id);
-        parent_col.child_columns.erase(name);
-        replaced = true;  // to skip duplicate name in column_order
-      } else {
-        // If this is a nested column but we're trying to insert either (a) a list node into a
-        // struct column or (b) a struct node into a list column, we fail
-        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
-                          column_categories[this_col_id] == NC_STRUCT) or
-                         (column_categories[old_col_id] == NC_STRUCT and
-                          column_categories[this_col_id] == NC_LIST)),
-                     "A mix of lists and structs within the same column is not supported");
-      }
-    }
-
-    auto this_column_category = column_categories[this_col_id];
-    // get path of this column, check if it is a struct/list forced as string, and enforce it
-    auto const nt                             = tree_path.get_path(this_col_id);
-    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      this_column_category = NC_STR;
-    }
-
-    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
-    // move into parent
-    device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col, this_column_category);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      col.forced_as_string_column          = true;
-      forced_as_string_column[this_col_id] = true;
-    }
-
-    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
-    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
-    if (not replaced) parent_col.column_order.push_back(name);
-    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
-    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
-  }
-
-  if (is_enabled_mixed_types_as_string) {
-    // ignore all children of mixed type columns
-    for (auto const this_col_id : unique_col_ids) {
-      auto parent_col_id = column_parent_ids[this_col_id];
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
-        is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = true;
-        columns.erase(this_col_id);
-      }
-      // Convert only mixed type columns as string (so to copy), but not its children
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
-          is_mixed_type_column[this_col_id] == 1)
-        column_categories[this_col_id] = NC_STR;
-    }
-    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                    column_categories.data(),
-                                    column_categories.size() * sizeof(column_categories[0]),
-                                    cudf::detail::host_memory_kind::PAGEABLE,
-                                    stream);
-  }
-
-  // ignore all children of columns forced as string
-  for (auto const this_col_id : unique_col_ids) {
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
-      forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = true;
-    }
-    // Convert only mixed type columns as string (so to copy), but not its children
-    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
-        forced_as_string_column[this_col_id])
-      column_categories[this_col_id] = NC_STR;
-  }
-  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                  column_categories.data(),
-                                  column_categories.size() * sizeof(column_categories[0]),
-                                  cudf::detail::host_memory_kind::PAGEABLE,
-                                  stream);
-
-  // restore unique_col_ids order
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<1>(a) < thrust::get<1>(b);
-  });
-  return {ignore_vals, columns};
-}
-
-void scatter_offsets(tree_meta_t const& tree,
-                     device_span<NodeIndexT const> col_ids,
-                     device_span<size_type const> row_offsets,
-                     device_span<size_type> node_ids,
-                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-                     tree_meta_t const& d_column_tree,
-                     host_span<const bool> ignore_vals,
-                     hashmap_of_device_columns const& columns,
-                     rmm::cuda_stream_view stream)
-{
-  auto const num_nodes   = col_ids.size();
-  auto const num_columns = d_column_tree.node_categories.size();
-  // move columns data to device.
-  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
-  for (auto& [col_id, col_ref] : columns) {
-    if (col_id == parent_node_sentinel) continue;
-    auto& col            = col_ref.get();
-    columns_data[col_id] = json_column_data{col.string_offsets.data(),
-                                            col.string_lengths.data(),
-                                            col.child_offsets.data(),
-                                            static_cast<bitmask_type*>(col.validity.data())};
-  }
-
-  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
-    ignore_vals, stream, cudf::get_current_device_resource_ref());
-  auto d_columns_data = cudf::detail::make_device_uvector_async(
-    columns_data, stream, cudf::get_current_device_resource_ref());
-
-  // 3. scatter string offsets to respective columns, set validity bits
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     row_offsets       = row_offsets.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     d_ignore_vals     = d_ignore_vals.begin(),
-     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
-      if (d_ignore_vals[col_ids[i]]) return;
-      auto const node_category = column_categories[col_ids[i]];
-      switch (node_category) {
-        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_STR: [[fallthrough]];
-        case NC_VAL:
-          if (d_ignore_vals[col_ids[i]]) break;
-          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
-          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
-          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
-          break;
-        default: break;
-      }
-    });
-
-  // 4. scatter List offset
-  // copy_if only node's whose parent is list, (node_id, parent_col_id)
-  // stable_sort by parent_col_id of {node_id}.
-  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
-  //                                  (i==last, i+1!=i), write end offset.
-  //    unique_copy_by_key {parent_node_id} {row_offset} to
-  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
-
-  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
-  auto parent_col_id   = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    cuda::proclaim_return_type<NodeIndexT>(
-      [col_ids         = col_ids.begin(),
-       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
-        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
-                                                                  : col_ids[parent_node_ids[node_id]];
-      }));
-  auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
-      num_nodes,
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [d_ignore_vals     = d_ignore_vals.begin(),
-     parent_node_ids   = tree.parent_node_ids.begin(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin()] __device__(size_type node_id) {
-      auto parent_node_id = parent_node_ids[node_id];
-      return parent_node_id != parent_node_sentinel and
-             column_categories[col_ids[parent_node_id]] == NC_LIST and
-             (!d_ignore_vals[col_ids[parent_node_id]]);
-    });
-
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
-                             parent_col_ids.begin(),
-                             parent_col_ids.begin() + num_list_children,
-                             node_ids.begin());
+  auto parse_opt = parsing_options(options, stream);
   thrust::for_each_n(
     rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    num_list_children,
-    [node_ids        = node_ids.begin(),
-     parent_node_ids = tree.parent_node_ids.begin(),
-     parent_col_ids  = parent_col_ids.begin(),
-     row_offsets     = row_offsets.begin(),
-     d_columns_data  = d_columns_data.begin(),
-     num_list_children] __device__(size_type i) {
-      auto const node_id        = node_ids[i];
-      auto const parent_node_id = parent_node_ids[node_id];
-      // scatter to list_offset
-      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
-          row_offsets[node_id];
-      }
-      // last value of list child_offset is its size.
-      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
-          row_offsets[node_id] + 1;
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [options           = parse_opt.view(),
+     data              = input.data(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
+      auto const node_category = column_categories[col_ids[i]];
+      if (node_category == NC_STR or node_category == NC_VAL) {
+        auto const is_null_literal = serialized_trie_contains(
+          options.trie_na,
+          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
+        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
       }
     });
+  return is_all_nulls;
+}
 
-  // 5. scan on offsets.
-  for (auto& [id, col_ref] : columns) {
-    auto& col = col_ref.get();
-    if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.string_offsets.begin(),
-                             col.string_offsets.end(),
-                             col.string_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.child_offsets.begin(),
-                             col.child_offsets.end(),
-                             col.child_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    }
-  }
-  stream.synchronize();
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
+                                       bool is_enabled_lines,
+                                       rmm::cuda_stream_view stream)
+{
+  if (col_ids.empty()) { return parent_node_sentinel; }
+
+  auto const list_node_index = is_enabled_lines ? 0 : 1;
+  auto const value           = cudf::detail::make_host_vector_sync(
+    device_span<NodeIndexT const>{col_ids.data() + list_node_index, 1}, stream);
+
+  return value[0];
 }
+/**
+ * @brief Holds member data pointers of `d_json_column`
+ *
+ */
+struct json_column_data {
+  using row_offset_t = json_column::row_offset_t;
+  row_offset_t* string_offsets;
+  row_offset_t* string_lengths;
+  row_offset_t* child_offsets;
+  bitmask_type* validity;
+};
+
+using hashmap_of_device_columns =
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
-namespace experimental {
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream);
 
 std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_options const& options)
 {
@@ -836,19 +272,6 @@ std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_optio
     options.get_dtypes());
 }
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
 /**
  * @brief Constructs `d_json_column` from node tree representation
  * Newly constructed columns are inserted into `root`'s children.
@@ -1040,7 +463,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
   std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns());
 
   // prune all children of a column, but not self.
-  auto ignore_all_children = [&](auto parent_col_id) {
+  auto ignore_all_children = [&adj, &is_pruned](auto parent_col_id) {
     std::deque<NodeIndexT> offspring;
     if (adj.count(parent_col_id)) {
       for (auto const& child : adj[parent_col_id]) {
@@ -1391,14 +814,149 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                  column_categories.cbegin(),
                  expected_types.begin(),
                  [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
-  cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                  expected_types.data(),
-                  expected_types.size() * sizeof(column_categories[0]),
-                  cudaMemcpyDefault,
-                  stream.value());
+  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, expected_types, stream);
 
   return {is_pruned, columns};
 }
-}  // namespace experimental
+
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream)
+{
+  auto const num_nodes   = col_ids.size();
+  auto const num_columns = d_column_tree.node_categories.size();
+  // move columns data to device.
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
+  for (auto& [col_id, col_ref] : columns) {
+    if (col_id == parent_node_sentinel) continue;
+    auto& col            = col_ref.get();
+    columns_data[col_id] = json_column_data{col.string_offsets.data(),
+                                            col.string_lengths.data(),
+                                            col.child_offsets.data(),
+                                            static_cast<bitmask_type*>(col.validity.data())};
+  }
+
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, cudf::get_current_device_resource_ref());
+
+  // 3. scatter string offsets to respective columns, set validity bits
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      if (d_ignore_vals[col_ids[i]]) return;
+      auto const node_category = column_categories[col_ids[i]];
+      switch (node_category) {
+        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_STR: [[fallthrough]];
+        case NC_VAL:
+          if (d_ignore_vals[col_ids[i]]) break;
+          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
+          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
+          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
+          break;
+        default: break;
+      }
+    });
+
+  // 4. scatter List offset
+  // copy_if only node's whose parent is list, (node_id, parent_col_id)
+  // stable_sort by parent_col_id of {node_id}.
+  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
+  //                                  (i==last, i+1!=i), write end offset.
+  //    unique_copy_by_key {parent_node_id} {row_offset} to
+  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
+
+  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
+  auto parent_col_id   = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<NodeIndexT>(
+      [col_ids         = col_ids.begin(),
+       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
+        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
+                                                                  : col_ids[parent_node_ids[node_id]];
+      }));
+  auto const list_children_end = thrust::copy_if(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
+      num_nodes,
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
+    [d_ignore_vals     = d_ignore_vals.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin()] __device__(size_type node_id) {
+      auto parent_node_id = parent_node_ids[node_id];
+      return parent_node_id != parent_node_sentinel and
+             column_categories[col_ids[parent_node_id]] == NC_LIST and
+             (!d_ignore_vals[col_ids[parent_node_id]]);
+    });
+
+  auto const num_list_children =
+    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             parent_col_ids.begin(),
+                             parent_col_ids.begin() + num_list_children,
+                             node_ids.begin());
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids        = node_ids.begin(),
+     parent_node_ids = tree.parent_node_ids.begin(),
+     parent_col_ids  = parent_col_ids.begin(),
+     row_offsets     = row_offsets.begin(),
+     d_columns_data  = d_columns_data.begin(),
+     num_list_children] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      // scatter to list_offset
+      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
+          row_offsets[node_id];
+      }
+      // last value of list child_offset is its size.
+      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
+          row_offsets[node_id] + 1;
+      }
+    });
+
+  // 5. scan on offsets.
+  for (auto& [id, col_ref] : columns) {
+    auto& col = col_ref.get();
+    if (col.type == json_col_t::StringColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.string_offsets.begin(),
+                             col.string_offsets.end(),
+                             col.string_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    } else if (col.type == json_col_t::ListColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.child_offsets.begin(),
+                             col.child_offsets.end(),
+                             col.child_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    }
+  }
+  stream.synchronize();
+}
 
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 912e93d52ae..7e4d975e431 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -485,16 +485,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   }
 }
 
-template <typename... Args>
-auto make_device_json_column_dispatch(bool experimental, Args&&... args)
-{
-  if (experimental) {
-    return experimental::make_device_json_column(std::forward<Args>(args)...);
-  } else {
-    return make_device_json_column(std::forward<Args>(args)...);
-  }
-}
-
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
@@ -523,16 +513,14 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 #endif
 
   bool const is_array_of_arrays = [&]() {
-    std::array<node_t, 2> h_node_categories = {NC_ERR, NC_ERR};
-    auto const size_to_copy                 = std::min(size_t{2}, gpu_tree.node_categories.size());
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(),
-                                  gpu_tree.node_categories.data(),
-                                  sizeof(node_t) * size_to_copy,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size());
+    if (size_to_copy == 0) return false;
+    auto const h_node_categories = cudf::detail::make_host_vector_sync(
+      device_span<NodeT const>{gpu_tree.node_categories.data(), size_to_copy}, stream);
+
     if (options.is_enabled_lines()) return h_node_categories[0] == NC_LIST;
-    return h_node_categories[0] == NC_LIST and h_node_categories[1] == NC_LIST;
+    return h_node_categories.size() >= 2 and h_node_categories[0] == NC_LIST and
+           h_node_categories[1] == NC_LIST;
   }();
 
   auto [gpu_col_id, gpu_row_offsets] =
@@ -553,16 +541,15 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                0);
 
   // Get internal JSON column
-  make_device_json_column_dispatch(options.is_enabled_experimental(),
-                                   d_input,
-                                   gpu_tree,
-                                   gpu_col_id,
-                                   gpu_row_offsets,
-                                   root_column,
-                                   is_array_of_arrays,
-                                   options,
-                                   stream,
-                                   mr);
+  make_device_json_column(d_input,
+                          gpu_tree,
+                          gpu_col_id,
+                          gpu_row_offsets,
+                          root_column,
+                          is_array_of_arrays,
+                          options,
+                          stream,
+                          mr);
 
   // data_root refers to the root column of the data represented by the given JSON string
   auto& data_root =
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 2d435dc8e1a..34a87918e57 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -16,6 +16,7 @@
 
 #include "io/fst/lookup_tables.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
@@ -24,7 +25,6 @@
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -316,7 +316,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
                           stream);
 
   rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
   parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
                    static_cast<SymbolT*>(outbuf.data()),
@@ -401,7 +401,7 @@ std::
                           stream);
 
   rmm::device_uvector<size_type> outbuf_indices(inbuf.size(), stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
   parser.Transduce(inbuf.data(),
                    static_cast<SymbolOffsetT>(inbuf.size()),
                    thrust::make_discard_iterator(),
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index d949635c1cc..e2fe926ea19 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -264,16 +264,13 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       error_count > 0) {
     auto const error_location =
       thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
-    SymbolOffsetT error_index;
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(&error_index,
-                      token_indices.data() + thrust::distance(tokens.begin(), error_location),
-                      sizeof(SymbolOffsetT),
-                      cudaMemcpyDefault,
-                      stream.value()));
-    stream.synchronize();
+    auto error_index = cudf::detail::make_host_vector_sync<SymbolOffsetT>(
+      device_span<SymbolOffsetT const>{
+        token_indices.data() + thrust::distance(tokens.begin(), error_location), 1},
+      stream);
+
     CUDF_FAIL("JSON Parser encountered an invalid format at location " +
-              std::to_string(error_index));
+              std::to_string(error_index[0]));
   }
 
   auto const num_tokens = tokens.size();
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 3d9a51833e0..f6be4539d7f 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -405,21 +405,6 @@ void make_device_json_column(device_span<SymbolT const> input,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
-namespace experimental {
-/**
- * @copydoc cudf::io::json::detail::make_device_json_column
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t const& tree,
-                             device_span<NodeIndexT const> col_ids,
-                             device_span<size_type const> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr);
-}  // namespace experimental
-
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 76816071d8c..60e78f4763d 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -21,6 +21,7 @@
 #include "nested_json.hpp"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
@@ -34,7 +35,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -83,8 +83,7 @@ struct tree_node {
 void check_input_size(std::size_t input_size)
 {
   // Transduce() writes symbol offsets that may be as large input_size-1
-  CUDF_EXPECTS(input_size == 0 ||
-                 (input_size - 1) <= std::numeric_limits<cudf::io::json::SymbolOffsetT>::max(),
+  CUDF_EXPECTS(input_size == 0 || (input_size - 1) <= std::numeric_limits<int32_t>::max(),
                "Given JSON input is too large");
 }
 }  // namespace
@@ -1447,11 +1446,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr StackSymbolT read_symbol = 'x';
 
   // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes)
-  rmm::device_scalar<SymbolOffsetT> d_num_stack_ops(stream);
-
-  // Sequence of stack symbols and their position in the original input (sparse representation)
-  rmm::device_uvector<StackSymbolT> stack_ops{json_in.size(), stream};
-  rmm::device_uvector<SymbolOffsetT> stack_op_indices{json_in.size(), stream};
+  cudf::detail::device_scalar<SymbolOffsetT> d_num_stack_ops(stream);
 
   // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes
   constexpr auto max_translation_table_size =
@@ -1469,11 +1464,26 @@ void get_stack_context(device_span<SymbolT const> json_in,
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
   // of structs/lists
+  // Run FST to estimate the sizes of translated buffers
+  json_to_stack_ops_fst.Transduce(json_in.begin(),
+                                  static_cast<SymbolOffsetT>(json_in.size()),
+                                  thrust::make_discard_iterator(),
+                                  thrust::make_discard_iterator(),
+                                  d_num_stack_ops.data(),
+                                  to_stack_op::start_state,
+                                  stream);
+
+  auto stack_ops_bufsize = d_num_stack_ops.value(stream);
+  // Sequence of stack symbols and their position in the original input (sparse representation)
+  rmm::device_uvector<StackSymbolT> stack_ops{stack_ops_bufsize, stream};
+  rmm::device_uvector<SymbolOffsetT> stack_op_indices{stack_ops_bufsize, stream};
+
+  // Run bracket-brace FST to retrieve starting positions of structs and lists
   json_to_stack_ops_fst.Transduce(json_in.begin(),
                                   static_cast<SymbolOffsetT>(json_in.size()),
                                   stack_ops.data(),
                                   stack_op_indices.data(),
-                                  d_num_stack_ops.data(),
+                                  thrust::make_discard_iterator(),
                                   to_stack_op::start_state,
                                   stream);
 
@@ -1509,6 +1519,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   device_span<SymbolOffsetT const> token_indices,
   rmm::cuda_stream_view stream)
 {
+  CUDF_FUNC_RANGE();
   // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
   // invalid JSON line
   token_filter::UnwrapTokenFromSymbolOp sgid_op{};
@@ -1520,7 +1531,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
     stream);
 
   auto const mr = cudf::get_current_device_resource_ref();
-  rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
   rmm::device_uvector<PdaTokenT> filtered_tokens_out{tokens.size(), stream, mr};
   rmm::device_uvector<SymbolOffsetT> filtered_token_indices_out{tokens.size(), stream, mr};
 
@@ -1639,26 +1650,33 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   std::size_t constexpr max_tokens_per_struct = 6;
   auto const max_token_out_count =
     cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct;
-  rmm::device_scalar<std::size_t> num_written_tokens{stream};
+  cudf::detail::device_scalar<std::size_t> num_written_tokens{stream};
   // In case we're recovering on invalid JSON lines, post-processing the token stream requires to
   // see a JSON-line delimiter as the very first item
   SymbolOffsetT const delimiter_offset =
     (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0);
-  rmm::device_uvector<PdaTokenT> tokens{max_token_out_count + delimiter_offset, stream, mr};
-  rmm::device_uvector<SymbolOffsetT> tokens_indices{
-    max_token_out_count + delimiter_offset, stream, mr};
 
+  // Run FST to estimate the size of output buffers
   json_to_tokens_fst.Transduce(zip_in,
                                static_cast<SymbolOffsetT>(json_in.size()),
-                               tokens.data() + delimiter_offset,
-                               tokens_indices.data() + delimiter_offset,
+                               thrust::make_discard_iterator(),
+                               thrust::make_discard_iterator(),
                                num_written_tokens.data(),
                                tokenizer_pda::start_state,
                                stream);
 
   auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset;
-  tokens.resize(num_total_tokens, stream);
-  tokens_indices.resize(num_total_tokens, stream);
+  rmm::device_uvector<PdaTokenT> tokens{num_total_tokens, stream, mr};
+  rmm::device_uvector<SymbolOffsetT> tokens_indices{num_total_tokens, stream, mr};
+
+  // Run FST to translate the input JSON string into tokens and indices at which they occur
+  json_to_tokens_fst.Transduce(zip_in,
+                               static_cast<SymbolOffsetT>(json_in.size()),
+                               tokens.data() + delimiter_offset,
+                               tokens_indices.data() + delimiter_offset,
+                               thrust::make_discard_iterator(),
+                               tokenizer_pda::start_state,
+                               stream);
 
   if (delimiter_offset == 1) {
     tokens.set_element(0, token_t::LineEnd, stream);
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 83c7b663980..d41d137a2c9 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/tokenize_json.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -87,13 +88,25 @@ void validate_token_stream(device_span<char const> d_input,
 {
   CUDF_FUNC_RANGE();
   if (!options.is_strict_validation()) { return; }
+
+  rmm::device_uvector<bool> d_invalid = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    tokens.size(), stream, cudf::get_current_device_resource_ref());
+
   using token_t = cudf::io::json::token_t;
-  cudf::detail::optional_trie trie_na =
-    cudf::detail::create_serialized_trie(options.get_na_values(), stream);
-  auto trie_na_view    = cudf::detail::make_trie_view(trie_na);
+  auto literals = options.get_na_values();
+  literals.emplace_back("null");  // added these too to single trie
+  literals.emplace_back("true");
+  literals.emplace_back("false");
+
+  cudf::detail::optional_trie trie_literals =
+    cudf::detail::create_serialized_trie(literals, stream);
+  cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie(
+    {"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream);
+
   auto validate_values = cuda::proclaim_return_type<bool>(
     [data                        = d_input.data(),
-     trie_na                     = trie_na_view,
+     trie_literals               = cudf::detail::make_trie_view(trie_literals),
+     trie_nonnumeric             = cudf::detail::make_trie_view(trie_nonnumeric),
      allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
      allow_nonnumeric =
        options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
@@ -101,24 +114,15 @@ void validate_token_stream(device_span<char const> d_input,
       // This validates an unquoted value. A value must match https://www.json.org/json-en.html
       // but the leading and training whitespace should already have been removed, and is not
       // a string
-      auto c               = data[start];
-      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
-      if (is_null_literal) {
-        return true;
-      } else if ('n' == c) {
-        return substr_eq(data, start, end, 4, "null");
-      } else if ('t' == c) {
-        return substr_eq(data, start, end, 4, "true");
-      } else if ('f' == c) {
-        return substr_eq(data, start, end, 5, "false");
-      } else if (allow_nonnumeric && c == 'N') {
-        return substr_eq(data, start, end, 3, "NaN");
-      } else if (allow_nonnumeric && c == 'I') {
-        return substr_eq(data, start, end, 8, "Infinity");
-      } else if (allow_nonnumeric && c == '+') {
-        return substr_eq(data, start, end, 4, "+INF") ||
-               substr_eq(data, start, end, 9, "+Infinity");
-      } else if ('-' == c || c <= '9' && 'c' >= '0') {
+      auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start});
+      if (is_literal) { return true; }
+      if (allow_nonnumeric) {
+        auto const is_nonnumeric =
+          serialized_trie_contains(trie_nonnumeric, {data + start, end - start});
+        if (is_nonnumeric) { return true; }
+      }
+      auto c = data[start];
+      if ('-' == c || c <= '9' && 'c' >= '0') {
         // number
         auto num_state = number_state::START;
         for (auto at = start; at < end; at++) {
@@ -140,9 +144,6 @@ void validate_token_stream(device_span<char const> d_input,
                 num_state = number_state::LEADING_ZERO;
               } else if (c >= '1' && c <= '9') {
                 num_state = number_state::WHOLE;
-              } else if (allow_nonnumeric && 'I' == c) {
-                return substr_eq(data, start, end, 4, "-INF") ||
-                       substr_eq(data, start, end, 9, "-Infinity");
               } else {
                 return false;
               }
@@ -273,33 +274,44 @@ void validate_token_stream(device_span<char const> d_input,
 
   auto num_tokens = tokens.size();
   auto count_it   = thrust::make_counting_iterator(0);
-  auto predicate  = [tokens        = tokens.begin(),
-                    token_indices = token_indices.begin(),
-                    validate_values,
-                    validate_strings] __device__(auto i) -> bool {
+  auto predicate  = cuda::proclaim_return_type<bool>([tokens        = tokens.begin(),
+                                                     token_indices = token_indices.begin(),
+                                                     validate_values,
+                                                     validate_strings] __device__(auto i) -> bool {
     if (tokens[i] == token_t::ValueEnd) {
       return !validate_values(token_indices[i - 1], token_indices[i]);
     } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
       return !validate_strings(token_indices[i - 1], token_indices[i]);
     }
     return false;
-  };
+  });
+
+  auto conditional_invalidout_it =
+    cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type<void>(
+      [d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void {
+        if (x) { d_invalid[i] = true; }
+      }));
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    count_it,
+                    count_it + num_tokens,
+                    conditional_invalidout_it,
+                    predicate);
 
   using scan_type            = write_if::scan_type;
   auto conditional_write     = write_if{tokens.begin(), num_tokens};
   auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
-  auto transform_op          = cuda::proclaim_return_type<scan_type>(
-    [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
-      if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
-      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
-    });
-  auto binary_op = cuda::proclaim_return_type<scan_type>(
+  auto binary_op             = cuda::proclaim_return_type<scan_type>(
     [] __device__(scan_type prev, scan_type curr) -> scan_type {
       auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
-      return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
+      return {(curr.second ? curr.first : op_result), prev.second | curr.second};
+    });
+  auto transform_op = cuda::proclaim_return_type<scan_type>(
+    [d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type {
+      if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
+      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
     });
 
-  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+  thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream),
                                    count_it,
                                    count_it + num_tokens,
                                    conditional_output_it,
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 99a5b17bce8..8a740ae17ef 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -315,13 +315,12 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // Reading to host because decompression of a single block is much faster on the CPU
   sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
   auto uncomp_data = decompress(compression, hbuffer);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
-                                reinterpret_cast<char*>(uncomp_data.data()),
-                                uncomp_data.size() * sizeof(char),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  stream.synchronize();
-  return buffer.first(uncomp_data.size());
+  auto ret_buffer  = buffer.first(uncomp_data.size());
+  cudf::detail::cuda_memcpy<char>(
+    ret_buffer,
+    host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
+    stream);
+  return ret_buffer;
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
@@ -351,10 +350,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * JSON inputs.
    */
   std::size_t const total_source_size = sources_size(sources, 0, 0);
-  std::size_t chunk_offset            = reader_opts.get_byte_range_offset();
-  std::size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                          = !chunk_size ? total_source_size - chunk_offset
-                                                    : std::min(chunk_size, total_source_size - chunk_offset);
+
+  // Batching is enabled only for JSONL inputs, not regular JSON files
+  CUDF_EXPECTS(
+    reader_opts.is_enabled_lines() || total_source_size < std::numeric_limits<int32_t>::max(),
+    "Parsing Regular JSON inputs of size greater than INT_MAX bytes is not supported");
+
+  std::size_t chunk_offset = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size   = reader_opts.get_byte_range_size();
+  chunk_size               = !chunk_size ? total_source_size - chunk_offset
+                                         : std::min(chunk_size, total_source_size - chunk_offset);
 
   std::size_t const size_per_subchunk      = estimate_size_per_subchunk(chunk_size);
   std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index dc7199d7ab1..e1241f8f90c 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -170,6 +170,9 @@ struct escape_strings_fn {
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
   {
+    if (column_v.is_empty()) {  // empty begets empty
+      return make_empty_column(type_id::STRING);
+    }
     auto [offsets_column, chars] =
       cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
 
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 5be75350951..0cb5c382631 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -77,20 +77,6 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
     counts, orc_columns, rowgroup_bounds, str_col_indexes);
 }
 
-template <int block_size>
-CUDF_KERNEL void __launch_bounds__(block_size)
-  initialize_dictionary_hash_maps_kernel(device_span<stripe_dictionary> dictionaries)
-{
-  auto const dict_map = dictionaries[blockIdx.x].map_slots;
-  auto const t        = threadIdx.x;
-  for (size_type i = 0; i < dict_map.size(); i += block_size) {
-    if (t + i < dict_map.size()) {
-      new (&dict_map[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
-      new (&dict_map[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
-    }
-  }
-}
-
 struct equality_functor {
   column_device_view const& col;
   __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const
@@ -109,6 +95,9 @@ struct hash_functor {
   }
 };
 
+// Probing scheme to use for the hash map
+using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_functor>;
+
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
   populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
@@ -121,26 +110,34 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto const& col       = columns[dict.column_idx];
 
   // Make a view of the hash map
-  auto hash_map_mutable  = map_type::device_mutable_view(dict.map_slots.data(),
-                                                        dict.map_slots.size(),
-                                                        cuco::empty_key{KEY_SENTINEL},
-                                                        cuco::empty_value{VALUE_SENTINEL});
   auto const hash_fn     = hash_functor{col};
   auto const equality_fn = equality_functor{col};
 
+  storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()};
+  // Make a view of the hash map.
+  auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                           cuco::empty_value{VALUE_SENTINEL},
+                                           equality_fn,
+                                           probing_scheme_type{hash_fn},
+                                           cuco::thread_scope_block,
+                                           storage_ref};
+
+  // Create a map ref with `cuco::insert` operator
+  auto has_map_insert_ref = hash_map_ref.rebind_operators(cuco::insert);
+
   auto const start_row = dict.start_row;
   auto const end_row   = dict.start_row + dict.num_rows;
 
   size_type entry_count{0};
   size_type char_count{0};
+
   // all threads should loop the same number of times
   for (thread_index_type cur_row = start_row + t; cur_row - t < end_row; cur_row += block_size) {
     auto const is_valid = cur_row < end_row and col.is_valid(cur_row);
 
     if (is_valid) {
       // insert element at cur_row to hash map and count successful insertions
-      auto const is_unique =
-        hash_map_mutable.insert(std::pair(cur_row, cur_row), hash_fn, equality_fn);
+      auto const is_unique = has_map_insert_ref.insert(cuco::pair{cur_row, cur_row});
 
       if (is_unique) {
         ++entry_count;
@@ -175,24 +172,23 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   if (not dict.is_enabled) { return; }
 
   auto const t = threadIdx.x;
-  auto map     = map_type::device_view(dict.map_slots.data(),
-                                   dict.map_slots.size(),
-                                   cuco::empty_key{KEY_SENTINEL},
-                                   cuco::empty_value{VALUE_SENTINEL});
-
   __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
 
   using cuda::std::memory_order_relaxed;
   if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
   __syncthreads();
+
   for (size_type i = 0; i < dict.map_slots.size(); i += block_size) {
     if (t + i < dict.map_slots.size()) {
-      auto* slot = reinterpret_cast<map_type::value_type*>(map.begin_slot() + t + i);
-      auto key   = slot->first;
-      if (key != KEY_SENTINEL) {
-        auto loc       = counter.fetch_add(1, memory_order_relaxed);
-        dict.data[loc] = key;
-        slot->second   = loc;
+      auto window = dict.map_slots.begin() + t + i;
+      // Collect all slots from each window.
+      for (auto& slot : *window) {
+        auto const key = slot.first;
+        if (key != KEY_SENTINEL) {
+          auto loc       = counter.fetch_add(1, memory_order_relaxed);
+          dict.data[loc] = key;
+          slot.second    = loc;
+        }
       }
     }
   }
@@ -205,47 +201,42 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 {
   auto const col_idx    = blockIdx.x;
   auto const stripe_idx = blockIdx.y;
+  auto const t          = threadIdx.x;
   auto const& dict      = dictionaries[col_idx][stripe_idx];
   auto const& col       = columns[dict.column_idx];
 
   if (not dict.is_enabled) { return; }
 
-  auto const t         = threadIdx.x;
+  // Make a view of the hash map
+  auto const hash_fn     = hash_functor{col};
+  auto const equality_fn = equality_functor{col};
+
+  storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()};
+  // Make a view of the hash map.
+  auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                           cuco::empty_value{VALUE_SENTINEL},
+                                           equality_fn,
+                                           probing_scheme_type{hash_fn},
+                                           cuco::thread_scope_block,
+                                           storage_ref};
+
+  // Create a map ref with `cuco::insert` operator
+  auto has_map_find_ref = hash_map_ref.rebind_operators(cuco::find);
+
   auto const start_row = dict.start_row;
   auto const end_row   = dict.start_row + dict.num_rows;
 
-  auto const map = map_type::device_view(dict.map_slots.data(),
-                                         dict.map_slots.size(),
-                                         cuco::empty_key{KEY_SENTINEL},
-                                         cuco::empty_value{VALUE_SENTINEL});
-
-  thread_index_type cur_row = start_row + t;
-  while (cur_row < end_row) {
+  for (thread_index_type cur_row = start_row + t; cur_row < end_row; cur_row += block_size) {
     if (col.is_valid(cur_row)) {
-      auto const hash_fn     = hash_functor{col};
-      auto const equality_fn = equality_functor{col};
-      auto const found_slot  = map.find(cur_row, hash_fn, equality_fn);
-      cudf_assert(found_slot != map.end() &&
+      auto const found_slot = has_map_find_ref.find(cur_row);
+      // Fail if we didn't find the previously inserted key.
+      cudf_assert(found_slot != has_map_find_ref.end() &&
                   "Unable to find value in map in dictionary index construction");
-      if (found_slot != map.end()) {
-        // No need for atomic as this is not going to be modified by any other thread
-        auto const val_ptr  = reinterpret_cast<map_type::mapped_type const*>(&found_slot->second);
-        dict.index[cur_row] = *val_ptr;
-      }
+      dict.index[cur_row] = found_slot->second;
     }
-    cur_row += block_size;
   }
 }
 
-void initialize_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries,
-                                     rmm::cuda_stream_view stream)
-{
-  if (dictionaries.count() == 0) { return; }
-  constexpr int block_size = 1024;
-  initialize_dictionary_hash_maps_kernel<block_size>
-    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries.flat_view());
-}
-
 void populate_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries,
                                    device_span<orc_column_device_view const> columns,
                                    rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 790532c9d54..5ab36fdae8e 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -258,7 +258,7 @@ class ProtobufReader {
 
  private:
   template <int index>
-  friend class FunctionSwitchImpl;
+  friend struct FunctionSwitchImpl;
 
   void skip_bytes(size_t bytecnt)
   {
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 8c7ccf0527f..0949fafe9a4 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -21,6 +21,7 @@
 #include "io/utilities/column_buffer.hpp"
 #include "orc.hpp"
 
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/timezone.cuh>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/io/types.hpp>
@@ -40,19 +41,27 @@ namespace gpu {
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
 
+using key_type    = size_type;
+using mapped_type = size_type;
+using slot_type   = cuco::pair<key_type, mapped_type>;
+auto constexpr map_cg_size =
+  1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
+      ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
+auto constexpr window_size =
+  1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
+auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
+                                          ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
+using storage_type     = cuco::aow_storage<slot_type,
+                                       window_size,
+                                       cuco::extent<std::size_t>,
+                                       cudf::detail::cuco_allocator<char>>;
+using storage_ref_type = typename storage_type::ref_type;
+using window_type      = typename storage_type::window_type;
+using slot_type        = cuco::pair<key_type, mapped_type>;
+
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
 
-using map_type = cuco::legacy::static_map<size_type, size_type>;
-
-/**
- * @brief The alias of `map_type::pair_atomic_type` class.
- *
- * Declare this struct by trivial subclassing instead of type aliasing so we can have forward
- * declaration of this struct somewhere else.
- */
-struct slot_type : public map_type::slot_type {};
-
 struct CompressedStreamInfo {
   CompressedStreamInfo() = default;
   explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_)
@@ -184,11 +193,11 @@ struct StripeStream {
  */
 struct stripe_dictionary {
   // input
-  device_span<slot_type> map_slots;  // hash map storage
-  uint32_t column_idx      = 0;      // column index
-  size_type start_row      = 0;      // first row in the stripe
-  size_type start_rowgroup = 0;      // first rowgroup in the stripe
-  size_type num_rows       = 0;      // number of rows in the stripe
+  device_span<window_type> map_slots;  // hash map (windows) storage
+  uint32_t column_idx      = 0;        // column index
+  size_type start_row      = 0;        // first row in the stripe
+  size_type start_rowgroup = 0;        // first rowgroup in the stripe
+  size_type num_rows       = 0;        // number of rows in the stripe
 
   // output
   device_span<uint32_t> data;        // index of elements in the column to include in the dictionary
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 01ee5ad177d..fcaee9c548e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -500,6 +500,8 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   auto const [read_begin, read_end] =
     merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
 
+  bool stream_synchronized{false};
+
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read_info = _file_itm_data.data_read_info[read_idx];
     auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
@@ -507,10 +509,17 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
 
     if (source_ptr->is_device_read_preferred(read_info.length)) {
-      device_read_tasks.push_back(
-        std::pair(source_ptr->device_read_async(
-                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
-                  read_info.length));
+      // `device_read_async` may not use _stream at all.
+      // Instead, it may use some other stream(s) to sync the H->D memcpy.
+      // As such, we need to make sure the device buffers in `lvl_stripe_data` are ready first.
+      if (!stream_synchronized) {
+        _stream.synchronize();
+        stream_synchronized = true;
+      }
+      device_read_tasks.emplace_back(
+        source_ptr->device_read_async(
+          read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+        read_info.length);
 
     } else {
       auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
@@ -659,8 +668,8 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
       auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
 
-      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
-        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>{hd_compinfo}.subspan(
+        0, stream_range.size());
       for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
         auto const& info = stream_info[stream_idx];
         auto const dst_base =
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index d628e936cb1..c42348a165f 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -22,6 +22,7 @@
 #include "io/utilities/hostdevice_span.hpp"
 
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -32,7 +33,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -451,7 +451,7 @@ void decode_stream_data(int64_t num_dicts,
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
-  rmm::device_scalar<size_type> error_count(0, stream);
+  cudf::detail::device_scalar<size_type> error_count(0, stream);
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
@@ -508,21 +508,20 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, cudf::get_current_device_resource_ref());
 
-  thrust::for_each(
-    rmm::exec_policy_nosync(stream),
-    d_prefix_sums_to_update.begin(),
-    d_prefix_sums_to_update.end(),
-    [num_stripes, chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-      auto const& idx_psums) {
-      auto const col_idx = idx_psums.first;
-      auto const psums   = idx_psums.second;
-      thrust::transform(thrust::seq,
-                        thrust::make_counting_iterator<std::size_t>(0ul),
-                        thrust::make_counting_iterator<std::size_t>(num_stripes),
-                        psums,
-                        [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-      thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
-    });
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   d_prefix_sums_to_update.begin(),
+                   d_prefix_sums_to_update.end(),
+                   [num_stripes, chunks = chunks.device_view()] __device__(auto const& idx_psums) {
+                     auto const col_idx = idx_psums.first;
+                     auto const psums   = idx_psums.second;
+                     thrust::transform(
+                       thrust::seq,
+                       thrust::make_counting_iterator<std::size_t>(0ul),
+                       thrust::make_counting_iterator<std::size_t>(num_stripes),
+                       psums,
+                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+                     thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
+                   });
   // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
   stream.synchronize();
 }
@@ -554,12 +553,12 @@ void aggregate_child_meta(std::size_t level,
   col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
   col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
 
-  auto child_start_row = cudf::detail::host_2dspan<int64_t>(
-    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
-  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<int64_t>(
-    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
+  auto child_start_row =
+    cudf::detail::host_2dspan<int64_t>(col_meta.child_start_row, num_child_cols);
+  auto num_child_rows_per_stripe =
+    cudf::detail::host_2dspan<int64_t>(col_meta.num_child_rows_per_stripe, num_child_cols);
   auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
-    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
+    col_meta.rwgrp_meta, num_child_cols);
 
   int index = 0;  // number of child column processed
 
@@ -951,8 +950,9 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
 
     // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
-        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      auto const compinfo =
+        cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>{hd_compinfo}.subspan(
+          0, stream_range.size());
       auto decomp_data = decompress_stripe_data(load_stripe_range,
                                                 stream_range,
                                                 stripe_count,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 5c70e35fd2e..ed0b6969154 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -20,6 +20,8 @@
 #include "orc_gpu.hpp"
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/batched_memcpy.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -1087,37 +1089,42 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 /**
  * @brief Merge chunked column data into a single contiguous stream
  *
- * @param[in,out] strm_desc StripeStream device array [stripe][stream]
- * @param[in,out] streams List of encoder chunk streams [column][rowgroup]
+ * @param[in] strm_desc StripeStream device array [stripe][stream]
+ * @param[in] streams List of encoder chunk streams [column][rowgroup]
+ * @param[out] srcs  List of source encoder chunk stream data addresses
+ * @param[out] dsts List of destination StripeStream data addresses
+ * @param[out] sizes List of stream sizes in bytes
  */
 // blockDim {compact_streams_block_size,1,1}
 CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
-  gpuCompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
-                           device_2dspan<encoder_chunk_streams> streams)
+  gpuInitBatchedMemcpy(device_2dspan<StripeStream const> strm_desc,
+                       device_2dspan<encoder_chunk_streams> streams,
+                       device_span<uint8_t*> srcs,
+                       device_span<uint8_t*> dsts,
+                       device_span<size_t> sizes)
 {
-  __shared__ __align__(16) StripeStream ss;
-
-  auto const stripe_id = blockIdx.x;
+  auto const stripe_id = cudf::detail::grid_1d::global_thread_id();
   auto const stream_id = blockIdx.y;
-  auto const t         = threadIdx.x;
+  if (stripe_id >= strm_desc.size().first) { return; }
 
-  if (t == 0) { ss = strm_desc[stripe_id][stream_id]; }
-  __syncthreads();
+  auto const out_id = stream_id * strm_desc.size().first + stripe_id;
+  StripeStream ss   = strm_desc[stripe_id][stream_id];
 
   if (ss.data_ptr == nullptr) { return; }
 
   auto const cid = ss.stream_type;
   auto dst_ptr   = ss.data_ptr;
   for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) {
+    auto const out_id = stream_id * streams.size().second + group;
+    srcs[out_id]      = streams[ss.column_id][group].data_ptrs[cid];
+    dsts[out_id]      = dst_ptr;
+
+    // Also update the stream here, data will be copied in a separate kernel
+    streams[ss.column_id][group].data_ptrs[cid] = dst_ptr;
+
     auto const len = streams[ss.column_id][group].lengths[cid];
-    if (len > 0) {
-      auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid];
-      for (uint32_t i = t; i < len; i += blockDim.x) {
-        dst_ptr[i] = src_ptr[i];
-      }
-      __syncthreads();
-    }
-    if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; }
+    // len is the size (in bytes) of the current stream.
+    sizes[out_id] = len;
     dst_ptr += len;
   }
 }
@@ -1325,9 +1332,26 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream)
 {
+  auto const num_rowgroups = enc_streams.size().second;
+  auto const num_streams   = strm_desc.size().second;
+  auto const num_stripes   = strm_desc.size().first;
+  auto const num_chunks    = num_rowgroups * num_streams;
+  auto srcs                = cudf::detail::make_zeroed_device_uvector_async<uint8_t*>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+  auto dsts = cudf::detail::make_zeroed_device_uvector_async<uint8_t*>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+  auto lengths = cudf::detail::make_zeroed_device_uvector_async<size_t>(
+    num_chunks, stream, rmm::mr::get_current_device_resource());
+
   dim3 dim_block(compact_streams_block_size, 1);
-  dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
-  gpuCompactOrcDataStreams<<<dim_grid, dim_block, 0, stream.value()>>>(strm_desc, enc_streams);
+  dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size),
+                strm_desc.size().second);
+  gpuInitBatchedMemcpy<<<dim_grid, dim_block, 0, stream.value()>>>(
+    strm_desc, enc_streams, srcs, dsts, lengths);
+
+  // Copy streams in a batched manner.
+  cudf::detail::batched_memcpy_async(
+    srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream);
 }
 
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 60a64fb0ee6..d432deb8e79 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -19,7 +19,9 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
+#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
 #include "writer_impl.hpp"
@@ -717,8 +719,8 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
 
   auto d_pd_set_counts_data = rmm::device_uvector<cudf::size_type>(
     orc_table.num_columns() * segmentation.num_rowgroups(), stream);
-  auto const d_pd_set_counts = device_2dspan<cudf::size_type>{
-    d_pd_set_counts_data.data(), segmentation.num_rowgroups(), orc_table.num_columns()};
+  auto const d_pd_set_counts =
+    device_2dspan<cudf::size_type>{d_pd_set_counts_data, orc_table.num_columns()};
   gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream);
 
   auto aligned_rgs = hostdevice_2dvector<rowgroup_rows>(
@@ -739,7 +741,7 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
     [columns = device_span<orc_column_device_view const>{orc_table.d_columns},
      stripes = device_span<stripe_rowgroups const>{d_stripes},
      d_pd_set_counts,
-     out_rowgroups = device_2dspan<rowgroup_rows>{aligned_rgs}] __device__(auto& idx) {
+     out_rowgroups = aligned_rgs.device_view()] __device__(auto& idx) {
       uint32_t const col_idx = idx / stripes.size();
       // No alignment needed for root columns
       if (not columns[col_idx].parent_index.has_value()) return;
@@ -911,7 +913,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0ul),
     chunks.count(),
-    [chunks = device_2dspan<gpu::EncChunk>{chunks},
+    [chunks = chunks.device_view(),
      cols = device_span<orc_column_device_view const>{orc_table.d_columns}] __device__(auto& idx) {
       auto const col_idx             = idx / chunks.size().second;
       auto const rg_idx              = idx % chunks.size().second;
@@ -1407,7 +1409,8 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     num_entries_seen += stripes_per_col;
   }
 
-  std::vector<statistics_merge_group> file_stats_merge(num_file_blobs);
+  auto file_stats_merge =
+    cudf::detail::make_host_vector<statistics_merge_group>(num_file_blobs, stream);
   for (auto i = 0u; i < num_file_blobs; ++i) {
     auto col_stats         = &file_stats_merge[i];
     col_stats->col_dtype   = per_chunk_stats.col_types[i];
@@ -1417,11 +1420,10 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
   }
 
   auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_file_stats_merge,
-                                file_stats_merge.data(),
-                                num_file_blobs * sizeof(statistics_merge_group),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<statistics_merge_group>(
+    device_span<statistics_merge_group>{stats_merge.device_ptr(num_stripe_blobs), num_file_blobs},
+    file_stats_merge,
+    stream);
 
   auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs;
   detail::merge_group_statistics<detail::io_file_format::ORC>(
@@ -1572,7 +1574,7 @@ void write_index_stream(int32_t stripe_id,
  * @param[in] strm_desc Stream's descriptor
  * @param[in] enc_stream Chunk's streams
  * @param[in] compressed_data Compressed stream data
- * @param[in,out] stream_out Temporary host output buffer
+ * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
  * @param[in] compression_kind The compression kind
@@ -1583,7 +1585,7 @@ void write_index_stream(int32_t stripe_id,
 std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
                                     gpu::encoder_chunk_streams const& enc_stream,
                                     uint8_t const* compressed_data,
-                                    uint8_t* stream_out,
+                                    host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
                                     orc_streams* streams,
                                     CompressionKind compression_kind,
@@ -1603,11 +1605,10 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
     if (out_sink->is_device_write_preferred(length)) {
       return out_sink->device_write_async(stream_in, length, stream);
     } else {
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value()));
-      stream.synchronize();
+      cudf::detail::cuda_memcpy(
+        bounce_buffer.subspan(0, length), device_span<uint8_t const>{stream_in, length}, stream);
 
-      out_sink->host_write(stream_out, length);
+      out_sink->host_write(bounce_buffer.data(), length);
       return std::async(std::launch::deferred, [] {});
     }
   }();
@@ -1897,7 +1898,7 @@ hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view cons
     thrust::make_counting_iterator(0ul),
     num_rowgroups,
     [cols      = device_span<orc_column_device_view const>{orc_table.d_columns},
-     rg_bounds = device_2dspan<rowgroup_rows>{rowgroup_bounds},
+     rg_bounds = rowgroup_bounds.device_view(),
      rowgroup_size] __device__(auto rg_idx) mutable {
       thrust::transform(
         thrust::seq, cols.begin(), cols.end(), rg_bounds[rg_idx].begin(), [&](auto const& col) {
@@ -1987,8 +1988,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                      d_tmp_rowgroup_sizes.end(),
                      [src       = esizes.data(),
                       col_idx   = col_idx,
-                      rg_bounds = device_2dspan<rowgroup_rows const>{
-                        segmentation.rowgroups}] __device__(auto idx) {
+                      rg_bounds = segmentation.rowgroups.device_view()] __device__(auto idx) {
                        return src[rg_bounds[idx][col_idx].end - 1];
                      });
 
@@ -2050,7 +2050,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
   auto const num_str_cols  = orc_table.num_string_columns();
 
   auto counts         = rmm::device_uvector<size_type>(num_str_cols * num_rowgroups, stream);
-  auto counts_2d_view = device_2dspan<size_type>(counts.data(), num_str_cols, num_rowgroups);
+  auto counts_2d_view = device_2dspan<size_type>(counts, num_rowgroups);
   gpu::rowgroup_char_counts(counts_2d_view,
                             orc_table.d_columns,
                             rowgroup_bounds,
@@ -2110,7 +2110,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
                                        bool sort_dictionaries,
                                        rmm::cuda_stream_view stream)
 {
-  std::vector<std::vector<rmm::device_uvector<gpu::slot_type>>> hash_maps_storage(
+  // Variable to keep track of the current total map storage size
+  size_t total_map_storage_size = 0;
+  std::vector<std::vector<size_t>> hash_maps_storage_offsets(
     orc_table.string_column_indices.size());
   for (auto col_idx : orc_table.string_column_indices) {
     auto& str_column = orc_table.column(col_idx);
@@ -2119,14 +2121,21 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
         stripe.size == 0 ? 0
                          : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end -
                              segmentation.rowgroups[stripe.first][col_idx].begin;
-      hash_maps_storage[str_column.str_index()].emplace_back(stripe_num_rows * 1.43, stream);
+      hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size);
+      total_map_storage_size += stripe_num_rows * gpu::occupancy_factor;
     }
+    hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size);
   }
 
   hostdevice_2dvector<gpu::stripe_dictionary> stripe_dicts(
     orc_table.num_string_columns(), segmentation.num_stripes(), stream);
   if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}};
 
+  // Create a single bulk storage to use for all sub-dictionaries
+  auto map_storage = std::make_unique<gpu::storage_type>(
+    total_map_storage_size,
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream});
+
   // Initialize stripe dictionaries
   for (auto col_idx : orc_table.string_column_indices) {
     auto& str_column       = orc_table.column(col_idx);
@@ -2137,7 +2146,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
       auto const stripe_idx = stripe.id;
       auto& sd              = stripe_dicts[str_col_idx][stripe_idx];
 
-      sd.map_slots      = hash_maps_storage[str_col_idx][stripe_idx];
+      sd.map_slots      = {map_storage->data() + hash_maps_storage_offsets[str_col_idx][stripe_idx],
+                           hash_maps_storage_offsets[str_col_idx][stripe_idx + 1] -
+                             hash_maps_storage_offsets[str_col_idx][stripe_idx]};
       sd.column_idx     = col_idx;
       sd.start_row      = segmentation.rowgroups[stripe.first][col_idx].begin;
       sd.start_rowgroup = stripe.first;
@@ -2150,7 +2161,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   }
   stripe_dicts.host_to_device_async(stream);
 
-  gpu::initialize_dictionary_hash_maps(stripe_dicts, stream);
+  map_storage->initialize_async({gpu::KEY_SENTINEL, gpu::VALUE_SENTINEL}, {stream.value()});
   gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream);
   // Copy the entry counts and char counts from the device to the host
   stripe_dicts.device_to_host_sync(stream);
@@ -2184,8 +2195,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
         col_use_dictionary = true;
       } else {
         // Clear hash map storage as dictionary encoding is not used for this stripe
-        hash_maps_storage[str_col_idx][stripe_idx] = rmm::device_uvector<gpu::slot_type>(0, stream);
-        sd.map_slots                               = {};
+        sd.map_slots = {};
       }
     }
     // If any stripe uses dictionary encoding, allocate index storage for the whole column
@@ -2203,7 +2213,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
 
   // deallocate hash map storage, unused after this point
-  hash_maps_storage.clear();
+  map_storage.reset();
 
   // Clear map slots and attach order buffers
   auto dictionaries_flat = stripe_dicts.host_view().flat_view();
@@ -2606,7 +2616,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
         strm_desc,
         enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
         compressed_data.data(),
-        bounce_buffer.data(),
+        bounce_buffer,
         &stripe,
         &streams,
         _compression_kind,
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 17ccb73c0a8..b85ebf2fa1a 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -84,7 +84,7 @@ struct map_insert_fn {
                                                storage_ref};
 
       // Create a map ref with `cuco::insert` operator
-      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
+      auto map_insert_ref = hash_map_ref.rebind_operators(cuco::insert);
       auto const t        = threadIdx.x;
 
       // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size
@@ -186,7 +186,7 @@ struct map_find_fn {
                                                storage_ref};
 
       // Create a map ref with `cuco::find` operator
-      auto const map_find_ref = hash_map_ref.with_operators(cuco::find);
+      auto const map_find_ref = hash_map_ref.rebind_operators(cuco::find);
       auto const t            = threadIdx.x;
 
       // Note: Adjust the following loop to use `cg::tiles<map_cg_size>` if needed in the future.
@@ -194,17 +194,12 @@ struct map_find_fn {
            val_idx += block_size) {
         // Find the key using a single thread for best performance for now.
         if (data_col.is_valid(val_idx)) {
+          auto const found_slot = map_find_ref.find(val_idx);
+          // Fail if we didn't find the previously inserted key.
+          cudf_assert(found_slot != map_find_ref.end() &&
+                      "Unable to find value in map in dictionary index construction");
           // No need for atomic as this is not going to be modified by any other thread.
-          chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() {
-            auto const found_slot = map_find_ref.find(val_idx);
-
-            // Fail if we didn't find the previously inserted key.
-            cudf_assert(found_slot != map_find_ref.end() &&
-                        "Unable to find value in map in dictionary index construction");
-
-            // Return the found value.
-            return found_slot->second;
-          }();
+          chunk->dict_index[val_idx - s_ck_start_val_idx] = found_slot->second;
         }
       }
     } else {
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index b978799b8bc..d276e946a51 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -228,7 +228,8 @@ class parquet_field_string : public parquet_field {
  * @return True if field types mismatch or if the process of reading a
  * string fails
  */
-struct parquet_field_string_list : public parquet_field_list<std::string, FieldType::BINARY> {
+class parquet_field_string_list : public parquet_field_list<std::string, FieldType::BINARY> {
+ public:
   parquet_field_string_list(int f, std::vector<std::string>& v) : parquet_field_list(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
@@ -308,10 +309,10 @@ class parquet_field_struct : public parquet_field {
 template <typename E, typename T>
 class parquet_field_union_struct : public parquet_field {
   E& enum_val;
-  cuda::std::optional<T>& val;  // union structs are always wrapped in std::optional
+  std::optional<T>& val;  // union structs are always wrapped in std::optional
 
  public:
-  parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
+  parquet_field_union_struct(int f, E& ev, std::optional<T>& v)
     : parquet_field(f), enum_val(ev), val(v)
   {
   }
@@ -396,8 +397,9 @@ class parquet_field_binary : public parquet_field {
  * @return True if field types mismatch or if the process of reading a
  * binary fails
  */
-struct parquet_field_binary_list
+class parquet_field_binary_list
   : public parquet_field_list<std::vector<uint8_t>, FieldType::BINARY> {
+ public:
   parquet_field_binary_list(int f, std::vector<std::vector<uint8_t>>& v) : parquet_field_list(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
@@ -437,10 +439,10 @@ class parquet_field_struct_blob : public parquet_field {
  */
 template <typename T, typename FieldFunctor>
 class parquet_field_optional : public parquet_field {
-  cuda::std::optional<T>& val;
+  std::optional<T>& val;
 
  public:
-  parquet_field_optional(int f, cuda::std::optional<T>& v) : parquet_field(f), val(v) {}
+  parquet_field_optional(int f, std::optional<T>& v) : parquet_field(f), val(v) {}
 
   inline void operator()(CompactProtocolReader* cpr, int field_type)
   {
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 8a866141c4b..4522ea7fe56 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,6 +24,59 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously.
+// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for
+// lists.
+struct block_scan_results {
+  uint32_t warp_bits;
+  int thread_count_within_warp;
+  int warp_count;
+
+  int thread_count_within_block;
+  int block_count;
+};
+
+template <int decode_block_size>
+static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
+{
+  int const t              = threadIdx.x;
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
+
+  uint32_t warp_bits = ballot(thread_bit);
+  scan_block_exclusive_sum<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
+}
+
+template <int decode_block_size>
+__device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
+                                                int warp_lane,
+                                                int warp_index,
+                                                uint32_t lane_mask,
+                                                block_scan_results& results)
+{
+  // Compute # warps
+  constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
+
+  // Compute the warp-wide results
+  results.warp_bits                = warp_bits;
+  results.warp_count               = __popc(results.warp_bits);
+  results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
+
+  // Share the warp counts amongst the block threads
+  __shared__ int warp_counts[num_warps];
+  if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
+  __syncthreads();
+
+  // Compute block-wide results
+  results.block_count               = 0;
+  results.thread_count_within_block = results.thread_count_within_warp;
+  for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
+    results.block_count += warp_counts[warp_idx];
+    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+  }
+}
+
 template <int block_size, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
@@ -194,7 +247,7 @@ struct decode_fixed_width_split_values_func {
   }
 };
 
-template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+template <int decode_block_size, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesNested(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
@@ -211,29 +264,28 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
 
   int const row_index_lower_bound = s->row_index_lower_bound;
 
-  int const max_depth = s->col.max_nesting_depth - 1;
+  int const max_depth       = s->col.max_nesting_depth - 1;
+  auto& max_depth_ni        = s->nesting_info[max_depth];
+  int max_depth_valid_count = max_depth_ni.valid_count;
+
   __syncthreads();
 
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
+    // definition level
+    int d = 1;
+    if (t >= batch_size) {
+      d = -1;
+    } else if (def) {
+      d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
     }
 
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index           = (thread_value_count + value_count) - 1;
+    int const row_index           = thread_value_count + value_count;
     int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
     int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
     int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
@@ -242,90 +294,75 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
       auto& ni = s->nesting_info[d_idx];
 
-      int is_valid;
-      if constexpr (nullable) {
-        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
-      } else {
-        is_valid = in_row_bounds;
-      }
+      int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
 
       // thread and block validity count
+      using block_scan = cub::BlockScan<int, decode_block_size>;
+      __shared__ typename block_scan::TempStorage scan_storage;
       int thread_valid_count, block_valid_count;
-      if constexpr (nullable) {
-        using block_scan = cub::BlockScan<int, decode_block_size>;
-        __shared__ typename block_scan::TempStorage scan_storage;
-        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-        __syncthreads();
-
-        // validity is processed per-warp
-        //
-        // nested schemas always read and write to the same bounds (that is, read and write
-        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-        // at the first value, even if that is before first_row, because we cannot trivially jump to
-        // the correct position to start reading. since we are about to write the validity vector
-        // here we need to adjust our computed mask to take into account the write row bounds.
-        int warp_null_count = 0;
-        if (write_start >= 0 && ni.valid_map != nullptr) {
-          int const valid_map_offset        = ni.valid_map_offset;
-          uint32_t const warp_validity_mask = ballot(is_valid);
-          // lane 0 from each warp writes out validity
-          if ((t % cudf::detail::warp_size) == 0) {
-            int const vindex =
-              (value_count + thread_value_count) - 1;  // absolute input value index
-            int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                   first_row;  // absolute bit offset into the output validity map
-            int const write_end = cudf::detail::warp_size -
-                                  __clz(in_write_row_bounds);  // last bit in the warp to store
-            int const bit_count = write_end - write_start;
-            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-          }
-        }
+      block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
 
-        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-        // valid_count) because valid_count also includes rows that potentially start before our row
-        // bounds. if we could come up with a way to clean that up, we could remove this and just
-        // compute it directly at the end of the kernel.
-        size_type const block_null_count =
-          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-        if (t == 0) { ni.null_count += block_null_count; }
-      }
-      // trivial for non-nullable columns
-      else {
-        thread_valid_count = thread_value_count;
-        block_valid_count  = block_value_count;
+      // validity is processed per-warp
+      //
+      // nested schemas always read and write to the same bounds (that is, read and write
+      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+      // at the first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int warp_null_count = 0;
+      if (ni.valid_map != nullptr) {
+        uint32_t const warp_validity_mask = ballot(is_valid);
+        // lane 0 from each warp writes out validity
+        if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) {
+          int const valid_map_offset = ni.valid_map_offset;
+          int const vindex     = value_count + thread_value_count;  // absolute input value index
+          int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                 first_row;  // absolute bit offset into the output validity map
+          int const write_end =
+            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+          int const bit_count = write_end - write_start;
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+        }
       }
 
+      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+      // valid_count) because valid_count also includes rows that potentially start before our row
+      // bounds. if we could come up with a way to clean that up, we could remove this and just
+      // compute it directly at the end of the kernel.
+      size_type const block_null_count =
+        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+      if (t == 0) { ni.null_count += block_null_count; }
+
       // if this is valid and we're at the leaf, output dst_pos
-      __syncthreads();  // handle modification of ni.value_count from below
-      if (is_valid && d_idx == max_depth) {
-        // for non-list types, the value count is always the same across
-        int const dst_pos = (value_count + thread_value_count) - 1;
-        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
-        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      if (d_idx == max_depth) {
+        if (is_valid) {
+          int const dst_pos = value_count + thread_value_count;
+          int const src_pos = max_depth_valid_count + thread_valid_count;
+          sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+        }
+        // update stuff
+        max_depth_valid_count += block_valid_count;
       }
-      __syncthreads();  // handle modification of ni.value_count from below
 
-      // update stuff
-      if (t == 0) { ni.valid_count += block_valid_count; }
-    }
+    }  // end depth loop
 
     value_count += block_value_count;
-  }
+  }  // end loop
 
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
-    s->nz_count          = s->nesting_info[max_depth].valid_count;
-    s->input_value_count = value_count;
-    s->input_row_count   = value_count;
+    max_depth_ni.valid_count = max_depth_valid_count;
+    s->nz_count              = max_depth_valid_count;
+    s->input_value_count     = value_count;
+    s->input_row_count       = value_count;
   }
 
-  __syncthreads();
-  return s->nesting_info[max_depth].valid_count;
+  return max_depth_valid_count;
 }
 
-template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+template <int decode_block_size, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
@@ -351,83 +388,67 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
-    }
-
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
+    int const row_index     = thread_value_count + value_count;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // use definition level & row bounds to determine if is valid
     int is_valid;
-    if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+    if (t >= batch_size) {
+      is_valid = 0;
+    } else if (def) {
+      int const def_level =
+        static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
     } else {
       is_valid = in_row_bounds;
     }
 
     // thread and block validity count
+    using block_scan = cub::BlockScan<int, decode_block_size>;
+    __shared__ typename block_scan::TempStorage scan_storage;
     int thread_valid_count, block_valid_count;
-    if constexpr (nullable) {
-      using block_scan = cub::BlockScan<int, decode_block_size>;
-      __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-      __syncthreads();
-
-      // validity is processed per-warp
-      //
-      // nested schemas always read and write to the same bounds (that is, read and write
-      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-      // at the first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector
-      // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
-        // lane 0 from each warp writes out validity
-        if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
-          int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-          int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-        }
-      }
-
-      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-      // valid_count) because valid_count also includes rows that potentially start before our row
-      // bounds. if we could come up with a way to clean that up, we could remove this and just
-      // compute it directly at the end of the kernel.
-      size_type const block_null_count =
-        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-      if (t == 0) { ni.null_count += block_null_count; }
-    }
-    // trivial for non-nullable columns
-    else {
-      thread_valid_count = thread_value_count;
-      block_valid_count  = block_value_count;
+    block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
+    uint32_t const warp_validity_mask = ballot(is_valid);
+
+    // validity is processed per-warp
+    //
+    // nested schemas always read and write to the same bounds (that is, read and write
+    // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+    // at the first value, even if that is before first_row, because we cannot trivially jump to
+    // the correct position to start reading. since we are about to write the validity vector
+    // here we need to adjust our computed mask to take into account the write row bounds.
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+    int warp_null_count   = 0;
+    // lane 0 from each warp writes out validity
+    if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) {
+      int const vindex     = value_count + thread_value_count;  // absolute input value index
+      int const bit_offset = (valid_map_offset + vindex + write_start) -
+                             first_row;  // absolute bit offset into the output validity map
+      int const write_end =
+        cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+      int const bit_count = write_end - write_start;
+      warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+      store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
     }
 
+    // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+    // valid_count) because valid_count also includes rows that potentially start before our row
+    // bounds. if we could come up with a way to clean that up, we could remove this and just
+    // compute it directly at the end of the kernel.
+    size_type const block_null_count =
+      cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+    if (t == 0) { ni.null_count += block_null_count; }
+
     // output offset
     if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
+      int const dst_pos                                          = value_count + thread_value_count;
+      int const src_pos                                          = valid_count + thread_valid_count;
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 
@@ -448,6 +469,70 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   return valid_count;
 }
 
+template <int decode_block_size, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_value_count,
+                                                                page_state_s* s,
+                                                                state_buf* sb,
+                                                                int t)
+{
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
+
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
+  int const row_index_lower_bound     = s->row_index_lower_bound;
+
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
+
+  int const max_depth = s->col.max_nesting_depth - 1;
+  auto& ni            = s->nesting_info[max_depth];
+  int valid_count     = ni.valid_count;
+
+  __syncthreads();
+
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
+
+    int const thread_value_count = t;
+    int const block_value_count  = batch_size;
+
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index     = thread_value_count + value_count;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    int const is_valid           = in_row_bounds;
+    int const thread_valid_count = thread_value_count;
+    int const block_valid_count  = block_value_count;
+
+    // if this is valid and we're at the leaf, output dst_pos
+    if (is_valid) {
+      // for non-list types, the value count is always the same across
+      int const dst_pos = value_count + thread_value_count;
+      int const src_pos = valid_count + thread_valid_count;
+
+      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+    }
+
+    // update stuff
+    value_count += block_value_count;
+    valid_count += block_valid_count;
+  }  // end loop
+
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    ni.valid_count       = valid_count;
+    ni.value_count       = value_count;
+    s->nz_count          = valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
+  }
+
+  return valid_count;
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -605,7 +690,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   int valid_count     = 0;
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
+  // For chunked reads we may not process all of the rows on the page; if not stop early
+  int last_row = s->first_row + s->num_rows;
+  while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
+         (s->input_row_count <= last_row)) {
     int next_valid_count;
 
     // only need to process definition levels if this is a nullable column
@@ -614,10 +702,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       __syncthreads();
 
       if constexpr (has_nesting_t) {
-        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       }
     }
@@ -626,15 +714,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-
-      if constexpr (has_nesting_t) {
-        next_valid_count =
-          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
-            processed_count, s, sb, nullptr, t);
-      } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
-          processed_count, s, sb, nullptr, t);
-      }
+      next_valid_count =
+        gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
     }
     __syncthreads();
 
diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp
index f0fc9fab3ab..8b3d1d7a6c3 100644
--- a/cpp/src/io/parquet/error.hpp
+++ b/cpp/src/io/parquet/error.hpp
@@ -26,7 +26,7 @@
 namespace cudf::io::parquet {
 
 /**
- * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in
+ * @brief Specialized device scalar for use in reporting errors that occur in
  * kernel calls.
  *
  * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index e0d50d7ccf9..0d24fa4236f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -17,8 +17,11 @@
 #include "page_data.cuh"
 #include "page_decode.cuh"
 
+#include <cudf/detail/utilities/batched_memcpy.hpp>
+
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
 namespace cudf::io::parquet::detail {
@@ -466,4 +469,28 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
   }
 }
 
+void WriteFinalOffsets(host_span<size_type const> offsets,
+                       host_span<size_type* const> buff_addrs,
+                       rmm::cuda_stream_view stream)
+{
+  // Copy offsets to device and create an iterator
+  auto d_src_data = cudf::detail::make_device_uvector_async(
+    offsets, stream, cudf::get_current_device_resource_ref());
+  // Iterator for the source (scalar) data
+  auto src_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<std::size_t>(0),
+    cuda::proclaim_return_type<cudf::size_type*>(
+      [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; }));
+
+  // Copy buffer addresses to device and create an iterator
+  auto d_dst_addrs = cudf::detail::make_device_uvector_async(
+    buff_addrs, stream, cudf::get_current_device_resource_ref());
+  // size_iter is simply a constant iterator of sizeof(size_type) bytes.
+  auto size_iter = thrust::make_constant_iterator(sizeof(size_type));
+
+  // Copy offsets to buffers in batched manner.
+  cudf::detail::batched_memcpy_async(
+    src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream);
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 7c985643887..2851ef67a65 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -20,8 +20,6 @@
 
 #include <cudf/types.hpp>
 
-#include <cuda/std/optional>
-
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -94,10 +92,10 @@ struct LogicalType {
     BSON
   };
   Type type;
-  cuda::std::optional<DecimalType> decimal_type;
-  cuda::std::optional<TimeType> time_type;
-  cuda::std::optional<TimestampType> timestamp_type;
-  cuda::std::optional<IntType> int_type;
+  std::optional<DecimalType> decimal_type;
+  std::optional<TimeType> time_type;
+  std::optional<TimestampType> timestamp_type;
+  std::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
@@ -178,21 +176,21 @@ struct SchemaElement {
   // 5: nested fields
   int32_t num_children = 0;
   // 6: DEPRECATED: record the original type before conversion to parquet type
-  cuda::std::optional<ConvertedType> converted_type;
+  std::optional<ConvertedType> converted_type;
   // 7: DEPRECATED: record the scale for DECIMAL converted type
   int32_t decimal_scale = 0;
   // 8: DEPRECATED: record the precision for DECIMAL converted type
   int32_t decimal_precision = 0;
   // 9: save field_id from original schema
-  cuda::std::optional<int32_t> field_id;
+  std::optional<int32_t> field_id;
   // 10: replaces converted type
-  cuda::std::optional<LogicalType> logical_type;
+  std::optional<LogicalType> logical_type;
 
   // extra cudf specific fields
   bool output_as_byte_array = false;
 
   // cudf type determined from arrow:schema
-  cuda::std::optional<type_id> arrow_type;
+  std::optional<type_id> arrow_type;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
@@ -258,21 +256,21 @@ struct SchemaElement {
  */
 struct Statistics {
   // deprecated max value in signed comparison order
-  cuda::std::optional<std::vector<uint8_t>> max;
+  std::optional<std::vector<uint8_t>> max;
   // deprecated min value in signed comparison order
-  cuda::std::optional<std::vector<uint8_t>> min;
+  std::optional<std::vector<uint8_t>> min;
   // count of null values in the column
-  cuda::std::optional<int64_t> null_count;
+  std::optional<int64_t> null_count;
   // count of distinct values occurring
-  cuda::std::optional<int64_t> distinct_count;
+  std::optional<int64_t> distinct_count;
   // max value for column determined by ColumnOrder
-  cuda::std::optional<std::vector<uint8_t>> max_value;
+  std::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
-  cuda::std::optional<std::vector<uint8_t>> min_value;
+  std::optional<std::vector<uint8_t>> min_value;
   // If true, max_value is the actual maximum value for a column
-  cuda::std::optional<bool> is_max_value_exact;
+  std::optional<bool> is_max_value_exact;
   // If true, min_value is the actual minimum value for a column
-  cuda::std::optional<bool> is_min_value_exact;
+  std::optional<bool> is_min_value_exact;
 };
 
 /**
@@ -281,7 +279,7 @@ struct Statistics {
 struct SizeStatistics {
   // Number of variable-width bytes stored for the page/chunk. Should not be set for anything
   // but the BYTE_ARRAY physical type.
-  cuda::std::optional<int64_t> unencoded_byte_array_data_bytes;
+  std::optional<int64_t> unencoded_byte_array_data_bytes;
   /**
    * When present, there is expected to be one element corresponding to each
    * repetition (i.e. size=max repetition_level+1) where each element
@@ -290,14 +288,14 @@ struct SizeStatistics {
    *
    * This value should not be written if max_repetition_level is 0.
    */
-  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
+  std::optional<std::vector<int64_t>> repetition_level_histogram;
 
   /**
    * Same as repetition_level_histogram except for definition levels.
    *
    * This value should not be written if max_definition_level is 0 or 1.
    */
-  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
+  std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -318,7 +316,7 @@ struct OffsetIndex {
   std::vector<PageLocation> page_locations;
   // per-page size info. see description of the same field in SizeStatistics. only present for
   // columns with a BYTE_ARRAY physical type.
-  cuda::std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
+  std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
 };
 
 /**
@@ -329,11 +327,11 @@ struct ColumnIndex {
   std::vector<std::vector<uint8_t>> min_values;  // lower bound for values in each page
   std::vector<std::vector<uint8_t>> max_values;  // upper bound for values in each page
   BoundaryOrder boundary_order =
-    BoundaryOrder::UNORDERED;  // Indicates if min and max values are ordered
-  cuda::std::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
+    BoundaryOrder::UNORDERED;                       // Indicates if min and max values are ordered
+  std::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
   // Repetition/definition level histograms for the column chunk
-  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
-  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
+  std::optional<std::vector<int64_t>> repetition_level_histogram;
+  std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -383,11 +381,11 @@ struct ColumnChunkMetaData {
   Statistics statistics;
   // Set of all encodings used for pages in this column chunk. This information can be used to
   // determine if all data pages are dictionary encoded for example.
-  cuda::std::optional<std::vector<PageEncodingStats>> encoding_stats;
+  std::optional<std::vector<PageEncodingStats>> encoding_stats;
   // Optional statistics to help estimate total memory when converted to in-memory representations.
   // The histograms contained in these statistics can also be useful in some cases for more
   // fine-grained nullability/list length filter pushdown.
-  cuda::std::optional<SizeStatistics> size_statistics;
+  std::optional<SizeStatistics> size_statistics;
 };
 
 /**
@@ -429,13 +427,13 @@ struct RowGroup {
   int64_t num_rows = 0;
   // If set, specifies a sort ordering of the rows in this RowGroup.
   // The sorting columns can be a subset of all the columns.
-  cuda::std::optional<std::vector<SortingColumn>> sorting_columns;
+  std::optional<std::vector<SortingColumn>> sorting_columns;
   // Byte offset from beginning of file to first page (data or dictionary) in this row group
-  cuda::std::optional<int64_t> file_offset;
+  std::optional<int64_t> file_offset;
   // Total byte size of all compressed (and potentially encrypted) column data in this row group
-  cuda::std::optional<int64_t> total_compressed_size;
+  std::optional<int64_t> total_compressed_size;
   // Row group ordinal in the file
-  cuda::std::optional<int16_t> ordinal;
+  std::optional<int16_t> ordinal;
 };
 
 /**
@@ -460,7 +458,7 @@ struct FileMetaData {
   std::vector<RowGroup> row_groups;
   std::vector<KeyValue> key_value_metadata;
   std::string created_by = "";
-  cuda::std::optional<std::vector<ColumnOrder>> column_orders;
+  std::optional<std::vector<ColumnOrder>> column_orders;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index e631e12119d..be502b581af 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -22,14 +22,13 @@
 #include "io/parquet/parquet_common.hpp"
 #include "io/statistics/statistics.cuh"
 #include "io/utilities/column_buffer.hpp"
-#include "io/utilities/hostdevice_vector.hpp"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cuda/atomic>
@@ -395,7 +394,7 @@ struct ColumnChunkDesc {
                            uint8_t def_level_bits_,
                            uint8_t rep_level_bits_,
                            Compression codec_,
-                           cuda::std::optional<LogicalType> logical_type_,
+                           std::optional<LogicalType> logical_type_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
                            int32_t src_col_schema_,
@@ -441,12 +440,12 @@ struct ColumnChunkDesc {
   int32_t num_data_pages{};                           // number of data pages
   int32_t num_dict_pages{};                           // number of dictionary pages
   PageInfo const* dict_page{};
-  string_index_pair* str_dict_index{};  // index for string dictionary
-  bitmask_type** valid_map_base{};      // base pointers of valid bit map for this column
-  void** column_data_base{};            // base pointers of column data
-  void** column_string_base{};          // base pointers of column string data
-  Compression codec{};                  // compressed codec enum
-  cuda::std::optional<LogicalType> logical_type{};  // logical type
+  string_index_pair* str_dict_index{};        // index for string dictionary
+  bitmask_type** valid_map_base{};            // base pointers of valid bit map for this column
+  void** column_data_base{};                  // base pointers of column data
+  void** column_string_base{};                // base pointers of column string data
+  Compression codec{};                        // compressed codec enum
+  std::optional<LogicalType> logical_type{};  // logical type
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
@@ -797,6 +796,18 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
+/**
+ * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a
+ * batched manner.
+ *
+ * @param offsets Host span of final offsets
+ * @param buff_addrs Host span of corresponding output col buffer end addresses
+ * @param stream CUDA stream to use
+ */
+void WriteFinalOffsets(host_span<size_type const> offsets,
+                       host_span<size_type* const> buff_addrs,
+                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the string column data stored in the pages
  *
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index b90ca36c8c7..32e922b04bb 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -152,7 +152,7 @@ struct stats_caster {
         }
 
         void set_index(size_type index,
-                       cuda::std::optional<std::vector<uint8_t>> const& binary_value,
+                       std::optional<std::vector<uint8_t>> const& binary_value,
                        Type const type)
         {
           if (binary_value.has_value()) {
@@ -234,8 +234,8 @@ struct stats_caster {
             max.set_index(stats_idx, max_value, colchunk.meta_data.type);
           } else {
             // Marking it null, if column present in row group
-            min.set_index(stats_idx, cuda::std::nullopt, {});
-            max.set_index(stats_idx, cuda::std::nullopt, {});
+            min.set_index(stats_idx, std::nullopt, {});
+            max.set_index(stats_idx, std::nullopt, {});
           }
           stats_idx++;
         }
@@ -454,15 +454,18 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
                "Filter expression must return a boolean column");
 
-  auto num_bitmasks = num_bitmask_words(predicate.size());
-  std::vector<bitmask_type> host_bitmask(num_bitmasks, ~bitmask_type{0});
-  if (predicate.nullable()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(host_bitmask.data(),
-                                  predicate.null_mask(),
-                                  num_bitmasks * sizeof(bitmask_type),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-  }
+  auto const host_bitmask = [&] {
+    auto const num_bitmasks = num_bitmask_words(predicate.size());
+    if (predicate.nullable()) {
+      return cudf::detail::make_host_vector_sync(
+        device_span<bitmask_type const>(predicate.null_mask(), num_bitmasks), stream);
+    } else {
+      auto bitmask = cudf::detail::make_host_vector<bitmask_type>(num_bitmasks, stream);
+      std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0});
+      return bitmask;
+    }
+  }();
+
   auto validity_it = cudf::detail::make_counting_transform_iterator(
     0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 7d817bde7af..0705ff6f5cc 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -38,7 +38,7 @@ namespace {
 // be treated as a string. Currently the only logical type that has special handling is DECIMAL.
 // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
 // for now would also be treated as a string).
-inline bool is_treat_fixed_length_as_string(cuda::std::optional<LogicalType> const& logical_type)
+inline bool is_treat_fixed_length_as_string(std::optional<LogicalType> const& logical_type)
 {
   if (!logical_type.has_value()) { return true; }
   return logical_type->type != LogicalType::DECIMAL;
@@ -78,7 +78,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
   auto const has_strings = (kernel_mask & STRINGS_MASK) != 0;
-  std::vector<size_t> col_string_sizes(_input_columns.size(), 0L);
+  auto col_string_sizes  = cudf::detail::make_host_vector<size_t>(_input_columns.size(), _stream);
   if (has_strings) {
     // need to compute pages bounds/sizes if we lack page indexes or are using custom bounds
     // TODO: we could probably dummy up size stats for FLBA data since we know the width
@@ -371,13 +371,15 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
   }
 
-  // for list columns, add the final offset to every offset buffer.
-  // TODO : make this happen in more efficiently. Maybe use thrust::for_each
-  // on each buffer.
+  // For list and string columns, add the final offset to every offset buffer.
   // Note : the reason we are doing this here instead of in the decode kernel is
   // that it is difficult/impossible for a given page to know that it is writing the very
   // last value that should then be followed by a terminator (because rows can span
   // page boundaries).
+  std::vector<size_type*> out_buffers;
+  std::vector<size_type> final_offsets;
+  out_buffers.reserve(_input_columns.size());
+  final_offsets.reserve(_input_columns.size());
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     input_column_info const& input_col = _input_columns[idx];
 
@@ -393,25 +395,21 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
 
         // the final offset for a list at level N is the size of it's child
         size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + (out_buf.size - 1),
-                                      &offset,
-                                      sizeof(size_type),
-                                      cudaMemcpyDefault,
-                                      _stream.value()));
+        out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + (out_buf.size - 1));
+        final_offsets.emplace_back(offset);
         out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
         auto const sz = static_cast<size_type>(col_string_sizes[idx]);
         if (sz <= strings::detail::get_offset64_threshold()) {
-          CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
-                                        &sz,
-                                        sizeof(size_type),
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
+          out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
+          final_offsets.emplace_back(sz);
         }
       }
     }
   }
+  // Write the final offsets for list and string columns in a batched manner
+  WriteFinalOffsets(final_offsets, out_buffers, _stream);
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < subpass.pages.size(); idx++) {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 62ffc4d3077..3aa9b94ed6b 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -284,7 +284,7 @@ class reader::impl {
    *
    * @return Vector of total string data sizes for each column
    */
-  std::vector<size_t> calculate_page_string_offsets();
+  cudf::detail::host_vector<size_t> calculate_page_string_offsets();
 
   /**
    * @brief Converts the page data and outputs to columns.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index c588fedb85c..27312a4da89 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos,
  *
  * @return A tuple of Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, cuda::std::optional<LogicalType>> conversion_info(
+[[nodiscard]] std::tuple<int32_t, std::optional<LogicalType>> conversion_info(
   type_id column_type_id,
   type_id timestamp_type_id,
   Type physical,
-  cuda::std::optional<LogicalType> logical_type)
+  std::optional<LogicalType> logical_type)
 {
   int32_t const clock_rate =
     is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
@@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos,
     // if decimal but not outputting as float or decimal, then convert to no logical type
     if (column_type_id != type_id::FLOAT64 and
         not cudf::is_fixed_point(data_type{column_type_id})) {
-      return std::make_tuple(clock_rate, cuda::std::nullopt);
+      return {clock_rate, std::nullopt};
     }
   }
 
-  return std::make_tuple(clock_rate, std::move(logical_type));
+  return {clock_rate, std::move(logical_type)};
 }
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 3a3cdd34a58..a0c2dbd3e44 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -107,7 +107,7 @@ struct subpass_intermediate_data {
  * rowgroups may represent less than all of the rowgroups to be read for the file.
  */
 struct pass_intermediate_data {
-  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  std::vector<rmm::device_buffer> raw_page_data;
 
   // rowgroup, chunk and page information for the current pass.
   bool has_compressed_data{false};
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 6d566b5815e..a6562d33de2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf;
 
 namespace {
 
-cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
+std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
 {
   if (schema.converted_type.has_value()) {
     switch (schema.converted_type.value()) {
@@ -66,7 +66,7 @@ cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const&
       default: return LogicalType{LogicalType::UNDEFINED};
     }
   }
-  return cuda::std::nullopt;
+  return std::nullopt;
 }
 
 }  // namespace
@@ -246,7 +246,7 @@ void metadata::sanitize_schema()
         struct_elem.repetition_type = REQUIRED;
         struct_elem.num_children    = schema_elem.num_children;
         struct_elem.type            = UNDEFINED_TYPE;
-        struct_elem.converted_type  = cuda::std::nullopt;
+        struct_elem.converted_type  = std::nullopt;
 
         // swap children
         struct_elem.children_idx = std::move(schema_elem.children_idx);
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 3763c2e8e6d..f03f1214b9a 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -19,9 +19,9 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/batched_memset.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/detail/batched_memset.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -44,6 +44,7 @@
 #include <thrust/unique.h>
 
 #include <bitset>
+#include <limits>
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
@@ -217,7 +218,7 @@ void generate_depth_remappings(
  */
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
-  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
+  cudf::host_span<rmm::device_buffer> page_data,
   cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
@@ -250,23 +251,24 @@ void generate_depth_remappings(
       if (source->is_device_read_preferred(io_size)) {
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        auto buffer =
+        page_data[chunk] =
           rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream);
         auto fut_read_size = source->device_read_async(
-          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
+          io_offset, io_size, static_cast<uint8_t*>(page_data[chunk].data()), stream);
         read_tasks.emplace_back(std::move(fut_read_size));
-        page_data[chunk] = datasource::buffer::create(std::move(buffer));
       } else {
         auto const read_buffer = source->host_read(io_offset, io_size);
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        auto tmp_buffer = rmm::device_buffer(
+        page_data[chunk] = rmm::device_buffer(
           cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream));
-        page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer));
+        CUDF_CUDA_TRY(cudaMemcpyAsync(page_data[chunk].data(),
+                                      read_buffer->data(),
+                                      read_buffer->size(),
+                                      cudaMemcpyDefault,
+                                      stream));
       }
-      auto d_compdata = page_data[chunk]->data();
+      auto d_compdata = static_cast<uint8_t const*>(page_data[chunk].data());
       do {
         chunks[chunk].compressed_data = d_compdata;
         d_compdata += chunks[chunk].compressed_size;
@@ -979,7 +981,7 @@ std::pair<bool, std::future<void>> reader::impl::read_column_chunks()
   std::vector<size_type> chunk_source_map(num_chunks);
 
   // Tracker for eventually deallocating compressed and uncompressed data
-  raw_page_data = std::vector<std::unique_ptr<datasource::buffer>>(num_chunks);
+  raw_page_data = std::vector<rmm::device_buffer>(num_chunks);
 
   // Keep track of column chunk file offsets
   std::vector<size_t> column_chunk_offsets(num_chunks);
@@ -1592,36 +1594,68 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
     auto const d_cols_info = cudf::detail::make_device_uvector_async(
       h_cols_info, _stream, cudf::get_current_device_resource_ref());
 
-    auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size();
-    // size iterator. indexes pages by sorted order
-    rmm::device_uvector<size_type> size_input{num_keys, _stream};
-    thrust::transform(
-      rmm::exec_policy(_stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(num_keys),
-      size_input.begin(),
-      get_page_nesting_size{
-        d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
-    auto const reduction_keys =
-      cudf::detail::make_counting_transform_iterator(0, get_reduction_key{subpass.pages.size()});
+    // Vector to store page sizes for each column at each depth
     cudf::detail::hostdevice_vector<size_t> sizes{_input_columns.size() * max_depth, _stream};
 
-    // find the size of each column
-    thrust::reduce_by_key(rmm::exec_policy(_stream),
-                          reduction_keys,
-                          reduction_keys + num_keys,
-                          size_input.cbegin(),
-                          thrust::make_discard_iterator(),
-                          sizes.d_begin());
-
-    // for nested hierarchies, compute per-page start offset
-    thrust::exclusive_scan_by_key(
-      rmm::exec_policy(_stream),
-      reduction_keys,
-      reduction_keys + num_keys,
-      size_input.cbegin(),
-      start_offset_output_iterator{
-        subpass.pages.device_begin(), 0, d_cols_info.data(), max_depth, subpass.pages.size()});
+    // Total number of keys to process
+    auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size();
+
+    // Maximum 1 billion keys processed per iteration
+    auto constexpr max_keys_per_iter =
+      static_cast<size_t>(std::numeric_limits<size_type>::max() / 2);
+
+    // Number of keys for per each column
+    auto const num_keys_per_col = max_depth * subpass.pages.size();
+
+    // The largest multiple of `num_keys_per_col` that is <= `num_keys`
+    auto const num_keys_per_iter =
+      num_keys <= max_keys_per_iter
+        ? num_keys
+        : num_keys_per_col * std::max<size_t>(1, max_keys_per_iter / num_keys_per_col);
+
+    // Size iterator. Indexes pages by sorted order
+    rmm::device_uvector<size_type> size_input{num_keys_per_iter, _stream};
+
+    // To keep track of the starting key of an iteration
+    size_t key_start = 0;
+    // Loop until all keys are processed
+    while (key_start < num_keys) {
+      // Number of keys processed in this iteration
+      auto const num_keys_this_iter = std::min<size_t>(num_keys_per_iter, num_keys - key_start);
+      thrust::transform(
+        rmm::exec_policy_nosync(_stream),
+        thrust::make_counting_iterator<size_t>(key_start),
+        thrust::make_counting_iterator<size_t>(key_start + num_keys_this_iter),
+        size_input.begin(),
+        get_page_nesting_size{
+          d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
+
+      // Manually create a size_t `key_start` compatible counting_transform_iterator.
+      auto const reduction_keys =
+        thrust::make_transform_iterator(thrust::make_counting_iterator<std::size_t>(key_start),
+                                        get_reduction_key{subpass.pages.size()});
+
+      // Find the size of each column
+      thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
+                            reduction_keys,
+                            reduction_keys + num_keys_this_iter,
+                            size_input.cbegin(),
+                            thrust::make_discard_iterator(),
+                            sizes.d_begin() + (key_start / subpass.pages.size()));
+
+      // For nested hierarchies, compute per-page start offset
+      thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
+                                    reduction_keys,
+                                    reduction_keys + num_keys_this_iter,
+                                    size_input.cbegin(),
+                                    start_offset_output_iterator{subpass.pages.device_begin(),
+                                                                 key_start,
+                                                                 d_cols_info.data(),
+                                                                 max_depth,
+                                                                 subpass.pages.size()});
+      // Increment the key_start
+      key_start += num_keys_this_iter;
+    }
 
     sizes.device_to_host_sync(_stream);
     for (size_type idx = 0; idx < static_cast<size_type>(_input_columns.size()); idx++) {
@@ -1656,21 +1690,20 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
     }
   }
 
-  cudf::io::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
+  cudf::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
   // Need to set null mask bufs to all high bits
-  cudf::io::detail::batched_memset(
+  cudf::detail::batched_memset(
     nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
-std::vector<size_t> reader::impl::calculate_page_string_offsets()
+cudf::detail::host_vector<size_t> reader::impl::calculate_page_string_offsets()
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
   auto page_keys = make_page_key_iterator(subpass.pages);
 
-  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
-  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
+  rmm::device_uvector<size_t> d_col_sizes(_input_columns.size(), _stream);
 
   // use page_index to fetch page string sizes in the proper order
   auto val_iter = thrust::make_transform_iterator(subpass.pages.device_begin(),
@@ -1684,7 +1717,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                                 page_offset_output_iter{subpass.pages.device_ptr()});
 
   // now sum up page sizes
-  rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
+  rmm::device_uvector<int> reduce_keys(d_col_sizes.size(), _stream);
   thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
                         page_keys,
                         page_keys + subpass.pages.size(),
@@ -1692,14 +1725,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                         reduce_keys.begin(),
                         d_col_sizes.begin());
 
-  cudaMemcpyAsync(col_sizes.data(),
-                  d_col_sizes.data(),
-                  sizeof(size_t) * col_sizes.size(),
-                  cudaMemcpyDeviceToHost,
-                  _stream);
-  _stream.synchronize();
-
-  return col_sizes;
+  return cudf::detail::make_host_vector_sync(d_col_sizes, _stream);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index ec05f35d405..f865c9a7643 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -183,10 +183,10 @@ struct aggregate_writer_metadata {
     std::vector<RowGroup> row_groups;
     std::vector<KeyValue> key_value_metadata;
     std::vector<OffsetIndex> offset_indexes;
-    std::vector<std::vector<uint8_t>> column_indexes;
+    std::vector<cudf::detail::host_vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
-  cuda::std::optional<std::vector<ColumnOrder>> column_orders = cuda::std::nullopt;
+  std::optional<std::vector<ColumnOrder>> column_orders = std::nullopt;
 };
 
 namespace {
@@ -472,7 +472,7 @@ struct leaf_schema_fn {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type = cuda::std::nullopt;
+    col_schema.converted_type = std::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
     if (timestamp_is_int96) {
       col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
@@ -750,7 +750,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
           col_schema.type = Type::BYTE_ARRAY;
         }
 
-        col_schema.converted_type  = cuda::std::nullopt;
+        col_schema.converted_type  = std::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -1543,12 +1543,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
       d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream);
   }
 
-  auto h_chunks = chunks.host_view();
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(),
-                                d_chunks.data(),
-                                d_chunks.flat_view().size_bytes(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  chunks.device_to_host_async(stream);
 
   if (comp_stats.has_value()) {
     comp_stats.value() += collect_compression_statistics(comp_in, comp_res, stream);
@@ -2559,12 +2554,11 @@ void writer::impl::write_parquet_data_to_sink(
         } else {
           CUDF_EXPECTS(bounce_buffer.size() >= ck.compressed_size,
                        "Bounce buffer was not properly initialized.");
-          CUDF_CUDA_TRY(cudaMemcpyAsync(bounce_buffer.data(),
-                                        dev_bfr + ck.ck_stat_size,
-                                        ck.compressed_size,
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
-          _stream.synchronize();
+          cudf::detail::cuda_memcpy(
+            host_span{bounce_buffer}.subspan(0, ck.compressed_size),
+            device_span<uint8_t const>{dev_bfr + ck.ck_stat_size, ck.compressed_size},
+            _stream);
+
           _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size);
         }
 
@@ -2600,13 +2594,8 @@ void writer::impl::write_parquet_data_to_sink(
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
-          std::vector<uint8_t> column_idx;
-          column_idx.resize(ck.column_index_size);
-          CUDF_CUDA_TRY(cudaMemcpyAsync(column_idx.data(),
-                                        ck.column_index_blob,
-                                        ck.column_index_size,
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
+          auto column_idx = cudf::detail::make_host_vector_async(
+            device_span<uint8_t const>{ck.column_index_blob, ck.column_index_size}, _stream);
 
           // calculate offsets while the column index is transferring
           int64_t curr_pg_offset = column_chunk_meta.data_page_offset;
@@ -2795,7 +2784,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
   for (auto& se : md.schema) {
     if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
-      se.logical_type = cuda::std::nullopt;
+      se.logical_type = std::nullopt;
     }
   }
 
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index badcd3f58f9..06069630685 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -74,8 +74,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     // Buffer needs to be padded.
     // Required by `inflate_kernel`.
     device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+    cudf::detail::cuda_memcpy_async<T>(
+      device_span<T>{device}.subspan(0, host.size()), host, stream);
   }
 
   struct decompression_blocks {
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 58faa0ebfe4..4baea8655e0 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -87,8 +87,10 @@ class datasource_chunk_reader : public data_chunk_reader {
       _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer.data()));
 
       // copy the host-pinned data on to device
-      CUDF_CUDA_TRY(cudaMemcpyAsync(
-        chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+      cudf::detail::cuda_memcpy_async<char>(
+        device_span<char>{chunk}.subspan(0, read_size),
+        host_span<char const>{h_ticket.buffer}.subspan(0, read_size),
+        stream);
 
       // record the host-to-device copy.
       CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -153,8 +155,10 @@ class istream_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host-pinned data on to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+    cudf::detail::cuda_memcpy_async<char>(
+      device_span<char>{chunk}.subspan(0, read_size),
+      host_span<char const>{h_ticket.buffer}.subspan(0, read_size),
+      stream);
 
     // record the host-to-device copy.
     CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -193,12 +197,10 @@ class host_span_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host data to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(  //
-      chunk.data(),
-      _data.data() + _position,
-      read_size,
-      cudaMemcpyDefault,
-      stream.value()));
+    cudf::detail::cuda_memcpy_async<char>(
+      cudf::device_span<char>{chunk}.subspan(0, read_size),
+      cudf::host_span<char const>{_data}.subspan(_position, read_size),
+      stream);
 
     _position += read_size;
 
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index a3afbd52896..813743fa7b4 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -19,7 +19,10 @@
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <kvikio/defaults.hpp>
+
 #include <cstdlib>
+#include <mutex>
 #include <sstream>
 #include <string>
 
@@ -53,6 +56,14 @@ bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_
 
 bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; }
 
+void set_thread_pool_nthreads_from_env()
+{
+  static std::once_flag flag{};
+  std::call_once(flag, [] {
+    auto nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 8U);
+    kvikio::defaults::thread_pool_nthreads_reset(nthreads);
+  });
+}
 }  // namespace cufile_integration
 
 namespace nvcomp_integration {
@@ -81,5 +92,4 @@ bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
 bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; }
 
 }  // namespace nvcomp_integration
-
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index f70171eef68..0c49b2e5d78 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -800,7 +801,7 @@ template <typename string_view_pair_it>
 static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
                                             size_type col_size,
                                             rmm::device_buffer&& null_mask,
-                                            rmm::device_scalar<size_type>& d_null_count,
+                                            cudf::detail::device_scalar<size_type>& d_null_count,
                                             cudf::io::parse_options_view const& options,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
@@ -930,7 +931,7 @@ std::unique_ptr<column> parse_data(
   CUDF_FUNC_RANGE();
 
   if (col_size == 0) { return make_empty_column(col_type); }
-  auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
+  auto d_null_count    = cudf::detail::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
   if (null_mask.is_empty()) {
     null_mask = cudf::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 1dbb9369115..a8a275919d8 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -42,6 +42,7 @@ class file_sink : public data_sink {
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
     if (cufile_integration::is_kvikio_enabled()) {
+      cufile_integration::set_thread_pool_nthreads_from_env();
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
@@ -50,7 +51,8 @@ class file_sink : public data_sink {
     }
   }
 
-  ~file_sink() override { flush(); }
+  // Marked as NOLINT because we are calling a virtual method in the destructor
+  ~file_sink() override { flush(); }  // NOLINT
 
   void host_write(void const* data, size_t size) override
   {
@@ -114,7 +116,8 @@ class host_buffer_sink : public data_sink {
  public:
   explicit host_buffer_sink(std::vector<char>* buffer) : buffer_(buffer) {}
 
-  ~host_buffer_sink() override { flush(); }
+  // Marked as NOLINT because we are calling a virtual method in the destructor
+  ~host_buffer_sink() override { flush(); }  // NOLINT
 
   void host_write(void const* data, size_t size) override
   {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index e4313eba454..4e8908a8942 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -15,8 +15,10 @@
  */
 
 #include "file_io_utilities.hpp"
+#include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
@@ -32,6 +34,7 @@
 #include <unistd.h>
 
 #include <unordered_map>
+#include <vector>
 
 namespace cudf {
 namespace io {
@@ -46,6 +49,7 @@ class file_source : public datasource {
   {
     detail::force_init_cuda_context();
     if (cufile_integration::is_kvikio_enabled()) {
+      cufile_integration::set_thread_pool_nthreads_from_env();
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
@@ -54,6 +58,30 @@ class file_source : public datasource {
     }
   }
 
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    lseek(_file.desc(), offset, SEEK_SET);
+
+    // Clamp length to available data
+    ssize_t const read_size = std::min(size, _file.size() - offset);
+
+    std::vector<uint8_t> v(read_size);
+    CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed");
+    return buffer::create(std::move(v));
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    lseek(_file.desc(), offset, SEEK_SET);
+
+    // Clamp length to available data
+    auto const read_size = std::min(size, _file.size() - offset);
+
+    CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast<ssize_t>(read_size),
+                 "read failed");
+    return read_size;
+  }
+
   ~file_source() override = default;
 
   [[nodiscard]] bool supports_device_read() const override
@@ -109,27 +137,6 @@ class file_source : public datasource {
   static constexpr size_t _gds_read_preferred_threshold = 128 << 10;  // 128KB
 };
 
-/**
- * @brief Memoized pageableMemoryAccessUsesHostPageTables device property.
- */
-[[nodiscard]] bool pageableMemoryAccessUsesHostPageTables()
-{
-  static std::unordered_map<int, bool> result_cache{};
-
-  int deviceId{};
-  CUDF_CUDA_TRY(cudaGetDevice(&deviceId));
-
-  if (result_cache.find(deviceId) == result_cache.end()) {
-    cudaDeviceProp props{};
-    CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, deviceId));
-    result_cache[deviceId] = (props.pageableMemoryAccessUsesHostPageTables == 1);
-    CUDF_LOG_INFO(
-      "Device {} pageableMemoryAccessUsesHostPageTables: {}", deviceId, result_cache[deviceId]);
-  }
-
-  return result_cache[deviceId];
-}
-
 /**
  * @brief Implementation class for reading from a file using memory mapped access.
  *
@@ -138,40 +145,53 @@ class file_source : public datasource {
  */
 class memory_mapped_source : public file_source {
  public:
-  explicit memory_mapped_source(char const* filepath, size_t offset, size_t size)
+  explicit memory_mapped_source(char const* filepath, size_t offset, size_t max_size_estimate)
     : file_source(filepath)
   {
     if (_file.size() != 0) {
-      map(_file.desc(), offset, size);
-      register_mmap_buffer();
+      // Memory mapping is not exclusive, so we can include the whole region we expect to read
+      map(_file.desc(), offset, max_size_estimate);
     }
   }
 
   ~memory_mapped_source() override
   {
-    if (_map_addr != nullptr) {
-      munmap(_map_addr, _map_size);
-      unregister_mmap_buffer();
-    }
+    if (_map_addr != nullptr) { unmap(); }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
-    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
+    // Clamp length to available data
+    auto const read_size = std::min(size, +_file.size() - offset);
+
+    // If the requested range is outside of the mapped region, read from the file
+    if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) {
+      return file_source::host_read(offset, read_size);
+    }
+
+    // If the requested range is only partially within the registered region, copy to a new
+    // host buffer to make the data safe to copy to the device
+    if (_reg_addr != nullptr and
+        (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) {
+      auto const src = static_cast<uint8_t*>(_map_addr) + (offset - _map_offset);
 
-    // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+      return std::make_unique<owning_buffer<std::vector<uint8_t>>>(
+        std::vector<uint8_t>(src, src + read_size));
+    }
 
     return std::make_unique<non_owning_buffer>(
-      static_cast<uint8_t*>(_map_addr) + (offset - _map_offset), read_size);
+      static_cast<uint8_t*>(_map_addr) + offset - _map_offset, read_size);
   }
 
   size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
-    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
+    // Clamp length to available data
+    auto const read_size = std::min(size, +_file.size() - offset);
 
-    // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    // If the requested range is outside of the mapped region, read from the file
+    if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) {
+      return file_source::host_read(offset, read_size, dst);
+    }
 
     auto const src = static_cast<uint8_t*>(_map_addr) + (offset - _map_offset);
     std::memcpy(dst, src, read_size);
@@ -179,42 +199,6 @@ class memory_mapped_source : public file_source {
   }
 
  private:
-  /**
-   * @brief Page-locks (registers) the memory range of the mapped file.
-   *
-   * Fixes nvbugs/4215160
-   */
-  void register_mmap_buffer()
-  {
-    if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) {
-      return;
-    }
-
-    auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault);
-    if (result == cudaSuccess) {
-      _is_map_registered = true;
-    } else {
-      CUDF_LOG_WARN("cudaHostRegister failed with {} ({})",
-                    static_cast<int>(result),
-                    cudaGetErrorString(result));
-    }
-  }
-
-  /**
-   * @brief Unregisters the memory range of the mapped file.
-   */
-  void unregister_mmap_buffer()
-  {
-    if (not _is_map_registered) { return; }
-
-    auto const result = cudaHostUnregister(_map_addr);
-    if (result != cudaSuccess) {
-      CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})",
-                    static_cast<int>(result),
-                    cudaGetErrorString(result));
-    }
-  }
-
   void map(int fd, size_t offset, size_t size)
   {
     CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error);
@@ -226,52 +210,30 @@ class memory_mapped_source : public file_source {
 
     // Size for `mmap()` needs to include the page padding
     _map_size = size + (offset - _map_offset);
+    if (_map_size == 0) { return; }
 
     // Check if accessing a region within already mapped area
     _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset);
     CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping");
   }
 
- private:
-  size_t _map_size        = 0;
-  size_t _map_offset      = 0;
-  void* _map_addr         = nullptr;
-  bool _is_map_registered = false;
-};
-
-/**
- * @brief Implementation class for reading from a file using `read` calls
- *
- * Potentially faster than `memory_mapped_source` when only a small portion of the file is read
- * through the host.
- */
-class direct_read_source : public file_source {
- public:
-  explicit direct_read_source(char const* filepath) : file_source(filepath) {}
-
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  void unmap()
   {
-    lseek(_file.desc(), offset, SEEK_SET);
-
-    // Clamp length to available data
-    ssize_t const read_size = std::min(size, _file.size() - offset);
-
-    std::vector<uint8_t> v(read_size);
-    CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed");
-    return buffer::create(std::move(v));
+    if (_map_addr != nullptr) {
+      auto const result = munmap(_map_addr, _map_size);
+      if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); }
+      _map_addr = nullptr;
+    }
   }
 
-  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
-  {
-    lseek(_file.desc(), offset, SEEK_SET);
-
-    // Clamp length to available data
-    auto const read_size = std::min(size, _file.size() - offset);
+ private:
+  size_t _map_offset = 0;
+  size_t _map_size   = 0;
+  void* _map_addr    = nullptr;
 
-    CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast<ssize_t>(read_size),
-                 "read failed");
-    return read_size;
-  }
+  size_t _reg_offset = 0;
+  size_t _reg_size   = 0;
+  void* _reg_addr    = nullptr;
 };
 
 /**
@@ -286,17 +248,18 @@ class device_buffer_source final : public datasource {
   size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
     auto const count  = std::min(size, this->size() - offset);
-    auto const stream = cudf::get_default_stream();
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(dst, _d_buffer.data() + offset, count, cudaMemcpyDefault, stream.value()));
-    stream.synchronize();
+    auto const stream = cudf::detail::global_cuda_stream_pool().get_stream();
+    cudf::detail::cuda_memcpy(host_span<uint8_t>{dst, count},
+                              device_span<uint8_t const>{
+                                reinterpret_cast<uint8_t const*>(_d_buffer.data() + offset), count},
+                              stream);
     return count;
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
     auto const count  = std::min(size, this->size() - offset);
-    auto const stream = cudf::get_default_stream();
+    auto const stream = cudf::detail::global_cuda_stream_pool().get_stream();
     auto h_data       = cudf::detail::make_host_vector_async(
       cudf::device_span<std::byte const>{_d_buffer.data() + offset, count}, stream);
     stream.synchronize();
@@ -431,16 +394,23 @@ class user_datasource_wrapper : public datasource {
 
 std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t offset,
-                                               size_t size)
+                                               size_t max_size_estimate)
 {
-#ifdef CUFILE_FOUND
-  if (cufile_integration::is_always_enabled()) {
-    // avoid mmap as GDS is expected to be used for most reads
-    return std::make_unique<direct_read_source>(filepath.c_str());
+  auto const use_memory_mapping = [] {
+    auto const policy = getenv_or("LIBCUDF_MMAP_ENABLED", std::string{"ON"});
+
+    if (policy == "ON") { return true; }
+    if (policy == "OFF") { return false; }
+
+    CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy);
+  }();
+
+  if (use_memory_mapping) {
+    return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, max_size_estimate);
+  } else {
+    // `file_source` reads the file directly, without memory mapping
+    return std::make_unique<file_source>(filepath.c_str());
   }
-#endif
-  // Use our own memory mapping implementation for direct file reads
-  return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, size);
 }
 
 std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index d7b54399f8d..98ed9b28f0a 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -239,7 +239,7 @@ std::vector<std::future<ResultT>> make_sliced_tasks(
   std::vector<std::future<ResultT>> slice_tasks;
   std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
     return pool.submit_task(
-      [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
+      [=] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
   });
   return slice_tasks;
 }
diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp
index d9eac423901..a3ddef52dd8 100644
--- a/cpp/src/io/utilities/hostdevice_span.hpp
+++ b/cpp/src/io/utilities/hostdevice_span.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -33,31 +34,18 @@ class hostdevice_span {
   hostdevice_span(hostdevice_span const&) = default;  ///< Copy constructor
   hostdevice_span(hostdevice_span&&)      = default;  ///< Move constructor
 
-  hostdevice_span(T* cpu_data, T* gpu_data, size_t size)
-    : _size(size), _device_data(gpu_data), _host_data(cpu_data)
+  hostdevice_span(host_span<T> host_data, T* device_data)
+    : _host_data{host_data}, _device_data{device_data}
   {
   }
 
-  /// Constructor from container
-  /// @param in The container to construct the span from
-  template <typename C,
-            // Only supported containers of types convertible to T
-            std::enable_if_t<std::is_convertible_v<
-              std::remove_pointer_t<decltype(std::declval<C&>().host_ptr())> (*)[],
-              T (*)[]>>* = nullptr>
-  constexpr hostdevice_span(C& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size())
-  {
-  }
-
-  /// Constructor from const container
-  /// @param in The container to construct the span from
-  template <typename C,
-            // Only supported containers of types convertible to T
-            std::enable_if_t<std::is_convertible_v<
-              std::remove_pointer_t<decltype(std::declval<C&>().host_ptr())> (*)[],
-              T (*)[]>>* = nullptr>
-  constexpr hostdevice_span(C const& in)
-    : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size())
+  // Copy construction to support const conversion
+  /// @param other The span to copy
+  template <typename OtherT,
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
+                             void>* = nullptr>
+  constexpr hostdevice_span(hostdevice_span<OtherT> const& other) noexcept
+    : _host_data{host_span<OtherT>{other}}, _device_data{other.device_ptr()}
   {
   }
 
@@ -74,15 +62,13 @@ class hostdevice_span {
    * @tparam T The device span type.
    * @return A typed device span of the hostdevice view's data.
    */
-  [[nodiscard]] operator cudf::device_span<T>() { return {_device_data, size()}; }
-
-  /**
-   * @brief Converts a hostdevice view into a device span of const data.
-   *
-   * @tparam T The device span type.
-   * @return A const typed device span of the hostdevice view's data.
-   */
-  [[nodiscard]] operator cudf::device_span<T const>() const { return {_device_data, size()}; }
+  template <typename U,
+            std::enable_if_t<std::is_convertible_v<T (*)[], U (*)[]>,  // NOLINT
+                             void>* = nullptr>
+  [[nodiscard]] operator cudf::device_span<U>() const noexcept
+  {
+    return {_device_data, size()};
+  }
 
   /**
    * @brief Returns the underlying device data.
@@ -114,9 +100,12 @@ class hostdevice_span {
    * @tparam T The host span type.
    * @return A typed host span of the hostdevice_span's data.
    */
-  [[nodiscard]] operator cudf::host_span<T>() const noexcept
+  template <typename U,
+            std::enable_if_t<std::is_convertible_v<T (*)[], U (*)[]>,  // NOLINT
+                             void>* = nullptr>
+  [[nodiscard]] operator host_span<U>() const noexcept
   {
-    return cudf::host_span<T>(_host_data, size());
+    return {_host_data};
   }
 
   /**
@@ -125,7 +114,7 @@ class hostdevice_span {
    * @tparam T The type to cast to
    * @return T* Typed pointer to underlying data
    */
-  [[nodiscard]] T* host_ptr(size_t offset = 0) const noexcept { return _host_data + offset; }
+  [[nodiscard]] T* host_ptr(size_t offset = 0) const noexcept { return _host_data.data() + offset; }
 
   /**
    * @brief Return first element in host data.
@@ -136,19 +125,19 @@ class hostdevice_span {
   [[nodiscard]] T* host_begin() const noexcept { return host_ptr(); }
 
   /**
-   * @brief Return one past the last elementin host data.
+   * @brief Return one past the last element in host data.
    *
    * @tparam T The desired type
    * @return T const* Pointer to one past the last element
    */
-  [[nodiscard]] T* host_end() const noexcept { return host_begin() + size(); }
+  [[nodiscard]] T* host_end() const noexcept { return _host_data.end(); }
 
   /**
    * @brief Returns the number of elements in the view
    *
    * @return The number of elements in the view
    */
-  [[nodiscard]] std::size_t size() const noexcept { return _size; }
+  [[nodiscard]] std::size_t size() const noexcept { return _host_data.size(); }
 
   /**
    * @brief Returns true if `size()` returns zero, or false otherwise
@@ -159,12 +148,11 @@ class hostdevice_span {
 
   [[nodiscard]] size_t size_bytes() const noexcept { return sizeof(T) * size(); }
 
-  [[nodiscard]] T& operator[](size_t i) { return _host_data[i]; }
-  [[nodiscard]] T const& operator[](size_t i) const { return _host_data[i]; }
+  [[nodiscard]] T& operator[](size_t i) const { return _host_data[i]; }
 
   /**
-   * @brief Obtains a hostdevice_span that is a view over the `count` elements of this
-   * hostdevice_span starting at offset
+   * @brief Obtains a `hostdevice_span` that is a view over the `count` elements of this
+   * hostdevice_span starting at `offset`
    *
    * @param offset The offset of the first element in the subspan
    * @param count The number of elements in the subspan
@@ -172,37 +160,37 @@ class hostdevice_span {
    */
   [[nodiscard]] constexpr hostdevice_span<T> subspan(size_t offset, size_t count) const noexcept
   {
-    return hostdevice_span<T>(_host_data + offset, _device_data + offset, count);
+    return hostdevice_span<T>(_host_data.subspan(offset, count), device_ptr(offset));
   }
 
-  void host_to_device_async(rmm::cuda_stream_view stream)
+  void host_to_device_async(rmm::cuda_stream_view stream) const
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    static_assert(not std::is_const_v<T>, "Cannot copy to const device memory");
+    cudf::detail::cuda_memcpy_async<T>(device_span<T>{device_ptr(), size()}, _host_data, stream);
   }
 
-  void host_to_device_sync(rmm::cuda_stream_view stream)
+  void host_to_device_sync(rmm::cuda_stream_view stream) const
   {
     host_to_device_async(stream);
     stream.synchronize();
   }
 
-  void device_to_host_async(rmm::cuda_stream_view stream)
+  void device_to_host_async(rmm::cuda_stream_view stream) const
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    static_assert(not std::is_const_v<T>, "Cannot copy to const host memory");
+    cudf::detail::cuda_memcpy_async<T>(
+      _host_data, device_span<T const>{device_ptr(), size()}, stream);
   }
 
-  void device_to_host_sync(rmm::cuda_stream_view stream)
+  void device_to_host_sync(rmm::cuda_stream_view stream) const
   {
     device_to_host_async(stream);
     stream.synchronize();
   }
 
  private:
-  size_t _size{};     ///< Number of elements
-  T* _device_data{};  ///< Pointer to device memory containing elements
-  T* _host_data{};    ///< Pointer to host memory containing elements
+  host_span<T> _host_data;
+  T* _device_data{nullptr};
 };
 
 }  // namespace cudf::detail
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index aed745c42dd..f969b45727b 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -117,55 +117,39 @@ class hostdevice_vector {
     return d_data.element(element_index, stream);
   }
 
-  operator cudf::host_span<T>() { return {host_ptr(), size()}; }
-  operator cudf::host_span<T const>() const { return {host_ptr(), size()}; }
+  operator cudf::host_span<T>() { return host_span<T>{h_data}.subspan(0, size()); }
+  operator cudf::host_span<T const>() const
+  {
+    return host_span<T const>{h_data}.subspan(0, size());
+  }
 
   operator cudf::device_span<T>() { return {device_ptr(), size()}; }
   operator cudf::device_span<T const>() const { return {device_ptr(), size()}; }
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    cuda_memcpy_async<T>(d_data, h_data, stream);
   }
 
-  void host_to_device_sync(rmm::cuda_stream_view stream)
-  {
-    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
-  }
+  void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(d_data, h_data, stream); }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    cuda_memcpy_async<T>(h_data, d_data, stream);
   }
 
-  void device_to_host_sync(rmm::cuda_stream_view stream)
-  {
-    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
-  }
+  void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(h_data, d_data, stream); }
 
   /**
    * @brief Converts a hostdevice_vector into a hostdevice_span.
    *
    * @return A typed hostdevice_span of the hostdevice_vector's data
    */
-  [[nodiscard]] operator hostdevice_span<T>()
-  {
-    return hostdevice_span<T>{h_data.data(), d_data.data(), size()};
-  }
+  [[nodiscard]] operator hostdevice_span<T>() { return {host_span<T>{h_data}, device_ptr()}; }
 
-  /**
-   * @brief Converts a part of a hostdevice_vector into a hostdevice_span.
-   *
-   * @param offset The offset of the first element in the subspan
-   * @param count The number of elements in the subspan
-   * @return A typed hostdevice_span of the hostdevice_vector's data
-   */
-  [[nodiscard]] hostdevice_span<T> subspan(size_t offset, size_t count)
+  [[nodiscard]] operator hostdevice_span<T const>() const
   {
-    CUDF_EXPECTS(offset < d_data.size(), "Offset is out of bounds.");
-    CUDF_EXPECTS(count <= d_data.size() - offset,
-                 "The span with given offset and count is out of bounds.");
-    return hostdevice_span<T>{h_data.data() + offset, d_data.data() + offset, count};
+    return {host_span<T const>{h_data}, device_ptr()};
   }
 
  private:
@@ -188,38 +172,47 @@ class hostdevice_2dvector {
   {
   }
 
-  operator device_2dspan<T>() { return {_data.device_ptr(), _size}; }
-  operator device_2dspan<T const>() const { return {_data.device_ptr(), _size}; }
+  operator device_2dspan<T>() { return {device_span<T>{_data}, _size.second}; }
+  operator device_2dspan<T const>() const { return {device_span<T const>{_data}, _size.second}; }
 
   device_2dspan<T> device_view() { return static_cast<device_2dspan<T>>(*this); }
-  device_2dspan<T> device_view() const { return static_cast<device_2dspan<T const>>(*this); }
+  [[nodiscard]] device_2dspan<T const> device_view() const
+  {
+    return static_cast<device_2dspan<T const>>(*this);
+  }
 
-  operator host_2dspan<T>() { return {_data.host_ptr(), _size}; }
-  operator host_2dspan<T const>() const { return {_data.host_ptr(), _size}; }
+  operator host_2dspan<T>() { return {host_span<T>{_data}, _size.second}; }
+  operator host_2dspan<T const>() const { return {host_span<T const>{_data}, _size.second}; }
 
   host_2dspan<T> host_view() { return static_cast<host_2dspan<T>>(*this); }
-  host_2dspan<T> host_view() const { return static_cast<host_2dspan<T const>>(*this); }
+  [[nodiscard]] host_2dspan<T const> host_view() const
+  {
+    return static_cast<host_2dspan<T const>>(*this);
+  }
 
   host_span<T> operator[](size_t row)
   {
-    return {_data.host_ptr() + host_2dspan<T>::flatten_index(row, 0, _size), _size.second};
+    return host_span<T>{_data}.subspan(row * _size.second, _size.second);
   }
 
   host_span<T const> operator[](size_t row) const
   {
-    return {_data.host_ptr() + host_2dspan<T>::flatten_index(row, 0, _size), _size.second};
+    return host_span<T const>{_data}.subspan(row * _size.second, _size.second);
   }
 
-  auto size() const noexcept { return _size; }
-  auto count() const noexcept { return _size.first * _size.second; }
-  auto is_empty() const noexcept { return count() == 0; }
+  [[nodiscard]] auto size() const noexcept { return _size; }
+  [[nodiscard]] auto count() const noexcept { return _size.first * _size.second; }
+  [[nodiscard]] auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
 
-  T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); }
+  [[nodiscard]] T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); }
 
-  T const* base_device_ptr(size_t offset = 0) const { return _data.device_ptr(offset); }
+  [[nodiscard]] T const* base_device_ptr(size_t offset = 0) const
+  {
+    return _data.device_ptr(offset);
+  }
 
   [[nodiscard]] size_t size_bytes() const noexcept { return _data.size_bytes(); }
 
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index 43dc38c4ac6..af32b207d20 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -18,11 +18,10 @@
 #include "io/utilities/string_parsing.hpp"
 #include "io/utilities/trie.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/device_scalar.hpp>
-
 #include <cub/block/block_reduce.cuh>
 
 #include <cstddef>
@@ -242,7 +241,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
   constexpr int block_size = 128;
 
   auto const grid_size = (size + block_size - 1) / block_size;
-  auto d_column_info   = rmm::device_scalar<cudf::io::column_type_histogram>(stream);
+  auto d_column_info   = cudf::detail::device_scalar<cudf::io::column_type_histogram>(stream);
   CUDF_CUDA_TRY(cudaMemsetAsync(
     d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value()));
 
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 2ec23e0dc6d..40d1c925889 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
@@ -81,7 +82,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = *output_size;
   } else {
     // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
     if (has_nulls) {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -94,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  rmm::device_scalar<std::size_t> write_index(0, stream);
+  cudf::detail::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -197,7 +198,7 @@ conditional_join(table_view const& left,
     join_size = *output_size;
   } else {
     // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
     if (has_nulls) {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -231,7 +232,7 @@ conditional_join(table_view const& left,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<std::size_t> write_index(0, stream);
+  cudf::detail::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -342,7 +343,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index c7294152982..515d28201e8 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -27,7 +27,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cooperative_groups.h>
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index bd8c80652a0..a4ec97af235 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -67,7 +67,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
   // Create set ref with the new equality comparator
-  auto const set_ref_equality = set_ref.with_key_eq(equality);
+  auto const set_ref_equality = set_ref.rebind_key_eq(equality);
 
   // Total number of rows to query the set
   auto const outer_num_rows = left_table.num_rows();
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 83a55eca50f..62ba558b0bd 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -184,7 +184,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+  hash_set_ref_type const row_set_ref =
+    row_set.ref(cuco::contains).rebind_hash_function(hash_probe);
 
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 84e9be45030..4049ccf35e1 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/export.hpp>
@@ -122,7 +123,7 @@ std::size_t launch_compute_mixed_join_output_size(
   rmm::device_async_resource_ref mr)
 {
   // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
 
   compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, has_nulls>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index 59fdbedf089..fb5cf66dd60 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
@@ -1031,7 +1032,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr);
 
   // compute results
-  rmm::device_scalar<size_type> d_valid_count{0, stream};
+  cudf::detail::device_scalar<size_type> d_valid_count{0, stream};
 
   get_json_object_kernel<block_size>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index b0a84a6d50c..d27420658d6 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1126,12 +1126,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  * `max` of 0.
  *
  * @param tdv input tdigests. The tdigests within this column are grouped by key.
- * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `group_offsets`.
  * @param group_offsets a device iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `h_group_offsets`.
+ * counted as one even when the cluster is empty in it.
  * @param group_labels a device iterator of the the group label for each tdigest cluster including
  * empty clusters.
  * @param num_group_labels the number of unique group labels.
@@ -1142,9 +1138,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  *
  * @return A column containing the merged tdigests.
  */
-template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
+template <typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
-                                       HGroupOffsetIter h_group_offsets,
                                        GroupOffsetIter group_offsets,
                                        GroupLabelIter group_labels,
                                        size_t num_group_labels,
@@ -1313,21 +1308,13 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
 
   if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
-  auto group_offsets_  = group_offsets_fn{input.size()};
-  auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_offsets   = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_labels    = thrust::make_constant_iterator(0);
-  return to_tdigest_scalar(merge_tdigests(tdv,
-                                          h_group_offsets,
-                                          group_offsets,
-                                          group_labels,
-                                          input.size(),
-                                          1,
-                                          max_centroids,
-                                          stream,
-                                          mr),
-                           stream,
-                           mr);
+  auto group_offsets_ = group_offsets_fn{input.size()};
+  auto group_offsets  = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
+  auto group_labels   = thrust::make_constant_iterator(0);
+  return to_tdigest_scalar(
+    merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr),
+    stream,
+    mr);
 }
 
 std::unique_ptr<column> group_tdigest(column_view const& col,
@@ -1376,16 +1363,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
     return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr);
   }
 
-  // bring group offsets back to the host
-  std::vector<size_type> h_group_offsets(group_offsets.size());
-  cudaMemcpyAsync(h_group_offsets.data(),
-                  group_offsets.begin(),
-                  sizeof(size_type) * group_offsets.size(),
-                  cudaMemcpyDefault,
-                  stream);
-
   return merge_tdigests(tdv,
-                        h_group_offsets.begin(),
                         group_offsets.data(),
                         group_labels.data(),
                         group_labels.size(),
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 67ea29a2cb1..890625830a5 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -16,6 +16,7 @@
 
 #include "simple.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -65,7 +66,8 @@ struct all_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(1, stream, cudf::get_current_device_resource_ref());
+    auto d_result =
+      cudf::detail::device_scalar<int32_t>(1, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 057f038c622..d70da369d72 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -16,6 +16,7 @@
 
 #include "simple.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -65,7 +66,8 @@ struct any_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(0, stream, cudf::get_current_device_resource_ref());
+    auto d_result =
+      cudf::detail::device_scalar<int32_t>(0, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 6bc8b48832f..cd9fade164a 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -18,13 +18,18 @@
 
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/reduction/detail/reduction.cuh>
+#include <cudf/reduction/detail/reduction_operators.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
+#include <stdexcept>
+#include <type_traits>
+
 namespace cudf {
 namespace reduction {
 namespace compound {
@@ -53,9 +58,17 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
 {
   auto const valid_count = col.size() - col.null_count();
 
+  // All null input produces all null output
+  if (valid_count == 0 ||
+      // Only care about ddof for standard deviation and variance right now
+      valid_count <= ddof && (std::is_same_v<Op, cudf::reduction::detail::op::standard_deviation> ||
+                              std::is_same_v<Op, cudf::reduction::detail::op::variance>)) {
+    auto result = cudf::make_fixed_width_scalar(output_dtype, stream, mr);
+    result->set_valid_async(false, stream);
+    return result;
+  }
   // reduction by iterator
   auto dcol = cudf::column_device_view::create(col, stream);
-  std::unique_ptr<scalar> result;
   Op compound_op{};
 
   if (!cudf::is_dictionary(col.type())) {
@@ -63,25 +76,21 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
       auto it = thrust::make_transform_iterator(
         dcol->pair_begin<ElementType, true>(),
         compound_op.template get_null_replacing_element_transformer<ResultType>());
-      result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
+      return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
         it, col.size(), compound_op, valid_count, ddof, stream, mr);
     } else {
       auto it = thrust::make_transform_iterator(
         dcol->begin<ElementType>(), compound_op.template get_element_transformer<ResultType>());
-      result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
+      return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
         it, col.size(), compound_op, valid_count, ddof, stream, mr);
     }
   } else {
     auto it = thrust::make_transform_iterator(
       cudf::dictionary::detail::make_dictionary_pair_iterator<ElementType>(*dcol, col.has_nulls()),
       compound_op.template get_null_replacing_element_transformer<ResultType>());
-    result = cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
+    return cudf::reduction::detail::reduce<Op, decltype(it), ResultType>(
       it, col.size(), compound_op, valid_count, ddof, stream, mr);
   }
-
-  // set scalar is valid
-  result->set_valid_async(col.null_count() < col.size(), stream);
-  return result;
 };
 
 // @brief result type dispatcher for compound reduction (a.k.a. mean, var, std)
@@ -137,6 +146,7 @@ struct element_type_dispatcher {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
+    CUDF_EXPECTS(ddof >= 0, "ddof must be non-negative", std::domain_error);
     return cudf::type_dispatcher(
       output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, ddof, stream, mr);
   }
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 362b5f74c46..b40b2b6dd2e 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -15,18 +15,24 @@
  */
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/operator.hpp>
+#include <cuco/static_set.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
+#include <thrust/uninitialized_fill.h>
 
 #include <optional>
 
@@ -34,61 +40,12 @@ namespace cudf::reduction::detail {
 
 namespace {
 
+// A CUDA Cooperative Group of 1 thread for the hash set for histogram
+auto constexpr DEFAULT_HISTOGRAM_CG_SIZE = 1;
+
 // Always use 64-bit signed integer for storing count.
 using histogram_count_type = int64_t;
 
-/**
- * @brief The functor to accumulate the frequency of each distinct rows in the input table.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename CountType>
-struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType> {
-  CountType const* d_partial_output;
-
-  reduce_fn(MapView const& d_map,
-            KeyHasher const& d_hasher,
-            KeyEqual const& d_equal,
-            CountType* const d_output,
-            CountType const* const d_partial_output)
-    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType>{d_map,
-                                                                                   d_hasher,
-                                                                                   d_equal,
-                                                                                   d_output},
-      d_partial_output{d_partial_output}
-  {
-  }
-
-  // Count the number of rows in each group of rows that are compared equal.
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1};
-    auto const count =
-      cuda::atomic_ref<CountType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
-    count.fetch_add(increment, cuda::std::memory_order_relaxed);
-  }
-};
-
-/**
- * @brief The builder to construct an instance of `reduce_fn` functor.
- */
-template <typename CountType>
-struct reduce_func_builder {
-  CountType const* const d_partial_output;
-
-  reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output}
-  {
-  }
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             CountType* const d_output)
-  {
-    return reduce_fn<MapView, KeyHasher, KeyEqual, CountType>{
-      d_map, d_hasher, d_equal, d_output, d_partial_output};
-  }
-};
-
 /**
  * @brief Specialized functor to check for not-zero of the second component of the input.
  */
@@ -163,14 +120,6 @@ compute_row_frequencies(table_view const& input,
                "Nested types are not yet supported in histogram aggregation.",
                std::invalid_argument);
 
-  auto map = cudf::detail::hash_map_type{
-    compute_hash_table_size(input.num_rows()),
-    cuco::empty_key{-1},
-    cuco::empty_value{std::numeric_limits<size_type>::min()},
-
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
   auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
@@ -179,51 +128,68 @@ compute_row_frequencies(table_view const& input,
   auto const key_hasher = row_hasher.device_hasher(has_nulls);
   auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
-  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, size_type>>(
-      [] __device__(size_type const i) { return cuco::make_pair(i, i); }));
-
   // Always compare NaNs as equal.
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
+  // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns
+  auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+
+  using row_hash =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
+
+  size_t const num_rows = input.num_rows();
+
+  // Construct a vector to store reduced counts and init to zero
+  rmm::device_uvector<histogram_count_type> reduction_results(num_rows, stream, mr);
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             histogram_count_type{0});
+
+  // Construct a hash set
+  auto row_set = cuco::static_set{
+    cuco::extent{num_rows},
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,
+    cuco::empty_key<size_type>{-1},
+    key_equal,
+    cuco::linear_probing<DEFAULT_HISTOGRAM_CG_SIZE, row_hash>{key_hasher},
+    {},  // thread scope
+    {},  // storage
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
-  if (has_nested_columns) {
-    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-  } else {
-    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-  }
-
-  // Gather the indices of distinct rows.
-  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(
-    static_cast<size_type>(map.get_size()), stream, mr);
-
-  // Store the number of occurrences of each distinct row.
-  auto distinct_counts = make_numeric_column(data_type{type_to_id<histogram_count_type>()},
-                                             static_cast<size_type>(map.get_size()),
-                                             mask_state::UNALLOCATED,
-                                             stream,
-                                             mr);
+  // Device-accessible reference to the hash set with `insert_and_find` operator
+  auto row_set_ref = row_set.ref(cuco::op::insert_and_find);
 
   // Compute frequencies (aka distinct counts) for the input rows.
   // Note that we consider null and NaNs as always equal.
-  auto const reduction_results = cudf::detail::hash_reduce_by_row(
-    map,
-    preprocessed_input,
-    input.num_rows(),
-    has_nulls,
-    has_nested_columns,
-    null_equality::EQUAL,
-    nan_equality::ALL_EQUAL,
-    reduce_func_builder<histogram_count_type>{
-      partial_counts ? partial_counts.value().begin<histogram_count_type>() : nullptr},
-    histogram_count_type{0},
-    stream,
-    cudf::get_current_device_resource_ref());
-
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(num_rows),
+    [set_ref = row_set_ref,
+     increments =
+       partial_counts.has_value() ? partial_counts.value().begin<histogram_count_type>() : nullptr,
+     counts = reduction_results.begin()] __device__(auto const idx) mutable {
+      auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+      cuda::atomic_ref<histogram_count_type, cuda::thread_scope_device> count_ref{
+        counts[*inserted_idx_ptr]};
+      auto const increment = increments ? increments[idx] : histogram_count_type{1};
+      count_ref.fetch_add(increment, cuda::std::memory_order_relaxed);
+    });
+
+  // Set-size is the number of distinct (inserted) rows
+  auto const set_size = row_set.size(stream);
+
+  // Vector of distinct indices
+  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(set_size, stream, mr);
+  // Column of distinct counts
+  auto distinct_counts = make_numeric_column(
+    data_type{type_to_id<histogram_count_type>()}, set_size, mask_state::UNALLOCATED, stream, mr);
+
+  // Copy row indices and counts to the output if counts are non-zero
   auto const input_it = thrust::make_zip_iterator(
     thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
   auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
@@ -232,7 +198,7 @@ compute_row_frequencies(table_view const& input,
   // Reduction results above are either group sizes of equal rows, or `0`.
   // The final output is non-zero group sizes only.
   thrust::copy_if(
-    rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{});
+    rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{});
 
   return {std::move(distinct_indices), std::move(distinct_counts)};
 }
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 139de068050..4f6eb23ce5b 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
@@ -69,18 +70,18 @@ struct minmax_pair {
  * @param num_items number of items to reduce
  * @param binary_op binary operator used to reduce
  * @param stream CUDA stream to run kernels on.
- * @return rmm::device_scalar<OutputType>
+ * @return cudf::detail::device_scalar<OutputType>
  */
 template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type>
-rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
-                                             size_type num_items,
-                                             Op binary_op,
-                                             rmm::cuda_stream_view stream)
+auto reduce_device(InputIterator d_in,
+                   size_type num_items,
+                   Op binary_op,
+                   rmm::cuda_stream_view stream)
 {
   OutputType identity{};
-  rmm::device_scalar<OutputType> result{identity, stream};
+  cudf::detail::device_scalar<OutputType> result{identity, stream};
 
   // Allocate temporary storage
   size_t storage_bytes = 0;
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 1df1549432f..d0e3358cc34 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -137,7 +138,7 @@ struct replace_nulls_column_kernel_forwarder {
     auto device_out         = cudf::mutable_column_device_view::create(output_view, stream);
     auto device_replacement = cudf::column_device_view::create(replacement, stream);
 
-    rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
+    cudf::detail::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
 
     replace<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 86ec8cfc91e..0cc97ca05e0 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -37,6 +37,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
@@ -53,7 +54,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -182,7 +182,7 @@ struct replace_kernel_forwarder {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
+    cudf::detail::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
 
     auto replace = [&] {
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 528700137bf..bc0ee2eb519 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -33,6 +33,7 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/unary.hpp>
@@ -49,7 +50,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/std/climits>
@@ -1105,7 +1105,7 @@ struct rolling_window_launcher {
       auto const d_inp_ptr         = column_device_view::create(input, stream);
       auto const d_default_out_ptr = column_device_view::create(default_outputs, stream);
       auto const d_out_ptr = mutable_column_device_view::create(output->mutable_view(), stream);
-      auto d_valid_count   = rmm::device_scalar<size_type>{0, stream};
+      auto d_valid_count   = cudf::detail::device_scalar<size_type>{0, stream};
 
       auto constexpr block_size = 256;
       auto const grid           = cudf::detail::grid_1d(input.size(), block_size);
@@ -1271,7 +1271,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
     udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
 
   auto output_view = output->mutable_view();
-  rmm::device_scalar<size_type> device_valid_count{0, stream};
+  cudf::detail::device_scalar<size_type> device_valid_count{0, stream};
 
   std::string kernel_name =
     jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new")  //
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 4c015f3cbed..6a7c8ea45e9 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
@@ -348,7 +349,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // This check incurs ~20% performance hit for smaller strings and so we only use it
   // after the threshold check above. The check makes very little impact for long strings
   // but results in a large performance gain when the input contains only single-byte characters.
-  rmm::device_scalar<int64_t> mb_count(0, stream);
+  cudf::detail::device_scalar<int64_t> mb_count(0, stream);
   // cudf::detail::grid_1d is limited to size_type elements
   auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size);
   // we only need to check every other byte since either will contain high bit
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 0db1adf1223..f5d052c6657 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -152,12 +153,8 @@ struct format_compiler {
     }
 
     // create program in device memory
-    d_items.resize(items.size(), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(),
-                                  items.data(),
-                                  items.size() * sizeof(items[0]),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    d_items = cudf::detail::make_device_uvector_sync(
+      items, stream, cudf::get_current_device_resource_ref());
   }
 
   format_item const* compiled_format_items() { return d_items.data(); }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 1d9d12686eb..9e4ef47ff79 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/advance.h>
@@ -242,7 +242,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   }
 
   {  // Copy offsets columns with single kernel launch
-    rmm::device_scalar<size_type> d_valid_count(0, stream);
+    cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
 
     constexpr size_type block_size{256};
     cudf::detail::grid_1d config(offsets_count, block_size);
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index 7323918dcff..8683a9bdfbe 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -100,9 +100,8 @@ std::unique_ptr<table> extract(strings_column_view const& input,
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
 
-  auto indices = rmm::device_uvector<string_index_pair>(input.size() * groups, stream);
-  auto d_indices =
-    cudf::detail::device_2dspan<string_index_pair>(indices.data(), input.size(), groups);
+  auto indices   = rmm::device_uvector<string_index_pair>(input.size() * groups, stream);
+  auto d_indices = cudf::detail::device_2dspan<string_index_pair>(indices, groups);
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 51c6e765edd..b923a301f84 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -710,19 +710,17 @@ class regex_parser {
     std::stack<int> lbra_stack;
     auto repeat_start_index = -1;
 
-    for (std::size_t index = 0; index < in.size(); index++) {
-      auto const item = in[index];
-
+    for (auto const item : in) {
       if (item.type != COUNTED && item.type != COUNTED_LAZY) {
         out.push_back(item);
         if (item.type == LBRA || item.type == LBRA_NC) {
-          lbra_stack.push(index);
+          lbra_stack.push(out.size() - 1);
           repeat_start_index = -1;
         } else if (item.type == RBRA) {
           repeat_start_index = lbra_stack.top();
           lbra_stack.pop();
         } else if ((item.type & ITEM_MASK) != OPERATOR_MASK) {
-          repeat_start_index = index;
+          repeat_start_index = out.size() - 1;
         }
       } else {
         // item is of type COUNTED or COUNTED_LAZY
@@ -731,26 +729,39 @@ class regex_parser {
         CUDF_EXPECTS(repeat_start_index >= 0, "regex: invalid counted quantifier location");
 
         // range of affected item(s) to repeat
-        auto const begin = in.begin() + repeat_start_index;
-        auto const end   = in.begin() + index;
+        auto const begin = out.begin() + repeat_start_index;
+        auto const end   = out.end();
+
         // count range values
         auto const n = item.d.count.n;  // minimum count
         auto const m = item.d.count.m;  // maximum count
-
         assert(n >= 0 && "invalid repeat count value n");
         // zero-repeat edge-case: need to erase the previous items
-        if (n == 0) { out.erase(out.end() - (index - repeat_start_index), out.end()); }
-
-        // minimum repeats (n)
-        for (int j = 1; j < n; j++) {
-          out.insert(out.end(), begin, end);
+        if (n == 0) { out.erase(begin, end); }
+
+        std::vector<regex_parser::Item> repeat_copy(begin, end);
+        // special handling for quantified capture groups
+        if ((n > 1) && (*begin).type == LBRA) {
+          (*begin).type = LBRA_NC;  // change first one to non-capture
+          // add intermediate groups as non-capture
+          std::vector<regex_parser::Item> ncg_copy(begin, end);
+          for (int j = 1; j < (n - 1); j++) {
+            out.insert(out.end(), ncg_copy.begin(), ncg_copy.end());
+          }
+          // add the last entry as a regular capture-group
+          out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
+        } else {
+          // minimum repeats (n)
+          for (int j = 1; j < n; j++) {
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
+          }
         }
 
         // optional maximum repeats (m)
         if (m >= 0) {
           for (int j = n; j < m; j++) {
             out.emplace_back(LBRA_NC, 0);
-            out.insert(out.end(), begin, end);
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
           }
           for (int j = n; j < m; j++) {
             out.emplace_back(RBRA, 0);
@@ -760,8 +771,9 @@ class regex_parser {
           // infinite repeats
           if (n > 0) {  // append '+' after last repetition
             out.emplace_back(item.type == COUNTED ? PLUS : PLUS_LAZY, 0);
-          } else {  // copy it once then append '*'
-            out.insert(out.end(), begin, end);
+          } else {
+            // copy it once then append '*'
+            out.insert(out.end(), repeat_copy.begin(), repeat_copy.end());
             out.emplace_back(item.type == COUNTED ? STAR : STAR_LAZY, 0);
           }
         }
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
index 8a8001dd81a..957075017ba 100644
--- a/cpp/src/strings/replace/find_replace.cu
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/replace.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -21,7 +22,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 352d883bdc5..88f343926c9 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -334,7 +334,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   auto const num_blocks = util::div_rounding_up_safe(
     util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
   count_targets<<<num_blocks, block_size, 0, stream.value()>>>(fn, chars_bytes, d_count.data());
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 16df0dbabdf..52ddef76c1a 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -285,7 +285,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  rmm::device_scalar<int64_t> d_target_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_target_count(0, stream);
   constexpr int64_t block_size         = 512;
   constexpr size_type bytes_per_thread = 4;
   auto const num_blocks                = util::div_rounding_up_safe(
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index d8c1b50a94b..21708e48a25 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -126,6 +126,43 @@ std::unique_ptr<column> findall(strings_column_view const& input,
                            mr);
 }
 
+namespace {
+struct find_re_fn {
+  column_device_view d_strings;
+
+  __device__ size_type operator()(size_type const idx,
+                                  reprog_device const prog,
+                                  int32_t const thread_idx) const
+  {
+    if (d_strings.is_null(idx)) { return 0; }
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    auto const result = prog.find(thread_idx, d_str, d_str.begin());
+    return result.has_value() ? result.value().first : -1;
+  }
+};
+}  // namespace
+
+std::unique_ptr<column> find_re(strings_column_view const& input,
+                                regex_program const& prog,
+                                rmm::cuda_stream_view stream,
+                                rmm::device_async_resource_ref mr)
+{
+  auto results = make_numeric_column(data_type{type_to_id<size_type>()},
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
+                                     stream,
+                                     mr);
+  if (input.is_empty()) { return results; }
+
+  auto d_results       = results->mutable_view().data<size_type>();
+  auto d_prog          = regex_device_builder::create_prog_device(prog, stream);
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  launch_transform_kernel(find_re_fn{*d_strings}, *d_prog, d_results, input.size(), stream);
+
+  return results;
+}
 }  // namespace detail
 
 // external API
@@ -139,5 +176,14 @@ std::unique_ptr<column> findall(strings_column_view const& input,
   return detail::findall(input, prog, stream, mr);
 }
 
+std::unique_ptr<column> find_re(strings_column_view const& input,
+                                regex_program const& prog,
+                                rmm::cuda_stream_view stream,
+                                rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::find_re(input, prog, stream, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 81aca001d53..4b777be9d5b 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
@@ -361,7 +362,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   if (chars_bytes > 0) {
     constexpr int64_t block_size         = 512;
     constexpr size_type bytes_per_thread = 4;
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 07516f91dcf..8e00a29f8e9 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -16,36 +16,171 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
-#include <cudf/strings/detail/utilities.cuh>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
+#include <thrust/scan.h>
+#include <thrust/uninitialized_fill.h>
 
 namespace cudf {
 
+namespace strings::detail {
+
 namespace {
-struct string_view_to_pair {
-  string_view null_placeholder;
-  string_view_to_pair(string_view n) : null_placeholder(n) {}
-  __device__ thrust::pair<char const*, size_type> operator()(string_view const& i)
-  {
-    return (i.data() == null_placeholder.data())
-             ? thrust::pair<char const*, size_type>{nullptr, 0}
-             : thrust::pair<char const*, size_type>{i.data(), i.size_bytes()};
+
+using column_string_pairs = cudf::device_span<string_index_pair const>;
+
+template <typename OutputType>
+std::pair<std::vector<std::unique_ptr<column>>, rmm::device_uvector<int64_t>>
+make_offsets_child_column_batch_async(std::vector<column_string_pairs> const& input,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  auto const num_columns = input.size();
+  std::vector<std::unique_ptr<column>> offsets_columns(num_columns);
+  rmm::device_uvector<int64_t> chars_sizes(num_columns, stream);
+  for (std::size_t idx = 0; idx < num_columns; ++idx) {
+    auto const string_pairs = input[idx];
+    auto const string_count = static_cast<size_type>(string_pairs.size());
+    auto offsets            = make_numeric_column(
+      data_type{type_to_id<OutputType>()}, string_count + 1, mask_state::UNALLOCATED, stream, mr);
+
+    auto const offsets_transformer = cuda::proclaim_return_type<size_type>(
+      [string_count, string_pairs = string_pairs.data()] __device__(size_type idx) -> size_type {
+        return idx < string_count ? string_pairs[idx].second : size_type{0};
+      });
+    auto const input_it  = cudf::detail::make_counting_transform_iterator(0, offsets_transformer);
+    auto const d_offsets = offsets->mutable_view().template data<OutputType>();
+    auto const output_it = cudf::detail::make_sizes_to_offsets_iterator(
+      d_offsets, d_offsets + string_count + 1, chars_sizes.data() + idx);
+    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                           input_it,
+                           input_it + string_count + 1,
+                           output_it,
+                           int64_t{0});
+    offsets_columns[idx] = std::move(offsets);
   }
-};
+
+  return {std::move(offsets_columns), std::move(chars_sizes)};
+}
 
 }  // namespace
 
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<column_string_pairs> const& input,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto const num_columns = input.size();
+
+  auto [offsets_cols, d_chars_sizes] =
+    make_offsets_child_column_batch_async<size_type>(input, stream, mr);
+
+  std::vector<rmm::device_buffer> null_masks;
+  null_masks.reserve(num_columns);
+
+  rmm::device_uvector<size_type> d_valid_counts(num_columns, stream, mr);
+  thrust::uninitialized_fill(
+    rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0);
+
+  for (std::size_t idx = 0; idx < num_columns; ++idx) {
+    auto const& string_pairs = input[idx];
+    auto const string_count  = static_cast<size_type>(string_pairs.size());
+    null_masks.emplace_back(
+      cudf::create_null_mask(string_count, mask_state::UNINITIALIZED, stream, mr));
+
+    if (string_count == 0) { continue; }
+
+    constexpr size_type block_size{256};
+    auto const grid =
+      cudf::detail::grid_1d{static_cast<thread_index_type>(string_count), block_size};
+    cudf::detail::valid_if_kernel<block_size>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        reinterpret_cast<bitmask_type*>(null_masks.back().data()),
+        string_pairs.data(),
+        string_count,
+        [] __device__(string_index_pair const pair) -> bool { return pair.first != nullptr; },
+        d_valid_counts.data() + idx);
+  }
+
+  auto const chars_sizes  = cudf::detail::make_std_vector_async(d_chars_sizes, stream);
+  auto const valid_counts = cudf::detail::make_std_vector_async(d_valid_counts, stream);
+
+  // Except for other stream syncs in `CUB` that we cannot control,
+  // this should be the only stream sync we need in the entire API.
+  stream.synchronize();
+
+  auto const threshold = cudf::strings::get_offset64_threshold();
+  auto const overflow_count =
+    std::count_if(chars_sizes.begin(), chars_sizes.end(), [threshold](auto const chars_size) {
+      return chars_size >= threshold;
+    });
+  CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || overflow_count == 0,
+               "Size of output exceeds the column size limit",
+               std::overflow_error);
+
+  if (overflow_count > 0) {
+    std::vector<column_string_pairs> long_string_input;
+    std::vector<std::size_t> long_string_col_idx;
+    long_string_input.reserve(overflow_count);
+    long_string_col_idx.reserve(overflow_count);
+    for (std::size_t idx = 0; idx < num_columns; ++idx) {
+      if (chars_sizes[idx] >= threshold) {
+        long_string_input.push_back(input[idx]);
+        long_string_col_idx.push_back(idx);
+      }
+    }
+
+    [[maybe_unused]] auto [new_offsets_cols, d_new_chars_sizes] =
+      make_offsets_child_column_batch_async<int64_t>(long_string_input, stream, mr);
+
+    // Update the new offsets columns.
+    // The new chars sizes should be the same as before, thus we don't need to update them.
+    for (std::size_t idx = 0; idx < long_string_col_idx.size(); ++idx) {
+      offsets_cols[long_string_col_idx[idx]] = std::move(new_offsets_cols[idx]);
+    }
+  }
+
+  std::vector<std::unique_ptr<column>> output(num_columns);
+  for (std::size_t idx = 0; idx < num_columns; ++idx) {
+    auto const strings_count = static_cast<size_type>(input[idx].size());
+    if (strings_count == 0) {
+      output[idx] = make_empty_column(type_id::STRING);
+      continue;
+    }
+
+    auto const chars_size  = chars_sizes[idx];
+    auto const valid_count = valid_counts[idx];
+
+    auto chars_data = make_chars_buffer(
+      offsets_cols[idx]->view(), chars_size, input[idx].data(), strings_count, stream, mr);
+
+    auto const null_count = strings_count - valid_count;
+    output[idx]           = make_strings_column(
+      strings_count,
+      std::move(offsets_cols[idx]),
+      chars_data.release(),
+      null_count,
+      null_count ? std::move(null_masks[idx]) : rmm::device_buffer{0, stream, mr});
+  }
+
+  return output;
+}
+
+}  // namespace strings::detail
+
 // Create a strings-type column from vector of pointer/size pairs
 std::unique_ptr<column> make_strings_column(
   device_span<thrust::pair<char const*, size_type> const> strings,
@@ -53,10 +188,32 @@ std::unique_ptr<column> make_strings_column(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-
   return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr);
 }
 
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> const& input,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return cudf::strings::detail::make_strings_column_batch(input, stream, mr);
+}
+
+namespace {
+struct string_view_to_pair {
+  string_view null_placeholder;
+  string_view_to_pair(string_view n) : null_placeholder(n) {}
+  __device__ thrust::pair<char const*, size_type> operator()(string_view const& i)
+  {
+    return (i.data() == null_placeholder.data())
+             ? thrust::pair<char const*, size_type>{nullptr, 0}
+             : thrust::pair<char const*, size_type>{i.data(), i.size_bytes()};
+  }
+};
+
+}  // namespace
+
 std::unique_ptr<column> make_strings_column(device_span<string_view const> string_views,
                                             string_view null_placeholder,
                                             rmm::cuda_stream_view stream,
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index a87ecb81b9d..997b0278fe2 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -48,6 +49,9 @@
 namespace nvtext {
 namespace detail {
 namespace {
+// long strings threshold found with benchmarking
+constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64;
+
 /**
  * @brief Generate ngrams from strings column.
  *
@@ -173,33 +177,39 @@ constexpr cudf::thread_index_type bytes_per_thread = 4;
 /**
  * @brief Counts the number of ngrams in each row of the given strings column
  *
- * Each warp processes a single string.
+ * Each warp/thread processes a single string.
  * Formula is `count = max(0,str.length() - ngrams + 1)`
  * If a string has less than ngrams characters, its count is 0.
  */
 CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings,
                                           cudf::size_type ngrams,
+                                          cudf::size_type tile_size,
                                           cudf::size_type* d_counts)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
 
-  auto const str_idx = idx / cudf::detail::warp_size;
+  auto const str_idx = idx / tile_size;
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) {
     d_counts[str_idx] = 0;
     return;
   }
 
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (tile_size == 1) {
+    d_counts[str_idx] = cuda::std::max(0, (d_str.length() + 1 - ngrams));
+    return;
+  }
+
   namespace cg    = cooperative_groups;
   auto const warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
 
-  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
-  auto const end   = d_str.data() + d_str.size_bytes();
+  auto const end = d_str.data() + d_str.size_bytes();
 
   auto const lane_idx   = warp.thread_rank();
   cudf::size_type count = 0;
   for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
-       itr += cudf::detail::warp_size * bytes_per_thread) {
+       itr += tile_size * bytes_per_thread) {
     for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
       count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
     }
@@ -256,19 +266,27 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Parameter ngrams should be an integer value of 2 or greater",
                std::invalid_argument);
 
-  auto const strings_count = input.size();
-  if (strings_count == 0) {  // if no strings, return an empty column
-    return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  if (input.is_empty()) {  // if no strings, return an empty column
+    return cudf::lists::detail::make_empty_lists_column(
+      cudf::data_type{cudf::type_id::STRING}, stream, mr);
+  }
+  if (input.size() == input.null_count()) {
+    return cudf::lists::detail::make_all_nulls_lists_column(
+      input.size(), cudf::data_type{cudf::type_id::STRING}, stream, mr);
   }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
   auto [offsets, total_ngrams] = [&] {
-    auto counts           = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-    auto const num_blocks = cudf::util::div_rounding_up_safe(
-      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
-    count_char_ngrams_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
-      *d_strings, ngrams, counts.data());
+    auto counts               = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    auto const avg_char_bytes = (input.chars_size(stream) / (input.size() - input.null_count()));
+    auto const tile_size      = (avg_char_bytes < AVG_CHAR_BYTES_THRESHOLD)
+                                  ? 1                         // thread per row
+                                  : cudf::detail::warp_size;  // warp per row
+    auto const grid           = cudf::detail::grid_1d(
+      static_cast<cudf::thread_index_type>(input.size()) * tile_size, block_size);
+    count_char_ngrams_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, ngrams, tile_size, counts.data());
     return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
   }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
@@ -277,8 +295,8 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Insufficient number of characters in each string to generate ngrams");
 
   character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    generator, strings_count, total_ngrams, stream, mr);
+  auto [offsets_column, chars] =
+    cudf::strings::detail::make_strings_children(generator, input.size(), total_ngrams, stream, mr);
 
   auto output = cudf::make_strings_column(
     total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
@@ -368,7 +386,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
   auto [offsets, total_ngrams] = [&] {
     auto counts = rmm::device_uvector<cudf::size_type>(input.size(), stream);
     count_char_ngrams_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, ngrams, counts.data());
+      *d_strings, ngrams, cudf::detail::warp_size, counts.data());
     return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
   }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index df25950e6d5..89ca8a089d6 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -221,7 +222,7 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // To minimize memory, count the number of characters so we can
   // build the output offsets without an intermediate buffer.
   // In the worst case each byte is a character so the output is 4x the input.
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   auto const num_blocks = cudf::util::div_rounding_up_safe(
     cudf::util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)),
     block_size);
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 0efb881eb3e..c0af27a1748 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -30,7 +30,7 @@ namespace cudf::detail {
 namespace {
 
 // Simple kernel to copy between device buffers
-CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
   if (idx < n) { dst[idx] = src[idx]; }
@@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
 
 };  // namespace
 
-void cuda_memcpy_async(
+void cuda_memcpy_async_impl(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
   if (kind == host_memory_kind::PINNED) {
@@ -73,11 +73,4 @@ void cuda_memcpy_async(
   }
 }
 
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
-{
-  cuda_memcpy_async(dst, src, size, kind, stream);
-  stream.synchronize();
-}
-
 }  // namespace cudf::detail
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 125b98c4a67..9d8e3cf2fa6 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -115,12 +115,19 @@ class fixed_pinned_pool_memory_resource {
     return !operator==(other);
   }
 
-  friend void get_property(fixed_pinned_pool_memory_resource const&,
+  // clang-tidy will complain about this function because it is completely
+  // unused at runtime and only exist for tag introspection by CCCL, so we
+  // ignore linting. This masks a real issue if we ever want to compile with
+  // clang, though, which is that the function will actually be compiled out by
+  // clang. If cudf were ever to try to support clang as a compile we would
+  // need to force the compiler to emit this symbol. The same goes for the
+  // other get_property definitions in this file.
+  friend void get_property(fixed_pinned_pool_memory_resource const&,  // NOLINT
                            cuda::mr::device_accessible) noexcept
   {
   }
 
-  friend void get_property(fixed_pinned_pool_memory_resource const&,
+  friend void get_property(fixed_pinned_pool_memory_resource const&,  // NOLINT
                            cuda::mr::host_accessible) noexcept
   {
   }
@@ -235,7 +242,9 @@ class new_delete_memory_resource {
 
   bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); }
 
+  // NOLINTBEGIN
   friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+  // NOLINTEND
 };
 
 static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp
index d54f5677c4c..e52fffbd8c6 100644
--- a/cpp/src/utilities/logger.cpp
+++ b/cpp/src/utilities/logger.cpp
@@ -74,8 +74,10 @@ struct logger_wrapper {
 
 }  // namespace
 
-spdlog::logger& cudf::logger()
+spdlog::logger& cudf::detail::logger()
 {
   static logger_wrapper wrapped{};
   return wrapped.logger_;
 }
+
+spdlog::logger& cudf::logger() { return cudf::detail::logger(); }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b67d922d377..b78a64d0e55 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -56,8 +56,15 @@ function(ConfigureTest CMAKE_TEST_NAME)
 
   target_link_libraries(
     ${CMAKE_TEST_NAME}
-    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main
-            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIBS}"
+    PRIVATE cudf::cudftestutil
+            cudf::cudftestutil_impl
+            GTest::gmock
+            GTest::gmock_main
+            GTest::gtest
+            GTest::gtest_main
+            nvtx3::nvtx3-cpp
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+            "${_CUDF_TEST_EXTRA_LIBS}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
@@ -76,6 +83,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
         "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_${_CUDF_TEST_STREAM_MODE}>"
     )
   endif()
+  enable_clang_tidy(${CMAKE_TEST_NAME})
 endfunction()
 
 # ##################################################################################################
@@ -385,6 +393,8 @@ ConfigureTest(
 # * utilities tests -------------------------------------------------------------------------------
 ConfigureTest(
   UTILITIES_TEST
+  utilities_tests/batched_memcpy_tests.cu
+  utilities_tests/batched_memset_tests.cu
   utilities_tests/column_debug_tests.cpp
   utilities_tests/column_utilities_tests.cpp
   utilities_tests/column_wrapper_tests.cpp
@@ -395,7 +405,6 @@ ConfigureTest(
   utilities_tests/pinned_memory_tests.cpp
   utilities_tests/type_check_tests.cpp
   utilities_tests/type_list_tests.cpp
-  utilities_tests/batched_memset_tests.cu
 )
 
 # ##################################################################################################
@@ -602,7 +611,6 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
-  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp
@@ -717,6 +725,7 @@ ConfigureTest(
   streams/strings/contains_test.cpp
   streams/strings/convert_test.cpp
   streams/strings/extract_test.cpp
+  streams/strings/factory_test.cpp
   streams/strings/filter_test.cpp
   streams/strings/find_test.cpp
   streams/strings/replace_test.cpp
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 06e0d193d80..aa5b49567e6 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -557,7 +557,11 @@ auto NullOp_Result(cudf::column_view lhs, cudf::column_view rhs)
   std::transform(thrust::make_counting_iterator(0),
                  thrust::make_counting_iterator(lhs.size()),
                  result.begin(),
-                 [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut {
+                 [&lhs_data    = lhs_data,
+                  &lhs_mask    = lhs_mask,
+                  &rhs_data    = rhs_data,
+                  &rhs_mask    = rhs_mask,
+                  &result_mask = result_mask](auto i) -> TypeOut {
                    auto lhs_valid    = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i);
                    auto rhs_valid    = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i);
                    bool output_valid = lhs_valid or rhs_valid;
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index d36b48d666a..ef1ccfccab5 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -100,7 +100,7 @@ struct Mul {
             std::enable_if_t<(cudf::is_duration_t<LhsT>::value && std::is_integral_v<RhsT>) ||
                                (cudf::is_duration_t<RhsT>::value && std::is_integral_v<LhsT>),
                              void>* = nullptr>
-  OutT DurationProduct(LhsT x, RhsT y) const
+  [[nodiscard]] OutT DurationProduct(LhsT x, RhsT y) const
   {
     return x * y;
   }
@@ -128,7 +128,7 @@ struct Div {
     typename LhsT,
     typename RhsT,
     std::enable_if_t<(std::is_integral_v<RhsT> || cudf::is_duration<RhsT>()), void>* = nullptr>
-  OutT DurationDivide(LhsT x, RhsT y) const
+  [[nodiscard]] OutT DurationDivide(LhsT x, RhsT y) const
   {
     return x / y;
   }
diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp
index 14b4197de71..631f5150829 100644
--- a/cpp/tests/column/column_test.cpp
+++ b/cpp/tests/column/column_test.cpp
@@ -340,7 +340,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask)
 
   cudf::column moved_to{std::move(original)};
 
-  EXPECT_EQ(0, original.size());
+  EXPECT_EQ(0, original.size());  // NOLINT
   EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type());
 
   verify_column_views(moved_to);
@@ -359,7 +359,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask)
   cudf::column moved_to{std::move(original)};
   verify_column_views(moved_to);
 
-  EXPECT_EQ(0, original.size());
+  EXPECT_EQ(0, original.size());  // NOLINT
   EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type());
 
   // Verify move
diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp
index bebd3d25610..aef0d4ad78a 100644
--- a/cpp/tests/copying/slice_tests.cpp
+++ b/cpp/tests/copying/slice_tests.cpp
@@ -29,6 +29,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <array>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -370,11 +371,12 @@ TEST_F(SliceStringTableTest, StringWithNulls)
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
-  cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids},
-                                              {strings[1].begin(), strings[1].end(), valids}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
+  std::array<cudf::test::strings_column_wrapper, 2> sw{
+    {{strings[0].begin(), strings[0].end(), valids},
+     {strings[1].begin(), strings[1].end(), valids}}};
 
   std::vector<std::unique_ptr<cudf::column>> scols;
   scols.push_back(sw[0].release());
diff --git a/cpp/tests/copying/slice_tests.cuh b/cpp/tests/copying/slice_tests.cuh
index a180740f143..1e037294527 100644
--- a/cpp/tests/copying/slice_tests.cuh
+++ b/cpp/tests/copying/slice_tests.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -148,7 +148,7 @@ std::vector<cudf::table> create_expected_tables(cudf::size_type num_cols,
       }
     }
 
-    result.push_back(cudf::table(std::move(cols)));
+    result.emplace_back(std::move(cols));
   }
 
   return result;
@@ -163,13 +163,12 @@ inline std::vector<cudf::test::strings_column_wrapper> create_expected_string_co
 
   for (unsigned long index = 0; index < indices.size(); index += 2) {
     if (not nullable) {
-      result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index],
-                                                          strings.begin() + indices[index + 1]));
+      result.emplace_back(strings.begin() + indices[index], strings.begin() + indices[index + 1]);
     } else {
       auto valids = cudf::detail::make_counting_transform_iterator(
         indices[index], [](auto i) { return i % 2 == 0; });
-      result.push_back(cudf::test::strings_column_wrapper(
-        strings.begin() + indices[index], strings.begin() + indices[index + 1], valids));
+      result.emplace_back(
+        strings.begin() + indices[index], strings.begin() + indices[index + 1], valids);
     }
   }
 
@@ -184,16 +183,16 @@ inline std::vector<cudf::test::strings_column_wrapper> create_expected_string_co
   std::vector<cudf::test::strings_column_wrapper> result = {};
 
   for (unsigned long index = 0; index < indices.size(); index += 2) {
-    result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index],
-                                                        strings.begin() + indices[index + 1],
-                                                        validity.begin() + indices[index]));
+    result.emplace_back(strings.begin() + indices[index],
+                        strings.begin() + indices[index + 1],
+                        validity.begin() + indices[index]);
   }
 
   return result;
 }
 
 inline std::vector<cudf::table> create_expected_string_tables(
-  std::vector<std::string> const strings[2],
+  std::vector<std::vector<std::string>> const strings,
   std::vector<cudf::size_type> const& indices,
   bool nullable)
 {
@@ -216,7 +215,7 @@ inline std::vector<cudf::table> create_expected_string_tables(
       }
     }
 
-    result.push_back(cudf::table(std::move(cols)));
+    result.emplace_back(std::move(cols));
   }
 
   return result;
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index ee3e7da5e0f..b56b0f2d3f8 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -35,6 +35,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
+#include <array>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -135,7 +136,7 @@ std::vector<cudf::table> create_expected_tables_for_splits(
 }
 
 std::vector<cudf::table> create_expected_string_tables_for_splits(
-  std::vector<std::string> const strings[2],
+  std::vector<std::vector<std::string>> const strings,
   std::vector<cudf::size_type> const& splits,
   bool nullable)
 {
@@ -144,8 +145,8 @@ std::vector<cudf::table> create_expected_string_tables_for_splits(
 }
 
 std::vector<cudf::table> create_expected_string_tables_for_splits(
-  std::vector<std::string> const strings[2],
-  std::vector<bool> const validity[2],
+  std::vector<std::vector<std::string>> const strings,
+  std::vector<std::vector<bool>> const validity,
   std::vector<cudf::size_type> const& splits)
 {
   std::vector<cudf::size_type> indices = splits_to_indices(splits, strings[0].size());
@@ -627,11 +628,12 @@ void split_string_with_invalids(SplitFunc Split,
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
-  cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids},
-                                              {strings[1].begin(), strings[1].end(), valids}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
+  std::array<cudf::test::strings_column_wrapper, 2> sw{
+    {{strings[0].begin(), strings[0].end(), valids},
+     {strings[1].begin(), strings[1].end(), valids}}};
 
   std::vector<std::unique_ptr<cudf::column>> scols;
   scols.push_back(sw[0].release());
@@ -658,11 +660,12 @@ void split_empty_output_strings_column_value(SplitFunc Split,
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
-  cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids},
-                                              {strings[1].begin(), strings[1].end(), valids}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
+  std::array<cudf::test::strings_column_wrapper, 2> sw{
+    {{strings[0].begin(), strings[0].end(), valids},
+     {strings[1].begin(), strings[1].end(), valids}}};
 
   std::vector<std::unique_ptr<cudf::column>> scols;
   scols.push_back(sw[0].release());
@@ -684,9 +687,9 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare)
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
 
   std::vector<cudf::size_type> splits{2, 5, 9};
 
@@ -699,16 +702,17 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare)
     EXPECT_NO_THROW(Split(empty_table, splits));
   }
 
-  cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), no_valids},
-                                              {strings[1].begin(), strings[1].end(), valids}};
+  std::array<cudf::test::strings_column_wrapper, 2> sw{
+    {{strings[0].begin(), strings[0].end(), no_valids},
+     {strings[1].begin(), strings[1].end(), valids}}};
   std::vector<std::unique_ptr<cudf::column>> scols;
   scols.push_back(sw[0].release());
   scols.push_back(sw[1].release());
   cudf::table src_table(std::move(scols));
   auto result = Split(src_table, splits);
 
-  std::vector<bool> validity_masks[2] = {std::vector<bool>(strings[0].size()),
-                                         std::vector<bool>(strings[0].size())};
+  std::vector<std::vector<bool>> validity_masks{std::vector<bool>(strings[0].size()),
+                                                std::vector<bool>(strings[0].size())};
   std::generate(
     validity_masks[1].begin(), validity_masks[1].end(), [i = 0]() mutable { return i++ % 2 == 0; });
 
@@ -1913,9 +1917,9 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes)
   cudf::size_type start = 0;
   auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; });
 
-  std::vector<std::string> strings[2] = {
-    {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
-    {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}};
+  std::vector<std::vector<std::string>> strings{
+    {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"},
+     {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}};
 
   std::vector<std::unique_ptr<cudf::column>> cols;
 
@@ -2377,7 +2381,7 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall)
 {
   // internally, contiguous split chunks GPU work in 1MB contiguous copies
   // so the output buffer must be 1MB or larger.
-  EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error);
+  EXPECT_THROW(auto _ = cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error);
 }
 
 TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 13577c4d0ea..603edb27c7c 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -196,6 +196,136 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
                                  fixed_width_column_wrapper<int16_t>{0, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns),
                                  fixed_width_column_wrapper<int16_t>{766, 424, 623});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::YEAR),
+    fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::YEAR),
+    fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::YEAR),
+    fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::YEAR),
+    fixed_width_column_wrapper<int16_t>{1969, 1970, 1970});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MONTH),
+    fixed_width_column_wrapper<int16_t>{10, 7, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MONTH),
+    fixed_width_column_wrapper<int16_t>{10, 7, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MONTH),
+    fixed_width_column_wrapper<int16_t>{10, 7, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MONTH),
+    fixed_width_column_wrapper<int16_t>{12, 1, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::DAY),
+    fixed_width_column_wrapper<int16_t>{26, 4, 25});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::DAY),
+    fixed_width_column_wrapper<int16_t>{26, 4, 25});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::DAY),
+    fixed_width_column_wrapper<int16_t>{26, 4, 25});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::DAY),
+    fixed_width_column_wrapper<int16_t>{31, 1, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::WEEKDAY),
+    fixed_width_column_wrapper<int16_t>{2, 3, 3});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::WEEKDAY),
+    fixed_width_column_wrapper<int16_t>{2, 3, 3});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY),
+    fixed_width_column_wrapper<int16_t>{2, 3, 3});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY),
+    fixed_width_column_wrapper<int16_t>{2, 3, 3});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::HOUR),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::HOUR),
+    fixed_width_column_wrapper<int16_t>{14, 12, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::HOUR),
+    fixed_width_column_wrapper<int16_t>{14, 12, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::HOUR),
+    fixed_width_column_wrapper<int16_t>{23, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MINUTE),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MINUTE),
+    fixed_width_column_wrapper<int16_t>{1, 0, 32});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MINUTE),
+    fixed_width_column_wrapper<int16_t>{1, 0, 32});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MINUTE),
+    fixed_width_column_wrapper<int16_t>{59, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::SECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::SECOND),
+    fixed_width_column_wrapper<int16_t>{12, 0, 12});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::SECOND),
+    fixed_width_column_wrapper<int16_t>{12, 0, 12});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::SECOND),
+    fixed_width_column_wrapper<int16_t>{59, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MILLISECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MILLISECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MILLISECOND),
+    fixed_width_column_wrapper<int16_t>{762, 0, 929});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MILLISECOND),
+    fixed_width_column_wrapper<int16_t>{976, 23, 987});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MICROSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MICROSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MICROSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MICROSECOND),
+    fixed_width_column_wrapper<int16_t>{675, 432, 234});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::NANOSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::NANOSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::NANOSECOND),
+    fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::NANOSECOND),
+    fixed_width_column_wrapper<int16_t>{766, 424, 623});
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp
index 4fb8f78b558..0e68050f935 100644
--- a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp
+++ b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,6 @@
 
 #include <cudf/hashing.hpp>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 using NumericTypesNoBools =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 
diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp
index cc95c7a2f0f..8bc47c92c6b 100644
--- a/cpp/tests/hashing/sha256_test.cpp
+++ b/cpp/tests/hashing/sha256_test.cpp
@@ -23,8 +23,6 @@
 #include <cudf/hashing.hpp>
 #include <cudf/utilities/error.hpp>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 class SHA256HashTest : public cudf::test::BaseFixture {};
 
 TEST_F(SHA256HashTest, EmptyTable)
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index a4dc7531765..2151ec6e22f 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -270,9 +270,9 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
@@ -414,9 +414,9 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
 {
   std::vector<std::unique_ptr<cudf::column>> columns;
   auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(cudf::dictionary::encode(col));
+  columns.emplace_back(cudf::dictionary::encode(col));
+  columns.emplace_back(cudf::dictionary::encode(col));
 
   cudf::table expected_table(std::move(columns));
   cudf::table_view expected_table_view = expected_table.view();
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
index cbfa4911c3c..ef9936b214c 100644
--- a/cpp/tests/interop/from_arrow_host_test.cpp
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -309,9 +309,9 @@ TEST_F(FromArrowHostDeviceTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 81c406c0faf..6e742b9e4cf 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -52,7 +52,7 @@ std::unique_ptr<cudf::table> get_cudf_table()
                          .release());
   auto col4 = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 5, 2, 7},
                                                               {true, false, true, true, true});
-  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
                          {true, false, true, false, true}, {true, false, true, true, false})
                          .release());
@@ -339,9 +339,9 @@ TEST_F(FromArrowTest, DictionaryIndicesType)
   std::vector<std::unique_ptr<cudf::column>> columns;
   auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7},
                                                              {true, false, true, true, true});
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
-  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(cudf::dictionary::encode(col));
+  columns.emplace_back(cudf::dictionary::encode(col));
+  columns.emplace_back(cudf::dictionary::encode(col));
 
   cudf::table expected_table(std::move(columns));
 
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index a961f73d955..8be7e087b6d 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -256,7 +256,8 @@ std::enable_if_t<std::is_same_v<T, bool>, nanoarrow::UniqueArray> get_nanoarrow_
     ArrowBitmap out;
     ArrowBitmapInit(&out);
     NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1));
-    std::memset(out.buffer.data, 0, out.buffer.size_bytes);
+    // TODO: Investigate clang-tidy issue further after nanoarrow is made compliant
+    std::memset(out.buffer.data, 0, out.buffer.size_bytes);  // NOLINT
 
     for (size_t i = 0; i < b.size(); ++i) {
       ArrowBitSetTo(out.buffer.data, i, static_cast<uint8_t>(b[i]));
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 51216a8512c..7ba586461dc 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -55,7 +55,7 @@ get_nanoarrow_cudf_table(cudf::size_type length)
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
     test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin());
   auto dict_col = cudf::dictionary::encode(col4);
-  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(test_data.bool_data.begin(),
                                                                     test_data.bool_data.end(),
                                                                     test_data.bool_validity.begin())
@@ -82,8 +82,8 @@ get_nanoarrow_cudf_table(cudf::size_type length)
       test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin())
       .release();
   vector_of_columns cols;
-  cols.push_back(move(int_column));
-  cols.push_back(move(str_column));
+  cols.push_back(std::move(int_column));
+  cols.push_back(std::move(str_column));
   auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
     test_data.bool_data_validity.begin(), test_data.bool_data_validity.end()));
   columns.emplace_back(
@@ -575,9 +575,9 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp
index fc0ed6c9352..fcb4433b42e 100644
--- a/cpp/tests/interop/to_arrow_host_test.cpp
+++ b/cpp/tests/interop/to_arrow_host_test.cpp
@@ -436,9 +436,9 @@ TEST_F(ToArrowHostDeviceTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 90ae12cdd90..a6aa4b22eca 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -90,7 +90,7 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
     int64_data.begin(), int64_data.end(), validity.begin());
   auto dict_col = cudf::dictionary::encode(col4);
-  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
                          bool_data.begin(), bool_data.end(), bool_validity.begin())
                          .release());
@@ -112,8 +112,8 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
     cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
       .release();
   vector_of_columns cols;
-  cols.push_back(move(int_column));
-  cols.push_back(move(str_column));
+  cols.push_back(std::move(int_column));
+  cols.push_back(std::move(str_column));
   auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
     bool_data_validity.begin(), bool_data_validity.end()));
   columns.emplace_back(
@@ -294,9 +294,9 @@ TEST_F(ToArrowTest, StructColumn)
   auto int_col2 =
     cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
   auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
-      .release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
@@ -438,7 +438,7 @@ TEST_F(ToArrowTest, FixedPoint64TableLarge)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};  // NOLINT
     ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index 840cf263ed9..54262dc3b44 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -39,19 +39,19 @@ using cudf::device_span;
  */
 template <typename Decompressor>
 struct DecompressTest : public cudf::test::BaseFixture {
-  std::vector<uint8_t> vector_from_string(char const* str) const
+  [[nodiscard]] std::vector<uint8_t> vector_from_string(std::string const str) const
   {
-    return std::vector<uint8_t>(reinterpret_cast<uint8_t const*>(str),
-                                reinterpret_cast<uint8_t const*>(str) + strlen(str));
+    return {reinterpret_cast<uint8_t const*>(str.c_str()),
+            reinterpret_cast<uint8_t const*>(str.c_str()) + strlen(str.c_str())};
   }
 
-  void Decompress(std::vector<uint8_t>* decompressed,
+  void Decompress(std::vector<uint8_t>& decompressed,
                   uint8_t const* compressed,
                   size_t compressed_size)
   {
     auto stream = cudf::get_default_stream();
     rmm::device_buffer src{compressed, compressed_size, stream};
-    rmm::device_uvector<uint8_t> dst{decompressed->size(), stream};
+    rmm::device_uvector<uint8_t> dst{decompressed.size(), stream};
 
     cudf::detail::hostdevice_vector<device_span<uint8_t const>> inf_in(1, stream);
     inf_in[0] = {static_cast<uint8_t const*>(src.data()), src.size()};
@@ -67,7 +67,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
 
     static_cast<Decompressor*>(this)->dispatch(inf_in, inf_out, inf_stat);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
-      decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value()));
+      decompressed.data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value()));
     inf_stat.device_to_host_sync(stream);
     ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS);
   }
@@ -125,49 +125,57 @@ struct NvcompConfigTest : public cudf::test::BaseFixture {};
 
 TEST_F(GzipDecompressTest, HelloWorld)
 {
-  constexpr char uncompressed[]  = "hello world";
+  std::string const uncompressed{"hello world"};
+  // NOLINTBEGIN
   constexpr uint8_t compressed[] = {
     0x1f, 0x8b, 0x8,  0x0,  0x9,  0x63, 0x99, 0x5c, 0x2,  0xff, 0xcb, 0x48, 0xcd, 0xc9, 0xc9, 0x57,
     0x28, 0xcf, 0x2f, 0xca, 0x49, 0x1,  0x0,  0x85, 0x11, 0x4a, 0xd,  0xb,  0x0,  0x0,  0x0};
+  // NOLINTEND
 
   std::vector<uint8_t> input = vector_from_string(uncompressed);
   std::vector<uint8_t> output(input.size());
-  Decompress(&output, compressed, sizeof(compressed));
+  Decompress(output, compressed, sizeof(compressed));
   EXPECT_EQ(output, input);
 }
 
 TEST_F(SnappyDecompressTest, HelloWorld)
 {
-  constexpr char uncompressed[]  = "hello world";
+  std::string const uncompressed{"hello world"};
+  // NOLINTBEGIN
   constexpr uint8_t compressed[] = {
     0xb, 0x28, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64};
+  // NOLINTEND
 
   std::vector<uint8_t> input = vector_from_string(uncompressed);
   std::vector<uint8_t> output(input.size());
-  Decompress(&output, compressed, sizeof(compressed));
+  Decompress(output, compressed, sizeof(compressed));
   EXPECT_EQ(output, input);
 }
 
 TEST_F(SnappyDecompressTest, ShortLiteralAfterLongCopyAtStartup)
 {
-  constexpr char uncompressed[]  = "Aaaaaaaaaaaah!";
+  std::string const uncompressed{"Aaaaaaaaaaaah!"};
+  // NOLINTBEGIN
   constexpr uint8_t compressed[] = {14, 0x0, 'A', 0x0, 'a', (10 - 4) * 4 + 1, 1, 0x4, 'h', '!'};
+  // NOLINTEND
 
   std::vector<uint8_t> input = vector_from_string(uncompressed);
   std::vector<uint8_t> output(input.size());
-  Decompress(&output, compressed, sizeof(compressed));
+  Decompress(output, compressed, sizeof(compressed));
   EXPECT_EQ(output, input);
 }
 
 TEST_F(BrotliDecompressTest, HelloWorld)
 {
-  constexpr char uncompressed[]  = "hello world";
+  std::string const uncompressed{"hello world"};
+  // NOLINTBEGIN
   constexpr uint8_t compressed[] = {
     0xb, 0x5, 0x80, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x3};
+  // NOLINTEND
 
   std::vector<uint8_t> input = vector_from_string(uncompressed);
   std::vector<uint8_t> output(input.size());
-  Decompress(&output, compressed, sizeof(compressed));
+  Decompress(output, compressed, sizeof(compressed));
   EXPECT_EQ(output, input);
 }
 
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index dc14824d834..b265dcf9273 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -63,9 +63,9 @@ auto dtype()
 
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
-  typename std::conditional<std::is_same_v<T, cudf::string_view>,
-                            cudf::test::strings_column_wrapper,
-                            cudf::test::fixed_width_column_wrapper<T, SourceElementT>>::type;
+  std::conditional_t<std::is_same_v<T, cudf::string_view>,
+                     cudf::test::strings_column_wrapper,
+                     cudf::test::fixed_width_column_wrapper<T, SourceElementT>>;
 using column     = cudf::column;
 using table      = cudf::table;
 using table_view = cudf::table_view;
@@ -954,7 +954,7 @@ TEST_F(CsvReaderTest, Strings)
   ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
-    std::vector<std::string>{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"},
+    std::vector<std::string>{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"},
     view.column(1));
 }
 
@@ -1014,7 +1014,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
   ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
-    std::vector<std::string>{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"},
+    std::vector<std::string>{"\"abcdef ghi\"", R"("jkl ""mno"" pqr")", "stu \"vwx\" yz"},
     view.column(1));
 }
 
@@ -1830,7 +1830,7 @@ TEST_F(CsvReaderTest, StringsWithWriter)
 
   auto int_column = column_wrapper<int32_t>{10, 20, 30};
   auto string_column =
-    column_wrapper<cudf::string_view>{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"};
+    column_wrapper<cudf::string_view>{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"};
   cudf::table_view input_table(std::vector<cudf::column_view>{int_column, string_column});
 
   // TODO add quoting style flag?
@@ -2516,4 +2516,39 @@ TEST_F(CsvReaderTest, UTF8BOM)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected);
 }
 
+void expect_buffers_equal(cudf::io::datasource::buffer* lhs, cudf::io::datasource::buffer* rhs)
+{
+  ASSERT_EQ(lhs->size(), rhs->size());
+  EXPECT_EQ(0, std::memcmp(lhs->data(), rhs->data(), lhs->size()));
+}
+
+TEST_F(CsvReaderTest, OutOfMapBoundsReads)
+{
+  // write a lot of data into a file
+  auto filepath        = temp_env->get_temp_dir() + "OutOfMapBoundsReads.csv";
+  auto const num_rows  = 1 << 20;
+  auto const row       = std::string{"0,1,2,3,4,5,6,7,8,9\n"};
+  auto const file_size = num_rows * row.size();
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    for (size_t i = 0; i < num_rows; ++i) {
+      outfile << row;
+    }
+  }
+
+  // Only memory map the middle of the file
+  auto source         = cudf::io::datasource::create(filepath, file_size / 2, file_size / 4);
+  auto full_source    = cudf::io::datasource::create(filepath);
+  auto const all_data = source->host_read(0, file_size);
+  auto ref_data       = full_source->host_read(0, file_size);
+  expect_buffers_equal(ref_data.get(), all_data.get());
+
+  auto const start_data = source->host_read(file_size / 2, file_size / 2);
+  expect_buffers_equal(full_source->host_read(file_size / 2, file_size / 2).get(),
+                       start_data.get());
+
+  auto const end_data = source->host_read(0, file_size / 2 + 512);
+  expect_buffers_equal(full_source->host_read(0, file_size / 2 + 512).get(), end_data.get());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 49ad0c408dc..cb6716f4a18 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -68,9 +68,9 @@ auto dtype()
 
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
-  typename std::conditional<std::is_same_v<T, cudf::string_view>,
-                            cudf::test::strings_column_wrapper,
-                            cudf::test::fixed_width_column_wrapper<T, SourceElementT>>::type;
+  std::conditional_t<std::is_same_v<T, cudf::string_view>,
+                     cudf::test::strings_column_wrapper,
+                     cudf::test::fixed_width_column_wrapper<T, SourceElementT>>;
 
 cudf::test::TempDirTestEnvironment* const temp_env =
   static_cast<cudf::test::TempDirTestEnvironment*>(
diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp
index 2c4e29a01b9..39d31c406a5 100644
--- a/cpp/tests/io/json/json_writer.cpp
+++ b/cpp/tests/io/json/json_writer.cpp
@@ -70,6 +70,43 @@ TEST_F(JsonWriterTest, EmptyInput)
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 }
 
+TEST_F(JsonWriterTest, EmptyLeaf)
+{
+  cudf::test::strings_column_wrapper col1{""};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0};
+  auto col2 = make_lists_column(1,
+                                offsets.release(),
+                                cudf::test::strings_column_wrapper{}.release(),
+                                0,
+                                rmm::device_buffer{},
+                                cudf::test::get_default_stream());
+  auto col3 = cudf::test::lists_column_wrapper<int>::make_one_empty_row_column();
+  cudf::table_view tbl_view{{col1, *col2, col3}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}}};
+
+  std::vector<char> out_buffer;
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(true)
+                       .metadata(mt)
+                       .lines(false)
+                       .na_rep("null")
+                       .build();
+
+  // Empty columns in table
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
+  std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])";
+  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+
+  // Empty columns in table - JSON Lines
+  out_buffer.clear();
+  out_options.enable_lines(true);
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
+  std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})"
+                                     "\n";
+  EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
+}
+
 TEST_F(JsonWriterTest, ErrorCases)
 {
   cudf::test::strings_column_wrapper col1{"a", "b", "c"};
diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp
index 84f04f67038..380d66c53f9 100644
--- a/cpp/tests/io/metadata_utilities.cpp
+++ b/cpp/tests/io/metadata_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/io_metadata_utilities.hpp>
 
-#include <gmock/gmock.h>
-
 namespace cudf::test {
 
 void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 8ad1fea649d..5f1aea71f73 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1358,10 +1358,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
-    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
-  });
-  auto const col = data_col(it, it + num_rows);
+  auto const it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<int64_t>(0), [num_rows](int64_t i) {
+      return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+    });
+  auto const col         = data_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
   std::vector<char> data_buffer;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 89e704f3ed3..cce0adbf317 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -43,9 +43,9 @@
 
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
-  typename std::conditional<std::is_same_v<T, cudf::string_view>,
-                            cudf::test::strings_column_wrapper,
-                            cudf::test::fixed_width_column_wrapper<T, SourceElementT>>::type;
+  std::conditional_t<std::is_same_v<T, cudf::string_view>,
+                     cudf::test::strings_column_wrapper,
+                     cudf::test::fixed_width_column_wrapper<T, SourceElementT>>;
 
 using str_col     = column_wrapper<cudf::string_view>;
 using bool_col    = column_wrapper<bool>;
@@ -1358,21 +1358,22 @@ TEST_P(OrcWriterTestStripes, StripeSize)
   cols.push_back(col.release());
   auto const expected = std::make_unique<table>(std::move(cols));
 
-  auto validate = [&](std::vector<char> const& orc_buffer) {
-    auto const expected_stripe_num =
-      std::max<cudf::size_type>(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes);
-    auto const stats = cudf::io::read_parsed_orc_statistics(
-      cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
-    EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
-
-    cudf::io::orc_reader_options in_opts =
-      cudf::io::orc_reader_options::builder(
-        cudf::io::source_info(orc_buffer.data(), orc_buffer.size()))
-        .use_index(false);
-    auto result = cudf::io::read_orc(in_opts);
-
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  };
+  auto validate =
+    [&, &size_bytes = size_bytes, &size_rows = size_rows](std::vector<char> const& orc_buffer) {
+      auto const expected_stripe_num =
+        std::max<cudf::size_type>(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes);
+      auto const stats = cudf::io::read_parsed_orc_statistics(
+        cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
+      EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
+
+      cudf::io::orc_reader_options in_opts =
+        cudf::io::orc_reader_options::builder(
+          cudf::io::source_info(orc_buffer.data(), orc_buffer.size()))
+          .use_index(false);
+      auto result = cudf::io::read_orc(in_opts);
+
+      CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+    };
 
   {
     std::vector<char> out_buffer_chunked;
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index 6141a40bc95..a1b8677eac8 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2)
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
+                       std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
 {
   auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
   switch (ptype) {
diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp
index bd1579eaa1b..c90b81ed27a 100644
--- a/cpp/tests/io/parquet_common.hpp
+++ b/cpp/tests/io/parquet_common.hpp
@@ -172,7 +172,7 @@ std::pair<cudf::table, std::string> create_parquet_typed_with_stats(std::string
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
+                       std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
 
 void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats);
 
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index 8b03e94191e..f1286a00d22 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -98,7 +98,7 @@ TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced)
   // list<T>
   constexpr int vals_per_row = 4;
   auto c1_offset_iter        = cudf::detail::make_counting_transform_iterator(
-    0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; });
+    0, [](cudf::size_type idx) { return idx * vals_per_row; });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
                                                                      c1_offset_iter + num_rows + 1);
   cudf::test::fixed_width_column_wrapper<T> c1_vals(
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index dc8e68b3a15..7986a3c6d70 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -1189,15 +1189,12 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest)
   cudf::test::fixed_width_column_wrapper<int> values(value_iter, value_iter + num_values, validity);
 
   // ~256k values with num_nesting_levels = 16
-  int total_values_produced = num_values;
-  auto prev_col             = values.release();
+  auto prev_col = values.release();
   for (int idx = 0; idx < num_nesting_levels; idx++) {
-    auto const depth    = num_nesting_levels - idx;
     auto const num_rows = (1 << (num_nesting_levels - idx));
 
     auto offsets_iter = cudf::detail::make_counting_transform_iterator(
-      0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; });
-    total_values_produced += (num_rows + 1);
+      0, [](cudf::size_type i) { return i * rows_per_level; });
 
     cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets(offsets_iter,
                                                                     offsets_iter + num_rows + 1);
@@ -2727,3 +2724,40 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped)
   EXPECT_EQ(result_table.num_columns(), expected->num_columns());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
 }
+
+// The test below requires several minutes to complete with memcheck, thus it is disabled by
+// default.
+TEST_F(ParquetReaderTest, DISABLED_ListsWideTable)
+{
+  auto constexpr num_rows = 2;
+  auto constexpr num_cols = 26'755;  // for slightly over 2B keys
+  auto constexpr seed     = 0xceed;
+
+  std::mt19937 engine{seed};
+
+  auto list_list       = make_parquet_list_list_col<int32_t>(0, num_rows, 1, 1, false);
+  auto list_list_nulls = make_parquet_list_list_col<int32_t>(0, num_rows, 1, 1, true);
+
+  // switch between nullable and non-nullable
+  std::vector<cudf::column_view> cols(num_cols);
+  bool with_nulls = false;
+  std::generate_n(cols.begin(), num_cols, [&]() {
+    auto const view = with_nulls ? list_list_nulls->view() : list_list->view();
+    with_nulls      = not with_nulls;
+    return view;
+  });
+
+  cudf::table_view expected(cols);
+
+  // Use a host buffer for faster I/O
+  std::vector<char> buffer;
+  auto const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected).build();
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size()));
+  auto const [result, _] = cudf::io::read_parquet(default_in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
+}
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index 7c305235ea6..a0b48f54854 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -1302,24 +1302,24 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
   table_view expected({col0, col1, col2, col3, col4, col5, col6, col7});
 
   std::array<int64_t, 9> expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11};
-  std::vector<int64_t> const expected_def_hists[] = {{1, 1, 2, 3},
-                                                     {1, 3, 10},
-                                                     {1, 1, 2, 10},
-                                                     {1, 1, 2, 2, 8},
-                                                     {1, 1, 1, 1, 10},
-                                                     {1, 1, 1, 1, 2, 8},
-                                                     {1, 3, 9},
-                                                     {1, 3, 1, 8},
-                                                     {1, 0, 4, 1, 1, 4, 9}};
-  std::vector<int64_t> const expected_rep_hists[] = {{4, 3},
-                                                     {4, 4, 6},
-                                                     {4, 4, 6},
-                                                     {4, 4, 6},
-                                                     {4, 4, 6},
-                                                     {4, 4, 6},
-                                                     {4, 4, 5},
-                                                     {4, 4, 5},
-                                                     {4, 6, 2, 8}};
+  std::vector<std::vector<int64_t>> const expected_def_hists = {{1, 1, 2, 3},
+                                                                {1, 3, 10},
+                                                                {1, 1, 2, 10},
+                                                                {1, 1, 2, 2, 8},
+                                                                {1, 1, 1, 1, 10},
+                                                                {1, 1, 1, 1, 2, 8},
+                                                                {1, 3, 9},
+                                                                {1, 3, 1, 8},
+                                                                {1, 0, 4, 1, 1, 4, 9}};
+  std::vector<std::vector<int64_t>> const expected_rep_hists = {{4, 3},
+                                                                {4, 4, 6},
+                                                                {4, 4, 6},
+                                                                {4, 4, 6},
+                                                                {4, 4, 6},
+                                                                {4, 4, 6},
+                                                                {4, 4, 5},
+                                                                {4, 4, 5},
+                                                                {4, 6, 2, 8}};
 
   auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet");
   auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 8794f2ee304..6c5e9cdf07a 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -290,7 +290,8 @@ class custom_test_data_sink : public cudf::io::data_sink {
     CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file");
   }
 
-  ~custom_test_data_sink() override { flush(); }
+  // Marked as NOLINT because we are calling a virtual method in the destructor
+  ~custom_test_data_sink() override { flush(); }  // NOLINT
 
   void host_write(void const* data, size_t size) override
   {
@@ -981,13 +982,15 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
 
 TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
 {
-  std::vector<uint8_t> truncated_min[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe},
-                                          {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-                                          {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
+  std::array<std::vector<uint8_t>, 3> truncated_min{
+    {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe},
+     {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+     {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}};
 
-  std::vector<uint8_t> truncated_max[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff},
-                                          {0xff},
-                                          {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
+  std::array<std::vector<uint8_t>, 3> truncated_max{
+    {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff},
+     {0xff},
+     {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}};
 
   cudf::test::lists_column_wrapper<uint8_t> col0{
     {0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}};
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 93754091b3f..178edc52dd3 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -314,7 +314,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -362,7 +362,7 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -398,7 +398,7 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -423,7 +423,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}};
   strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
@@ -468,7 +468,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
 
   auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
   auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
+  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
 
   auto col0_gold_names_col = strcol_wrapper{
     "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"};
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index 249319da7f7..7b61be113f9 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -123,12 +123,9 @@ LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
 int main(int argc, char** argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
-  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
-  // hardcoding the CUDA memory resource to keep from exceeding the pool
-  auto mr = cudf::test::make_cuda();
-  cudf::set_current_device_resource(mr.get());
-  auto adaptor = make_stream_mode_adaptor(cmd_opts);
-
+  cudf::test::config config;
+  config.rmm_mode = "cuda";
+  init_cudf_test(argc, argv, config);
   // create object to automatically be destroyed at the end of main()
   auto lsd = cudf::test::StringsLargeTest::get_ls_data();
 
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index 97979e79010..bea044496b3 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -97,7 +97,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyColumns)
                                             "hi",
                                             "hj"});
 
-  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) {
+  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8)
       return 0;
     else
@@ -296,7 +296,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns)
                                             true,
                                             false,
                                             false});
-  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) {
+  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8)
       return 0;
     else
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 2e09f25b51f..6208d395f0a 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -349,7 +349,7 @@ TYPED_TEST(MergeTest_, Merge1KeyColumns)
   cudf::test::fixed_width_column_wrapper<TypeParam, typename decltype(seq_out1)::value_type>
     expectedDataWrap1(seq_out1, seq_out1 + outputRows);
 
-  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) {
+  auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8)
       return 0;
     else
@@ -452,7 +452,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns)
   cudf::size_type inputRows = 40;
 
   // data: 0  2  4  6 | valid: 1 1 1 0
-  auto sequence1       = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) {
+  auto sequence1       = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8) {
       return 0;  // <- no shortcut to this can avoid compiler errors
     } else {
@@ -465,7 +465,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns)
     leftColWrap1(sequence1, sequence1 + inputRows, valid_sequence1);
 
   // data: 1  3  5  7 | valid: 1 1 1 0
-  auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) {
+  auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8) {
       return 1;
     } else
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 1e9e13ded93..bdb98372836 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -33,8 +33,12 @@
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <algorithm>
+#include <iostream>
+#include <iterator>
 #include <vector>
 
 using aggregation        = cudf::aggregation;
@@ -765,6 +769,25 @@ TYPED_TEST(MultiStepReductionTest, Mean)
             expected_value_nulls);
 }
 
+template <typename T>
+double calc_var(std::vector<T> const& v, int ddof, std::vector<bool> const& mask = {})
+{
+  auto const values = [&]() {
+    if (mask.empty()) { return v; }
+    std::vector<T> masked{};
+    thrust::copy_if(
+      v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; });
+    return masked;
+  }();
+  auto const valid_count = values.size();
+  double const mean      = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count;
+  double const sq_sum_of_differences =
+    std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) {
+      return acc + std::pow(v - mean, 2);
+    });
+  return sq_sum_of_differences / (valid_count - ddof);
+}
+
 // This test is disabled for only a Debug build because a compiler error
 // documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu
 #ifdef NDEBUG
@@ -777,25 +800,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
   std::vector<bool> host_bools({true, true, false, true, true, true, false, true});
 
-  auto calc_var = [](std::vector<T>& v, cudf::size_type valid_count, int ddof) {
-    double mean = std::accumulate(v.begin(), v.end(), double{0});
-    mean /= valid_count;
-
-    double sum_of_sq = std::accumulate(
-      v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; });
-
-    cudf::size_type div = valid_count - ddof;
-
-    double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
-    return var;
-  };
-
   // test without nulls
   std::vector<T> v = convert_values<T>(int_values);
   cudf::test::fixed_width_column_wrapper<T> col(v.begin(), v.end());
 
   auto const ddof = 1;
-  double var      = calc_var(v, v.size(), ddof);
+  double var      = calc_var(v, ddof);
   double std      = std::sqrt(var);
   auto var_agg    = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
   auto std_agg    = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -811,23 +821,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
 
   // test with nulls
   cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(v, host_bools);
-  cudf::size_type valid_count =
-    cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count();
-  auto replaced_array = replace_nulls(v, host_bools, T{0});
-
-  double var_nulls = calc_var(replaced_array, valid_count, ddof);
-  double std_nulls = std::sqrt(var_nulls);
+  double var_nulls                                    = calc_var(v, ddof, host_bools);
+  double std_nulls                                    = std::sqrt(var_nulls);
 
-  EXPECT_EQ(this
-              ->template reduction_test<double>(
-                col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
-              .first,
-            var_nulls);
-  EXPECT_EQ(this
-              ->template reduction_test<double>(
-                col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
-              .first,
-            std_nulls);
+  EXPECT_DOUBLE_EQ(this
+                     ->template reduction_test<double>(
+                       col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
+                     .first,
+                   var_nulls);
+  EXPECT_DOUBLE_EQ(this
+                     ->template reduction_test<double>(
+                       col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
+                     .first,
+                   std_nulls);
 }
 
 // ----------------------------------------------------------------------------
@@ -1139,23 +1145,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
   std::vector<double> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
   std::vector<bool> host_bools({true, true, false, true, true, true, false, true});
 
-  auto calc_var = [ddof](std::vector<double>& v, cudf::size_type valid_count) {
-    double mean = std::accumulate(v.begin(), v.end(), double{0});
-    mean /= valid_count;
-
-    double sum_of_sq = std::accumulate(
-      v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; });
-
-    cudf::size_type div = valid_count - ddof;
-
-    double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
-    return var;
-  };
-
   // test without nulls
   cudf::test::fixed_width_column_wrapper<double> col(int_values.begin(), int_values.end());
 
-  double var   = calc_var(int_values, int_values.size());
+  double var   = calc_var(int_values, ddof);
   double std   = std::sqrt(var);
   auto var_agg = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
   auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -1172,23 +1165,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
   // test with nulls
   cudf::test::fixed_width_column_wrapper<double> col_nulls =
     construct_null_column(int_values, host_bools);
-  cudf::size_type valid_count =
-    cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count();
-  auto replaced_array = replace_nulls<double>(int_values, host_bools, int{0});
-
-  double var_nulls = calc_var(replaced_array, valid_count);
+  double var_nulls = calc_var(int_values, ddof, host_bools);
   double std_nulls = std::sqrt(var_nulls);
 
-  EXPECT_EQ(this
-              ->template reduction_test<double>(
-                col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
-              .first,
-            var_nulls);
-  EXPECT_EQ(this
-              ->template reduction_test<double>(
-                col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
-              .first,
-            std_nulls);
+  EXPECT_DOUBLE_EQ(this
+                     ->template reduction_test<double>(
+                       col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
+                     .first,
+                   var_nulls);
+  EXPECT_DOUBLE_EQ(this
+                     ->template reduction_test<double>(
+                       col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
+                     .first,
+                   std_nulls);
 }
 
 //-------------------------------------------------------------------
@@ -2471,21 +2460,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
   std::vector<T> v = convert_values<T>(int_values);
   cudf::data_type output_type{cudf::type_to_id<double>()};
 
-  auto calc_var = [](std::vector<T> const& v, cudf::size_type valid_count, cudf::size_type ddof) {
-    double mean = std::accumulate(v.cbegin(), v.cend(), double{0});
-    mean /= valid_count;
-    double sum_of_sq = std::accumulate(
-      v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; });
-    auto const div = valid_count - ddof;
-    double var     = sum_of_sq / div - ((mean * mean) * valid_count) / div;
-    return var;
-  };
-
   // test without nulls
   cudf::test::dictionary_column_wrapper<T> col(v.begin(), v.end());
 
   cudf::size_type const ddof = 1;
-  double var                 = calc_var(v, v.size(), ddof);
+  double var                 = calc_var(v, ddof);
   double std                 = std::sqrt(var);
   auto var_agg               = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
   auto std_agg               = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -2497,15 +2476,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
   std::vector<bool> validity({true, true, false, true, true, true, false, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
-  cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true);
-
-  double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof);
+  double var_nulls = calc_var(v, ddof, validity);
   double std_nulls = std::sqrt(var_nulls);
 
-  EXPECT_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
-            var_nulls);
-  EXPECT_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
-            std_nulls);
+  EXPECT_DOUBLE_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
+                   var_nulls);
+  EXPECT_DOUBLE_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
+                   std_nulls);
 }
 
 TYPED_TEST(DictionaryReductionTest, NthElement)
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 19996f827cf..bc0321bd40a 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -1092,11 +1092,10 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
   auto aggregates =
     std::vector<std::unique_ptr<cudf::segmented_reduce_aggregation,
                                 std::default_delete<cudf::segmented_reduce_aggregation>>>();
-  aggregates.push_back(std::move(cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>()));
-  aggregates.push_back(std::move(cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>()));
-  aggregates.push_back(std::move(cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>()));
-  aggregates.push_back(
-    std::move(cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>()));
+  aggregates.push_back(cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>());
+  aggregates.push_back(cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>());
+  aggregates.push_back(cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>());
+  aggregates.push_back(cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>());
 
   auto output_type = cudf::data_type{cudf::type_to_id<int32_t>()};
   for (auto&& agg : aggregates) {
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 1858cd7782e..b12bf08520f 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -356,7 +356,7 @@ void test_replace(cudf::host_span<T const> input_column,
 
   for (size_t i = 0; i < values_to_replace_column.size(); i++) {
     size_t k  = 0;
-    auto pred = [=, &k, &reference_result, &expected_valid, &isReplaced](T element) {
+    auto pred = [=, &k, &expected_valid, &isReplaced](T element) {
       bool toBeReplaced = false;
       if (!isReplaced[k]) {
         if (!input_has_nulls || expected_valid[k]) {
@@ -503,7 +503,7 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest)
   const size_t REPLACE_SIZE = 10000;
 
   thrust::host_vector<TypeParam> input_column(DATA_SIZE);
-  std::generate(std::begin(input_column), std::end(input_column), [REPLACE_SIZE]() {
+  std::generate(std::begin(input_column), std::end(input_column), []() {
     return std::rand() % (REPLACE_SIZE);
   });
 
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index f702dc78371..165e0347785 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -214,7 +214,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
                          *cudf::make_collect_list_aggregation<cudf::rolling_aggregation>());
   auto expected_result_2 = cudf::test::lists_column_wrapper<T, int32_t>{
     {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}},
-    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
       return i != 0 && i < 4;
     })}.release();
 
@@ -338,7 +338,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 4, 8, 12, 12, 12}.release();
     auto expected_num_rows = expected_offsets->size() - 1;
     auto null_mask_iter    = cudf::detail::make_counting_transform_iterator(
-      cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; });
+      cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; });
 
     auto [null_mask, null_count] =
       cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows);
@@ -373,7 +373,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 3, 5, 8, 8, 8}.release();
     auto expected_num_rows = expected_offsets->size() - 1;
     auto null_mask_iter    = cudf::detail::make_counting_transform_iterator(
-      cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; });
+      cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; });
 
     auto [null_mask, null_count] =
       cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows);
@@ -1499,7 +1499,7 @@ TYPED_TEST(TypedCollectSetTest, RollingWindowHonoursMinPeriods)
                         *cudf::make_collect_set_aggregation<cudf::rolling_aggregation>());
   auto expected_result_2 = cudf::test::lists_column_wrapper<T, int32_t>{
     {{}, {0, 1, 2}, {1, 2, 4}, {2, 4, 5}, {}, {}},
-    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
       return i != 0 && i < 4;
     })}.release();
 
diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp
index ec726878b34..0eaab0c9f7a 100644
--- a/cpp/tests/rolling/offset_row_window_test.cpp
+++ b/cpp/tests/rolling/offset_row_window_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,11 @@ using cudf::test::iterators::nulls_at;
 
 auto constexpr null = int32_t{0};  // NULL representation for int32_t;
 
+// clang-tidy doesn't think std::transform can handle a
+// thrust::constant_iterator, so this is a workaround that uses nulls_at
+// instead of no_nulls
+auto no_nulls_list() { return nulls_at({}); }
+
 struct OffsetRowWindowTest : public cudf::test::BaseFixture {
   static ints_column const _keys;    // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
   static ints_column const _values;  // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
@@ -210,7 +215,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *run_rolling(*AGG_COLLECT_LIST),
-    lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, no_nulls});
+    lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}},
+                 no_nulls_list()});
 }
 
 TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2)
@@ -250,7 +256,7 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *run_rolling(*AGG_COLLECT_LIST),
     lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}},
-                 no_nulls});
+                 no_nulls_list()});
 }
 
 // To test that preceding bounds are clamped correctly at group boundaries.
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index c2c22986975..6e0dc16dca9 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -541,7 +541,7 @@ class RollingTest : public cudf::test::BaseFixture {
 
     agg_op op;
     for (cudf::size_type i = 0; i < num_rows; i++) {
-      OutputType val = agg_op::template identity<OutputType>();
+      auto val = agg_op::template identity<OutputType>();
 
       // load sizes
       min_periods = std::max(min_periods, 1);  // at least one observation is required
diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp
index 2d37de920d5..2b79911a95a 100644
--- a/cpp/tests/scalar/scalar_test.cpp
+++ b/cpp/tests/scalar/scalar_test.cpp
@@ -190,7 +190,7 @@ TEST_F(ListScalarTest, MoveConstructorNonNested)
 
   EXPECT_EQ(mask_ptr, s2.validity_data());
   EXPECT_EQ(data_ptr, s2.view().data<int32_t>());
-  EXPECT_EQ(s.view().data<int32_t>(), nullptr);
+  EXPECT_EQ(s.view().data<int32_t>(), nullptr);  // NOLINT
 }
 
 TEST_F(ListScalarTest, MoveConstructorNested)
@@ -205,8 +205,8 @@ TEST_F(ListScalarTest, MoveConstructorNested)
   EXPECT_EQ(mask_ptr, s2.validity_data());
   EXPECT_EQ(offset_ptr, s2.view().child(0).data<cudf::size_type>());
   EXPECT_EQ(data_ptr, s2.view().child(1).data<int32_t>());
-  EXPECT_EQ(s.view().data<int32_t>(), nullptr);
-  EXPECT_EQ(s.view().num_children(), 0);
+  EXPECT_EQ(s.view().data<int32_t>(), nullptr);  // NOLINT
+  EXPECT_EQ(s.view().num_children(), 0);         // NOLINT
 }
 
 struct StructScalarTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp
index 48711c21715..7584003e800 100644
--- a/cpp/tests/search/search_list_test.cpp
+++ b/cpp/tests/search/search_list_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,6 @@ using strings_col = cudf::test::strings_column_wrapper;
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR};
 constexpr int32_t null{0};  // Mark for null child elements at the current level
-constexpr int32_t XXX{0};   // Mark for null elements at all levels
 
 using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                      cudf::test::FloatingPointTypes,
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index e84275f41ef..6a35e977b46 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -1087,7 +1087,7 @@ TEST_F(SortCornerTest, WithEmptyStructColumn)
   child_columns2.push_back(std::move(child_col_1));
   int_col col4{{5, 4, 3, 2, 1, 0}};
   std::vector<std::unique_ptr<cudf::column>> grand_child;
-  grand_child.push_back(std::move(col4.release()));
+  grand_child.push_back(col4.release());
   auto child_col_2 = cudf::make_structs_column(6, std::move(grand_child), 0, rmm::device_buffer{});
   child_columns2.push_back(std::move(child_col_2));
   auto struct_col3 =
diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp
index 4d7d23dc881..d5b6915b520 100644
--- a/cpp/tests/stream_compaction/unique_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_tests.cpp
@@ -43,7 +43,6 @@ auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
 auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
 auto constexpr KEEP_LAST    = cudf::duplicate_keep_option::KEEP_LAST;
 auto constexpr KEEP_NONE    = cudf::duplicate_keep_option::KEEP_NONE;
-auto constexpr NULL_EQUAL   = cudf::null_equality::EQUAL;
 auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
 
 using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
index 443f4548b2c..07b2d77cc04 100644
--- a/cpp/tests/streams/stream_compaction_test.cpp
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -29,8 +29,6 @@
 
 #include <cmath>
 
-auto constexpr null{0};  // null at current level
-auto constexpr XXX{0};   // null pushed down from parent level
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
 auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
 auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp
new file mode 100644
index 00000000000..36e595ab9fa
--- /dev/null
+++ b/cpp/tests/streams/strings/factory_test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/pair.h>
+
+#include <string>
+#include <vector>
+
+class StringsFactoryTest : public cudf::test::BaseFixture {};
+
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+
+TEST_F(StringsFactoryTest, StringConstructionFromPairs)
+{
+  auto const stream = cudf::test::get_default_stream();
+
+  auto const h_data = std::vector<char>{'a', 'b', 'c'};
+  auto const d_data = cudf::detail::make_device_uvector_async(
+    h_data, stream, cudf::get_current_device_resource_ref());
+
+  auto const h_input =
+    std::vector<string_pair>{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}};
+  auto const d_input = cudf::detail::make_device_uvector_async(
+    h_input, stream, cudf::get_current_device_resource_ref());
+  auto const input = cudf::device_span<string_pair const>{d_input.data(), d_input.size()};
+  cudf::make_strings_column(input, stream);
+}
+
+TEST_F(StringsFactoryTest, StringBatchConstruction)
+{
+  auto const stream = cudf::test::get_default_stream();
+
+  auto const h_data = std::vector<char>{'a', 'b', 'c'};
+  auto const d_data = cudf::detail::make_device_uvector_async(
+    h_data, stream, cudf::get_current_device_resource_ref());
+
+  auto const h_input =
+    std::vector<string_pair>{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}};
+  auto const d_input = cudf::detail::make_device_uvector_async(
+    h_input, stream, cudf::get_current_device_resource_ref());
+
+  std::vector<cudf::device_span<string_pair const>> input(
+    10, cudf::device_span<string_pair const>{d_input.data(), d_input.size()});
+  cudf::make_strings_column_batch(input, stream);
+}
diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp
index 52839c6fc9f..e5a1ee0988c 100644
--- a/cpp/tests/streams/strings/find_test.cpp
+++ b/cpp/tests/streams/strings/find_test.cpp
@@ -46,4 +46,5 @@ TEST_F(StringsFindTest, Find)
   auto const pattern = std::string("[a-z]");
   auto const prog    = cudf::strings::regex_program::create(pattern);
   cudf::strings::findall(view, *prog, cudf::test::get_default_stream());
+  cudf::strings::find_re(view, *prog, cudf::test::get_default_stream());
 }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index bdfd38267e6..cceec1d3537 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -474,6 +474,54 @@ TEST_F(StringsContainsTests, FixedQuantifier)
   }
 }
 
+TEST_F(StringsContainsTests, ZeroRangeQuantifier)
+{
+  auto input = cudf::test::strings_column_wrapper({"a", "", "abc", "XYAZ", "ABC", "ZYXA"});
+  auto sv    = cudf::strings_column_view(input);
+
+  auto pattern = std::string("A{0,}");  // should match everyting
+  auto prog    = cudf::strings::regex_program::create(pattern);
+
+  {
+    auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1});
+    auto results  = cudf::strings::contains_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({2, 1, 4, 5, 4, 5});
+    auto results  = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+
+  pattern = std::string("(?:ab){0,3}");
+  prog    = cudf::strings::regex_program::create(pattern);
+
+  {
+    auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1});
+    auto results  = cudf::strings::contains_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({2, 1, 3, 5, 4, 5});
+    auto results  = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
+TEST_F(StringsContainsTests, NestedQuantifier)
+{
+  auto input   = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555",
+                                                     "0000 AAAA 9999 BBBB 8888",
+                                                     "7777 6666 4444 3333",
+                                                     "12345 3333 4444 1111 ABCD"});
+  auto sv      = cudf::strings_column_view(input);
+  auto pattern = std::string(R"((\d{4}\s){4})");
+  cudf::test::fixed_width_column_wrapper<bool> expected({true, false, false, true});
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::contains_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, QuantifierErrors)
 {
   EXPECT_THROW(cudf::strings::regex_program::create("^+"), cudf::logic_error);
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 61246fb098d..7e0338f1bf4 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -240,6 +239,21 @@ TEST_F(StringsExtractTests, SpecialNewLines)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
 }
 
+TEST_F(StringsExtractTests, NestedQuantifier)
+{
+  auto input   = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555",
+                                                     "0000 AAAA 9999 BBBB 8888",
+                                                     "7777 6666 4444 3333",
+                                                     "12345 3333 4444 1111 ABCD"});
+  auto sv      = cudf::strings_column_view(input);
+  auto pattern = std::string(R"((\d{4}\s){4})");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::extract(sv, *prog);
+  // fixed quantifier on capture group only honors the last group
+  auto expected = cudf::test::strings_column_wrapper({"4444 ", "", "", "1111 "}, {1, 0, 0, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 90054e41d36..7eb429da7d9 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -37,6 +37,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
 #include <thrust/pair.h>
+#include <thrust/tabulate.h>
 #include <thrust/transform.h>
 
 #include <cstring>
@@ -44,6 +45,8 @@
 
 struct StringsFactoriesTest : public cudf::test::BaseFixture {};
 
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+
 TEST_F(StringsFactoriesTest, CreateColumnFromPair)
 {
   std::vector<char const*> h_test_strings{"the quick brown fox jumps over the lazy dog",
@@ -61,7 +64,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   cudf::size_type count = (cudf::size_type)h_test_strings.size();
   thrust::host_vector<char> h_buffer(memsize);
   rmm::device_uvector<char> d_buffer(memsize, cudf::get_default_stream());
-  thrust::host_vector<thrust::pair<char const*, cudf::size_type>> strings(count);
+  thrust::host_vector<string_pair> strings(count);
   thrust::host_vector<cudf::size_type> h_offsets(count + 1);
   cudf::size_type offset = 0;
   cudf::size_type nulls  = 0;
@@ -69,12 +72,12 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   for (cudf::size_type idx = 0; idx < count; ++idx) {
     char const* str = h_test_strings[idx];
     if (!str) {
-      strings[idx] = thrust::pair<char const*, cudf::size_type>{nullptr, 0};
+      strings[idx] = string_pair{nullptr, 0};
       nulls++;
     } else {
       auto length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
-      strings[idx] = thrust::pair<char const*, cudf::size_type>{d_buffer.data() + offset, length};
+      strings[idx] = string_pair{d_buffer.data() + offset, length};
       offset += length;
     }
     h_offsets[idx + 1] = offset;
@@ -201,14 +204,13 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn)
     cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release());
   cudf::test::expect_column_empty(results->view());
 
-  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> d_strings{
-    0, cudf::get_default_stream()};
+  rmm::device_uvector<string_pair> d_strings{0, cudf::get_default_stream()};
   results = cudf::make_strings_column(d_strings);
   cudf::test::expect_column_empty(results->view());
 }
 
 namespace {
-using string_pair = thrust::pair<char const*, cudf::size_type>;
+
 struct string_view_to_pair {
   __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
   {
@@ -234,3 +236,198 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty)
   auto result = cudf::make_strings_column(pairs);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data);
 }
+
+struct StringsBatchConstructionTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsBatchConstructionTest, EmptyColumns)
+{
+  auto constexpr num_columns = 10;
+  auto const stream          = cudf::get_default_stream();
+
+  auto const d_string_pairs = rmm::device_uvector<string_pair>{0, stream};
+  auto const input          = std::vector<cudf::device_span<string_pair const>>(
+    num_columns, {d_string_pairs.data(), d_string_pairs.size()});
+  auto const output = cudf::make_strings_column_batch(input, stream);
+
+  auto const expected_col = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  for (auto const& col : output) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view());
+  }
+}
+
+TEST_F(StringsBatchConstructionTest, AllNullsColumns)
+{
+  auto constexpr num_columns = 10;
+  auto constexpr num_rows    = 100;
+  auto const stream          = cudf::get_default_stream();
+
+  auto d_string_pairs = rmm::device_uvector<string_pair>{num_rows, stream};
+  thrust::uninitialized_fill_n(rmm::exec_policy(stream),
+                               d_string_pairs.data(),
+                               d_string_pairs.size(),
+                               string_pair{nullptr, 0});
+  auto const input = std::vector<cudf::device_span<string_pair const>>(
+    num_columns, {d_string_pairs.data(), d_string_pairs.size()});
+  auto const output = cudf::make_strings_column_batch(input, stream);
+
+  auto const expected_col = cudf::make_strings_column(d_string_pairs);
+  for (auto const& col : output) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view());
+  }
+}
+
+namespace {
+
+struct index_to_pair {
+  int const num_test_strings;
+  char const* d_chars;
+  std::size_t const* d_offsets;
+  int const* is_null;
+
+  __device__ string_pair operator()(cudf::size_type idx)
+  {
+    auto const data_idx = idx % num_test_strings;
+    return {is_null[data_idx] ? nullptr : d_chars + d_offsets[data_idx],
+            static_cast<cudf::size_type>(d_offsets[data_idx + 1] - d_offsets[data_idx])};
+  }
+};
+
+}  // namespace
+
+TEST_F(StringsBatchConstructionTest, CreateColumnsFromPairs)
+{
+  auto constexpr num_columns  = 10;
+  auto constexpr max_num_rows = 1000;
+  auto const stream           = cudf::get_default_stream();
+  auto const mr               = cudf::get_current_device_resource_ref();
+
+  std::vector<char const*> h_test_strings{"the quick brown fox jumps over the lazy dog",
+                                          "the fat cat lays next to the other accénted cat",
+                                          "a slow moving turtlé cannot catch the bird",
+                                          "which can be composéd together to form a more complete",
+                                          "thé result does not include the value in the sum in",
+                                          "",
+                                          nullptr,
+                                          "absent stop words"};
+  auto const num_test_strings = static_cast<int>(h_test_strings.size());
+
+  std::vector<std::size_t> h_offsets(num_test_strings + 1, 0);
+  for (int i = 0; i < num_test_strings; ++i) {
+    h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0);
+  }
+
+  std::vector<char> h_chars(h_offsets.back());
+  std::vector<int> is_null(num_test_strings, 0);
+  for (int i = 0; i < num_test_strings; ++i) {
+    if (h_test_strings[i]) {
+      memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i]));
+    } else {
+      is_null[i] = 1;
+    }
+  }
+
+  auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
+  auto const d_chars   = cudf::detail::make_device_uvector_async(h_chars, stream, mr);
+  auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr);
+
+  std::vector<rmm::device_uvector<string_pair>> d_input;
+  std::vector<cudf::device_span<string_pair const>> input;
+  d_input.reserve(num_columns);
+  input.reserve(num_columns);
+
+  for (int col_idx = 0; col_idx < num_columns; ++col_idx) {
+    // Columns have sizes increase from `max_num_rows / num_columns` to `max_num_rows`.
+    auto const num_rows =
+      static_cast<int>(static_cast<double>(col_idx + 1) / num_columns * max_num_rows);
+
+    auto string_pairs = rmm::device_uvector<string_pair>(num_rows, stream);
+    thrust::tabulate(
+      rmm::exec_policy_nosync(stream),
+      string_pairs.begin(),
+      string_pairs.end(),
+      index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()});
+
+    d_input.emplace_back(std::move(string_pairs));
+    input.emplace_back(d_input.back());
+  }
+
+  auto const output = cudf::make_strings_column_batch(input, stream, mr);
+
+  for (std::size_t i = 0; i < num_columns; ++i) {
+    auto const string_pairs = input[i];
+    auto const expected     = cudf::make_strings_column(string_pairs, stream, mr);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view());
+  }
+}
+
+// The test below requires a huge amount of memory, thus it is disabled by default.
+TEST_F(StringsBatchConstructionTest, DISABLED_CreateLongStringsColumns)
+{
+  auto constexpr num_columns = 2;
+  auto const stream          = cudf::get_default_stream();
+  auto const mr              = cudf::get_current_device_resource_ref();
+
+  std::vector<char const*> h_test_strings{"the quick brown fox jumps over the lazy dog",
+                                          "the fat cat lays next to the other accénted cat",
+                                          "a slow moving turtlé cannot catch the bird",
+                                          "which can be composéd together to form a more complete",
+                                          "thé result does not include the value in the sum in",
+                                          "",
+                                          nullptr,
+                                          "absent stop words"};
+  auto const num_test_strings = static_cast<int>(h_test_strings.size());
+
+  std::vector<std::size_t> h_offsets(num_test_strings + 1, 0);
+  for (int i = 0; i < num_test_strings; ++i) {
+    h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0);
+  }
+
+  std::vector<char> h_chars(h_offsets.back());
+  std::vector<int> is_null(num_test_strings, 0);
+  for (int i = 0; i < num_test_strings; ++i) {
+    if (h_test_strings[i]) {
+      memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i]));
+    } else {
+      is_null[i] = 1;
+    }
+  }
+
+  auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
+  auto const d_chars   = cudf::detail::make_device_uvector_async(h_chars, stream, mr);
+  auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr);
+
+  // If we create a column by repeating h_test_strings by `max_cycles` times,
+  // we will have it size around (1.5*INT_MAX) bytes.
+  auto const max_cycles = static_cast<int>(static_cast<int64_t>(std::numeric_limits<int>::max()) *
+                                           1.5 / h_offsets.back());
+
+  std::vector<rmm::device_uvector<string_pair>> d_input;
+  std::vector<cudf::device_span<string_pair const>> input;
+  d_input.reserve(num_columns);
+  input.reserve(num_columns);
+
+  for (int col_idx = 0; col_idx < num_columns; ++col_idx) {
+    // Columns have sizes increase from `max_cycles * num_test_strings / num_columns` to
+    // `max_cycles * num_test_strings`.
+    auto const num_rows = static_cast<int>(static_cast<double>(col_idx + 1) / num_columns *
+                                           max_cycles * num_test_strings);
+
+    auto string_pairs = rmm::device_uvector<string_pair>(num_rows, stream);
+    thrust::tabulate(
+      rmm::exec_policy_nosync(stream),
+      string_pairs.begin(),
+      string_pairs.end(),
+      index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()});
+
+    d_input.emplace_back(std::move(string_pairs));
+    input.emplace_back(d_input.back());
+  }
+
+  auto const output = cudf::make_strings_column_batch(input, stream, mr);
+
+  for (std::size_t i = 0; i < num_columns; ++i) {
+    auto const string_pairs = input[i];
+    auto const expected     = cudf::make_strings_column(string_pairs, stream, mr);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view());
+  }
+}
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 73da4d081e2..4821a7fa999 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -19,6 +19,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/strings/findall.hpp>
@@ -149,6 +150,22 @@ TEST_F(StringsFindallTests, LargeRegex)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
+TEST_F(StringsFindallTests, FindTest)
+{
+  auto const valids = cudf::test::iterators::null_at(5);
+  cudf::test::strings_column_wrapper input(
+    {"3A", "May4", "Jan2021", "March", "A9BC", "", "", "abcdef ghijklm 12345"}, valids);
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern = std::string("\\d+");
+
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::find_re(sv, *prog);
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 3, 3, -1, 1, 0, -1, 15}, valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
+
 TEST_F(StringsFindallTests, NoMatches)
 {
   cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
@@ -169,10 +186,16 @@ TEST_F(StringsFindallTests, EmptyTest)
   auto prog = cudf::strings::regex_program::create(pattern);
 
   cudf::test::strings_column_wrapper input;
-  auto sv      = cudf::strings_column_view(input);
-  auto results = cudf::strings::findall(sv, *prog);
-
-  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected;
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto sv = cudf::strings_column_view(input);
+  {
+    auto results = cudf::strings::findall(sv, *prog);
+    using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected;
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  }
+  {
+    auto results  = cudf::strings::find_re(sv, *prog);
+    auto expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>{};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  }
 }
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index ce5f68de3c9..26bcfe8028d 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -30,6 +30,7 @@
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
+#include <array>
 #include <string>
 #include <vector>
 
@@ -425,7 +426,7 @@ TYPED_TEST(StringsIntegerConvertTest, IntegerToHex)
     if (v == 0) { return std::string("00"); }
     // special handling for single-byte types
     if constexpr (std::is_same_v<TypeParam, int8_t> || std::is_same_v<TypeParam, uint8_t>) {
-      char const hex_digits[16] = {
+      std::array const hex_digits = {
         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
       std::string str;
       str += hex_digits[(v & 0xF0) >> 4];
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 9847d8d6bb5..abc12b00a81 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -200,6 +200,34 @@ TEST_F(StringsReplaceRegexTest, ZeroLengthMatch)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsReplaceRegexTest, ZeroRangeQuantifier)
+{
+  auto input = cudf::test::strings_column_wrapper({"a", "", "123", "XYAZ", "abc", "zéyab"});
+  auto sv    = cudf::strings_column_view(input);
+
+  auto pattern  = std::string("A{0,5}");
+  auto prog     = cudf::strings::regex_program::create(pattern);
+  auto repl     = cudf::string_scalar("_");
+  auto expected = cudf::test::strings_column_wrapper(
+    {"_a_", "_", "_1_2_3_", "_X_Y__Z_", "_a_b_c_", "_z_é_y_a_b_"});
+  auto results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  pattern = std::string("[a0-9]{0,2}");
+  prog    = cudf::strings::regex_program::create(pattern);
+  expected =
+    cudf::test::strings_column_wrapper({"__", "_", "___", "_X_Y_A_Z_", "__b_c_", "_z_é_y__b_"});
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  pattern = std::string("(?:ab){0,3}");
+  prog    = cudf::strings::regex_program::create(pattern);
+  expected =
+    cudf::test::strings_column_wrapper({"_a_", "_", "_1_2_3_", "_X_Y_A_Z_", "__c_", "_z_é_y__"});
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsReplaceRegexTest, Multiline)
 {
   auto const multiline = cudf::strings::regex_flags::MULTILINE;
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index f0010fc1ed9..219bd6d8b01 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -635,9 +635,8 @@ TEST_F(StructColumnWrapperTest, TestStructsColumnWithEmptyChild)
   auto mask_vec = std::vector<bool>{true, false, false};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(mask_vec.begin(), mask_vec.end());
-  auto structs_col =
-    cudf::make_structs_column(num_rows, std::move(cols), null_count, std::move(null_mask));
-  EXPECT_NO_THROW(structs_col->view());
+  EXPECT_NO_THROW(auto structs_col = cudf::make_structs_column(
+                    num_rows, std::move(cols), null_count, std::move(null_mask)));
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp
index 215ca158f37..2684123c08a 100644
--- a/cpp/tests/transform/bools_to_mask_test.cpp
+++ b/cpp/tests/transform/bools_to_mask_test.cpp
@@ -32,7 +32,7 @@ struct MaskToNullTest : public cudf::test::BaseFixture {
   {
     cudf::test::fixed_width_column_wrapper<bool> input_column(
       input.begin(), input.end(), val.begin());
-    std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and<bool>());
+    std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and<>());
 
     auto sample = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
 
diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp
index 1785848ec77..0bdf5b321ac 100644
--- a/cpp/tests/transform/integration/unary_transform_test.cpp
+++ b/cpp/tests/transform/integration/unary_transform_test.cpp
@@ -47,7 +47,7 @@ void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool
 TEST_F(UnaryOperationIntegrationTest, Transform_FP32_FP32)
 {
   // c = a*a*a*a
-  char const* cuda =
+  std::string const cuda =
     R"***(
 __device__ inline void    fdsf   (
        float* C,
@@ -58,7 +58,7 @@ __device__ inline void    fdsf   (
 }
 )***";
 
-  char const* ptx =
+  std::string const ptx =
     R"***(
 //
 // Generated by NVIDIA NVVM Compiler
@@ -101,17 +101,17 @@ __device__ inline void    fdsf   (
   auto op        = [](dtype a) { return a * a * a * a; };
   auto data_init = [](cudf::size_type row) { return row % 3; };
 
-  test_udf<dtype>(cuda, op, data_init, 500, false);
-  test_udf<dtype>(ptx, op, data_init, 500, true);
+  test_udf<dtype>(cuda.c_str(), op, data_init, 500, false);
+  test_udf<dtype>(ptx.c_str(), op, data_init, 500, true);
 }
 
 TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32)
 {
   // c = a * a - a
-  char const cuda[] =
+  std::string const cuda =
     "__device__ inline void f(int* output,int input){*output = input*input - input;}";
 
-  char const* ptx =
+  std::string const ptx =
     R"***(
 .func _Z1fPii(
         .param .b64 _Z1fPii_param_0,
@@ -136,8 +136,8 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32)
   auto op        = [](dtype a) { return a * a - a; };
   auto data_init = [](cudf::size_type row) { return row % 78; };
 
-  test_udf<dtype>(cuda, op, data_init, 500, false);
-  test_udf<dtype>(ptx, op, data_init, 500, true);
+  test_udf<dtype>(cuda.c_str(), op, data_init, 500, false);
+  test_udf<dtype>(ptx.c_str(), op, data_init, 500, true);
 }
 
 TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8)
@@ -145,7 +145,7 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8)
   // Capitalize all the lower case letters
   // Assuming ASCII, the PTX code is compiled from the following CUDA code
 
-  char const cuda[] =
+  std::string const cuda =
     R"***(
 __device__ inline void f(
   signed char* output,
@@ -159,7 +159,7 @@ __device__ inline void f(
 }
 )***";
 
-  char const ptx[] =
+  std::string const ptx =
     R"***(
 .func _Z1fPcc(
         .param .b64 _Z1fPcc_param_0,
@@ -191,15 +191,15 @@ __device__ inline void f(
   auto op        = [](dtype a) { return std::toupper(a); };
   auto data_init = [](cudf::size_type row) { return 'a' + (row % 26); };
 
-  test_udf<dtype>(cuda, op, data_init, 500, false);
-  test_udf<dtype>(ptx, op, data_init, 500, true);
+  test_udf<dtype>(cuda.c_str(), op, data_init, 500, false);
+  test_udf<dtype>(ptx.c_str(), op, data_init, 500, true);
 }
 
 TEST_F(UnaryOperationIntegrationTest, Transform_Datetime)
 {
   // Add one day to timestamp in microseconds
 
-  char const cuda[] =
+  std::string const cuda =
     R"***(
 __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input)
 {
@@ -217,7 +217,7 @@ __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input)
   auto random_eng = cudf::test::UniformRandomGenerator<cudf::timestamp_us::rep>(0, 100000000);
   auto data_init  = [&random_eng](cudf::size_type row) { return random_eng.generate(); };
 
-  test_udf<dtype>(cuda, op, data_init, 500, false);
+  test_udf<dtype>(cuda.c_str(), op, data_init, 500, false);
 }
 
 }  // namespace transformation
diff --git a/cpp/tests/utilities/table_utilities.cu b/cpp/tests/utilities/table_utilities.cu
index 354c0b1b57e..8e4906408de 100644
--- a/cpp/tests/utilities/table_utilities.cu
+++ b/cpp/tests/utilities/table_utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,9 @@
  */
 
 #include <cudf_test/column_utilities.hpp>
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <gmock/gmock.h>
-
 namespace cudf::test::detail {
 void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs)
 {
diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
new file mode 100644
index 00000000000..98657f8e224
--- /dev/null
+++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/utilities/batched_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/device_vector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/tuple.h>
+
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <type_traits>
+
+template <typename T>
+struct BatchedMemcpyTest : public cudf::test::BaseFixture {};
+
+TEST(BatchedMemcpyTest, BasicTest)
+{
+  using T1 = int64_t;
+
+  // Device init
+  auto stream = cudf::get_default_stream();
+  auto mr     = cudf::get_current_device_resource_ref();
+
+  // Buffer lengths (in number of elements)
+  std::vector<size_t> const h_lens{
+    50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000};
+
+  // Total number of buffers
+  auto const num_buffs = h_lens.size();
+
+  // Exclusive sum of buffer lengths for pointers
+  std::vector<size_t> h_lens_excl_sum(num_buffs);
+  std::exclusive_scan(h_lens.begin(), h_lens.end(), h_lens_excl_sum.begin(), 0);
+
+  // Corresponding buffer sizes (in bytes)
+  std::vector<size_t> h_sizes_bytes;
+  h_sizes_bytes.reserve(num_buffs);
+  std::transform(
+    h_lens.cbegin(), h_lens.cend(), std::back_inserter(h_sizes_bytes), [&](auto& size) {
+      return size * sizeof(T1);
+    });
+
+  // Initialize random engine
+  auto constexpr seed = 0xcead;
+  std::mt19937 engine{seed};
+  using uniform_distribution =
+    typename std::conditional_t<std::is_same_v<T1, bool>,
+                                std::bernoulli_distribution,
+                                std::conditional_t<std::is_floating_point_v<T1>,
+                                                   std::uniform_real_distribution<T1>,
+                                                   std::uniform_int_distribution<T1>>>;
+  uniform_distribution dist{};
+
+  // Generate a src vector of random data vectors
+  std::vector<std::vector<T1>> h_sources;
+  h_sources.reserve(num_buffs);
+  std::transform(h_lens.begin(), h_lens.end(), std::back_inserter(h_sources), [&](auto size) {
+    std::vector<T1> data(size);
+    std::generate_n(data.begin(), size, [&]() { return T1{dist(engine)}; });
+    return data;
+  });
+  // Copy the vectors to device
+  std::vector<rmm::device_uvector<T1>> h_device_vecs;
+  h_device_vecs.reserve(h_sources.size());
+  std::transform(
+    h_sources.begin(), h_sources.end(), std::back_inserter(h_device_vecs), [stream, mr](auto& vec) {
+      return cudf::detail::make_device_uvector_async(vec, stream, mr);
+    });
+  // Pointers to the source vectors
+  std::vector<T1*> h_src_ptrs;
+  h_src_ptrs.reserve(h_sources.size());
+  std::transform(
+    h_device_vecs.begin(), h_device_vecs.end(), std::back_inserter(h_src_ptrs), [](auto& vec) {
+      return static_cast<T1*>(vec.data());
+    });
+  // Copy the source data pointers to device
+  auto d_src_ptrs = cudf::detail::make_device_uvector_async(h_src_ptrs, stream, mr);
+
+  // Total number of elements in all buffers
+  auto const total_buff_len = std::accumulate(h_lens.cbegin(), h_lens.cend(), 0);
+
+  // Create one giant buffer for destination
+  auto d_dst_data = cudf::detail::make_zeroed_device_uvector_async<T1>(total_buff_len, stream, mr);
+  // Pointers to destination buffers within the giant destination buffer
+  std::vector<T1*> h_dst_ptrs(num_buffs);
+  std::for_each(thrust::make_counting_iterator(static_cast<size_t>(0)),
+                thrust::make_counting_iterator(num_buffs),
+                [&](auto i) { return h_dst_ptrs[i] = d_dst_data.data() + h_lens_excl_sum[i]; });
+  // Copy destination data pointers to device
+  auto d_dst_ptrs = cudf::detail::make_device_uvector_async(h_dst_ptrs, stream, mr);
+
+  // Copy buffer size iterators (in bytes) to device
+  auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr);
+
+  // Run the batched memcpy
+  cudf::detail::batched_memcpy_async(
+    d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream);
+
+  // Expected giant destination buffer after the memcpy
+  std::vector<T1> expected_buffer;
+  expected_buffer.reserve(total_buff_len);
+  std::for_each(h_sources.cbegin(), h_sources.cend(), [&expected_buffer](auto& source) {
+    expected_buffer.insert(expected_buffer.end(), source.begin(), source.end());
+  });
+
+  // Copy over the result destination buffer to host and synchronize the stream
+  auto result_dst_buffer =
+    cudf::detail::make_std_vector_sync<T1>(cudf::device_span<T1>(d_dst_data), stream);
+
+  // Check if both vectors are equal
+  EXPECT_TRUE(
+    std::equal(expected_buffer.begin(), expected_buffer.end(), result_dst_buffer.begin()));
+}
diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu
index bed0f40d70e..0eeb7b95318 100644
--- a/cpp/tests/utilities_tests/batched_memset_tests.cu
+++ b/cpp/tests/utilities_tests/batched_memset_tests.cu
@@ -18,8 +18,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/detail/utilities/batched_memset.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/detail/batched_memset.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -78,7 +78,7 @@ TEST(MultiBufferTestIntegral, BasicTest1)
     });
 
   // Function Call
-  cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
+  cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
 
   // Set all buffer regions to 0 for expected comparison
   std::for_each(
diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp
index d052e20eedb..cfab570833b 100644
--- a/cpp/tests/utilities_tests/logger_tests.cpp
+++ b/cpp/tests/utilities_tests/logger_tests.cpp
@@ -28,16 +28,17 @@ class LoggerTest : public cudf::test::BaseFixture {
   std::vector<spdlog::sink_ptr> prev_sinks;
 
  public:
-  LoggerTest() : prev_level{cudf::logger().level()}, prev_sinks{cudf::logger().sinks()}
+  LoggerTest()
+    : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()}
   {
-    cudf::logger().sinks() = {std::make_shared<spdlog::sinks::ostream_sink_mt>(oss)};
-    cudf::logger().set_formatter(
+    cudf::detail::logger().sinks() = {std::make_shared<spdlog::sinks::ostream_sink_mt>(oss)};
+    cudf::detail::logger().set_formatter(
       std::unique_ptr<spdlog::formatter>(new spdlog::pattern_formatter("%v")));
   }
   ~LoggerTest() override
   {
-    cudf::logger().set_level(prev_level);
-    cudf::logger().sinks() = prev_sinks;
+    cudf::detail::logger().set_level(prev_level);
+    cudf::detail::logger().sinks() = prev_sinks;
   }
 
   void clear_sink() { oss.str(""); }
@@ -46,32 +47,32 @@ class LoggerTest : public cudf::test::BaseFixture {
 
 TEST_F(LoggerTest, Basic)
 {
-  cudf::logger().critical("crit msg");
+  cudf::detail::logger().critical("crit msg");
   ASSERT_EQ(this->sink_content(), "crit msg\n");
 }
 
 TEST_F(LoggerTest, DefaultLevel)
 {
-  cudf::logger().trace("trace");
-  cudf::logger().debug("debug");
-  cudf::logger().info("info");
-  cudf::logger().warn("warn");
-  cudf::logger().error("error");
-  cudf::logger().critical("critical");
+  cudf::detail::logger().trace("trace");
+  cudf::detail::logger().debug("debug");
+  cudf::detail::logger().info("info");
+  cudf::detail::logger().warn("warn");
+  cudf::detail::logger().error("error");
+  cudf::detail::logger().critical("critical");
   ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n");
 }
 
 TEST_F(LoggerTest, CustomLevel)
 {
-  cudf::logger().set_level(spdlog::level::warn);
-  cudf::logger().info("info");
-  cudf::logger().warn("warn");
+  cudf::detail::logger().set_level(spdlog::level::warn);
+  cudf::detail::logger().info("info");
+  cudf::detail::logger().warn("warn");
   ASSERT_EQ(this->sink_content(), "warn\n");
 
   this->clear_sink();
 
-  cudf::logger().set_level(spdlog::level::debug);
-  cudf::logger().trace("trace");
-  cudf::logger().debug("debug");
+  cudf::detail::logger().set_level(spdlog::level::debug);
+  cudf::detail::logger().trace("trace");
+  cudf::detail::logger().debug("debug");
   ASSERT_EQ(this->sink_content(), "debug\n");
 }
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index ae7c6fa8b8c..7b8ee840da4 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "io/utilities/hostdevice_vector.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
@@ -22,10 +24,17 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
+using cudf::host_span;
+using cudf::detail::host_2dspan;
+using cudf::detail::hostdevice_2dvector;
+using cudf::detail::hostdevice_span;
+using cudf::detail::hostdevice_vector;
+
 class PinnedMemoryTest : public cudf::test::BaseFixture {
   size_t prev_copy_threshold;
   size_t prev_alloc_threshold;
@@ -125,3 +134,63 @@ TEST_F(PinnedMemoryTest, MakeHostVector)
     EXPECT_FALSE(vec.get_allocator().is_device_accessible());
   }
 }
+
+TEST_F(PinnedMemoryTest, HostSpan)
+{
+  auto test_ctors = [](auto&& vec) {
+    auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible();
+    // Test conversion from a vector
+    auto const span = host_span<int16_t>{vec};
+    EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible);
+    // Test conversion from host_span with different type
+    auto const span_converted = host_span<int16_t const>{span};
+    EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible);
+  };
+
+  cudf::set_allocate_host_as_pinned_threshold(7);
+  for (int i = 1; i < 10; i++) {
+    // some iterations will use pinned memory, some will not
+    test_ctors(cudf::detail::make_host_vector<int16_t>(i, cudf::get_default_stream()));
+  }
+
+  auto stream{cudf::get_default_stream()};
+
+  // hostdevice vectors use pinned memory for the host side; test that host_span can be constructed
+  // from a hostdevice_vector with correct device accessibility
+
+  hostdevice_vector<int16_t> hd_vec(10, stream);
+  auto const span = host_span<int16_t>{hd_vec};
+  EXPECT_TRUE(span.is_device_accessible());
+
+  // test host_view and operator[]
+  {
+    hostdevice_2dvector<int16_t> hd_2dvec(10, 10, stream);
+    auto const span2d = hd_2dvec.host_view().flat_view();
+    EXPECT_TRUE(span2d.is_device_accessible());
+
+    auto const span2d_from_cast = host_2dspan<int16_t>{hd_2dvec};
+    EXPECT_TRUE(span2d_from_cast.flat_view().is_device_accessible());
+
+    auto const row_span = hd_2dvec[0];
+    EXPECT_TRUE(row_span.is_device_accessible());
+  }
+
+  // test const versions of host_view and operator[]
+  {
+    hostdevice_2dvector<int16_t> const const_hd_2dvec(10, 10, stream);
+    auto const const_span2d = const_hd_2dvec.host_view().flat_view();
+    EXPECT_TRUE(const_span2d.is_device_accessible());
+
+    auto const const_span2d_from_cast = host_2dspan<int16_t const>{const_hd_2dvec};
+    EXPECT_TRUE(const_span2d_from_cast.flat_view().is_device_accessible());
+
+    auto const const_row_span = const_hd_2dvec[0];
+    EXPECT_TRUE(const_row_span.is_device_accessible());
+  }
+
+  // test hostdevice_span
+  {
+    hostdevice_span<int16_t> hd_span(hd_vec);
+    EXPECT_TRUE(host_span<int16_t>{hd_span}.is_device_accessible());
+  }
+}
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index 019d6adc007..5389e1c069d 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -336,58 +336,50 @@ auto get_test_hostdevice_vector()
 
 TEST(HostDeviceSpanTest, CanCreateFullSubspan)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
-  expect_equivalent(message_span, message.subspan(0, message_span.size()));
+  expect_equivalent(message_span.subspan(0, message_span.size()), message_span);
 }
 
 TEST(HostDeviceSpanTest, CanCreateHostSpan)
 {
   auto message            = get_test_hostdevice_vector();
   auto const message_span = host_span<char>(message.host_ptr(), message.size());
-  auto const hd_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto const hd_span      = cudf::detail::hostdevice_span<char>{message};
 
   expect_equivalent(message_span, cudf::host_span<char>(hd_span));
 }
 
 TEST(HostDeviceSpanTest, CanTakeSubspanFull)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
-  expect_match("hello world", message.subspan(0, 11));
   expect_match("hello world", message_span.subspan(0, 11));
 }
 
 TEST(HostDeviceSpanTest, CanTakeSubspanPartial)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
-  expect_match("lo w", message.subspan(3, 4));
   expect_match("lo w", message_span.subspan(3, 4));
 }
 
 TEST(HostDeviceSpanTest, CanGetData)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
   EXPECT_EQ(message.host_ptr(), message_span.host_ptr());
 }
 
 TEST(HostDeviceSpanTest, CanGetSize)
 {
-  auto message = get_test_hostdevice_vector();
-  auto const message_span =
-    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
-  auto const empty_span = cudf::detail::hostdevice_span<char>();
+  auto message            = get_test_hostdevice_vector();
+  auto const message_span = cudf::detail::hostdevice_span<char>{message};
+  auto const empty_span   = cudf::detail::hostdevice_span<char>();
 
   EXPECT_EQ(static_cast<size_t>(11), message_span.size());
   EXPECT_EQ(static_cast<size_t>(0), empty_span.size());
@@ -413,8 +405,7 @@ TEST(HostDeviceSpanTest, CanCopySpan)
   cudf::detail::hostdevice_span<char> message_span_copy;
 
   {
-    auto const message_span =
-      cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+    auto const message_span = cudf::detail::hostdevice_span<char>{message};
 
     message_span_copy = message_span;
   }
diff --git a/dependencies.yaml b/dependencies.yaml
index ed36a23e5c3..4804f7b00b0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,15 +6,24 @@ files:
       cuda: ["11.8", "12.5"]
       arch: [x86_64]
     includes:
+      # Note that clang-tidy is not included here because cudf's preferred
+      # version conflicts with the rest of RAPIDS as well as its own
+      # clang-format version. Until we update our clang-format version we will
+      # not be able to install both into the same environment. Moreover, using
+      # this version will break compatibility with other RAPIDS libraries that
+      # are still using 16.0.6, and as such will and that would break any
+      # unified environment like that used in unified devcontainers.
       - build_base
       - build_all
       - build_cpp
       - build_python_common
+      - clang_format
       - cuda
       - cuda_version
       - depends_on_cupy
       - depends_on_libkvikio
       - depends_on_librmm
+      - depends_on_nvcomp
       - depends_on_rmm
       - develop
       - docs
@@ -85,6 +94,16 @@ files:
     includes:
       - develop
       - py_version
+  clang_tidy:
+    output: none
+    includes:
+      - build_all
+      - build_base
+      - clang_tidy
+      - cuda
+      - cuda_version
+      - develop
+      - py_version
   docs:
     output: none
     includes:
@@ -152,6 +171,13 @@ files:
       - build_cpp
       - depends_on_libkvikio
       - depends_on_librmm
+  py_run_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: project
+    includes:
+      - depends_on_nvcomp
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -367,9 +393,27 @@ dependencies:
           - fmt>=11.0.2,<12
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
+          - spdlog>=1.14.1,<1.15
+  depends_on_nvcomp:
+    common:
+      - output_types: conda
+        packages:
           # Align nvcomp version with rapids-cmake
           - nvcomp==4.0.1
-          - spdlog>=1.14.1,<1.15
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - nvidia-nvcomp-cu12==4.0.1
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - nvidia-nvcomp-cu11==4.0.1
+          - matrix:
+            packages:
+              - nvidia-nvcomp==4.0.1
   rapids_build_skbuild:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -395,9 +439,18 @@ dependencies:
           - cython>=3.0.3
   pyarrow_run:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: [conda]
         packages:
           - pyarrow>=14.0.0,<18.0.0a0
+      - output_types: [requirements, pyproject]
+        packages:
+          # pyarrow 17.0.0 wheels have a subtle issue around threading that
+          # can cause segmentation faults around imports on arm. It appears to
+          # be highly dependent on the exact build configuration, so we'll just
+          # avoid 17.0.0 for now unless we observe similar issues in future
+          # releases as well.
+          - pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'
+          - pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'
   cuda_version:
     specific:
       - output_types: conda
@@ -518,11 +571,21 @@ dependencies:
           # pre-commit requires identify minimum version 1.0, but clang-format requires textproto support and that was
           # added in 2.5.20, so we need to call out the minimum version needed for our plugins
           - identify>=2.5.20
+      - output_types: conda
+        packages:
+          - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
+  clang_format:
+    common:
       - output_types: conda
         packages:
           - clang==16.0.6
           - clang-tools=16.0.6
-          - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
+  clang_tidy:
+    common:
+      - output_types: conda
+        packages:
+          - clang==19.1.0
+          - clang-tools==19.1.0
   docs:
     common:
       - output_types: [conda]
@@ -576,7 +639,7 @@ dependencies:
         packages:
           - fsspec>=0.6.0
           - &numpy numpy>=1.23,<3.0a0
-          - pandas>=2.0,<2.2.3dev0
+          - pandas>=2.0,<2.2.4dev0
   run_pylibcudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -664,7 +727,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.8,<1.9
+          - polars>=1.11,<1.12
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -722,6 +785,10 @@ dependencies:
             packages:
               - *numba-cuda-dep
               - pandas==2.0.*
+          - matrix: {dependencies: "latest"}
+            packages:
+              - numba-cuda==0.0.15
+              - pandas==2.2.3
           - matrix:
             packages:
       - output_types: conda
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 95813907bf4..5942cc16850 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,7 +342,7 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
+    "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 
diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 34b657488c1..5024747227e 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of:
    ```
 - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version
   24.02 of cudf was the last to support pandas 1.5.x.
+- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy
+  array, we create a proxy type that actually subclasses `numpy.ndarray`. We can
+  verify this with an isinstance check.
+
+  ```python
+  %load_ext cudf.pandas
+  import pandas as pd
+  import numpy as np
+
+  arr = pd.Series([1, 1, 2]).unique() # returns a proxy array
+  isinstance(arr, np.ndarray) # returns True, where arr is a proxy array
+  ```
+  Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to
+  access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API.
+  However, our proxy mechanism is designed to proxy function calls at the Python
+  level, which is incompatible with these types of accesses. To handle these
+  situations, we perform an eager device-to-host (DtoH) copy, which sets the data
+  buffer correctly but incurs the cost of extra time when creating the proxy array.
+  In the previous example, creating `arr` performed this kind of implicit DtoH transfer.
+
+  With this approach, we also get compatibility with third party libraries like `torch`.
+
+  ```python
+  import torch
+  x = torch.from_numpy(arr)
+  ```
 
 ## Can I force running on the CPU?
 
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 6fce268f309..f4d2c7319b3 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm
 The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting.
 Specifically, cuDF uses the following tools:
 
-- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
-- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
+- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md
index f12f809d5db..22cc1b5b8de 100644
--- a/docs/cudf/source/developer_guide/testing.md
+++ b/docs/cudf/source/developer_guide/testing.md
@@ -7,6 +7,23 @@ specifically the [`pytest-cov`](https://github.com/pytest-dev/pytest-cov) plugin
 Code coverage reports are uploaded to [Codecov](https://app.codecov.io/gh/rapidsai/cudf).
 Each PR also indicates whether it increases or decreases test coverage.
 
+### Configuring pytest
+
+Pytest will accept configuration in [multiple different
+files](https://docs.pytest.org/en/stable/reference/customize.html),
+with a specified discovery and precedence order. Note in particular
+that there is no automatic "include" mechanism, as soon as a matching
+configuration file is found, discovery stops.
+
+For preference, so that all tool configuration lives in the same
+place, we use `pyproject.toml`-based configuration. Test configuration
+for a given package should live in that package's `pyproject.toml`
+file.
+
+Where tests do not naturally belong to a project, for example the
+`cudf.pandas` integration tests and the cuDF benchmarks, use a
+`pytest.ini` file as close to the tests as possible.
+
 ## Test organization
 
 How tests are organized depends on which of the following two groups they fall into:
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 95f5f9734dd..46221b6015b 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -38,10 +38,10 @@
     "import os\n",
     "\n",
     "import cupy as cp\n",
+    "import dask_cudf\n",
     "import pandas as pd\n",
     "\n",
     "import cudf\n",
-    "import dask_cudf\n",
     "\n",
     "cp.random.seed(12)\n",
     "\n",
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index e21536e2e97..62e14a67ee5 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
     groupby
     interop
     join
+    json
     labeling
     lists
     merge
@@ -49,3 +50,4 @@ This page provides API documentation for pylibcudf.
 
     io/index.rst
     strings/index.rst
+    nvtext/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
new file mode 100644
index 00000000000..bb38d179a57
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst
@@ -0,0 +1,6 @@
+====
+json
+====
+
+.. automodule:: pylibcudf.json
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst
new file mode 100644
index 00000000000..abb45e426a8
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst
@@ -0,0 +1,6 @@
+=============
+edit_distance
+=============
+
+.. automodule:: pylibcudf.nvtext.edit_distance
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
new file mode 100644
index 00000000000..d68199271bd
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
@@ -0,0 +1,6 @@
+===============
+generate_ngrams
+===============
+
+.. automodule:: pylibcudf.nvtext.generate_ngrams
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
new file mode 100644
index 00000000000..e0735a197fd
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -0,0 +1,14 @@
+nvtext
+======
+
+.. toctree::
+    :maxdepth: 1
+
+    edit_distance
+    generate_ngrams
+    jaccard
+    minhash
+    ngrams_tokenize
+    normalize
+    replace
+    stemmer
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
new file mode 100644
index 00000000000..ea59657c25e
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst
@@ -0,0 +1,6 @@
+=======
+jaccard
+=======
+
+.. automodule:: pylibcudf.nvtext.jaccard
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst
new file mode 100644
index 00000000000..b8ec02fca35
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst
@@ -0,0 +1,6 @@
+=======
+minhash
+=======
+
+.. automodule:: pylibcudf.nvtext.minhash
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst
new file mode 100644
index 00000000000..ce6db76f889
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst
@@ -0,0 +1,6 @@
+===============
+ngrams_tokenize
+===============
+
+.. automodule:: pylibcudf.nvtext.ngrams_tokenize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
new file mode 100644
index 00000000000..e496f6a45da
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
@@ -0,0 +1,6 @@
+=========
+normalize
+=========
+
+.. automodule:: pylibcudf.nvtext.normalize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
new file mode 100644
index 00000000000..04cee972dc1
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
@@ -0,0 +1,6 @@
+=======
+replace
+=======
+
+.. automodule:: pylibcudf.nvtext.replace
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
new file mode 100644
index 00000000000..b407ff8451a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
@@ -0,0 +1,6 @@
+=======
+stemmer
+=======
+
+.. automodule:: pylibcudf.nvtext.stemmer
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst
new file mode 100644
index 00000000000..38a46641200
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst
@@ -0,0 +1,6 @@
+=======
+combine
+=======
+
+.. automodule:: pylibcudf.strings.combine
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
new file mode 100644
index 00000000000..de62221456f
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst
@@ -0,0 +1,6 @@
+================
+convert_booleans
+================
+
+.. automodule:: pylibcudf.strings.convert.convert_booleans
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
new file mode 100644
index 00000000000..fc5d5204ab3
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst
@@ -0,0 +1,6 @@
+================
+convert_datetime
+================
+
+.. automodule:: pylibcudf.strings.convert.convert_datetime
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
new file mode 100644
index 00000000000..e80b0c15a61
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst
@@ -0,0 +1,6 @@
+=================
+convert_durations
+=================
+
+.. automodule:: pylibcudf.strings.convert.convert_durations
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
new file mode 100644
index 00000000000..16d971a6849
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst
@@ -0,0 +1,6 @@
+===================
+convert_fixed_point
+===================
+
+.. automodule:: pylibcudf.strings.convert.convert_fixed_point
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
new file mode 100644
index 00000000000..9ae4004cea9
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst
@@ -0,0 +1,6 @@
+==============
+convert_floats
+==============
+
+.. automodule:: pylibcudf.strings.convert.convert_floats
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst
new file mode 100644
index 00000000000..71d146c0379
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst
@@ -0,0 +1,6 @@
+================
+convert_integers
+================
+
+.. automodule:: pylibcudf.strings.convert.convert_integers
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
new file mode 100644
index 00000000000..4ead8677a69
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst
@@ -0,0 +1,6 @@
+============
+convert_ipv4
+============
+
+.. automodule:: pylibcudf.strings.convert.convert_ipv4
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
new file mode 100644
index 00000000000..33a719a42e1
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst
@@ -0,0 +1,6 @@
+=============
+convert_lists
+=============
+
+.. automodule:: pylibcudf.strings.convert.convert_lists
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
new file mode 100644
index 00000000000..f20d95e0cdd
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst
@@ -0,0 +1,6 @@
+============
+convert_urls
+============
+
+.. automodule:: pylibcudf.strings.convert.convert_urls
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
new file mode 100644
index 00000000000..3d07c1271b4
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst
@@ -0,0 +1,15 @@
+convert
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    convert_booleans
+    convert_datetime
+    convert_durations
+    convert_fixed_point
+    convert_floats
+    convert_integers
+    convert_ipv4
+    convert_lists
+    convert_urls
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
new file mode 100644
index 00000000000..8e86b33b1a0
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst
@@ -0,0 +1,6 @@
+=============
+find_multiple
+=============
+
+.. automodule:: pylibcudf.strings.find_multiple
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 9b1a6b72a88..ae670b5bd8a 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -6,13 +6,26 @@ strings
 
     capitalize
     char_types
+    combine
     contains
     extract
     find
+    find_multiple
     findall
+    padding
     regex_flags
     regex_program
     repeat
+    replace_re
     replace
+    side_type
     slice
+    split
     strip
+    wrap
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Subpackages
+
+    convert/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst
new file mode 100644
index 00000000000..5b417024fd5
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst
@@ -0,0 +1,6 @@
+=======
+padding
+=======
+
+.. automodule:: pylibcudf.strings.padding
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
new file mode 100644
index 00000000000..5bf715ef657
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
@@ -0,0 +1,6 @@
+==========
+replace_re
+==========
+
+.. automodule:: pylibcudf.strings.replace_re
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst
new file mode 100644
index 00000000000..d5aef9c4f75
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst
@@ -0,0 +1,6 @@
+=========
+side_type
+=========
+
+.. automodule:: pylibcudf.strings.side_type
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
new file mode 100644
index 00000000000..cba96e86f45
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst
@@ -0,0 +1,6 @@
+=====
+split
+=====
+
+.. automodule:: pylibcudf.strings.split
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst
new file mode 100644
index 00000000000..bd825f78568
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst
@@ -0,0 +1,6 @@
+====
+wrap
+====
+
+.. automodule:: pylibcudf.strings.wrap
+   :members:
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 75eafcc5387..abfe5a1b178 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -101,6 +101,8 @@
    "outputs": [],
    "source": [
     "# define a scalar function\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -247,6 +249,8 @@
    "outputs": [],
    "source": [
     "# redefine the same function from above\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -1622,6 +1626,8 @@
    "outputs": [],
    "source": [
     "# a user defined aggregation function.\n",
+    "\n",
+    "\n",
     "def udaf(df):\n",
     "    return df[\"b\"].max() - df[\"b\"].min() / 2"
    ]
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 5a429bdc739..4b5379cf0f1 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -64,7 +64,8 @@ cmake .. -G"${CMAKE_GENERATOR}" \
          -DBUILD_TESTS=$BUILD_CPP_TESTS \
          -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \
          -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \
-         -DBUILD_SHARED_LIBS=OFF
+         -DBUILD_SHARED_LIBS=OFF \
+         -DKvikIO_REMOTE_SUPPORT=OFF
 
 if [[ -z "${PARALLEL_LEVEL}" ]]; then
     cmake --build .
diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java
index 7ed8e0354c9..68a3856f37d 100644
--- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java
+++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java
@@ -28,7 +28,16 @@ public enum RegexFlag {
   DEFAULT(0),   // default
   MULTILINE(8), // the '^' and '$' honor new-line characters
   DOTALL(16),   // the '.' matching includes new-line characters
-  ASCII(256);   // use only ASCII when matching built-in character classes
+  ASCII(256),   // use only ASCII when matching built-in character classes
+  /**
+   * EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters
+   * - NEXT_LINE ('\u0085')
+   * - LINE_SEPARATOR ('\u2028')
+   * - PARAGRAPH_SEPARATOR ('\u2029')
+   * - CARRIAGE_RETURN ('\r')
+   * - NEW_LINE ('\n')
+   */
+  EXT_NEWLINE(512);
 
   final int nativeId; // Native id, for use with libcudf.
   private RegexFlag(int nativeId) { // Only constant values should be used
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 708744569df..14c290b300a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -31,6 +31,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.EnumSet;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -3877,6 +3878,43 @@ void testExtractRe() {
     }
   }
 
+  @Test
+void testExtractReWithMultiLineDelimiters() {
+    String NEXT_LINE = "\u0085";
+    String LINE_SEPARATOR = "\u2028";
+    String PARAGRAPH_SEPARATOR = "\u2029";
+    String CARRIAGE_RETURN = "\r";
+    String NEW_LINE = "\n";
+
+    try (ColumnVector input = ColumnVector.fromStrings(
+            "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::",
+            "boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll",
+            "boo::",
+            "",
+            "boo::" + NEW_LINE,
+            "boo::" + CARRIAGE_RETURN,
+            "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR,
+            "boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR,
+            "boo:" + NEXT_LINE + "boo::" + NEXT_LINE);
+         Table expected_ext_newline = new Table.TestBuilder()
+             .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::")
+             .build();
+         Table expected_default = new Table.TestBuilder()
+             .column("boo:::", null, "boo::", null, "boo::", null, null, null, null)
+             .build()) {
+
+        // Regex pattern to match 'boo:' followed by one or more colons at the end of the string
+        try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) {
+          assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]);
+        }
+
+        try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) {
+          assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]);
+        }
+    }
+  }
+
+
   @Test
   void testExtractAllRecord() {
     String pattern = "([ab])(\\d)";
diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
index 8cc7df1ce7f..6bd6603d71b 100644
--- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
@@ -612,13 +612,13 @@ void testWithSetOutputType() {
       assertEquals(expected, result);
     }
 
-    try (Scalar expected = Scalar.fromFloat(1.666667f);
+    try (Scalar expected = Scalar.fromFloat(1.6666666f);
          ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4});
          Scalar result = cv.variance(DType.FLOAT32)) {
       assertEquals(expected, result);
     }
 
-    try (Scalar expected = Scalar.fromFloat(1.2909945f);
+    try (Scalar expected = Scalar.fromFloat(1.2909944f);
          ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4});
          Scalar result = cv.standardDeviation(DType.FLOAT32)) {
       assertEquals(expected, result);
diff --git a/pyproject.toml b/pyproject.toml
index 8f9aa165e5a..6933484f4e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,8 @@ select = [
     "F",
     # pycodestyle Warning
     "W",
+    # isort
+    "I",
     # no-blank-line-before-function
     "D201",
     # one-blank-line-after-class
@@ -90,6 +92,8 @@ select = [
     "UP007",
     # Import from `collections.abc` instead: `Callable`
     "UP035",
+    # usage of legacy `np.random` function calls
+    "NPY002",
 ]
 ignore = [
     # whitespace before :
diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py
index 93109838900..f902111b0db 100644
--- a/python/cudf/benchmarks/API/bench_functions.py
+++ b/python/cudf/benchmarks/API/bench_functions.py
@@ -72,12 +72,13 @@ def bench_pivot_table_simple(benchmark, dataframe):
 
 @pytest_cases.parametrize("nr", NUM_ROWS)
 def bench_crosstab_simple(benchmark, nr):
+    rng = np.random.default_rng(seed=0)
     series_a = np.array(["foo", "bar"] * nr)
     series_b = np.array(["one", "two"] * nr)
     series_c = np.array(["dull", "shiny"] * nr)
-    np.random.shuffle(series_a)
-    np.random.shuffle(series_b)
-    np.random.shuffle(series_c)
+    rng.shuffle(series_a)
+    rng.shuffle(series_b)
+    rng.shuffle(series_c)
     series_a = cudf.Series(series_a)
     series_b = cudf.Series(series_b)
     series_c = cudf.Series(series_c)
diff --git a/python/cudf/benchmarks/API/bench_multiindex.py b/python/cudf/benchmarks/API/bench_multiindex.py
index 6268bcc4267..77004c3313e 100644
--- a/python/cudf/benchmarks/API/bench_multiindex.py
+++ b/python/cudf/benchmarks/API/bench_multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of MultiIndex methods."""
 
@@ -11,16 +11,18 @@
 @pytest.fixture
 def pidx():
     num_elements = int(1e3)
-    a = np.random.randint(0, num_elements // 10, num_elements)
-    b = np.random.randint(0, num_elements // 10, num_elements)
+    rng = np.random.default_rng(seed=0)
+    a = rng.integers(0, num_elements // 10, num_elements)
+    b = rng.integers(0, num_elements // 10, num_elements)
     return pd.MultiIndex.from_arrays([a, b], names=("a", "b"))
 
 
 @pytest.fixture
 def midx(pidx):
     num_elements = int(1e3)
-    a = np.random.randint(0, num_elements // 10, num_elements)
-    b = np.random.randint(0, num_elements // 10, num_elements)
+    rng = np.random.default_rng(seed=0)
+    a = rng.integers(0, num_elements // 10, num_elements)
+    b = rng.integers(0, num_elements // 10, num_elements)
     df = cudf.DataFrame({"a": a, "b": b})
     return cudf.MultiIndex.from_frame(df)
 
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 7b2b71cf216..0e4afadccf5 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -56,27 +56,23 @@
 # into the main repo.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-from config import cudf  # noqa: W0611, E402, F401
-from utils import (  # noqa: E402
-    OrderedSet,
-    collapse_fixtures,
-    column_generators,
-    make_fixture,
-)
-
 # Turn off isort until we upgrade to 5.8.0
 # https://github.com/pycqa/isort/issues/1594
-# isort: off
 from config import (  # noqa: W0611, E402, F401
     NUM_COLS,
     NUM_ROWS,
     collect_ignore,
+    cudf,  # noqa: W0611, E402, F401
     pytest_collection_modifyitems,
     pytest_sessionfinish,
     pytest_sessionstart,
 )
-
-# isort: on
+from utils import (  # noqa: E402
+    OrderedSet,
+    collapse_fixtures,
+    column_generators,
+    make_fixture,
+)
 
 
 @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"])
diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py
index d9974037daa..172193aa672 100644
--- a/python/cudf/cudf/_fuzz_testing/avro.py
+++ b/python/cudf/cudf/_fuzz_testing/avro.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 import io
@@ -68,12 +68,12 @@ def generate_input(self):
                 # https://github.com/rapidsai/cudf/issues/6604
                 - cudf.utils.dtypes.TIMEDELTA_TYPES
             )
-
+            seed = random.randint(0, 2**32 - 1)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
-            seed = random.randint(0, 2**32 - 1)
+
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
@@ -100,17 +100,18 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if values == ALL_POSSIBLE_VALUES:
                 if param == "columns":
                     col_size = self._rand(len(self._df.columns))
                     params_dict[param] = list(
-                        np.unique(np.random.choice(self._df.columns, col_size))
+                        np.unique(rng.choice(self._df.columns, col_size))
                     )
                 elif param in ("skiprows", "num_rows"):
-                    params_dict[param] = np.random.choice(
+                    params_dict[param] = rng.choice(
                         [None, self._rand(len(self._df))]
                     )
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
index 67211a1c4bf..fa3ed40ce91 100644
--- a/python/cudf/cudf/_fuzz_testing/csv.py
+++ b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -54,7 +54,7 @@ def generate_input(self):
             random.seed(seed)
             dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
@@ -77,25 +77,22 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if values == ALL_POSSIBLE_VALUES:
                 if param == "usecols":
                     col_size = self._rand(len(self._df.columns))
-                    col_val = np.random.choice(
+                    col_val = rng.choice(
                         [
                             None,
-                            np.unique(
-                                np.random.choice(self._df.columns, col_size)
-                            ),
+                            np.unique(rng.choice(self._df.columns, col_size)),
                         ]
                     )
                     params_dict[param] = (
                         col_val if col_val is None else list(col_val)
                     )
                 elif param == "dtype":
-                    dtype_val = np.random.choice(
-                        [None, self._df.dtypes.to_dict()]
-                    )
+                    dtype_val = rng.choice([None, self._df.dtypes.to_dict()])
                     if dtype_val is not None:
                         dtype_val = {
                             col_name: "category"
@@ -105,25 +102,25 @@ def set_rand_params(self, params):
                         }
                     params_dict[param] = dtype_val
                 elif param == "header":
-                    header_val = np.random.choice(
-                        ["infer", np.random.randint(low=0, high=len(self._df))]
+                    header_val = rng.choice(
+                        ["infer", rng.integers(low=0, high=len(self._df))]
                     )
                     params_dict[param] = header_val
                 elif param == "skiprows":
-                    params_dict[param] = np.random.randint(
+                    params_dict[param] = rng.integers(
                         low=0, high=len(self._df)
                     )
                 elif param == "skipfooter":
-                    params_dict[param] = np.random.randint(
+                    params_dict[param] = rng.integers(
                         low=0, high=len(self._df)
                     )
                 elif param == "nrows":
-                    nrows_val = np.random.choice(
-                        [None, np.random.randint(low=0, high=len(self._df))]
+                    nrows_val = rng.choice(
+                        [None, rng.integers(low=0, high=len(self._df))]
                     )
                     params_dict[param] = nrows_val
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
@@ -159,7 +156,7 @@ def generate_input(self):
             random.seed(seed)
             dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
@@ -182,26 +179,25 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if values == ALL_POSSIBLE_VALUES:
                 if param == "columns":
                     col_size = self._rand(len(self._current_buffer.columns))
                     params_dict[param] = list(
                         np.unique(
-                            np.random.choice(
-                                self._current_buffer.columns, col_size
-                            )
+                            rng.choice(self._current_buffer.columns, col_size)
                         )
                     )
                 elif param == "chunksize":
-                    params_dict[param] = np.random.choice(
+                    params_dict[param] = rng.choice(
                         [
                             None,
-                            np.random.randint(
+                            rng.integers(
                                 low=1, high=max(1, len(self._current_buffer))
                             ),
                         ]
                     )
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py
index ffb7171a855..a4b8e18d8b4 100644
--- a/python/cudf/cudf/_fuzz_testing/io.py
+++ b/python/cudf/cudf/_fuzz_testing/io.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 import json
@@ -91,8 +91,9 @@ def get_next_regression_params(self):
         return dtypes_meta, num_rows, num_cols, seed
 
     def set_rand_params(self, params):
+        rng = np.random.default_rng(seed=None)
         params_dict = {
-            param: np.random.choice(values) for param, values in params.items()
+            param: rng.choice(values) for param, values in params.items()
         }
         self._current_params["test_kwargs"] = self.process_kwargs(
             params_dict=params_dict
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index e987529c8ba..45d2c8d8cf0 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -80,7 +80,7 @@ def generate_input(self):
             # https://github.com/rapidsai/cudf/issues/7086
             # dtypes_list.extend(["list"])
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
@@ -105,14 +105,15 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if param == "dtype" and values == ALL_POSSIBLE_VALUES:
-                dtype_val = np.random.choice(
+                dtype_val = rng.choice(
                     [True, self._current_buffer.dtypes.to_dict()]
                 )
                 params_dict[param] = _get_dtype_param_value(dtype_val)
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
@@ -155,7 +156,7 @@ def generate_input(self):
             # https://github.com/rapidsai/cudf/issues/7086
             # dtypes_list.extend(["list"])
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
@@ -180,12 +181,13 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if param == "dtype" and values == ALL_POSSIBLE_VALUES:
-                dtype_val = np.random.choice(
+                dtype_val = rng.choice(
                     [True, self._current_buffer.dtypes.to_dict()]
                 )
                 params_dict[param] = _get_dtype_param_value(dtype_val)
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index ecddc72fa85..4d9e4abb09e 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 import io
@@ -62,13 +62,11 @@ def generate_input(self):
                 - cudf.utils.dtypes.UNSIGNED_TYPES
                 - {"datetime64[ns]"}
             )
-
+            seed = random.randint(0, 2**32 - 1)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
-
             self._current_params["dtypes_meta"] = dtypes_meta
-            seed = random.randint(0, 2**32 - 1)
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
@@ -94,42 +92,41 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if values == ALL_POSSIBLE_VALUES:
                 if param == "columns":
                     col_size = self._rand(len(self._df.columns))
                     params_dict[param] = list(
-                        np.unique(np.random.choice(self._df.columns, col_size))
+                        np.unique(rng.choice(self._df.columns, col_size))
                     )
                 elif param == "stripes":
                     f = io.BytesIO(self._current_buffer)
                     orcFile = pa.orc.ORCFile(f)
                     stripes = list(range(orcFile.nstripes))
-                    params_dict[param] = np.random.choice(
+                    params_dict[param] = rng.choice(
                         [
                             None,
                             list(
                                 map(
                                     int,
                                     np.unique(
-                                        np.random.choice(
-                                            stripes, orcFile.nstripes
-                                        )
+                                        rng.choice(stripes, orcFile.nstripes)
                                     ),
                                 )
                             ),
                         ]
                     )
                 elif param == "use_index":
-                    params_dict[param] = np.random.choice([True, False])
+                    params_dict[param] = rng.choice([True, False])
                 elif param in ("skiprows", "num_rows"):
-                    params_dict[param] = np.random.choice(
+                    params_dict[param] = rng.choice(
                         [None, self._rand(len(self._df))]
                     )
             else:
                 if not isinstance(values, list):
                     raise TypeError("values must be of type list")
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
@@ -177,12 +174,11 @@ def generate_input(self):
                 # https://github.com/rapidsai/cudf/issues/7355
                 - cudf.utils.dtypes.DATETIME_TYPES
             )
-
+            seed = random.randint(0, 2**32 - 1)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
-            seed = random.randint(0, 2**32 - 1)
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 2d934e4816d..bd3df1b0847 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import logging
 import random
@@ -59,12 +59,11 @@ def generate_input(self):
                 - {"uint32"}
                 | {"list", "decimal64"}
             )
-
+            seed = random.randint(0, 2**32 - 1)
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
-            seed = random.randint(0, 2**32 - 1)
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_cols"] = num_cols
@@ -96,14 +95,15 @@ def write_data(self, file_name):
 
     def set_rand_params(self, params):
         params_dict = {}
+        rng = np.random.default_rng(seed=None)
         for param, values in params.items():
             if param == "columns" and values == ALL_POSSIBLE_VALUES:
                 col_size = self._rand(len(self._df.columns))
                 params_dict[param] = list(
-                    np.unique(np.random.choice(self._df.columns, col_size))
+                    np.unique(rng.choice(self._df.columns, col_size))
                 )
             else:
-                params_dict[param] = np.random.choice(values)
+                params_dict[param] = rng.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
@@ -146,7 +146,7 @@ def generate_input(self):
                 | {"list", "decimal64"}
             )
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
-                self, dtypes_list
+                self, dtypes_list, seed
             )
             self._current_params["dtypes_meta"] = dtypes_meta
             self._current_params["seed"] = seed
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
index 3d070576a12..bbc19dce1a4 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import sys
 
@@ -68,7 +68,9 @@ def parquet_writer_test(pdf):
 @pythonfuzz(
     data_handle=ParquetWriter,
     params={
-        "row_group_size": np.random.random_integers(1, 10000, 100),
+        "row_group_size": np.random.default_rng(seed=0).integers(
+            1, 10000, 100
+        ),
         "compression": ["snappy", None],
     },
 )
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 8ce92e1c0f6..4cadb3a109c 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -40,8 +40,11 @@
 }
 
 
-def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
+def _generate_rand_meta(
+    obj, dtypes_list, null_frequency_override=None, seed=0
+):
     obj._current_params = {}
+    rng = np.random.default_rng(seed=seed)
     num_rows = obj._rand(obj._max_rows)
     num_cols = obj._rand(obj._max_columns)
 
@@ -69,12 +72,12 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
                 meta["max_string_length"] = obj._max_string_length
         elif dtype == "list":
             if obj._max_lists_length is None:
-                meta["lists_max_length"] = np.random.randint(0, 2000000000)
+                meta["lists_max_length"] = rng.integers(0, 2000000000)
             else:
                 meta["lists_max_length"] = obj._max_lists_length
 
             if obj._max_lists_nesting_depth is None:
-                meta["nesting_max_depth"] = np.random.randint(
+                meta["nesting_max_depth"] = rng.integers(
                     1, np.iinfo("int64").max
                 )
             else:
@@ -85,7 +88,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             )
         elif dtype == "struct":
             if obj._max_lists_nesting_depth is None:
-                meta["nesting_max_depth"] = np.random.randint(2, 10)
+                meta["nesting_max_depth"] = rng.integers(2, 10)
             else:
                 meta["nesting_max_depth"] = obj._max_lists_nesting_depth
 
@@ -95,9 +98,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
                 meta["max_null_frequency"] = obj._max_struct_null_frequency
 
             if obj._max_struct_types_at_each_level is None:
-                meta["max_types_at_each_level"] = np.random.randint(
-                    low=1, high=10
-                )
+                meta["max_types_at_each_level"] = rng.integers(low=1, high=10)
             else:
                 meta["max_types_at_each_level"] = (
                     obj._max_struct_types_at_each_level
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 8ceea4920e2..8b1d16f0d85 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport (
     mutable_column_view,
 )
 from pylibcudf.libcudf.types cimport size_type
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef class Column:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 99e4c21df8a..065655505b8 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -28,7 +28,7 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.types cimport (
     dtype_from_column_view,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 49714091f46..4221e745e65 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -4,11 +4,11 @@ import pickle
 
 from libc.stdint cimport uint8_t, uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 import pylibcudf
 
@@ -30,10 +30,6 @@ from libcpp.memory cimport make_unique
 cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.gather cimport (
-    segmented_gather as cpp_segmented_gather,
-)
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
@@ -339,26 +335,6 @@ def get_element(Column input_column, size_type index):
     )
 
 
-@acquire_spill_lock()
-def segmented_gather(Column source_column, Column gather_map):
-    cdef shared_ptr[lists_column_view] source_LCV = (
-        make_shared[lists_column_view](source_column.view())
-    )
-    cdef shared_ptr[lists_column_view] gather_map_LCV = (
-        make_shared[lists_column_view](gather_map.view())
-    )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_segmented_gather(
-                source_LCV.get()[0], gather_map_LCV.get()[0])
-        )
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
-
 cdef class _CPackedColumns:
 
     @staticmethod
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index bc5e085ec39..d844466120f 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -13,12 +13,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.filling cimport calendrical_month_sequence
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.datetime import DatetimeComponent
 
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
-import pylibcudf as plc
-
 
 @acquire_spill_lock()
 def add_months(Column col, Column months):
@@ -40,9 +39,39 @@ def add_months(Column col, Column months):
 
 @acquire_spill_lock()
 def extract_datetime_component(Column col, object field):
-    result = Column.from_pylibcudf(
-        plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field)
-    )
+
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+    cdef libcudf_datetime.datetime_component component
+
+    component_names = {
+        "year": DatetimeComponent.YEAR,
+        "month": DatetimeComponent.MONTH,
+        "day": DatetimeComponent.DAY,
+        "weekday": DatetimeComponent.WEEKDAY,
+        "hour": DatetimeComponent.HOUR,
+        "minute": DatetimeComponent.MINUTE,
+        "second": DatetimeComponent.SECOND,
+        "millisecond": DatetimeComponent.MILLISECOND,
+        "microsecond": DatetimeComponent.MICROSECOND,
+        "nanosecond": DatetimeComponent.NANOSECOND,
+    }
+    if field == "day_of_year":
+        with nogil:
+            c_result = move(libcudf_datetime.day_of_year(col_view))
+    elif field in component_names:
+        component = component_names[field]
+        with nogil:
+            c_result = move(
+                libcudf_datetime.extract_datetime_component(
+                    col_view,
+                    component
+                )
+            )
+    else:
+        raise ValueError(f"Invalid field: '{field}'")
+
+    result = Column.from_unique_ptr(move(c_result))
 
     if field == "weekday":
         # Pandas counts Monday-Sunday as 0-6
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 1dc586bb257..1c9d3a01b80 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,49 +1,22 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cpython cimport pycapsule
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 import pylibcudf
 
-from pylibcudf.libcudf.interop cimport (
-    DLManagedTensor,
-    from_dlpack as cpp_from_dlpack,
-    to_dlpack as cpp_to_dlpack,
-)
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-
-from cudf._lib.utils cimport (
-    columns_from_pylibcudf_table,
-    columns_from_unique_ptr,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import ListDtype, StructDtype
 
 
-def from_dlpack(dlpack_capsule):
+def from_dlpack(object dlpack_capsule):
     """
     Converts a DLPack Tensor PyCapsule into a list of columns.
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>pycapsule.\
-        PyCapsule_GetPointer(dlpack_capsule, 'dltensor')
-    pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor')
-
-    cdef unique_ptr[table] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_from_dlpack(dlpack_tensor)
-        )
-
-    res = columns_from_unique_ptr(move(c_result))
-    dlpack_tensor.deleter(dlpack_tensor)
-    return res
+    return columns_from_pylibcudf_table(
+        pylibcudf.interop.from_dlpack(dlpack_capsule)
+    )
 
 
 def to_dlpack(list source_columns):
@@ -52,39 +25,13 @@ def to_dlpack(list source_columns):
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-    if any(column.null_count for column in source_columns):
-        raise ValueError(
-            "Cannot create a DLPack tensor with null values. \
-                Input is required to have null count as zero."
-        )
-
-    cdef DLManagedTensor *dlpack_tensor
-    cdef table_view source_table_view = table_view_from_columns(source_columns)
-
-    with nogil:
-        dlpack_tensor = cpp_to_dlpack(
-            source_table_view
+    return pylibcudf.interop.to_dlpack(
+        pylibcudf.Table(
+            [col.to_pylibcudf(mode="read") for col in source_columns]
         )
-
-    return pycapsule.PyCapsule_New(
-        dlpack_tensor,
-        'dltensor',
-        dlmanaged_tensor_pycapsule_deleter
     )
 
 
-cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>0
-    try:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'used_dltensor')
-        return  # we do not call a used capsule's deleter
-    except Exception:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'dltensor')
-    dlpack_tensor.deleter(dlpack_tensor)
-
-
 def gather_metadata(object cols_dtypes):
     """
     Generates a ColumnMetadata vector for each column.
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 7e8710bedb6..12432ac6d5d 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import pylibcudf
+import pylibcudf as plc
 
 from pylibcudf cimport Scalar
 
@@ -17,7 +17,7 @@ from pylibcudf cimport Scalar
 @acquire_spill_lock()
 def count_elements(Column col):
     return Column.from_pylibcudf(
-        pylibcudf.lists.count_elements(
+        plc.lists.count_elements(
             col.to_pylibcudf(mode="read"))
     )
 
@@ -25,8 +25,8 @@ def count_elements(Column col):
 @acquire_spill_lock()
 def explode_outer(list source_columns, int explode_column_idx):
     return columns_from_pylibcudf_table(
-        pylibcudf.lists.explode_outer(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
+        plc.lists.explode_outer(
+            plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
             explode_column_idx,
         )
     )
@@ -35,7 +35,7 @@ def explode_outer(list source_columns, int explode_column_idx):
 @acquire_spill_lock()
 def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
-        pylibcudf.lists.distinct(
+        plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
             nulls_equal,
             nans_all_equal,
@@ -46,7 +46,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
 @acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
-        pylibcudf.lists.sort_lists(
+        plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
             ascending,
             null_order.BEFORE if na_position == "first" else null_order.AFTER,
@@ -58,7 +58,7 @@ def sort_lists(Column col, bool ascending, str na_position):
 @acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index,
         )
@@ -68,7 +68,7 @@ def extract_element_scalar(Column col, size_type index):
 @acquire_spill_lock()
 def extract_element_column(Column col, Column index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index.to_pylibcudf(mode="read"),
         )
@@ -78,7 +78,7 @@ def extract_element_column(Column col, Column index):
 @acquire_spill_lock()
 def contains_scalar(Column col, py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.contains(
+        plc.lists.contains(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
         )
@@ -88,7 +88,7 @@ def contains_scalar(Column col, py_search_key):
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
             True,
@@ -99,7 +99,7 @@ def index_of_scalar(Column col, object py_search_key):
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             search_keys.to_pylibcudf(mode="read"),
             True,
@@ -110,8 +110,8 @@ def index_of_column(Column col, Column search_keys):
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_rows(
-            pylibcudf.Table([
+        plc.lists.concatenate_rows(
+            plc.Table([
                 c.to_pylibcudf(mode="read") for c in source_columns
             ])
         )
@@ -121,8 +121,18 @@ def concatenate_rows(list source_columns):
 @acquire_spill_lock()
 def concatenate_list_elements(Column input_column, dropna=False):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_list_elements(
+        plc.lists.concatenate_list_elements(
             input_column.to_pylibcudf(mode="read"),
             dropna,
         )
     )
+
+
+@acquire_spill_lock()
+def segmented_gather(Column source_column, Column gather_map):
+    return Column.from_pylibcudf(
+        plc.lists.segmented_gather(
+            source_column.to_pylibcudf(mode="read"),
+            gather_map.to_pylibcudf(mode="read"),
+        )
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
index e3c2273345a..3dd99c42d76 100644
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
@@ -2,37 +2,23 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.edit_distance cimport (
-    edit_distance as cpp_edit_distance,
-    edit_distance_matrix as cpp_edit_distance_matrix,
-)
+from pylibcudf cimport nvtext
 
 from cudf._lib.column cimport Column
 
 
 @acquire_spill_lock()
 def edit_distance(Column strings, Column targets):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_targets = targets.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_edit_distance(c_strings, c_targets))
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.edit_distance.edit_distance(
+        strings.to_pylibcudf(mode="read"),
+        targets.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def edit_distance_matrix(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_edit_distance_matrix(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.edit_distance.edit_distance_matrix(
+        strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index 6591b527eec..7fdf9258b7f 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -2,75 +2,34 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
-    generate_character_ngrams as cpp_generate_character_ngrams,
-    generate_ngrams as cpp_generate_ngrams,
-    hash_character_ngrams as cpp_hash_character_ngrams,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def generate_ngrams(Column strings, int ngrams, object py_separator):
-
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_generate_ngrams(
-                c_strings,
-                c_ngrams,
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams,
+        py_separator.device_value.c_value
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def generate_character_ngrams(Column strings, int ngrams):
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_generate_character_ngrams(
-                c_strings,
-                c_ngrams
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_character_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def hash_character_ngrams(Column strings, int ngrams):
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_hash_character_ngrams(
-                c_strings,
-                c_ngrams
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.hash_character_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 0ebf7c281e3..c964d0206b7 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -2,33 +2,16 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.jaccard cimport (
-    jaccard_index as cpp_jaccard_index,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 @acquire_spill_lock()
 def jaccard_index(Column input1, Column input2, int width):
-    cdef column_view c_input1 = input1.view()
-    cdef column_view c_input2 = input2.view()
-    cdef size_type c_width = width
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_jaccard_index(
-                c_input1,
-                c_input2,
-                c_width
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.jaccard.jaccard_index(
+        input1.to_pylibcudf(mode="read"),
+        input2.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 59cb8d51440..5e39cafa47b 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -2,93 +2,44 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.minhash cimport (
-    minhash as cpp_minhash,
-    minhash64 as cpp_minhash64,
-    word_minhash as cpp_word_minhash,
-    word_minhash64 as cpp_word_minhash64,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
 
-
-@acquire_spill_lock()
-def minhash(Column strings, Column seeds, int width):
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_width = width
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_minhash(
-                c_strings,
-                c_seeds,
-                c_width
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def minhash64(Column strings, Column seeds, int width):
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_width = width
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
+def minhash(Column input, Column seeds, int width=4):
+    result = nvtext.minhash.minhash(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
 
-    with nogil:
-        c_result = move(
-            cpp_minhash64(
-                c_strings,
-                c_seeds,
-                c_width
-            )
-        )
 
-    return Column.from_unique_ptr(move(c_result))
+@acquire_spill_lock()
+def minhash64(Column input, Column seeds, int width=4):
+    result = nvtext.minhash.minhash64(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+        width,
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def word_minhash(Column input, Column seeds):
-
-    cdef column_view c_input = input.view()
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_word_minhash(
-                c_input,
-                c_seeds
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.minhash.word_minhash(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def word_minhash64(Column input, Column seeds):
-
-    cdef column_view c_input = input.view()
-    cdef column_view c_seeds = seeds.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_word_minhash64(
-                c_input,
-                c_seeds
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.minhash.word_minhash64(
+        input.to_pylibcudf(mode="read"),
+        seeds.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index dec4f037d98..c125d92a24e 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -2,48 +2,23 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
-    ngrams_tokenize as cpp_ngrams_tokenize,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def ngrams_tokenize(
-    Column strings,
+    Column input,
     int ngrams,
     object py_delimiter,
     object py_separator
 ):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_ngrams_tokenize(
-                c_strings,
-                c_ngrams,
-                c_delimiter[0],
-                c_separator[0]
-            )
+    return Column.from_pylibcudf(
+        nvtext.ngrams_tokenize.ngrams_tokenize(
+            input.to_pylibcudf(mode="read"),
+            ngrams,
+            py_delimiter.device_value.c_value,
+            py_separator.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 5e86a9ce959..cc45123dd0a 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -3,36 +3,26 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.normalize cimport (
-    normalize_characters as cpp_normalize_characters,
-    normalize_spaces as cpp_normalize_spaces,
-)
 
 from cudf._lib.column cimport Column
 
-
-@acquire_spill_lock()
-def normalize_spaces(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_normalize_spaces(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def normalize_characters(Column strings, bool do_lower=True):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
+def normalize_spaces(Column input):
+    return Column.from_pylibcudf(
+        nvtext.normalize.normalize_spaces(
+            input.to_pylibcudf(mode="read")
+        )
+    )
 
-    with nogil:
-        c_result = move(cpp_normalize_characters(c_strings, do_lower))
 
-    return Column.from_unique_ptr(move(c_result))
+@acquire_spill_lock()
+def normalize_characters(Column input, bool do_lower=True):
+    return Column.from_pylibcudf(
+        nvtext.normalize.normalize_characters(
+            input.to_pylibcudf(mode="read"),
+            do_lower,
+        )
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index 61ae3da5782..bec56ade83c 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -2,20 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.replace cimport (
-    filter_tokens as cpp_filter_tokens,
-    replace_tokens as cpp_replace_tokens,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
@@ -30,27 +20,14 @@ def replace_tokens(Column strings,
     provided.
     """
 
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_targets = targets.view()
-    cdef column_view c_replacements = replacements.view()
-
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_replace_tokens(
-                c_strings,
-                c_targets,
-                c_replacements,
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.replace_tokens(
+            strings.to_pylibcudf(mode="read"),
+            targets.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
@@ -65,24 +42,11 @@ def filter_tokens(Column strings,
     character provided.
     """
 
-    cdef DeviceScalar replacement = py_replacement.device_value
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_repl = <const string_scalar*>replacement\
-        .get_raw_ptr()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_filter_tokens(
-                c_strings,
-                min_token_length,
-                c_repl[0],
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.filter_tokens(
+            strings.to_pylibcudf(mode="read"),
+            min_token_length,
+            py_replacement.device_value.c_value,
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 5bf25562fed..63a389b64d5 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,24 +1,19 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from enum import IntEnum
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
+from cudf.core.buffer import acquire_spill_lock
+
 from pylibcudf.libcudf.nvtext.stemmer cimport (
-    is_letter as cpp_is_letter,
     letter_type,
-    porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 class LetterType(IntEnum):
     CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
@@ -27,43 +22,34 @@ class LetterType(IntEnum):
 
 @acquire_spill_lock()
 def porter_stemmer_measure(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_porter_stemmer_measure(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        nvtext.stemmer.porter_stemmer_measure(
+            strings.to_pylibcudf(mode="read"),
+        )
+    )
 
 
 @acquire_spill_lock()
 def is_letter(Column strings,
               object ltype,
               size_type index):
-    cdef column_view c_strings = strings.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            index,
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, index))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def is_letter_multi(Column strings,
                     object ltype,
                     Column indices):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            indices.to_pylibcudf(mode="read"),
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 27095ca02d4..0f9820ed1db 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -4,7 +4,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from pylibcudf.libcudf.scalar.scalar cimport scalar
-from rmm._lib.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
 cdef class DeviceScalar:
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 0dde91316fb..56712402919 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -6,7 +6,6 @@ import numpy as np
 import pandas as pd
 import pyarrow as pa
 
-from libc.stdint cimport int64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -25,25 +24,7 @@ cimport pylibcudf.libcudf.types as libcudf_types
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
 from pylibcudf cimport Scalar as plc_Scalar
-from pylibcudf.libcudf.scalar.scalar cimport (
-    duration_scalar,
-    list_scalar,
-    scalar,
-    struct_scalar,
-    timestamp_scalar,
-)
-from pylibcudf.libcudf.wrappers.durations cimport (
-    duration_ms,
-    duration_ns,
-    duration_s,
-    duration_us,
-)
-from pylibcudf.libcudf.wrappers.timestamps cimport (
-    timestamp_ms,
-    timestamp_ns,
-    timestamp_s,
-    timestamp_us,
-)
+from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
 
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
@@ -284,62 +265,6 @@ cdef class DeviceScalar:
             ]
 
 
-# TODO: Currently the only uses of this function and the one below are in
-# _create_proxy_nat_scalar. See if that code path can be simplified to excise
-# or at least simplify these implementations.
-cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s,
-                                    object value,
-                                    object dtype,
-                                    bool valid=True):
-
-    value = value if valid else 0
-
-    if dtype == "datetime64[s]":
-        s.reset(
-            new timestamp_scalar[timestamp_s](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[ms]":
-        s.reset(
-            new timestamp_scalar[timestamp_ms](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[us]":
-        s.reset(
-            new timestamp_scalar[timestamp_us](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "datetime64[ns]":
-        s.reset(
-            new timestamp_scalar[timestamp_ns](<int64_t>np.int64(value), valid)
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s,
-                                     object value,
-                                     object dtype,
-                                     bool valid=True):
-
-    value = value if valid else 0
-
-    if dtype == "timedelta64[s]":
-        s.reset(
-            new duration_scalar[duration_s](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[ms]":
-        s.reset(
-            new duration_scalar[duration_ms](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[us]":
-        s.reset(
-            new duration_scalar[duration_us](<int64_t>np.int64(value), valid)
-        )
-    elif dtype == "timedelta64[ns]":
-        s.reset(
-            new duration_scalar[duration_ns](<int64_t>np.int64(value), valid)
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-
 def as_device_scalar(val, dtype=None):
     if isinstance(val, (cudf.Scalar, DeviceScalar)):
         if dtype == val.dtype or dtype is None:
@@ -361,22 +286,3 @@ def _is_null_host_scalar(slr):
         return True
     else:
         return False
-
-
-def _create_proxy_nat_scalar(dtype):
-    cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar)
-
-    dtype = cudf.dtype(dtype)
-    if dtype.char in 'mM':
-        nat = dtype.type('NaT').astype(dtype)
-        if dtype.type == np.datetime64:
-            _set_datetime64_from_np_scalar(
-                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
-            )
-        elif dtype.type == np.timedelta64:
-            _set_timedelta64_from_np_scalar(
-                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
-            )
-        return result
-    else:
-        raise TypeError('NAT only valid for datetime and timedelta')
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 60a6795a402..06ee07d8e2b 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -2,80 +2,27 @@
 
 from cudf._lib.column cimport Column
 
-from cudf._lib.scalar import as_device_scalar
-
-from cudf._lib.scalar cimport DeviceScalar
-
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
-
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.convert.convert_booleans cimport (
-    from_booleans as cpp_from_booleans,
-    to_booleans as cpp_to_booleans,
-)
-from pylibcudf.libcudf.strings.convert.convert_datetime cimport (
-    is_timestamp as cpp_is_timestamp,
-)
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    from_floats as cpp_from_floats,
-    to_floats as cpp_to_floats,
-)
-from pylibcudf.libcudf.strings.convert.convert_integers cimport (
-    from_integers as cpp_from_integers,
-    hex_to_integers as cpp_hex_to_integers,
-    integers_to_hex as cpp_integers_to_hex,
-    is_hex as cpp_is_hex,
-    to_integers as cpp_to_integers,
-)
-from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
-    integers_to_ipv4 as cpp_integers_to_ipv4,
-    ipv4_to_integers as cpp_ipv4_to_integers,
-    is_ipv4 as cpp_is_ipv4,
-)
-from pylibcudf.libcudf.types cimport data_type, type_id
-
-from cudf._lib.types cimport underlying_type_t_type_id
-
 import pylibcudf as plc
+from pylibcudf.types cimport DataType
 
-import cudf
+from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 
 def floating_to_string(Column input_col):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_floats(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def string_to_floating(Column input_col, object out_type):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
-        )
+    plc_column = plc.strings.convert.convert_floats.from_floats(
+        input_col.to_pylibcudf(mode="read"),
     )
-    cdef data_type c_out_type = data_type(tid)
-    with nogil:
-        c_result = move(
-            cpp_to_floats(
-                input_column_view,
-                c_out_type))
+    return Column.from_pylibcudf(plc_column)
+
 
-    return Column.from_unique_ptr(move(c_result))
+def string_to_floating(Column input_col, DataType out_type):
+    plc_column = plc.strings.convert.convert_floats.to_floats(
+        input_col.to_pylibcudf(mode="read"),
+        out_type
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def dtos(Column input_col):
@@ -107,7 +54,7 @@ def stod(Column input_col):
     A Column with strings cast to double
     """
 
-    return string_to_floating(input_col, cudf.dtype("float64"))
+    return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64))
 
 
 def ftos(Column input_col):
@@ -139,36 +86,22 @@ def stof(Column input_col):
     A Column with strings cast to float
     """
 
-    return string_to_floating(input_col, cudf.dtype("float32"))
+    return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32))
 
 
 def integer_to_string(Column input_col):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_integers(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def string_to_integer(Column input_col, object out_type):
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
-        )
+    plc_column = plc.strings.convert.convert_integers.from_integers(
+        input_col.to_pylibcudf(mode="read"),
     )
-    cdef data_type c_out_type = data_type(tid)
-    with nogil:
-        c_result = move(
-            cpp_to_integers(
-                input_column_view,
-                c_out_type))
+    return Column.from_pylibcudf(plc_column)
 
-    return Column.from_unique_ptr(move(c_result))
+
+def string_to_integer(Column input_col, DataType out_type):
+    plc_column = plc.strings.convert.convert_integers.to_integers(
+        input_col.to_pylibcudf(mode="read"),
+        out_type
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def i8tos(Column input_col):
@@ -200,7 +133,7 @@ def stoi8(Column input_col):
     A Column with strings cast to int8
     """
 
-    return string_to_integer(input_col, cudf.dtype("int8"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8))
 
 
 def i16tos(Column input_col):
@@ -232,7 +165,7 @@ def stoi16(Column input_col):
     A Column with strings cast to int16
     """
 
-    return string_to_integer(input_col, cudf.dtype("int16"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16))
 
 
 def itos(Column input_col):
@@ -264,7 +197,7 @@ def stoi(Column input_col):
     A Column with strings cast to int32
     """
 
-    return string_to_integer(input_col, cudf.dtype("int32"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32))
 
 
 def ltos(Column input_col):
@@ -296,7 +229,7 @@ def stol(Column input_col):
     A Column with strings cast to int64
     """
 
-    return string_to_integer(input_col, cudf.dtype("int64"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64))
 
 
 def ui8tos(Column input_col):
@@ -328,7 +261,7 @@ def stoui8(Column input_col):
     A Column with strings cast to uint8
     """
 
-    return string_to_integer(input_col, cudf.dtype("uint8"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8))
 
 
 def ui16tos(Column input_col):
@@ -360,7 +293,7 @@ def stoui16(Column input_col):
     A Column with strings cast to uint16
     """
 
-    return string_to_integer(input_col, cudf.dtype("uint16"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16))
 
 
 def uitos(Column input_col):
@@ -392,7 +325,7 @@ def stoui(Column input_col):
     A Column with strings cast to uint32
     """
 
-    return string_to_integer(input_col, cudf.dtype("uint32"))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32))
 
 
 def ultos(Column input_col):
@@ -424,80 +357,24 @@ def stoul(Column input_col):
     A Column with strings cast to uint64
     """
 
-    return string_to_integer(input_col, cudf.dtype("uint64"))
-
-
-def _to_booleans(Column input_col, object string_true="True"):
-    """
-    Converting/Casting input column of type string to boolean column
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    string_true : string that represents True
-
-    Returns
-    -------
-    A Column with string values cast to boolean
-    """
-
-    cdef DeviceScalar str_true = as_device_scalar(string_true)
-    cdef column_view input_column_view = input_col.view()
-    cdef const string_scalar* string_scalar_true = <const string_scalar*>(
-        str_true.get_raw_ptr())
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_to_booleans(
-                input_column_view,
-                string_scalar_true[0]))
-
-    return Column.from_unique_ptr(move(c_result))
+    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64))
 
 
 def to_booleans(Column input_col):
-
-    return _to_booleans(input_col)
-
-
-def _from_booleans(
-        Column input_col,
-        object string_true="True",
-        object string_false="False"):
-    """
-    Converting/Casting input column of type boolean to string column
-
-    Parameters
-    ----------
-    input_col : input column of type boolean
-    string_true : string that represents True
-    string_false : string that represents False
-
-    Returns
-    -------
-    A Column with boolean values cast to string
-    """
-
-    cdef DeviceScalar str_true = as_device_scalar(string_true)
-    cdef DeviceScalar str_false = as_device_scalar(string_false)
-    cdef column_view input_column_view = input_col.view()
-    cdef const string_scalar* string_scalar_true = <const string_scalar*>(
-        str_true.get_raw_ptr())
-    cdef const string_scalar* string_scalar_false = <const string_scalar*>(
-        str_false.get_raw_ptr())
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_booleans(
-                input_column_view,
-                string_scalar_true[0],
-                string_scalar_false[0]))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_booleans.to_booleans(
+        input_col.to_pylibcudf(mode="read"),
+        as_device_scalar("True").c_value,
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def from_booleans(Column input_col):
-    return _from_booleans(input_col)
+    plc_column = plc.strings.convert.convert_booleans.from_booleans(
+        input_col.to_pylibcudf(mode="read"),
+        as_device_scalar("True").c_value,
+        as_device_scalar("False").c_value,
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def int2timestamp(
@@ -520,11 +397,10 @@ def int2timestamp(
     A Column with date-time represented in string format
 
     """
-    cdef string c_timestamp_format = format.encode("UTF-8")
     return Column.from_pylibcudf(
         plc.strings.convert.convert_datetime.from_timestamps(
             input_col.to_pylibcudf(mode="read"),
-            c_timestamp_format,
+            format,
             names.to_pylibcudf(mode="read")
         )
     )
@@ -545,12 +421,11 @@ def timestamp2int(Column input_col, dtype, format):
 
     """
     dtype = dtype_to_pylibcudf_type(dtype)
-    cdef string c_timestamp_format = format.encode('UTF-8')
     return Column.from_pylibcudf(
         plc.strings.convert.convert_datetime.to_timestamps(
             input_col.to_pylibcudf(mode="read"),
             dtype,
-            c_timestamp_format
+            format
         )
     )
 
@@ -570,18 +445,11 @@ def istimestamp(Column input_col, str format):
     A Column of boolean values identifying strings that matched the format.
 
     """
-    if input_col.size == 0:
-        return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool"))
-    cdef column_view input_column_view = input_col.view()
-    cdef string c_timestamp_format = <string>str(format).encode('UTF-8')
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_is_timestamp(
-                input_column_view,
-                c_timestamp_format))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_datetime.is_timestamp(
+        input_col.to_pylibcudf(mode="read"),
+        format
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def timedelta2int(Column input_col, dtype, format):
@@ -599,12 +467,11 @@ def timedelta2int(Column input_col, dtype, format):
 
     """
     dtype = dtype_to_pylibcudf_type(dtype)
-    cdef string c_timestamp_format = format.encode('UTF-8')
     return Column.from_pylibcudf(
         plc.strings.convert.convert_durations.to_durations(
             input_col.to_pylibcudf(mode="read"),
             dtype,
-            c_timestamp_format
+            format
         )
     )
 
@@ -623,12 +490,10 @@ def int2timedelta(Column input_col, str format):
     A Column with Timedelta represented in string format
 
     """
-
-    cdef string c_duration_format = format.encode('UTF-8')
     return Column.from_pylibcudf(
         plc.strings.convert.convert_durations.from_durations(
             input_col.to_pylibcudf(mode="read"),
-            c_duration_format
+            format
         )
     )
 
@@ -646,14 +511,10 @@ def int2ip(Column input_col):
     A Column with integer represented in string ipv4 format
 
     """
-
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_integers_to_ipv4(input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4(
+        input_col.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def ip2int(Column input_col):
@@ -669,14 +530,10 @@ def ip2int(Column input_col):
     A Column with ipv4 represented as integer
 
     """
-
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_ipv4_to_integers(input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers(
+        input_col.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def is_ipv4(Column source_strings):
@@ -685,18 +542,13 @@ def is_ipv4(Column source_strings):
     that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn
     where nnn is integer digits in [0,255].
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_ipv4(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_ipv4.is_ipv4(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
-def htoi(Column input_col, **kwargs):
+def htoi(Column input_col):
     """
     Converting input column of type string having hex values
     to integer of out_type
@@ -709,22 +561,11 @@ def htoi(Column input_col, **kwargs):
     -------
     A Column of integers parsed from hexadecimal string values.
     """
-
-    cdef column_view input_column_view = input_col.view()
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")]
-        )
+    plc_column = plc.strings.convert.convert_integers.hex_to_integers(
+        input_col.to_pylibcudf(mode="read"),
+        plc.DataType(plc.TypeId.INT64)
     )
-    cdef data_type c_out_type = data_type(tid)
-
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_hex_to_integers(input_column_view,
-                                c_out_type))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 def is_hex(Column source_strings):
@@ -732,15 +573,10 @@ def is_hex(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that have hex characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_hex(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_integers.is_hex(
+        source_strings.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def itoh(Column input_col):
@@ -756,11 +592,7 @@ def itoh(Column input_col):
     -------
     A Column of strings with hexadecimal characters.
     """
-
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_integers_to_hex(input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_integers.integers_to_hex(
+        input_col.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 4bf8a9b1a8f..ffa5e603408 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -71,16 +71,9 @@
     startswith_multiple,
 )
 from cudf._lib.strings.find_multiple import find_multiple
-from cudf._lib.strings.findall import findall
-from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
-from cudf._lib.strings.padding import (
-    SideType,
-    center,
-    ljust,
-    pad,
-    rjust,
-    zfill,
-)
+from cudf._lib.strings.findall import find_re, findall
+from cudf._lib.strings.json import get_json_object
+from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill
 from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
 from cudf._lib.strings.replace import (
     insert,
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 76cc13db0da..0f7b27d85d7 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -2,24 +2,11 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+from cudf._lib.column cimport Column
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.combine cimport (
-    concatenate as cpp_concatenate,
-    join_list_elements as cpp_join_list_elements,
-    join_strings as cpp_join_strings,
-    output_if_empty_list,
-    separator_on_nulls,
-)
-from pylibcudf.libcudf.table.table_view cimport table_view
+import pylibcudf as plc
 
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport table_view_from_columns
+import cudf
 
 
 @acquire_spill_lock()
@@ -31,26 +18,12 @@ def concatenate(list source_strings,
     with the specified `sep` between each column and
     `na`/`None` values are replaced by `na_rep`
     """
-    cdef DeviceScalar separator = sep.device_value
-    cdef DeviceScalar narep = na_rep.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef table_view source_view = table_view_from_columns(source_strings)
-
-    cdef const string_scalar* scalar_separator = \
-        <const string_scalar*>(separator.get_raw_ptr())
-    cdef const string_scalar* scalar_narep = <const string_scalar*>(
-        narep.get_raw_ptr()
+    plc_column = plc.strings.combine.concatenate(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]),
+        sep.device_value.c_value,
+        na_rep.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_concatenate(
-            source_view,
-            scalar_separator[0],
-            scalar_narep[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -62,27 +35,12 @@ def join(Column source_strings,
     with the specified `sep` between each column and
     `na`/`None` values are replaced by `na_rep`
     """
-
-    cdef DeviceScalar separator = sep.device_value
-    cdef DeviceScalar narep = na_rep.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_separator = \
-        <const string_scalar*>(separator.get_raw_ptr())
-    cdef const string_scalar* scalar_narep = <const string_scalar*>(
-        narep.get_raw_ptr()
+    plc_column = plc.strings.combine.join_strings(
+        source_strings.to_pylibcudf(mode="read"),
+        sep.device_value.c_value,
+        na_rep.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_join_strings(
-            source_view,
-            scalar_separator[0],
-            scalar_narep[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -96,29 +54,15 @@ def join_lists_with_scalar(
     between each string in lists and `<NA>`/`None` values
     are replaced by `py_narep`
     """
-
-    cdef DeviceScalar separator = py_separator.device_value
-    cdef DeviceScalar narep = py_narep.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_separator = \
-        <const string_scalar*>(separator.get_raw_ptr())
-    cdef const string_scalar* scalar_narep = <const string_scalar*>(
-        narep.get_raw_ptr()
+    plc_column = plc.strings.combine.join_list_elements(
+        source_strings.to_pylibcudf(mode="read"),
+        py_separator.device_value.c_value,
+        py_narep.device_value.c_value,
+        cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value,
+        plc.strings.combine.SeparatorOnNulls.YES,
+        plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
     )
-
-    with nogil:
-        c_result = move(cpp_join_list_elements(
-            source_view,
-            scalar_separator[0],
-            scalar_narep[0],
-            separator_on_nulls.YES,
-            output_if_empty_list.NULL_ELEMENT
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -135,28 +79,12 @@ def join_lists_with_column(
     `<NA>`/`None` values in `separator_strings` are replaced
     by `py_separator_narep`
     """
-
-    cdef DeviceScalar source_narep = py_source_narep.device_value
-    cdef DeviceScalar separator_narep = py_separator_narep.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view separator_view = separator_strings.view()
-
-    cdef const string_scalar* scalar_source_narep = \
-        <const string_scalar*>(source_narep.get_raw_ptr())
-    cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
-        separator_narep.get_raw_ptr()
+    plc_column = plc.strings.combine.join_list_elements(
+        source_strings.to_pylibcudf(mode="read"),
+        separator_strings.to_pylibcudf(mode="read"),
+        py_separator_narep.device_value.c_value,
+        py_source_narep.device_value.c_value,
+        plc.strings.combine.SeparatorOnNulls.YES,
+        plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
     )
-
-    with nogil:
-        c_result = move(cpp_join_list_elements(
-            source_view,
-            separator_view,
-            scalar_separator_narep[0],
-            scalar_source_narep[0],
-            separator_on_nulls.YES,
-            output_if_empty_list.NULL_ELEMENT
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index a8df8c9a92c..96dcd021c3b 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,22 +1,11 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-import cudf
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
-    from_fixed_point as cpp_from_fixed_point,
-    is_fixed_point as cpp_is_fixed_point,
-    to_fixed_point as cpp_to_fixed_point,
-)
-from pylibcudf.libcudf.types cimport data_type, type_id
-
 from cudf._lib.column cimport Column
+from cudf._lib.types cimport dtype_to_pylibcudf_type
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -32,14 +21,10 @@ def from_decimal(Column input_col):
     -------
     A column of strings representing the input decimal values.
     """
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_fixed_point(
-                input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point(
+        input_col.to_pylibcudf(mode="read"),
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type):
     -------
     A column of decimals parsed from the string values.
     """
-    cdef column_view input_column_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef int scale = out_type.scale
-    cdef data_type c_out_type
-    if isinstance(out_type, cudf.Decimal32Dtype):
-        c_out_type = data_type(type_id.DECIMAL32, -scale)
-    elif isinstance(out_type, cudf.Decimal64Dtype):
-        c_out_type = data_type(type_id.DECIMAL64, -scale)
-    elif isinstance(out_type, cudf.Decimal128Dtype):
-        c_out_type = data_type(type_id.DECIMAL128, -scale)
-    else:
-        raise TypeError("should be a decimal dtype")
-    with nogil:
-        c_result = move(
-            cpp_to_fixed_point(
-                input_column_view,
-                c_out_type))
-
-    result = Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
+        input_col.to_pylibcudf(mode="read"),
+        dtype_to_pylibcudf_type(out_type),
+    )
+    result = Column.from_pylibcudf(plc_column)
     result.dtype.precision = out_type.precision
     return result
 
@@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype):
     -------
     A Column of booleans indicating valid decimal conversion.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = input_col.view()
-    cdef int scale = dtype.scale
-    cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale)
-    with nogil:
-        c_result = move(cpp_is_fixed_point(
-            source_view,
-            c_dtype
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point(
+        input_col.to_pylibcudf(mode="read"),
+        dtype_to_pylibcudf_type(dtype),
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index 7965b588703..5da6e3f10cc 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,18 +1,11 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_floats cimport (
-    is_float as cpp_is_float,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def is_float(Column source_strings):
@@ -20,12 +13,7 @@ def is_float(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that have floats.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.convert.convert_floats.is_float(
+        source_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 73aebf8ab35..3a2cb4bd5c7 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -1,23 +1,13 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.convert.convert_lists cimport (
-    format_list_column as cpp_format_list_column,
-)
-
 from cudf._lib.column cimport Column
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.scalar cimport DeviceScalar
-
 
 @acquire_spill_lock()
 def format_list_column(Column source_list, Column separators):
@@ -34,19 +24,9 @@ def format_list_column(Column source_list, Column separators):
     -------
     Formatted strings column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_list.view()
-    cdef column_view separators_view = separators.view()
-    # Use 'None' as null-replacement string
-    cdef DeviceScalar str_na_rep = as_device_scalar("None")
-    cdef const string_scalar* string_scalar_na_rep = <const string_scalar*>(
-        str_na_rep.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_format_list_column(
-            source_view, string_scalar_na_rep[0], separators_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_lists.format_list_column(
+        source_list.to_pylibcudf(mode="read"),
+        as_device_scalar("None").c_value,
+        separators.to_pylibcudf(mode="read"),
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index e52116d6247..d5c2f771970 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -1,17 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.convert.convert_urls cimport (
-    url_decode as cpp_url_decode,
-    url_encode as cpp_url_encode,
-)
-
 from cudf._lib.column cimport Column
 
 
@@ -28,17 +20,10 @@ def url_decode(Column source_strings):
     -------
     URL decoded string column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_url_decode(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_urls.url_decode(
+        source_strings.to_pylibcudf(mode="read")
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -57,14 +42,7 @@ def url_encode(Column source_strings):
     -------
     URL encoded string column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_url_encode(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
+    plc_column = plc.strings.convert.convert_urls.url_encode(
+        source_strings.to_pylibcudf(mode="read")
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index 1358f8e3c2c..39e0013769f 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -1,18 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.find_multiple cimport (
-    find_multiple as cpp_find_multiple,
-)
-
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def find_multiple(Column source_strings, Column target_strings):
@@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings):
     Returns a column with character position values where each
     of the `target_strings` are found in each string of `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_find_multiple(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.find_multiple.find_multiple(
+        source_strings.to_pylibcudf(mode="read"),
+        target_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 0e758d5b322..3e7a504d535 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -23,3 +23,19 @@ def findall(Column source_strings, object pattern, uint32_t flags):
         prog,
     )
     return Column.from_pylibcudf(plc_result)
+
+
+@acquire_spill_lock()
+def find_re(Column source_strings, object pattern, uint32_t flags):
+    """
+    Returns character positions where the pattern first matches
+    the elements in source_strings.
+    """
+    prog = plc.strings.regex_program.RegexProgram.create(
+        str(pattern), flags
+    )
+    plc_result = plc.strings.findall.find_re(
+        source_strings.to_pylibcudf(mode="read"),
+        prog,
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index c9b0bba088d..374a104635a 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -1,84 +1,26 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+import pylibcudf as plc
+from pylibcudf.json cimport GetJsonObjectOptions
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.json cimport (
-    get_json_object as cpp_get_json_object,
-    get_json_object_options,
-)
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
 def get_json_object(
-        Column col, object py_json_path, GetJsonObjectOptions options):
+    Column col,
+    object py_json_path,
+    GetJsonObjectOptions options
+):
     """
     Apply a JSONPath string to all rows in an input column
     of json strings.
     """
-    cdef unique_ptr[column] c_result
-
-    cdef column_view col_view = col.view()
-    cdef DeviceScalar json_path = py_json_path.device_value
-
-    cdef const string_scalar* scalar_json_path = <const string_scalar*>(
-        json_path.get_raw_ptr()
+    plc_column = plc.json.get_json_object(
+        col.to_pylibcudf(mode="read"),
+        py_json_path.device_value.c_value,
+        options
     )
-
-    with nogil:
-        c_result = move(cpp_get_json_object(
-            col_view,
-            scalar_json_path[0],
-            options.options,
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-cdef class GetJsonObjectOptions:
-    cdef get_json_object_options options
-
-    def __init__(
-        self,
-        *,
-        allow_single_quotes=False,
-        strip_quotes_from_single_strings=True,
-        missing_fields_as_nulls=False
-    ):
-        self.options.set_allow_single_quotes(allow_single_quotes)
-        self.options.set_strip_quotes_from_single_strings(
-            strip_quotes_from_single_strings
-        )
-        self.options.set_missing_fields_as_nulls(missing_fields_as_nulls)
-
-    @property
-    def allow_single_quotes(self):
-        return self.options.get_allow_single_quotes()
-
-    @property
-    def strip_quotes_from_single_strings(self):
-        return self.options.get_strip_quotes_from_single_strings()
-
-    @property
-    def missing_fields_as_nulls(self):
-        return self.options.get_missing_fields_as_nulls()
-
-    @allow_single_quotes.setter
-    def allow_single_quotes(self, val):
-        self.options.set_allow_single_quotes(val)
-
-    @strip_quotes_from_single_strings.setter
-    def strip_quotes_from_single_strings(self, val):
-        self.options.set_strip_quotes_from_single_strings(val)
-
-    @missing_fields_as_nulls.setter
-    def missing_fields_as_nulls(self, val):
-        self.options.set_missing_fields_as_nulls(val)
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx
index d0239e91ec3..015a2ebab8a 100644
--- a/python/cudf/cudf/_lib/strings/padding.pyx
+++ b/python/cudf/cudf/_lib/strings/padding.pyx
@@ -1,64 +1,31 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
-from enum import IntEnum
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.strings.padding cimport (
-    pad as cpp_pad,
-    zfill as cpp_zfill,
-)
-from pylibcudf.libcudf.strings.side_type cimport (
-    side_type,
-    underlying_type_t_side_type,
-)
-
-
-class SideType(IntEnum):
-    LEFT = <underlying_type_t_side_type> side_type.LEFT
-    RIGHT = <underlying_type_t_side_type> side_type.RIGHT
-    BOTH = <underlying_type_t_side_type> side_type.BOTH
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
 def pad(Column source_strings,
         size_type width,
         fill_char,
-        side=SideType.LEFT):
+        side=plc.strings.side_type.SideType.LEFT):
     """
     Returns a Column by padding strings in `source_strings`
     up to the given `width`. Direction of padding is to be specified by `side`.
     The additional characters being filled can be changed by specifying
     `fill_char`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string f_char = <string>str(fill_char).encode()
-
-    cdef side_type pad_direction = <side_type>(
-        <underlying_type_t_side_type> side
+    plc_result = plc.strings.padding.pad(
+        source_strings.to_pylibcudf(mode="read"),
+        width,
+        side,
+        fill_char,
     )
-
-    with nogil:
-        c_result = move(cpp_pad(
-            source_view,
-            width,
-            pad_direction,
-            f_char
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -68,19 +35,13 @@ def zfill(Column source_strings,
     Returns a Column by prepending strings in `source_strings`
     with '0' characters up to the given `width`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_zfill(
-            source_view,
-            width
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.padding.zfill(
+        source_strings.to_pylibcudf(mode="read"),
+        width
+    )
+    return Column.from_pylibcudf(plc_result)
 
 
-@acquire_spill_lock()
 def center(Column source_strings,
            size_type width,
            fill_char):
@@ -89,23 +50,9 @@ def center(Column source_strings,
     in `source_strings` with additional character, `fill_char`
     up to the given `width`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string f_char = <string>str(fill_char).encode()
-
-    with nogil:
-        c_result = move(cpp_pad(
-            source_view,
-            width,
-            side_type.BOTH,
-            f_char
-        ))
+    return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH)
 
-    return Column.from_unique_ptr(move(c_result))
 
-
-@acquire_spill_lock()
 def ljust(Column source_strings,
           size_type width,
           fill_char):
@@ -113,23 +60,9 @@ def ljust(Column source_strings,
     Returns a Column by filling right side of strings in `source_strings`
     with additional character, `fill_char` up to the given `width`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string f_char = <string>str(fill_char).encode()
+    return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT)
 
-    with nogil:
-        c_result = move(cpp_pad(
-            source_view,
-            width,
-            side_type.RIGHT,
-            f_char
-        ))
 
-    return Column.from_unique_ptr(move(c_result))
-
-
-@acquire_spill_lock()
 def rjust(Column source_strings,
           size_type width,
           fill_char):
@@ -137,17 +70,4 @@ def rjust(Column source_strings,
     Returns a Column by filling left side of strings in `source_strings`
     with additional character, `fill_char` up to the given `width`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string f_char = <string>str(fill_char).encode()
-
-    with nogil:
-        c_result = move(cpp_pad(
-            source_view,
-            width,
-            side_type.LEFT,
-            f_char
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT)
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index fffc8b7c3f6..462d5c903e8 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -1,26 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
+from pylibcudf.libcudf.types cimport size_type
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.replace_re cimport (
-    replace_re as cpp_replace_re,
-    replace_with_backrefs as cpp_replace_with_backrefs,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
@@ -34,28 +19,16 @@ def replace_re(Column source_strings,
     `n` indicates the number of resplacements to be made from
     start. (-1 indicates all)
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef const string_scalar* scalar_repl = \
-        <const string_scalar*>(repl.get_raw_ptr())
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_replace_re(
-            source_view,
-            dereference(c_prog),
-            scalar_repl[0],
-            n
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        py_repl.device_value.c_value,
+        n
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -68,50 +41,29 @@ def replace_with_backrefs(
     new string with the extracted elements found using
     `pattern` regular expression in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef string repl_string = <string>str(repl).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_replace_with_backrefs(
-            source_view,
-            dereference(c_prog),
-            repl_string
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_with_backrefs(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        repl
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
 def replace_multi_re(Column source_strings,
-                     object patterns,
+                     list patterns,
                      Column repl_strings):
     """
     Returns a Column after replacing occurrences of multiple
     regular expressions `patterns` with their corresponding
     strings in `repl_strings` in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view repl_view = repl_strings.view()
-
-    cdef int pattern_size = len(patterns)
-    cdef vector[string] patterns_vector
-    patterns_vector.reserve(pattern_size)
-
-    for pattern in patterns:
-        patterns_vector.push_back(str.encode(pattern))
-
-    with nogil:
-        c_result = move(cpp_replace_re(
-            source_view,
-            patterns_vector,
-            repl_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_re(
+        source_strings.to_pylibcudf(mode="read"),
+        patterns,
+        repl_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index a81fb18e752..5319addc41c 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -1,21 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.split.partition cimport (
-    partition as cpp_partition,
-    rpartition as cpp_rpartition,
-)
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -25,25 +14,11 @@ def partition(Column source_strings,
     Returns data by splitting the `source_strings`
     column at the first occurrence of the specified `py_delimiter`.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_partition(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.partition.partition(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -53,22 +28,8 @@ def rpartition(Column source_strings,
     Returns a Column by splitting the `source_strings`
     column at the last occurrence of the specified `py_delimiter`.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rpartition(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.partition.rpartition(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index f481fea4c51..4ec6c7073d8 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -1,33 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.split.split cimport (
-    rsplit as cpp_rsplit,
-    rsplit_re as cpp_rsplit_re,
-    rsplit_record as cpp_rsplit_record,
-    rsplit_record_re as cpp_rsplit_record_re,
-    split as cpp_split,
-    split_re as cpp_split_re,
-    split_record as cpp_split_record,
-    split_record_re as cpp_split_record_re,
-)
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -39,26 +18,12 @@ def split(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from beginning.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_split(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.split(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -70,25 +35,12 @@ def split_record(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from beginning.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_split_record(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.split_record(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -100,26 +52,12 @@ def rsplit(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from the end.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rsplit(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.rsplit(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -131,25 +69,12 @@ def rsplit_record(Column source_strings,
     column around the specified `py_delimiter`.
     The split happens from the end.
     """
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        delimiter.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rsplit_record(
-            source_view,
-            scalar_str[0],
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.rsplit_record(
+        source_strings.to_pylibcudf(mode="read"),
+        py_delimiter.device_value.c_value,
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -160,24 +85,15 @@ def split_re(Column source_strings,
     Returns data by splitting the `source_strings`
     column around the delimiters identified by `pattern`.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_split_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.split_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -189,24 +105,15 @@ def rsplit_re(Column source_strings,
     column around the delimiters identified by `pattern`.
     The delimiters are searched starting from the end of each string.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_rsplit_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    plc_table = plc.strings.split.split.rsplit_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
 
 
 @acquire_spill_lock()
@@ -217,23 +124,15 @@ def split_record_re(Column source_strings,
     Returns a Column by splitting the `source_strings`
     column around the delimiters identified by `pattern`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_split_record_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.split_record_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings,
     column around the delimiters identified by `pattern`.
     The delimiters are searched starting from the end of each string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_rsplit_record_re(
-            source_view,
-            dereference(c_prog),
-            maxsplit
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result),
+    plc_column = plc.strings.split.split.rsplit_record_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT,
+        ),
+        maxsplit,
     )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 38ecb21a94c..982c5a600e7 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -1,18 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.side_type cimport side_type
-from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 import pylibcudf as plc
 
 
@@ -24,15 +14,12 @@ def strip(Column source_strings,
     The set of characters need be stripped from left and right side
     can be specified by `py_repl`.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-    return Column.from_pylibcudf(
-        plc.strings.strip.strip(
-            source_strings.to_pylibcudf(mode="read"),
-            plc.strings.SideType.BOTH,
-            repl.c_value
-        )
+    plc_result = plc.strings.strip.strip(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.side_type.SideType.BOTH,
+        py_repl.device_value.c_value,
     )
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -43,24 +30,12 @@ def lstrip(Column source_strings,
     The set of characters need be stripped from left side can
     be specified by `py_repl`.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_result = plc.strings.strip.strip(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.side_type.SideType.LEFT,
+        py_repl.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_strip(
-            source_view,
-            side_type.LEFT,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -71,21 +46,9 @@ def rstrip(Column source_strings,
     The set of characters need be stripped from right side can
     be specified by `py_repl`.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_result = plc.strings.strip.strip(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.side_type.SideType.RIGHT,
+        py_repl.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_strip(
-            source_view,
-            side_type.RIGHT,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx
index eed5cf33b10..2b40f01f818 100644
--- a/python/cudf/cudf/_lib/strings/wrap.pyx
+++ b/python/cudf/cudf/_lib/strings/wrap.pyx
@@ -1,17 +1,13 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def wrap(Column source_strings,
@@ -21,14 +17,8 @@ def wrap(Column source_strings,
     in the Column to be formatted in paragraphs
     with length less than a given `width`.
     """
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_wrap(
-            source_view,
-            width
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.wrap.wrap(
+        source_strings.to_pylibcudf(mode="read"),
+        width
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index 78fc9f08bd8..dd2fafbe07f 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -23,7 +23,8 @@ from pylibcudf.libcudf.strings_udf cimport (
     to_string_view_array as cpp_to_string_view_array,
     udf_string,
 )
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.column cimport Column
 
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 6e8ad556b08..3b13cc258ab 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import sys
-from collections.abc import Callable
-from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union
+from collections.abc import Callable, Iterable
+from typing import TYPE_CHECKING, Any, TypeVar, Union
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
@@ -42,7 +42,7 @@
 SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"]
 
 # Groupby aggregation
-AggType = Union[str, Callable]
-MultiColumnAggType = Union[
-    AggType, Iterable[AggType], Dict[Any, Iterable[AggType]]
+AggType = Union[str, Callable]  # noqa: UP007
+MultiColumnAggType = Union[  # noqa: UP007
+    AggType, Iterable[AggType], dict[Any, Iterable[AggType]]
 ]
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index e2bdecbe67a..871ffc6269d 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from packaging import version
 
-PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2")
+PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.3")
 PANDAS_VERSION = version.parse(pd.__version__)
 
 
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 32ae8c5ee53..ffa306bf93f 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Literal, Mapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
 from typing_extensions import Self
@@ -18,6 +18,9 @@
 from cudf.core.abc import Serializable
 from cudf.utils.string import format_bytes
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 def host_memory_allocation(nbytes: int) -> memoryview:
     """Allocate host memory using NumPy
@@ -284,7 +287,7 @@ def memoryview(
         """Read-only access to the buffer through host memory."""
         size = self._size if size is None else size
         host_buf = host_memory_allocation(size)
-        rmm._lib.device_buffer.copy_ptr_to_host(
+        rmm.pylibrmm.device_buffer.copy_ptr_to_host(
             self.get_ptr(mode="read") + offset, host_buf
         )
         return memoryview(host_buf).toreadonly()
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 0bd8d6054b3..ecf9807cfc2 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -2,13 +2,16 @@
 
 from __future__ import annotations
 
-from typing import Literal, Mapping
+from typing import TYPE_CHECKING, Literal
 
 from typing_extensions import Self
 
 import cudf
 from cudf.core.buffer.buffer import Buffer, BufferOwner
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 class ExposureTrackedBuffer(Buffer):
     """An exposure tracked buffer.
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 4c9e524ee05..b40c56c9a6b 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -207,7 +207,7 @@ def spill(self, target: str = "cpu") -> None:
                     domain="cudf_python-spill",
                 ):
                     host_mem = host_memory_allocation(self.size)
-                    rmm._lib.device_buffer.copy_ptr_to_host(
+                    rmm.pylibrmm.device_buffer.copy_ptr_to_host(
                         self._ptr, host_mem
                     )
                 self._ptr_desc["memoryview"] = host_mem
@@ -352,7 +352,7 @@ def memoryview(
             else:
                 assert self._ptr_desc["type"] == "gpu"
                 ret = host_memory_allocation(size)
-                rmm._lib.device_buffer.copy_ptr_to_host(
+                rmm.pylibrmm.device_buffer.copy_ptr_to_host(
                     self._ptr + offset, ret
                 )
                 return ret
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 06791df7dc0..a1e87d04bc9 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -29,4 +29,3 @@
     Decimal128Column,
     DecimalBaseColumn,
 )
-from cudf.core.column.interval import IntervalColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 864e87b5377..087d0ed65f5 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -4,7 +4,7 @@
 
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -26,6 +26,7 @@
 
 if TYPE_CHECKING:
     from collections import abc
+    from collections.abc import Mapping, Sequence
 
     import numba.cuda
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7674565e2c3..d2cd6e8ac8f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -4,10 +4,11 @@
 
 import pickle
 from collections import abc
+from collections.abc import MutableSequence, Sequence
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
-from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index d0ea4612a1b..b6dc250e64d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -8,7 +8,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Literal, Sequence, cast
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 import pandas as pd
@@ -31,6 +31,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import (
         ColumnBinaryOperand,
         DatetimeLikeScalar,
@@ -480,6 +482,11 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
     def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
         if dtype == self.dtype:
             return self
+        elif isinstance(dtype, pd.DatetimeTZDtype):
+            raise TypeError(
+                "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. "
+                "Use tz_localize instead."
+            )
         return libcudf.unary.cast(self, dtype=dtype)
 
     def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
@@ -940,6 +947,16 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
     def as_string_column(self) -> cudf.core.column.StringColumn:
         return self._local_time.as_string_column()
 
+    def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
+        if isinstance(dtype, pd.DatetimeTZDtype) and dtype != self.dtype:
+            if dtype.unit != self.time_unit:
+                # TODO: Doesn't check that new unit is valid.
+                casted = self._with_type_metadata(dtype)
+            else:
+                casted = self
+            return casted.tz_convert(str(dtype.tz))
+        return super().as_datetime_column(dtype)
+
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(
             self._local_time, field
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 8803ebd6791..8ae06f72d1e 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -3,8 +3,9 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Sequence
 from decimal import Decimal
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import cupy as cp
 import numpy as np
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c6a39199e3b..6b25e568f00 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -11,7 +11,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
     concatenate_list_elements,
     concatenate_rows,
@@ -22,6 +21,7 @@
     extract_element_scalar,
     index_of_column,
     index_of_scalar,
+    segmented_gather,
     sort_lists,
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column
@@ -34,6 +34,8 @@
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
 
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 05a0ab2e09a..a91c080fe21 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Union, overload
-
-from typing_extensions import Literal
+from typing import Literal, Union, overload
 
 import cudf
 import cudf.core.column
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 78d2814ed26..620cae65374 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -28,7 +28,7 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Sequence
 
     from cudf._typing import (
         ColumnBinaryOperand,
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 3b8dd05c13a..f6ab91f2f01 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -180,9 +180,12 @@ def var(
         min_count: int = 0,
         ddof=1,
     ):
-        return self._reduce(
+        result = self._reduce(
             "var", skipna=skipna, min_count=min_count, ddof=ddof
         )
+        if result is NA:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+        return result
 
     def std(
         self,
@@ -190,9 +193,12 @@ def std(
         min_count: int = 0,
         ddof=1,
     ):
-        return self._reduce(
+        result = self._reduce(
             "std", skipna=skipna, min_count=min_count, ddof=ddof
         )
+        if result is NA:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+        return result
 
     def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 4463e3280df..856ce0f75de 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5,12 +5,14 @@
 import re
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast, overload
+from typing import TYPE_CHECKING, cast, overload
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
 import cudf.api.types
 from cudf import _lib as libcudf
@@ -33,6 +35,8 @@ def str_to_boolean(column: StringColumn):
 
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     import cupy
     import numba.cuda
 
@@ -996,7 +1000,7 @@ def replace(
             return self._return_or_inplace(
                 libstrings.replace_multi_re(
                     self._column,
-                    pat,
+                    list(pat),
                     column.as_column(repl, dtype="str"),
                 )
                 if regex
@@ -2383,8 +2387,7 @@ def get_json_object(
             0    [\n        { "category": "reference",\n       ...
             dtype: object
         """
-
-        options = libstrings.GetJsonObjectOptions(
+        options = plc.json.GetJsonObjectOptions(
             allow_single_quotes=allow_single_quotes,
             strip_quotes_from_single_strings=(
                 strip_quotes_from_single_strings
@@ -2546,9 +2549,9 @@ def split(
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data, _ = libstrings.split_re(self._column, pat, n)
+                    data = libstrings.split_re(self._column, pat, n)
                 else:
-                    data, _ = libstrings.split(
+                    data = libstrings.split(
                         self._column, cudf.Scalar(pat, "str"), n
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
@@ -2719,9 +2722,9 @@ def rsplit(
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data, _ = libstrings.rsplit_re(self._column, pat, n)
+                    data = libstrings.rsplit_re(self._column, pat, n)
                 else:
-                    data, _ = libstrings.rsplit(
+                    data = libstrings.rsplit(
                         self._column, cudf.Scalar(pat, "str"), n
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
@@ -2820,7 +2823,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0],
+            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -2885,7 +2888,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0],
+            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -2966,7 +2969,7 @@ def pad(
             raise TypeError(msg)
 
         try:
-            side = libstrings.SideType[side.upper()]
+            side = plc.strings.side_type.SideType[side.upper()]
         except KeyError:
             raise ValueError(
                 "side has to be either one of {'left', 'right', 'both'}"
@@ -3624,6 +3627,46 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         data = libstrings.findall(self._column, pat, flags)
         return self._return_or_inplace(data)
 
+    def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex:
+        """
+        Find first occurrence of pattern or regular expression in the
+        Series/Index.
+
+        Parameters
+        ----------
+        pat : str
+            Pattern or regular expression.
+        flags : int, default 0 (no flags)
+            Flags to pass through to the regex engine (e.g. re.MULTILINE)
+
+        Returns
+        -------
+        Series
+            A Series of position values where the pattern first matches
+            each string.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series(['Lion', 'Monkey', 'Rabbit', 'Cat'])
+        >>> s.str.find_re('[ti]')
+        0    1
+        1   -1
+        2    4
+        3    2
+        dtype: int32
+        """
+        if isinstance(pat, re.Pattern):
+            flags = pat.flags & ~re.U
+            pat = pat.pattern
+        if not _is_supported_regex_flags(flags):
+            raise NotImplementedError(
+                "Unsupported value for `flags` parameter"
+            )
+
+        data = libstrings.find_re(self._column, pat, flags)
+        return self._return_or_inplace(data)
+
     def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series:
         """
         Find all first occurrences of patterns in the Series/Index.
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 2fda3b2c434..8f16ba4e15b 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -68,12 +68,7 @@ def base_size(self):
             return self.size + self.offset
 
     def to_arrow(self) -> pa.Array:
-        children = [
-            pa.nulls(len(child))
-            if len(child) == child.null_count
-            else child.to_arrow()
-            for child in self.children
-        ]
+        children = [child.to_arrow() for child in self.children]
 
         pa_type = pa.struct(
             {
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 6b6f3e517a8..087d6474e7f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,7 +4,7 @@
 
 import datetime
 import functools
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -19,6 +19,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
 
 _unit_to_nanoseconds_conversion = {
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index bc093fdaa9a..496e86ed709 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -5,8 +5,9 @@
 import itertools
 import sys
 from collections import abc
+from collections.abc import Mapping
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Mapping, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 79ed5a0e187..bf1c39b23da 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -13,8 +13,8 @@
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Callable, Iterator
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from collections.abc import Callable, Iterator, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numba
@@ -781,9 +781,15 @@ def __init__(
                 )
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
-                "Use cudf.Series._from_data for constructing a Series from "
+                "Use cudf.DataFrame._from_data for constructing a DataFrame from "
                 "ColumnAccessor"
             )
+        elif isinstance(data, ColumnBase):
+            raise TypeError(
+                "Use cudf.DataFrame._from_arrays for constructing a DataFrame from "
+                "ColumnBase or Use cudf.DataFrame._from_data by passing a dict "
+                "of column name and column as key-value pair."
+            )
         elif hasattr(data, "__cuda_array_interface__"):
             arr_interface = data.__cuda_array_interface__
             # descr is an optional field of the _cuda_ary_iface_
@@ -5118,11 +5124,12 @@ def info(
         useful for big DataFrames and fine-tune memory optimization:
 
         >>> import numpy as np
-        >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
+        >>> rng = np.random.default_rng(seed=0)
+        >>> random_strings_array = rng.choice(['a', 'b', 'c'], 10 ** 6)
         >>> df = cudf.DataFrame({
-        ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
-        ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
-        ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
+        ...     'column_1': rng.choice(['a', 'b', 'c'], 10 ** 6),
+        ...     'column_2': rng.choice(['a', 'b', 'c'], 10 ** 6),
+        ...     'column_3': rng.choice(['a', 'b', 'c'], 10 ** 6)
         ... })
         >>> df.info(memory_usage='deep')
         <class 'cudf.core.dataframe.DataFrame'>
@@ -5883,7 +5890,7 @@ def _from_arrays(
                 f"records dimension expected 1 or 2 but found: {array_data.ndim}"
             )
 
-        if data.ndim == 2:
+        if array_data.ndim == 2:
             num_cols = array_data.shape[1]
         else:
             # Since we validate ndim to be either 1 or 2 above,
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 5250a741d3d..aa601a2b322 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -3,7 +3,7 @@
 
 import enum
 from collections import abc
-from typing import Any, Iterable, Mapping, Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import cupy as cp
 import numpy as np
@@ -20,6 +20,9 @@
     build_column,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping, Sequence
+
 # Implementation of interchange protocol classes
 # ----------------------------------------------
 
@@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum):
     _DtypeKind.BOOL,
     _DtypeKind.STRING,
 }
-ProtoDtype = Tuple[_DtypeKind, int, str, str]
+ProtoDtype = tuple[_DtypeKind, int, str, str]
 
 
 class _CuDFBuffer:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 37ad6b8fabb..205edd91d9d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import pickle
 import warnings
 from collections import abc
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 # TODO: The `numpy` import is needed for typing purposes during doc builds
 # only, need to figure out why the `np` alias is insufficient then remove.
@@ -36,6 +36,7 @@
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
     from types import ModuleType
 
     from cudf._typing import Dtype, ScalarLike
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 81b20488d8d..6630bd96c01 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable, Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy as cp
 import numpy as np
@@ -36,6 +36,8 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable
+
     from cudf._typing import (
         AggType,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd07c58c5d9..1b90e9f9df0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,10 +5,10 @@
 import operator
 import pickle
 import warnings
-from collections.abc import Hashable
+from collections.abc import Hashable, MutableMapping
 from functools import cache, cached_property
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5952815deef..e031f2a4e8e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -10,9 +10,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Literal,
-    MutableMapping,
     TypeVar,
     cast,
 )
@@ -63,6 +61,8 @@
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, MutableMapping
+
     from cudf._typing import (
         ColumnLike,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 8182e5cede2..ce6a5c960dd 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -3,9 +3,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, List, Union
-
-from typing_extensions import TypeAlias
+from typing import Any, TypeAlias
 
 import cudf
 from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
@@ -46,11 +44,11 @@ class ScalarIndexer:
     key: GatherMap
 
 
-IndexingSpec: TypeAlias = Union[
-    EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer
-]
+IndexingSpec: TypeAlias = (
+    EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer
+)
 
-ColumnLabels: TypeAlias = List[str]
+ColumnLabels: TypeAlias = list[str]
 
 
 def destructure_iloc_key(
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 92d094d9de5..bfff62f0a89 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,7 +8,7 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, MutableMapping
+from typing import TYPE_CHECKING, Any
 
 import cupy as cp
 import numpy as np
@@ -36,7 +36,7 @@
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Hashable
+    from collections.abc import Generator, Hashable, MutableMapping
 
     from typing_extensions import Self
 
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 6e5abb2b82b..3d132c92d54 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -681,7 +681,7 @@ def _tile(A, reps):
     nval = len(value_vars)
     dtype = min_unsigned_type(nval)
 
-    if not var_name:
+    if var_name is None:
         var_name = "variable"
 
     if not value_vars:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index acd97c2047c..9b60424c924 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,7 +9,7 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy
 import numpy as np
@@ -71,6 +71,8 @@
 from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
     import pyarrow as pa
 
     from cudf._typing import (
@@ -637,10 +639,15 @@ def __init__(
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
             if isinstance(data, (pd.Series, Series)):
                 index_from_data = ensure_index(data.index)
-        elif isinstance(data, (ColumnAccessor, ColumnBase)):
+        elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
-                "ColumnAccessor or a ColumnBase"
+                "ColumnAccessor"
+            )
+        elif isinstance(data, ColumnBase):
+            raise TypeError(
+                "Use cudf.Series._from_column for constructing a Series from "
+                "a ColumnBase"
             )
         elif isinstance(data, dict):
             if not data:
@@ -2943,7 +2950,7 @@ def corr(self, other, method="pearson", min_periods=None):
         >>> ser1 = cudf.Series([0.9, 0.13, 0.62])
         >>> ser2 = cudf.Series([0.12, 0.26, 0.51])
         >>> ser1.corr(ser2, method="pearson")
-        -0.20454263717316112
+        -0.20454263717316126
         >>> ser1.corr(ser2, method="spearman")
         -0.5
         """
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 68f34fa28ff..885e7b16644 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -4,7 +4,7 @@
 import math
 import re
 import warnings
-from typing import Literal, Sequence
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
@@ -20,6 +20,9 @@
 from cudf.core import column
 from cudf.core.index import ensure_index
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
     "year": "year",
diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py
index 43604ab21a7..a0cbe7ada19 100644
--- a/python/cudf/cudf/core/udf/strings_typing.py
+++ b/python/cudf/cudf/core/udf/strings_typing.py
@@ -99,7 +99,7 @@ def prepare_args(self, ty, val, **kwargs):
             ty.dtype, (StringView, UDFString)
         ):
             return types.uint64, val.ptr if isinstance(
-                val, rmm._lib.device_buffer.DeviceBuffer
+                val, rmm.pylibrmm.device_buffer.DeviceBuffer
             ) else val.get_ptr(mode="read")
         else:
             return ty, val
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index c364d55e677..73afde407db 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -10,9 +10,9 @@
 import pickle
 import types
 import warnings
-from collections.abc import Callable, Iterator
+from collections.abc import Callable, Iterator, Mapping
 from enum import IntEnum
-from typing import Any, Literal, Mapping
+from typing import Any, Literal
 
 import numpy as np
 
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index f82e300e83d..38103a71908 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, NamedTuple
+from typing import Any, ContextManager, NamedTuple  # noqa: UP035
 
 from typing_extensions import Self
 
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index 8870fbc5c28..bb2fc00d9fc 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -9,6 +9,7 @@
     python analyze-test-failures.py <path-to-test-log> <file-or-pattern>
 
 Example:
+-------
     python analyze-test-failures.py log.json frame/*
 """
 
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index d12d2697729..59966a5ff0c 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs):
 
 @pytest.fixture(scope="session", autouse=True)  # type: ignore
 def patch_testing_functions():
-    tm.assert_produces_warning = null_assert_warnings
+    tm.assert_produces_warning = null_assert_warnings  # noqa: F821
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 4ea0b3b4413..a0ad872e4c7 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -5,7 +5,8 @@
 """
 Summarizes the test results per module.
 
-Examples:
+Examples
+--------
     python summarize-test-results.py log.json
     python summarize-test-results.py log.json --output json
     python summarize-test-results.py log.json --output table
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 8cb9efa873c..a5dc8a5498c 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -92,7 +92,8 @@ def random_bitmask(size):
         number of bits
     """
     sz = bitmask_allocation_size_bytes(size)
-    data = np.random.randint(0, 255, dtype="u1", size=sz)
+    rng = np.random.default_rng(seed=0)
+    data = rng.integers(0, 255, dtype="u1", size=sz)
     return data.view("i1")
 
 
@@ -209,9 +210,10 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
 
 
 def gen_rand(dtype, size, **kwargs):
+    rng = np.random.default_rng(seed=kwargs.get("seed", 0))
     dtype = cudf.dtype(dtype)
     if dtype.kind == "f":
-        res = np.random.random(size=size).astype(dtype)
+        res = rng.random(size=size).astype(dtype)
         if kwargs.get("positive_only", False):
             return res
         else:
@@ -219,25 +221,23 @@ def gen_rand(dtype, size, **kwargs):
     elif dtype == np.int8 or dtype == np.int16:
         low = kwargs.get("low", -32)
         high = kwargs.get("high", 32)
-        return np.random.randint(low=low, high=high, size=size).astype(dtype)
+        return rng.integers(low=low, high=high, size=size).astype(dtype)
     elif dtype.kind == "i":
         low = kwargs.get("low", -10000)
         high = kwargs.get("high", 10000)
-        return np.random.randint(low=low, high=high, size=size).astype(dtype)
+        return rng.integers(low=low, high=high, size=size).astype(dtype)
     elif dtype == np.uint8 or dtype == np.uint16:
         low = kwargs.get("low", 0)
         high = kwargs.get("high", 32)
-        return np.random.randint(low=low, high=high, size=size).astype(dtype)
+        return rng.integers(low=low, high=high, size=size).astype(dtype)
     elif dtype.kind == "u":
         low = kwargs.get("low", 0)
         high = kwargs.get("high", 128)
-        return np.random.randint(low=low, high=high, size=size).astype(dtype)
+        return rng.integers(low=low, high=high, size=size).astype(dtype)
     elif dtype.kind == "b":
         low = kwargs.get("low", 0)
         high = kwargs.get("high", 2)
-        return np.random.randint(low=low, high=high, size=size).astype(
-            np.bool_
-        )
+        return rng.integers(low=low, high=high, size=size).astype(np.bool_)
     elif dtype.kind == "M":
         low = kwargs.get("low", 0)
         time_unit, _ = np.datetime_data(dtype)
@@ -246,14 +246,14 @@ def gen_rand(dtype, size, **kwargs):
             int(1e18) / _unit_to_nanoseconds_conversion[time_unit],
         )
         return pd.to_datetime(
-            np.random.randint(low=low, high=high, size=size), unit=time_unit
+            rng.integers(low=low, high=high, size=size), unit=time_unit
         )
     elif dtype.kind in ("O", "U"):
         low = kwargs.get("low", 10)
         high = kwargs.get("high", 11)
-        nchars = np.random.randint(low=low, high=high, size=1)[0]
+        nchars = rng.integers(low=low, high=high, size=1)[0]
         char_options = np.array(list(string.ascii_letters + string.digits))
-        all_chars = "".join(np.random.choice(char_options, nchars * size))
+        all_chars = "".join(rng.choice(char_options, nchars * size))
         return np.array(
             [all_chars[nchars * i : nchars * (i + 1)] for i in range(size)]
         )
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 13c194d6be0..99b686406fb 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -48,16 +48,22 @@ def __init__(
         self,
         cardinality=100,
         null_frequency=0.1,
-        generator=lambda: [
-            _generate_string(string.ascii_letters, random.randint(4, 8))
-            for _ in range(100)
-        ],
+        generator=None,
         is_sorted=True,
         dtype=None,
     ):
         self.cardinality = cardinality
         self.null_frequency = null_frequency
-        self.generator = generator
+        if generator is None:
+            rng = np.random.default_rng(seed=0)
+            self.generator = lambda: [
+                _generate_string(
+                    string.ascii_letters, rng, rng.integers(4, 8).item()
+                )
+                for _ in range(100)
+            ]
+        else:
+            self.generator = generator
         self.is_sorted = is_sorted
         self.dtype = dtype
 
@@ -96,7 +102,7 @@ def _write(tbl, path, format):
             tbl.to_parquet(path, row_group_size=format["row_group_size"])
 
 
-def _generate_column(column_params, num_rows):
+def _generate_column(column_params, num_rows, rng):
     # If cardinality is specified, we create a set to sample from.
     # Otherwise, we simply use the given generator to generate each value.
 
@@ -115,10 +121,8 @@ def _generate_column(column_params, num_rows):
             )
             return pa.DictionaryArray.from_arrays(
                 dictionary=vals,
-                indices=np.random.randint(
-                    low=0, high=len(vals), size=num_rows
-                ),
-                mask=np.random.choice(
+                indices=rng.integers(low=0, high=len(vals), size=num_rows),
+                mask=rng.choice(
                     [True, False],
                     size=num_rows,
                     p=[
@@ -142,7 +146,7 @@ def _generate_column(column_params, num_rows):
                 column_params.generator,
                 names=column_params.dtype.fields.keys(),
                 mask=pa.array(
-                    np.random.choice(
+                    rng.choice(
                         [True, False],
                         size=num_rows,
                         p=[
@@ -163,10 +167,10 @@ def _generate_column(column_params, num_rows):
                 type=arrow_type,
             )
         vals = pa.array(
-            np.random.choice(column_params.generator, size=num_rows)
+            rng.choice(column_params.generator, size=num_rows)
             if isinstance(arrow_type, pa.lib.Decimal128Type)
-            else np.random.choice(vals, size=num_rows),
-            mask=np.random.choice(
+            else rng.choice(vals, size=num_rows),
+            mask=rng.choice(
                 [True, False],
                 size=num_rows,
                 p=[
@@ -189,7 +193,7 @@ def _generate_column(column_params, num_rows):
         # Generate data for current column
         return pa.array(
             column_params.generator,
-            mask=np.random.choice(
+            mask=rng.choice(
                 [True, False],
                 size=num_rows,
                 p=[
@@ -233,7 +237,9 @@ def generate(
 def get_dataframe(parameters, use_threads):
     # Initialize seeds
     if parameters.seed is not None:
-        np.random.seed(parameters.seed)
+        rng = np.random.default_rng(seed=parameters.seed)  # noqa: F841
+    else:
+        rng = np.random.default_rng(seed=0)  # noqa: F841
 
     # For each column, invoke the data generator
     for column_params in parameters.column_parameters:
@@ -281,14 +287,16 @@ def get_dataframe(parameters, use_threads):
     if not use_threads:
         for i, column_params in enumerate(parameters.column_parameters):
             column_data[i] = _generate_column(
-                column_params, parameters.num_rows
+                column_params,
+                parameters.num_rows,
+                rng,
             )
     else:
         pool = Pool(pa.cpu_count())
         column_data = pool.starmap(
             _generate_column,
             [
-                (column_params, parameters.num_rows)
+                (column_params, parameters.num_rows, rng)
                 for i, column_params in enumerate(parameters.column_parameters)
             ],
         )
@@ -336,7 +344,7 @@ def rand_dataframe(
     """
     # Apply seed
     random.seed(seed)
-    np.random.seed(seed)
+    rng = np.random.default_rng(seed=seed)
 
     column_params = []
     for meta in dtypes_meta:
@@ -348,7 +356,7 @@ def rand_dataframe(
             lists_max_length = meta["lists_max_length"]
             nesting_max_depth = meta["nesting_max_depth"]
             value_type = meta["value_type"]
-            nesting_depth = np.random.randint(1, nesting_max_depth)
+            nesting_depth = rng.integers(1, nesting_max_depth)
 
             dtype = cudf.core.dtypes.ListDtype(value_type)
 
@@ -368,6 +376,7 @@ def rand_dataframe(
                         size=cardinality,
                         nesting_depth=nesting_depth,
                         lists_max_length=lists_max_length,
+                        rng=rng,
                     ),
                     is_sorted=False,
                     dtype=dtype,
@@ -377,10 +386,11 @@ def rand_dataframe(
             nesting_max_depth = meta["nesting_max_depth"]
             max_types_at_each_level = meta["max_types_at_each_level"]
             max_null_frequency = meta["max_null_frequency"]
-            nesting_depth = np.random.randint(1, nesting_max_depth)
+            nesting_depth = rng.integers(1, nesting_max_depth)
             structDtype = create_nested_struct_type(
                 max_types_at_each_level=max_types_at_each_level,
                 nesting_level=nesting_depth,
+                rng=rng,
             )
 
             column_params.append(
@@ -392,6 +402,7 @@ def rand_dataframe(
                         cardinality=cardinality,
                         size=rows,
                         max_null_frequency=max_null_frequency,
+                        rng=rng,
                     ),
                     is_sorted=False,
                     dtype=structDtype,
@@ -401,14 +412,16 @@ def rand_dataframe(
             max_precision = meta.get(
                 "max_precision", cudf.Decimal64Dtype.MAX_PRECISION
             )
-            precision = np.random.randint(1, max_precision)
-            scale = np.random.randint(0, precision)
+            precision = rng.integers(1, max_precision)
+            scale = rng.integers(0, precision)
             dtype = cudf.Decimal64Dtype(precision=precision, scale=scale)
             column_params.append(
                 ColumnParameters(
                     cardinality=cardinality,
                     null_frequency=null_frequency,
-                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    generator=decimal_generator(
+                        dtype=dtype, size=cardinality, rng=rng
+                    ),
                     is_sorted=False,
                     dtype=dtype,
                 )
@@ -417,14 +430,16 @@ def rand_dataframe(
             max_precision = meta.get(
                 "max_precision", cudf.Decimal32Dtype.MAX_PRECISION
             )
-            precision = np.random.randint(1, max_precision)
-            scale = np.random.randint(0, precision)
+            precision = rng.integers(1, max_precision)
+            scale = rng.integers(0, precision)
             dtype = cudf.Decimal32Dtype(precision=precision, scale=scale)
             column_params.append(
                 ColumnParameters(
                     cardinality=cardinality,
                     null_frequency=null_frequency,
-                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    generator=decimal_generator(
+                        dtype=dtype, size=cardinality, rng=rng
+                    ),
                     is_sorted=False,
                     dtype=dtype,
                 )
@@ -433,14 +448,16 @@ def rand_dataframe(
             max_precision = meta.get(
                 "max_precision", cudf.Decimal128Dtype.MAX_PRECISION
             )
-            precision = np.random.randint(1, max_precision)
-            scale = np.random.randint(0, precision)
+            precision = rng.integers(1, max_precision)
+            scale = rng.integers(0, precision)
             dtype = cudf.Decimal128Dtype(precision=precision, scale=scale)
             column_params.append(
                 ColumnParameters(
                     cardinality=cardinality,
                     null_frequency=null_frequency,
-                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    generator=decimal_generator(
+                        dtype=dtype, size=cardinality, rng=rng
+                    ),
                     is_sorted=False,
                     dtype=dtype,
                 )
@@ -469,6 +486,7 @@ def rand_dataframe(
                             size=cardinality,
                             min_bound=meta.get("min_bound", None),
                             max_bound=meta.get("max_bound", None),
+                            rng=rng,
                         ),
                         is_sorted=False,
                         dtype=dtype,
@@ -484,6 +502,7 @@ def rand_dataframe(
                             size=cardinality,
                             min_bound=meta.get("min_bound", None),
                             max_bound=meta.get("max_bound", None),
+                            rng=rng,
                         ),
                         is_sorted=False,
                         dtype=dtype,
@@ -497,7 +516,8 @@ def rand_dataframe(
                         generator=lambda cardinality=cardinality: [
                             _generate_string(
                                 string.printable,
-                                np.random.randint(
+                                rng,
+                                rng.integers(
                                     low=0,
                                     high=meta.get("max_string_length", 1000),
                                     size=1,
@@ -519,6 +539,7 @@ def rand_dataframe(
                             size=cardinality,
                             min_bound=meta.get("min_bound", None),
                             max_bound=meta.get("max_bound", None),
+                            rng=rng,
                         ),
                         is_sorted=False,
                         dtype=cudf.dtype(dtype),
@@ -534,6 +555,7 @@ def rand_dataframe(
                             size=cardinality,
                             min_bound=meta.get("min_bound", None),
                             max_bound=meta.get("max_bound", None),
+                            rng=rng,
                         ),
                         is_sorted=False,
                         dtype=cudf.dtype(dtype),
@@ -544,7 +566,7 @@ def rand_dataframe(
                     ColumnParameters(
                         cardinality=cardinality,
                         null_frequency=null_frequency,
-                        generator=boolean_generator(cardinality),
+                        generator=boolean_generator(cardinality, rng),
                         is_sorted=False,
                         dtype=cudf.dtype(dtype),
                     )
@@ -567,7 +589,7 @@ def rand_dataframe(
     return df
 
 
-def int_generator(dtype, size, min_bound=None, max_bound=None):
+def int_generator(dtype, size, rng, min_bound=None, max_bound=None):
     """
     Generator for int data
     """
@@ -577,7 +599,7 @@ def int_generator(dtype, size, min_bound=None, max_bound=None):
         iinfo = np.iinfo(dtype)
         low, high = iinfo.min, iinfo.max
 
-    return lambda: np.random.randint(
+    return lambda: rng.integers(
         low=low,
         high=high,
         size=size,
@@ -585,13 +607,13 @@ def int_generator(dtype, size, min_bound=None, max_bound=None):
     )
 
 
-def float_generator(dtype, size, min_bound=None, max_bound=None):
+def float_generator(dtype, size, rng, min_bound=None, max_bound=None):
     """
     Generator for float data
     """
     if min_bound is not None and max_bound is not None:
         low, high = min_bound, max_bound
-        return lambda: np.random.uniform(
+        return lambda: rng.uniform(
             low=low,
             high=high,
             size=size,
@@ -599,7 +621,7 @@ def float_generator(dtype, size, min_bound=None, max_bound=None):
     else:
         finfo = np.finfo(dtype)
         return (
-            lambda: np.random.uniform(
+            lambda: rng.uniform(
                 low=finfo.min / 2,
                 high=finfo.max / 2,
                 size=size,
@@ -608,7 +630,7 @@ def float_generator(dtype, size, min_bound=None, max_bound=None):
         )
 
 
-def datetime_generator(dtype, size, min_bound=None, max_bound=None):
+def datetime_generator(dtype, size, rng, min_bound=None, max_bound=None):
     """
     Generator for datetime data
     """
@@ -618,14 +640,14 @@ def datetime_generator(dtype, size, min_bound=None, max_bound=None):
         iinfo = np.iinfo("int64")
         low, high = iinfo.min + 1, iinfo.max
 
-    return lambda: np.random.randint(
+    return lambda: rng.integers(
         low=np.datetime64(low, "ns").astype(dtype).astype("int"),
         high=np.datetime64(high, "ns").astype(dtype).astype("int"),
         size=size,
     )
 
 
-def timedelta_generator(dtype, size, min_bound=None, max_bound=None):
+def timedelta_generator(dtype, size, rng, min_bound=None, max_bound=None):
     """
     Generator for timedelta data
     """
@@ -635,25 +657,25 @@ def timedelta_generator(dtype, size, min_bound=None, max_bound=None):
         iinfo = np.iinfo("int64")
         low, high = iinfo.min + 1, iinfo.max
 
-    return lambda: np.random.randint(
+    return lambda: rng.integers(
         low=np.timedelta64(low, "ns").astype(dtype).astype("int"),
         high=np.timedelta64(high, "ns").astype(dtype).astype("int"),
         size=size,
     )
 
 
-def boolean_generator(size):
+def boolean_generator(size, rng):
     """
     Generator for bool data
     """
-    return lambda: np.random.choice(a=[False, True], size=size)
+    return lambda: rng.choice(a=[False, True], size=size)
 
 
-def decimal_generator(dtype, size):
+def decimal_generator(dtype, size, rng):
     max_integral = 10 ** (dtype.precision - dtype.scale) - 1
     max_float = (10**dtype.scale - 1) if dtype.scale != 0 else 0
     return lambda: (
-        np.random.uniform(
+        rng.uniform(
             low=-max_integral,
             high=max_integral + (max_float / 10**dtype.scale),
             size=size,
@@ -661,32 +683,33 @@ def decimal_generator(dtype, size):
     )
 
 
-def get_values_for_nested_data(dtype, lists_max_length=None, size=None):
+def get_values_for_nested_data(dtype, rng, lists_max_length=None, size=None):
     """
     Returns list of values based on dtype.
     """
     if size is None:
-        cardinality = np.random.randint(0, lists_max_length)
+        cardinality = rng.integers(0, lists_max_length)
     else:
         cardinality = size
 
     dtype = cudf.dtype(dtype)
     if dtype.kind in ("i", "u"):
-        values = int_generator(dtype=dtype, size=cardinality)()
+        values = int_generator(dtype=dtype, size=cardinality, rng=rng)()
     elif dtype.kind == "f":
-        values = float_generator(dtype=dtype, size=cardinality)()
+        values = float_generator(dtype=dtype, size=cardinality, rng=rng)()
     elif dtype.kind in ("U", "O"):
         values = [
             _generate_string(
                 string.printable,
+                rng,
                 100,
             )
             for _ in range(cardinality)
         ]
     elif dtype.kind == "M":
-        values = datetime_generator(dtype=dtype, size=cardinality)().astype(
-            dtype
-        )
+        values = datetime_generator(
+            dtype=dtype, size=cardinality, rng=rng
+        )().astype(dtype)
     elif dtype.kind == "m":
         values = timedelta_generator(dtype=dtype, size=cardinality)().astype(
             dtype
@@ -699,14 +722,14 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None):
     return values
 
 
-def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
+def make_lists(dtype, lists_max_length, nesting_depth, top_level_list, rng):
     """
     Helper to create random list of lists with `nesting_depth` and
     specified value type `dtype`.
     """
     nesting_depth -= 1
     if nesting_depth >= 0:
-        L = np.random.randint(1, lists_max_length)
+        L = rng.integers(1, lists_max_length)
         for i in range(L):
             top_level_list.append(
                 make_lists(
@@ -714,11 +737,14 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
                     lists_max_length=lists_max_length,
                     nesting_depth=nesting_depth,
                     top_level_list=[],
+                    rng=rng,
                 )
             )
     else:
         top_level_list = get_values_for_nested_data(
-            dtype=dtype, lists_max_length=lists_max_length
+            dtype=dtype,
+            lists_max_length=lists_max_length,
+            rng=rng,
         )
         # To ensure numpy arrays are not passed as input to
         # list constructor, returning a python list object here.
@@ -728,22 +754,22 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
     return top_level_list
 
 
-def make_array_for_struct(dtype, cardinality, size, max_null_frequency):
+def make_array_for_struct(dtype, cardinality, size, max_null_frequency, rng):
     """
     Helper to create a pa.array with `size` and `dtype`
     for a `StructArray`.
     """
 
-    null_frequency = np.random.uniform(low=0, high=max_null_frequency)
-    local_cardinality = max(np.random.randint(low=0, high=cardinality), 1)
+    null_frequency = rng.uniform(low=0, high=max_null_frequency)
+    local_cardinality = max(rng.integers(low=0, high=cardinality), 1)
     data = get_values_for_nested_data(
-        dtype=dtype.type.to_pandas_dtype(), size=local_cardinality
+        dtype=dtype.type.to_pandas_dtype(), size=local_cardinality, rng=rng
     )
-    vals = np.random.choice(data, size=size)
+    vals = rng.choice(data, size=size)
 
     return pa.array(
         vals,
-        mask=np.random.choice(
+        mask=rng.choice(
             [True, False],
             size=size,
             p=[null_frequency, 1 - null_frequency],
@@ -756,7 +782,7 @@ def make_array_for_struct(dtype, cardinality, size, max_null_frequency):
     )
 
 
-def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
+def get_nested_lists(dtype, size, nesting_depth, lists_max_length, rng):
     """
     Returns a list of nested lists with random nesting
     depth and random nested lists length.
@@ -770,13 +796,14 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
                 lists_max_length=lists_max_length,
                 nesting_depth=nesting_depth,
                 top_level_list=[],
+                rng=rng,
             )
         )
 
     return list_of_lists
 
 
-def get_nested_structs(dtype, cardinality, size, max_null_frequency):
+def get_nested_structs(dtype, cardinality, size, max_null_frequency, rng):
     """
     Returns a list of arrays with random data
     corresponding to the dtype provided.
@@ -787,7 +814,7 @@ def get_nested_structs(dtype, cardinality, size, max_null_frequency):
     for name, col_dtype in dtype.fields.items():
         if isinstance(col_dtype, cudf.StructDtype):
             result_arrays = get_nested_structs(
-                col_dtype, cardinality, size, max_null_frequency
+                col_dtype, cardinality, size, max_null_frequency, rng
             )
             result_arrays = pa.StructArray.from_arrays(
                 result_arrays, names=col_dtype.fields.keys()
@@ -798,13 +825,14 @@ def get_nested_structs(dtype, cardinality, size, max_null_frequency):
                 cardinality=cardinality,
                 size=size,
                 max_null_frequency=max_null_frequency,
+                rng=rng,
             )
         list_of_arrays.append(result_arrays)
 
     return list_of_arrays
 
 
-def list_generator(dtype, size, nesting_depth, lists_max_length):
+def list_generator(dtype, size, nesting_depth, lists_max_length, rng):
     """
     Generator for list data
     """
@@ -813,10 +841,11 @@ def list_generator(dtype, size, nesting_depth, lists_max_length):
         size=size,
         nesting_depth=nesting_depth,
         lists_max_length=lists_max_length,
+        rng=rng,
     )
 
 
-def struct_generator(dtype, cardinality, size, max_null_frequency):
+def struct_generator(dtype, cardinality, size, max_null_frequency, rng):
     """
     Generator for struct data
     """
@@ -825,25 +854,26 @@ def struct_generator(dtype, cardinality, size, max_null_frequency):
         cardinality=cardinality,
         size=size,
         max_null_frequency=max_null_frequency,
+        rng=rng,
     )
 
 
-def create_nested_struct_type(max_types_at_each_level, nesting_level):
+def create_nested_struct_type(max_types_at_each_level, nesting_level, rng):
     dtypes_list = cudf.utils.dtypes.ALL_TYPES
-    picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level)
+    picked_types = rng.choice(list(dtypes_list), max_types_at_each_level)
     type_dict = {}
     for name, type_ in enumerate(picked_types):
         if type_ == "struct":
             type_dict[str(name)] = create_nested_struct_type(
-                max_types_at_each_level, nesting_level - 1
+                max_types_at_each_level, nesting_level - 1, rng
             )
         else:
             type_dict[str(name)] = cudf.dtype(type_)
     return cudf.StructDtype(type_dict)
 
 
-def _generate_string(str_seq: str, length: int = 10) -> str:
-    return "".join(random.choices(str_seq, k=length))
+def _generate_string(str_seq: str, rng, length: int = 10) -> str:
+    return "".join(rng.choice(list(str_seq), size=length))
 
 
 def _unique_string() -> str:
diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt
index 84b13c9d946..566ac2c337d 100644
--- a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt
+++ b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt
@@ -1,4382 +1,4382 @@
-26899
-27424
+19535
+9039
 875
-7428432802425011718 0
-5054974408289448963 6
-18358444369622338053 9
-5716902217424485892 14
-8236612966193239043 18
-15282833726017872390 21
-15533348956988973570 27
-9001315167781089284 29
-7621090240282984451 33
-15337888141402371590 36
-16169070283077377537 42
-15615300272936709634 43
-12338784885023498756 45
-3175624061711419395 49
-9436392785812228615 52
-12978641027296058883 59
-14468815760709033991 62
-15607694490571932163 69
-53295083356623878 72
-0 78
-2230148770582976004 78
-6120456721458209796 82
-15411373208619074054 86
-10274574020114097153 92
-9000294930530661890 93
-13031557903172483076 95
-11350066664294002181 99
-6325605033787362307 104
-2909954277284188676 107
-4104562716099355138 111
-3267092979937387012 113
-17525453481571210244 117
-11532627846208440834 121
-10784672185103672321 123
-11229796758348255749 124
-4379577250247562242 129
-1041161126836283908 131
-3854383966527313413 135
-16467720483237810694 140
-14820844471735454722 146
-13111220924289178119 148
-2548683052821249538 155
-719749806464434178 157
-2121722119826170883 159
-9005614210949580292 162
-7050169108294333445 166
-17351764915062575107 171
-14644698505496219141 174
-11657834349296686081 179
-13626797927783164930 180
-14735048589438940164 182
-1078491261937017863 186
-7952761372439242754 193
-7692446865301965827 195
-4552111108816020995 198
-12455022990418032132 201
-1123962659471997957 205
-3056549312838577156 210
-1025661670765243906 214
-5397331336358247944 216
-7810366437124875782 224
-1195318972358038531 230
-7079722807026103811 233
-2524512050942986248 236
-1208593608912656389 244
-458260789232344578 249
-13194777122325112327 251
-5922704468287492 258
-11746235869336195079 262
-8611574268876189188 269
-7889840228953421829 273
-16998721522558936068 278
-6703563424903621638 282
-8885848295085850114 288
-13776273837475230211 290
-6036043703810622467 293
-2006225773287659526 296
-14202467530861800964 302
-7157057020317447684 306
-16885485872491802629 310
-12800303798361952772 315
-621325108927868418 319
-16727475898656483841 321
-6890112792805515778 322
-2421332377941126151 324
-16243404411124196356 331
-179400401794890244 335
-2630159406474274819 339
-1306609735592145925 342
-14908020842914311174 347
-1684452927247835651 353
-9400495923215416322 356
-8041860727239247878 358
-5619270496913133574 364
-2985476283152588291 370
-18150632792370312198 373
-13075355875451793410 379
-7596576612263365635 381
-7174955249282660868 384
-2272878747426984963 388
-9645618748109430277 391
-5995177571885476868 396
-16871713338758691845 400
-11801224416933808644 405
-15551192014010130949 409
-8196030292452405250 414
-4794784530053649411 416
-68047322062825475 419
-10163451915097363972 422
-4366630365820669955 426
-9174613115382159879 429
-17673253091692480002 436
-10710744348807818249 438
-6301209632168211460 447
-6557199531177304066 451
-10370980735304160259 453
-2426040420413965827 456
-18123352379522220547 459
-15891150425892429319 462
-16507447417454265351 469
-487708338428237827 476
-14107089365716616196 479
-747857609528251395 483
-17357876987202521607 486
-321005419951863300 493
-703083947315053061 497
-0 502
-17149635587492691460 502
-8277651075246678020 506
-1819886593879462403 510
-13106328552418381315 513
-17519686381941948418 516
-10696099526822671877 518
-4627984173327437314 523
-2628632462897246722 525
-3686397216490033667 527
-6617920799692924934 530
-6679301623707790339 536
-2596030458845084674 539
-13288938917088308226 541
-8348492885671808517 543
-6252009608718840325 548
-5807005916268695559 553
-15382799971167504899 560
-14954638692016032262 563
-8963684459383523331 569
-2934745887866391556 572
-8236887590303639044 576
-2016330563068923911 580
-12976290063611676164 587
-9986513189506445831 591
-780378482699725318 598
-383862355994530823 604
-7511344867307093508 611
-1435616864863593988 615
-12590979271693393411 619
-859813995721111047 622
-17910873098448224770 629
-16703366890805911553 631
-6922480979814889987 632
-8200210214462711297 635
-18382541080931060232 636
-12959023536126992897 644
-11055794376142651906 645
-8668012051305565187 647
-6795201209679524868 650
-3864186432644490244 654
-4574634299775772674 658
-2086703290536303619 660
-7145543127561014787 663
-9889572542971630085 666
-3510566585561691650 671
-10482036181312531460 673
-4296479271603189251 677
-17165580381790665732 680
-17931697598514948104 684
-5072138329769649158 692
-17857316349005986308 698
-1196313437880152072 702
-16094827446472526340 710
-6365083142954013701 714
-17639674970007880709 719
-1336948026798963208 724
-15719079816546418177 732
-453771991153695748 733
-15666021623592344581 737
-3887496731301423107 742
-16351565489992748547 745
-12913808626051103749 748
-9427161342471792643 753
-14610089064185748483 756
-11909740995340709890 759
-3386059367942955011 761
-7100313088634791944 764
-14954362273735097348 772
-5300343188950335490 776
-3306636399811602435 778
-15049176780536452612 781
-11478464585367391747 785
-4192691696663825924 788
-1724981527538165256 792
-8923121468991320579 800
-10407927314751914499 803
-4140577061391662082 806
-11024499228689010181 808
-11103397578962422789 813
-16103730809841527300 818
-2161511371026989571 822
-16905537098408481288 825
-14418359835235787780 833
-8643099440826274820 837
-15803230958149170691 841
-2270949347024239618 844
-16607521085023703556 846
-12520505897845165062 850
-10502193626894192132 856
-12350321094518214659 860
-4950437143309872131 863
-938542234576037889 866
-9547302901107668484 867
-7827404372121768966 871
-17757593377946824198 877
-13699186867246955524 883
-9859653826627356163 887
-16394835100035514883 890
-13800374264730731525 893
-16954635983094506500 898
-8015308433863798275 902
-858715644299290630 905
-4519655150699331077 911
-7134867591233050115 916
-6432786657037144579 919
-0 922
-9408341322832972291 922
-13653279902433200130 925
-1249019122170091524 927
-5444522055126761479 931
-18233734556082323457 938
-1838285473517654531 939
-10799019207790220804 942
-2448710159565130755 946
-18425837006146807297 949
-1384258267102048263 950
-6553795393861204486 957
-5022631533298058243 963
-2595435540421003780 966
-18298501952506793480 970
-17380720526409169413 978
-10291550905275666437 983
-8968303908578660869 988
-7762552109517888009 993
-12993351549860134403 1002
-13098482377540869636 1005
-17174134275815044100 1009
-2405939573849534980 1013
-11051603729345690626 1017
-2765842466801084934 1019
-13348255112383532037 1025
-4560899789258637829 1030
-17071422935680193539 1035
-11513452937230732294 1038
-1637355496640499203 1044
-14940739688966611972 1047
-8286559267538602502 1051
-6029036263825492484 1057
-6337648087046756355 1061
-12327119652833755139 1064
-7489768843341343236 1067
-17101806024406781955 1071
-1494687508867621385 1074
-915975103999953922 1083
-14731060910946571783 1085
-7993361195780195330 1092
-13688799604315935236 1094
-7328858946338903047 1098
-2913637027195678723 1105
-18189363439163655681 1108
-11261484070936291332 1109
-1244962005334571010 1113
-12618388435910808066 1115
-655187203027088898 1117
-1699259352638115337 1119
-9837815037477742085 1128
-10558465000768489987 1133
-3128326958710492164 1136
-16210393874387209731 1140
-3831602806328386054 1143
-1858477608543888899 1149
-11203849268139405826 1152
-14876215834473532933 1154
-838167957834962945 1159
-4472540425609859076 1160
-11410947109444917250 1164
-8435818218907397633 1166
-11045000766266457089 1167
-12325335880954441220 1168
-16708265953266297345 1172
-18342265362969646594 1173
-6953158344648897539 1175
-9922701673105435137 1178
-10113283973443524101 1179
-11668798096262926343 1184
-2129351334726026241 1191
-5692959118811792390 1192
-2917574127780044290 1198
-0 1200
-14420924818562740228 1200
-6098057863303978497 1204
-1252966646111680002 1205
-7111078464697947144 1207
-14144456899593720327 1215
-7367692118573781509 1222
-9319588592876439043 1227
-5212294342286609410 1230
-1600499660866511361 1232
-17579747388547180552 1233
-8365608306992954885 1241
-10307394306592963076 1246
-17092600292669807621 1250
-17030981925892977667 1255
-6929843536411176451 1258
-9908722951841282057 1261
-14685407131320535554 1270
-12861962652898171396 1272
-11958437143660911107 1276
-15904867421058229764 1279
-7283769647955500035 1283
-7872121678898447876 1286
-11726527760261815816 1290
-2316085662456682505 1298
-12840093831481137155 1307
-15574983692566917639 1310
-15176154862895929860 1317
-16186650646772958214 1321
-1965140296142659588 1327
-17362020270091437575 1331
-26356620300320263 1338
-4688323194808506371 1345
-470137109846916612 1348
-785647648524588041 1352
-686083037273571331 1361
-8705676087000994307 1364
-15985311040931325446 1367
-8848102120172622345 1373
-14900059783221505542 1382
-11611185676221023751 1388
-5823293000835959809 1395
-11173877492782561286 1396
-5985141512875075076 1402
-16607272189142469634 1406
-7000924871247012354 1408
-12796508861938638339 1410
-16352304696891085315 1413
-12654027566339262469 1416
-17652126895193709571 1421
-2059554016646703617 1424
-8824828815238545922 1425
-8026041213654553606 1427
-189105210507091461 1433
-8038465995762949635 1438
-0 1441
-4346653818095449092 1441
-13441396742193060358 1445
-5067771148519478785 1451
-210369551309682178 1452
-7856429334361659909 1454
-6456628847560069634 1459
-4777640967745320451 1461
-8983636279512822276 1464
-14568805960710332932 1468
-13817574021643753989 1472
-14625711259902278149 1477
-4632056779689710085 1482
-17613320542667293189 1487
-3172012402848437254 1492
-8040798394603101188 1498
-14064841209998140419 1502
-1914908168343121410 1505
-7368139610144548354 1507
-12868473585497306119 1509
-0 1516
-1618708134596732930 1516
-12587973098332420105 1518
-4964388169698209795 1527
-11644359715676310021 1530
-2644060095775605251 1535
-6430078223195648003 1538
-10183198452214045187 1541
-1240799682393062914 1544
-594310634075621378 1546
-2369514519273954820 1548
-10180653661786314245 1552
-954303650251543043 1557
-14430712698160791045 1560
-7362398115224322564 1565
-17170839233019868678 1569
-4334478792852912645 1575
-6976600872204725253 1580
-2757627166710815234 1585
-11581525848542896643 1587
-1902097979216049156 1590
-7092174838851165700 1594
-3776232881097953287 1598
-4956341896516184071 1605
-16560365104979398147 1612
-9985649880040289799 1615
-8870322153106933763 1622
-6905121755133908995 1625
-13368640352340902916 1628
-6681848478588709895 1632
-1825204937600832520 1639
-10492979809894170628 1647
-16021790814379410438 1651
-2537982728896871938 1657
-17110141827238231043 1659
-8972517116882764291 1662
-6878463938568223238 1665
-3653948979877717506 1671
-11414481194651397126 1673
-14522267179648162819 1679
-3098339502618796035 1682
-7079749050994126342 1685
-13571764215085394946 1691
-4748948606525397506 1693
-1577643399485818884 1695
-4080235243237779462 1699
-10874175738252140040 1705
-8407257242091918850 1713
-13208300770644489219 1715
-692428139842995202 1718
-1811883090719733762 1720
-9059362818280152070 1722
-1942856588307002885 1728
-8118332366482353665 1733
-4958069245857057284 1734
-14647311378680886789 1738
-10762024033896625670 1743
-28898254948429830 1749
-9834906317233815042 1755
-14985989359682912259 1757
-1282980713864208388 1760
-6063131598875265027 1764
-11171681444901584901 1767
-9942643440891227650 1772
-7536761905759707139 1774
-17586310513048226310 1777
-5368266791748388869 1783
-14231943828217691651 1788
-12518647321260815877 1791
-129394441281844743 1796
-2483490487411335170 1803
-654244401428041732 1805
-15646533714849457160 1809
-11807354932867949571 1817
-15902831808268765699 1820
-16275101253541722114 1823
-7489443708629377026 1825
-15395914353243975682 1827
-5617555619731661829 1829
-3134100206450675206 1834
-11607495136261988868 1840
-4974806308616426501 1844
-17446584074836170241 1849
-15686830167444742663 1850
-9706307518401206273 1857
-1668062460313515521 1858
-1175330870409010693 1859
-6316020408117881860 1864
-3926008952689808899 1868
-7412001888157663237 1871
-16350342416828571139 1876
-17722048717800707588 1879
-6638262866276511751 1883
-7428951476729761793 1890
-17816197047883941382 1891
-1346568064340942337 1897
-3701787015222295555 1898
-6659812133237486083 1901
-1828541539854978054 1904
-12379063259192634885 1910
-2611769333840765443 1915
-9618163593004828678 1918
-10135224491789939206 1924
-12979651712861326853 1930
-8882180359699969027 1935
-8839565787481092102 1938
-13328456084920556038 1944
-14232512278042323458 1950
-1868952656876792325 1952
-7567044498348088836 1957
-9878469525845452294 1961
-10877666723773861891 1967
-4437849393189355524 1970
-542122243470857732 1974
-4059190346138068994 1978
-14321675947144358916 1980
-14971180244834539009 1984
-7944574903635664900 1985
-6982417546170903047 1989
-9205813465909939715 1996
-14237044737088801799 1999
-636814072910696963 2006
-12520841226045264391 2009
-8898943418672995331 2016
-15646690259358356484 2019
-15618851112604340228 2023
-10285088843216830977 2027
-18286036510192394760 2028
-6450286360774949890 2036
-12025307250191760899 2038
-7044602746592181249 2041
-8270361223031661060 2042
-7199149542695273990 2046
-16798091800673956358 2052
-5285433079037354499 2058
-8498140496880657410 2061
-18434636390635965953 2063
-8780418579830073348 2064
-959965579978681347 2068
-2666650386212475906 2071
-4093783342266269185 2073
-7977153448080645638 2074
-3230317076849645570 2080
-2644129221999468547 2082
-7597431151331275265 2085
-6151418962808616963 2086
-16786361788616914434 2089
-9522044737514147334 2091
-15360350686533802498 2097
-4398995179394704386 2099
-4163122903470647302 2101
-18110267126768664070 2107
-17811600627481865731 2113
-11988559903619469315 2116
-5893679902922151940 2119
-3302430115655037445 2123
-2756050317441962502 2128
-7373324598575981572 2134
-15626353672087051269 2138
-9026268416534243843 2143
-5857105831257628164 2146
-11246462751297413124 2150
-7459631049065515526 2154
-2175352842263141379 2160
-9748465532031254533 2163
-12060676108130005507 2168
-8160425232164846593 2171
-1665947540125783558 2172
-10758171140537368580 2178
-5744770555727548418 2182
-15867521551313803780 2184
-11178209498970826244 2188
-2663862265833334277 2192
-646145646253570050 2197
-6886825228888300036 2199
-5219187155516171272 2203
-16142200027647465989 2211
-8727938199665870852 2216
-1200328579526163971 2220
-12449385538114001417 2223
-14632283715533800450 2232
-5295800027246062086 2234
-8827019094633400323 2240
-14543826221768176641 2243
-12388128316821831686 2244
-3087048392675298821 2250
-17669786912563615747 2255
-3879520399747123716 2258
-15648071975541157893 2262
-5580473107362200071 2267
-6895786389712974853 2274
-17709709086906012676 2279
-9627483233657542665 2283
-9602326803985618949 2292
-6748599026443758086 2297
-11488364339401397254 2303
-6716511183525677573 2309
-16003763240189186563 2314
-6003803301075291138 2317
-15800367754014516746 2319
-2817341800198731782 2329
-2110085916033252869 2335
-10353852055773781511 2340
-8745468498457416193 2347
-15197463976907486213 2348
-11844773108515011075 2353
-10745169896165544965 2356
-9502565595236673539 2361
-18340734722524717062 2364
-0 2370
-4877506240735029250 2370
-6632868101528461318 2372
-1094192348264738308 2378
-15930308455756352518 2382
-7517061312773919237 2388
-11537382714050522116 2393
-15343851421525887493 2397
-15685583084244037124 2402
-11443729733346354693 2406
-18096845502703148037 2411
-13060060807344890377 2416
-8226818503915081731 2425
-5171144332412330499 2428
-5367144440061049859 2431
-4687503341676126209 2434
-8115677569098133507 2435
-8753274682505368066 2438
-6767268893840927749 2440
-10747160183142327300 2445
-5318831768157948930 2449
-16744837601970291208 2451
-3968740997769839108 2459
-1041860322726726147 2463
-13185494599343868419 2466
-3781663100474830852 2469
-8664347289501861378 2473
-7145447006642560001 2475
-977858689003972101 2476
-188865761021926916 2481
-14781205616979726850 2485
-7514076159997088261 2487
-15227633270557658627 2492
-7486357174119883778 2495
-7899052859637422087 2497
-4312982947448530435 2504
-2484418012864310785 2507
-8450324929602980870 2508
-11374778755239228418 2514
-10780034123560756745 2516
-10313953391808102916 2525
-13836623279669341188 2529
-16297706918062760459 2533
-6404560275247226885 2544
-8323769790774729734 2549
-10061687257419431941 2555
-6724033317759518212 2560
-12265972209834273288 2564
-4748706107567735299 2572
-17588235414846031363 2575
-16029681841978911746 2578
-333014962274056196 2580
-2819861156000228870 2584
-17301319418358929926 2590
-14323022738651812355 2596
-17758251407482208260 2599
-9992216596142364674 2603
-5541911712511293955 2605
-1880849355295036931 2608
-15421034026101803523 2611
-2288503501826235907 2614
-2336333131728265731 2617
-15127408664422292997 2620
-6756061181968708102 2625
-2316367058427453443 2631
-13786932856453332482 2634
-17564157627292750852 2636
-5809790665868502019 2640
-9389430036410766853 2643
-15157257604368261123 2648
-523412383725034497 2651
-5270886391729814021 2652
-8987256414287503365 2657
-2751897370690544643 2662
-47819066577966599 2665
-9543124453318907909 2672
-15186331456703232514 2677
-9731347057535958023 2679
-6234700495105510914 2686
-17720066604242729989 2688
-611878128332703234 2693
-6029104170087404549 2695
-14612606995632327172 2700
-7357792311987945475 2704
-6074856230289873410 2707
-13368808999886628358 2709
-5918378978107988995 2715
-15624776793824203778 2718
-4241055509726121476 2720
-12687432015779367427 2724
-4003272975122620932 2727
-17483676776191982087 2731
-2701605488646040584 2738
-7387630099939362308 2746
-16331822462747681798 2750
-2197183442359868933 2756
-17624623361194542087 2761
-1749450990014992388 2768
-2888206094896619010 2772
-12985412669390948353 2774
-9843120678422464515 2775
-15590458610270713859 2778
-5950622975418741251 2781
-17607672802725530117 2784
-1225097419526011394 2789
-3758572251524375044 2791
-5891371767718009858 2795
-6843754938996156419 2797
-13418347525088883204 2800
-2887280155684756490 2804
-7867196614872225796 2814
-10992396837241625094 2818
-15526482250456426497 2824
-7582254907030848515 2825
-14309589056601523716 2828
-2843794758628944386 2832
-10106627892829635078 2834
-11117505412117820418 2840
-17559521087909430786 2842
-18410508844162253834 2844
-7796754440171003912 2854
-1826091018065355268 2862
-5568124937607335426 2866
-9164033835486570503 2868
-7917102923116225537 2875
-10708221634884163076 2876
-966446973350329348 2880
-1882776320247897092 2884
-18137433528115911172 2888
-7577505208556149252 2892
-3902521102041700356 2896
-11942362790107158020 2900
-2328713611561709573 2904
-8376513561567004165 2909
-18415012889800110091 2914
-7983446382889179652 2925
-2304166271864391689 2929
-708759182721729026 2938
-10774631175750681603 2940
-2608247964063907842 2943
-7317603117343176707 2945
-12615180422705001477 2948
-17995452459822326275 2953
-12439250137675515394 2956
-9947610136498965509 2958
-10340600516380348420 2963
-10073894039732477444 2967
-15954561361998232578 2971
-6039226287079734788 2973
-12684813664097613833 2977
-8337524429261820932 2986
-0 2990
-5738139389410570757 2990
-0 2995
-163262518049440773 2995
-11390362112332120070 3000
-7666496378417453571 3006
-17188351170280199170 3009
-14157925477049500677 3011
-16535316221715341826 3016
-701193705161007105 3018
-15417977144980853763 3019
-9623949443365348357 3022
-16537640731048440324 3027
-9880057250380779521 3031
-10507448958568448514 3032
-9901540867816521219 3034
-10882434502571251716 3037
-15939490563935542790 3041
-3818155241101528578 3047
-10810785028031231493 3049
-17268925026504538113 3054
-6000103580025957894 3055
-14492044616225970179 3061
-8964295197943843335 3064
-13244227239481936387 3071
-2072267724499101186 3074
-735562179013069826 3076
-3271477415853879302 3078
-1150251700717751812 3084
-11835839830005115393 3088
-17028480913889055238 3089
-16864969398419772420 3095
-9646252156141336066 3099
-5589333819644110342 3101
-14729039479109188098 3107
-2256025994407046148 3109
-5630416426912279555 3113
-23611161351524356 3116
-16061932977440933889 3120
-7560058124185071106 3121
-8943767870065516551 3123
-17388385529962317834 3130
-11686727589179028995 3140
-2993671307613155843 3143
-7451626547139373061 3146
-12726375988952098305 3151
-0 3152
-1735273330892205060 3152
-2746028049042776065 3156
-17093562035495421445 3157
-7598703106262353411 3162
-17526920923827930631 3165
-0 3172
-18087597149122765317 3172
-11336730259137625602 3177
-9704022087244797957 3179
-14531181144788964866 3184
-5103530438547424773 3186
-7049971328222257156 3191
-2593832991454060548 3195
-2549992206172832771 3199
-2656864556911864322 3202
-3094347590740453380 3204
-0 3208
-10556974365044028932 3208
-12597146506913681926 3212
-18243354473097630721 3218
-4168646291002030084 3219
-8893226051755120644 3223
-7904367695210051587 3227
-17247367703075879942 3230
-1338287165638264836 3236
-6734394253777139715 3240
-14645087877274778627 3243
-1841749727013933062 3246
-0 3252
-9793622484838288388 3252
-15384076833580083718 3256
-14678310837729104389 3262
-8947895455599830021 3267
-12421729442783160325 3272
-14382812703434878978 3277
-3484468606955360259 3279
-2411175954345499653 3282
-18322361710054416389 3287
-8989744845956541448 3292
-9637438279185886726 3300
-8282725403817063939 3306
-10727259769060221446 3309
-280860399088910340 3315
-3074647116268871172 3319
-9311932047626983431 3323
-2990333995786696707 3330
-11415454184475025922 3333
-8194042667332418565 3335
-11269986522125913093 3340
-10773634478079810565 3345
-0 3350
-4302235270674672643 3350
-4579270605621971460 3353
-3687011949425630213 3357
-9678333478858482691 3362
-14661606109051090440 3365
-9504123850532876291 3373
-14299233528797568008 3376
-10370491504729965060 3384
-286239823911254530 3388
-7969121812144744451 3390
-16606218867148559880 3393
-11756345184017143302 3401
-8204961944753809412 3407
-12456910480062157316 3411
-7569786299014196739 3415
-3372309516929818119 3418
-16631131943564946948 3425
-4436969913528429575 3429
-14467771002258720772 3436
-15278270405312088583 3440
-6638334178561090565 3447
-8154814430089498114 3452
-17289464348431017987 3454
-13185969354886446085 3457
-4725380864147687429 3462
-14933071000620043778 3467
-12471883028204926466 3469
-13286302152236950530 3471
-12020003522260348419 3473
-11784545509165047810 3476
-10311182359550097412 3478
-2262872037167824902 3482
-15672162207595698690 3488
-8479660175647360516 3490
-543122224331105283 3494
-8738610060644560897 3497
-15969479020845567490 3498
+0 0
+1196190418526572547 0
+3117251964976502276 3
+0 7
+3266452963994632202 7
+6701451810090115586 17
+10156473964989528067 19
+6270220596053033473 22
+8689732391113957377 23
+345423933508452359 24
+9048486634542125058 31
+13000119181766437380 33
+1008808785591799299 37
+12586249368236978177 40
+11161089178393358857 41
+0 50
+6900865085865625094 50
+2615908179610132483 56
+1617129254806601731 59
+1607892326666533378 62
+123501381755693059 64
+17180234710792039941 67
+17345742025318016002 72
+7933590365928361474 74
+16187522989672200717 76
+14893593683284454915 89
+6001767212789422083 92
+1805417936920808451 95
+8589625060174958594 98
+13148488988905702416 100
+6759231203841442819 116
+798806762886474754 119
+13949836854106156034 121
+4277844318153606661 123
+18162360468357982216 128
+17429735113921325570 136
+10428297564837543938 138
+10174389176493224450 140
+4782734429389924866 142
+16828613770926935558 144
+16924367891356487169 150
+15473269356473895940 151
+10277883249583756290 155
+7398921953351034881 157
+15672774546004063755 158
+7032338026028942337 169
+12638648541163088900 170
+11956890857542837252 174
+10813991647348979717 178
+698603259209416204 183
+104155371596876289 195
+8849883347580968451 196
+13523964487472320004 199
+12948374094552270339 203
+16624700721113753096 206
+0 214
+630014773304871940 214
+14669827911540386306 218
+16593543947487157254 220
+16189120489289924617 226
+5936869209199720450 235
+6504800368776816645 237
+17628010111075734529 242
+16073662248530872322 243
+15997624981342335497 245
+13519486007586370049 254
+469623719382726661 255
+10478598590185625089 260
+5239294057556035586 261
+17274642882001730567 263
+7924882265216651266 270
+13138720901108912133 272
+13741737182438464004 277
+14608811194009491970 281
+2489742908982890509 283
+14952279757728973318 296
+13432486964055121926 302
+15397241996877524995 308
+7400937882698838020 311
+13309132794101168654 315
+8519404085542453250 329
+2551722931538879493 331
+4492819152473235971 336
+9634175483270757380 339
+5023439465649179147 343
+2912624940235659267 354
+15615524075652075524 357
+15131856319265032196 361
+7560465986110364673 365
+16393161300057821706 366
+6737538541011470849 376
+6394493716971627523 377
+0 380
+6957953643235488257 380
+7533365794097524234 381
+11551517784611555841 391
+0 392
+14017003685401013761 392
+13868858036311946245 393
+609890416048967688 398
+15853752823436186626 406
+13008887538399190534 408
+275598997711474690 414
+612244017304434692 416
+265561555991638021 420
+0 425
+4771730300985403909 425
+14595656195986303489 430
+13010615142623560194 431
+3520044222049365512 433
+4843556531627173889 441
+9544321596489038851 442
+18097338319835691009 445
+17588488217883868161 446
+4553739803879796748 447
+12247953831639953411 459
+1685939678565356546 462
+2454121115370725890 464
+7699707784321416706 466
+2322428462912444939 468
+4251948422489921028 479
+8009626371771665409 483
+15830912148611917313 484
+15530208627603713027 485
+14550069280077337095 488
+3074860258671426050 495
+9819565310679728648 497
+0 505
+239920763215632386 505
+4479084686100589069 507
+7541436040510714881 520
+0 521
+18361828565940659201 521
+13943609537766478850 522
+1644071836581560844 524
+3325147442114083333 536
+9121949682662027269 541
+5375060563545179653 546
+11461944020052039682 551
+10205876604940857353 553
+17856338086929782276 562
+3964733248608209412 566
+15252617693956101123 570
+5198588053258159617 573
+7294352613378259976 574
+14274593384918848004 582
+12443356879762990084 586
+15967601366558600195 590
+0 593
+1596502676746638348 593
+3447763432008799745 605
+2154246728958848517 606
+1249748142575979010 611
+12802117032328183298 613
+14720455521613154825 615
+14431397366571454983 624
+8968154969419252739 631
+61922506310515202 634
+17332184019644205571 636
+1580044533016865796 639
+0 643
+16037339623732295172 643
+0 647
+6451385579602643969 647
+2249232807147062791 648
+15969372656029624833 655
+9184080755936318981 656
+10444965622910510594 661
+976846907109217284 663
+15036566770534162954 667
+2852219209756952581 677
+14428186506827194885 682
+0 687
+9583345567128655877 687
+8154021185610424842 692
+7639653587249864197 702
+284400846134645765 707
+5822594207495943172 712
+4666916656146452484 716
+10837424823999667726 720
+7662230599689246212 734
+16769958284715374596 738
+14214321919518354947 742
+7700892892210644993 745
+5647486165416790024 746
+12807160877623480835 754
+17202327424132939777 757
+5849043248643779075 758
+18232796011600235523 761
+4957062118189902859 764
+6105730765254667266 775
+8753292226633308675 777
+14066686889142136835 780
+1047708050925830148 783
+5555751253338228747 787
+8205438979066793987 798
+10100035083082646017 801
+3037731532850264067 802
+16470238215781450756 805
+15841867742103541257 809
+8087512074161331714 818
+15493250668750321668 820
+3797087601271950854 824
+2623502875154101252 830
+15159098560356506121 834
+343051006899596292 843
+16668194639613285891 847
+0 850
+9601059867653113858 850
+1570493927206813191 852
+9118300038493915138 859
+9563382677447647747 861
+5285530497249013763 864
+14598000812816350721 867
+15243372398425255435 868
+9815541045508240385 879
+408899826773384197 880
+7463961818871554565 885
+12980371725716597249 890
+15376403281856848903 891
+0 898
+5841652391326774789 898
+6476912065420260354 903
+3963854010828661252 905
+5784218172655345161 909
+15327721657175197701 918
+13180549833166182403 923
+15904501101973266436 926
+0 930
+14206180323061139974 930
+1106786522797875713 936
+17058832169116321282 937
+721828206256696835 939
+0 942
+8561789411832569355 942
+13374043249168898050 953
+15922789491870388229 955
+0 960
+16131878595889026564 960
+5509499768642979336 964
+12415614990376579585 972
+11304605070154481157 973
+7663245502729528834 978
+2692663086158549507 980
+14133757573751133701 983
+6813598296480126979 988
+13616528755765764611 991
+16303994430841145861 994
+12880492472155407874 999
+14023778603465187338 1001
+1658551813664662018 1011
+8148008758896362498 1013
+10688946549204321795 1015
+13274653424094307841 1018
+10847911221158770190 1019
+0 1033
+4643539771717744131 1033
+4169507947260962821 1036
+3126526255358650372 1041
+13449815687571241992 1045
+9421207081901200898 1053
+6898163624184020997 1055
+7290174431607841794 1060
+2741902156609523715 1062
+15499057183587255302 1065
+16461426401301993476 1071
+11278211202787295747 1075
+0 1078
+9413985875830324739 1078
+4646548733144616463 1081
+7078801759685020673 1096
+5376123263925219331 1097
+14227335667134915589 1100
+0 1105
+7295351152600562699 1105
+0 1116
+1397641409882635269 1116
+2364632016557825025 1121
+7290779788839345158 1122
+223977268476071945 1128
+13026660262516529667 1137
+17998435953459809796 1140
+8522469059272339460 1144
+16293947433309880833 1148
+4576500186674335749 1149
+0 1154
+4042247147937702403 1154
+3034443556411821057 1157
+13667368622259281923 1158
+15202537810082257934 1161
+15337640185400698372 1175
+8308041085868251649 1179
+8832030889396702722 1180
+10436989792260434949 1182
+14898581533124037641 1187
+9317528159836099585 1196
+1612938252083390982 1197
+6278485319310800898 1203
+10612805446261845508 1205
+13787162434835940874 1209
+12133705386992745478 1219
+5227473436681376774 1225
+5656787771058157057 1231
+4433258109319585794 1232
+6704526927800668169 1234
+17440456789764264962 1243
+6979104089888754689 1245
+10768049747876580866 1246
+15707303682313568257 1248
+15148244407999994380 1249
+2841265161354426373 1261
+5252307512862989316 1266
+13331565891980378113 1270
+18159416118263116290 1271
+501516395825858060 1273
+3867012501081805829 1285
+8267472486312505860 1290
+12872828689431491073 1294
+727773195231890946 1295
+7322382021491738631 1297
+5402024496579473921 1304
+6959655625064837122 1305
+10187142685062514177 1307
+3029360479097259523 1308
+3524388403479357447 1311
+5803404108302127107 1318
+3322880653425492483 1321
+14014789072627667972 1324
+0 1328
+17779075582177396743 1328
+11597164340541700097 1335
+18164718194518923266 1336
+0 1338
+3688441162538457604 1338
+12763684824056344584 1342
+6555198237040291843 1350
+8999497138912988675 1353
+9277828726380557826 1356
+1652226711750231042 1358
+6386464493042135559 1360
+11832103051565904386 1367
+7889400420599073793 1369
+5173699340624307713 1370
+9839391635984425985 1371
+9179189546563518985 1372
+8987610858276033026 1381
+14211262843725043205 1383
+9924217736728436740 1388
+4401850895204555779 1392
+5541709837691148811 1395
+10214740045672277507 1406
+14656675767246138369 1409
+5518164076312088578 1410
+8819194535554354691 1412
+1202694809888231436 1415
+9937648736864647683 1427
+4776509399304216066 1430
+3828150896429232641 1432
+9726415758235178498 1433
+15478358790166008844 1435
+0 1447
+447632828248568324 1447
+10254625284015096321 1451
+9602208154038649858 1452
+7918490636759656966 1454
+4464032935723660291 1460
+517803065456797188 1463
+11296051306811729413 1467
+9559870439106258948 1472
+18140734313948729864 1476
+5761393475703308289 1484
+5817187969532432391 1485
+7214411138154648580 1492
+8556555308704695297 1496
+5517275039512219661 1497
+155198283803470849 1510
+12028807386786979841 1511
+9402878779861331461 1512
+7529466829850301953 1517
+3700043109242268166 1518
+7889220073888590849 1524
+9698905706548099588 1525
+950350740255051780 1529
+16659267722661032455 1533
+11934825441675277832 1540
+1840952787151591937 1548
+3181706929772123141 1549
+13084360636440561667 1554
+7392348362663288323 1557
+11299566685738323463 1560
+11865504406956790788 1567
+470806909387516931 1571
+11392390055026286594 1574
+0 1576
+15250035972710306824 1576
+1841748561073501700 1584
+13959366503388518404 1588
+16383575845586120707 1592
+5993903773214649347 1595
+12927537188954086928 1598
+6310676060569643522 1614
+6823572598110530053 1616
+0 1621
+10355215107753852930 1621
+12991560131813107723 1623
+6463225875312731650 1634
+444925180768886788 1636
+8287375501749122564 1640
+8102699978355624961 1644
+3217121844483982342 1645
+0 1651
+15310893597687290371 1651
+4651888484278436356 1654
+16622466823413339137 1658
+14426029300798547465 1659
+16208338759425902084 1668
+13384891560853317123 1672
+10542264124115582467 1675
+0 1678
+13404868863569442317 1678
+8380728838811013123 1691
+2656782871938641923 1694
+5621105522992570375 1697
+16165957063051496962 1704
+17183335989224497157 1706
+0 1711
+12377944724210268163 1711
+15698714840429098497 1714
+2063306500131813891 1715
+7135499884796623879 1718
+14916197160702468612 1725
+14565364212611500547 1729
+17109666354199615491 1732
+18420265465448709122 1735
+5039636110599831051 1737
+13648715526743665665 1748
+8648155745742680580 1749
+0 1753
+4128476852805537282 1753
+12229435493123252233 1755
+18671114624524289 1764
+0 1765
+4330985506003776003 1765
+4960636854468069379 1768
+2825174586054641673 1771
+8083214972260871169 1780
+1656668836635006471 1781
+15658718806708214274 1788
+1364137667359422465 1790
+5440910769879224326 1791
+1242060995600047617 1797
+6028285323527704577 1798
+9862524515548398083 1799
+14095132043223516673 1802
+5330121798209797643 1803
+3047808178481674242 1814
+7009881287782938629 1816
+3836453927748870146 1821
+4828562734878493698 1823
+6251707885160171534 1825
+13503013357676597250 1839
+13120060435028427777 1841
+17453157023102628866 1842
+6659266074333195266 1844
+12122449770852231175 1846
+76872493233309186 1853
+10510620038219076100 1855
+3104474465142299652 1859
+15145875800387371010 1863
+14514645157364972555 1865
+5990940750853294082 1876
+9568631318395414530 1878
+13307393937882497539 1880
+0 1883
+13432428898749511691 1883
+2851874300532727813 1894
+16127254686981486084 1899
+11152828733555106817 1903
+8099684063905722369 1904
+10726727557015251463 1905
+0 1912
+16773004137299201537 1912
+0 1913
+1737396243104320517 1913
+12312810570815952904 1918
+8420117868402509825 1926
+4468099455608655362 1927
+17181412210024682497 1929
+7344171998747088899 1930
+11200240032637073926 1933
+9773885730549905922 1939
+2888420847349521921 1941
+0 1942
+3301971714535044611 1942
+6622000068430301708 1945
+14679279568503564291 1957
+15312513401406547971 1960
+11219696574507219971 1963
+15557068645919193090 1966
+14518831268196627465 1968
+11306244334020066818 1977
+445302382600591361 1979
+4798518764725378563 1980
+12833053520101596161 1983
+6569110733351726088 1984
+1133142439547627010 1992
+6020738327851480577 1994
+0 1995
+0 1995
+15123217074875560455 1995
+5146261845254048769 2002
+15577303646915962882 2003
+5068854713026915334 2005
+5662217880612308482 2011
+13584286678752042508 2013
+17647669975855288324 2025
+7092182408195844613 2029
+5243600304614296065 2034
+16379641210199802883 2035
+6541142296931350023 2038
+17648968980389751301 2045
+3633167252938199556 2050
+691728008305302531 2054
+7434042972483105284 2057
+1243474674683616271 2061
+439217426838173186 2076
+10460352595647090183 2078
+5080394082232633345 2085
+7346464481151790597 2086
+8068677175549539843 2091
+4859996294860352513 2094
+12470823893961605122 2095
+10033529424736163842 2097
+10769920382809060357 2099
+16128670331104411146 2104
+2973668094989328385 2114
+16323032859702780931 2115
+12227727930958763521 2118
+7302528030871866371 2119
+8967586997946816013 2122
+13935701471042006020 2135
+15676859696752227844 2139
+0 2143
+2397906929972799494 2143
+731429270944234509 2149
+14629591375919925252 2162
+14201687141277194244 2166
+8813493889730974725 2170
+4967156306307785221 2175
+12152782138863493635 2180
+5716269545878689795 2183
+12118250850399448070 2186
+10079764034817249795 2192
+9905170822798166018 2195
+7330246949116896272 2197
+4975588281894977539 2213
+2377967791858227715 2216
+1711948357573607427 2219
+15733402191778006532 2222
+13617127880905861132 2226
+5413022680339381252 2238
+12001217113207191043 2242
+605362804928389124 2245
+10888521749365150723 2249
+11742554576381655052 2252
+3591551764774430724 2264
+8647496912976230402 2268
+3843626828621262342 2270
+3921763517492323331 2276
+7707493410895858692 2279
+3920334550068498946 2283
+2658528064200329217 2285
+9038122947820533253 2286
+6952499746958836740 2291
+7951530266135717388 2295
+16076637508890388481 2307
+15187897527562671106 2308
+5520701509759360003 2310
+2598679891400145409 2313
+17512255026679867408 2314
+10995766946592999425 2330
+18117038245928559618 2331
+5391766950501834244 2333
+14461374868186265605 2337
+1273598128050393611 2342
+11820949665032480260 2353
+17841646829021216260 2357
+10200569215461547521 2361
+3670141860910412289 2362
+18396940417538187269 2363
+14261984156631670787 2368
+106960762513502723 2371
+16393357936187300353 2374
+7032931990465729538 2375
+15907195827890083338 2377
+16437195285078765571 2387
+17301257309241798147 2390
+8236593629924756481 2393
+1379157623727557125 2394
+14767417508072398345 2399
+16695407490005887489 2408
+1414009372711604744 2409
+499004129948061185 2417
+5775255721778604547 2418
+16754393591199635469 2421
+10568987941526160386 2434
+3311623553148127749 2436
+10255724520964794369 2441
+3121950734017230849 2442
+2129428121322164230 2443
+5233872436075409922 2449
+5115946926893418500 2451
+298818270766586369 2455
+2534391384903305218 2456
+13962240998865999372 2458
+2858192092257344002 2470
+2246014736733727747 2472
+18208224108542041605 2475
+5900635063125726209 2480
+8459478259862856201 2481
+3106812066263162882 2490
+6016756381746226178 2492
+375597697640802819 2494
+2513762961093744131 2497
+15366269329105501700 2500
+10035949288505144322 2504
+427851159373997574 2506
+4274431321888115714 2512
+5253654952100000770 2514
+16894221500064376839 2516
+14687193167626954754 2523
+13771965837935513090 2525
+8874009193925074945 2527
+4974093839237721093 2528
+741620693598341642 2533
+11991618038806280705 2543
+11116672093208526850 2544
+15807249887587362818 2546
+7323942637968351746 2548
+3660270925885407751 2550
+0 2557
+10684033640943126020 2557
+16989816981004759553 2561
+9001924880900419075 2562
+1998443310251235851 2565
+17567979874939109890 2576
+13652482471668535812 2578
+17509569230481751555 2582
+3182500785161561606 2585
+13325982159032983558 2591
+1923914978402147329 2597
+5589189981371284484 2598
+1161601912578541572 2602
+1916235467976744451 2606
+16280831412119656968 2609
+5531274414859838467 2617
+13599333592024061957 2620
+17989155199582565378 2625
+3030922814179764740 2627
+14644007464957335564 2631
+0 2643
+5497605392959732225 2643
+2032331457863458818 2644
+8100338548587463682 2646
+993329328502006794 2648
+6750921732510502913 2658
+13748899324120622595 2659
+15617703054210413571 2662
+13138109094843573761 2665
+6544485718564688390 2666
+4168731610225209858 2672
+7315066071491735044 2674
+11306658702491732995 2678
+1460741416990041090 2681
+8624484085251326469 2683
+4952143576172173826 2688
+11470130411385533445 2690
+8808161070990530055 2695
+3407659004810532870 2702
+9761503347061253645 2708
+347929962150473217 2721
+15682869073661250565 2722
+12636859761190001153 2727
+2169559175677957635 2728
+6583723534446631435 2731
+11332478688871909892 2742
+3541912969021597188 2746
+15665073567582359041 2750
+6811971824255872515 2751
+17832657550632072714 2754
+8908928359280249862 2764
+16149194899805562374 2770
+16584564148406323202 2776
+8926638669588577795 2778
+8056234806465729542 2781
+20557314279745028 2787
+1574148835258315780 2791
+0 2795
+5593745704266732037 2795
+8450014032945361420 2800
+7024373575570305540 2812
+11737655816003366406 2816
+4727037432569372673 2822
+8600949146786643459 2823
+9003058529087846919 2826
+14052664559056898 2833
+1424791599736305667 2835
+5413427196124183555 2838
+13050600684981920260 2841
+8589685071512056331 2845
+13186761374251900929 2856
+14090913721681066498 2857
+0 2859
+2742241767433926657 2859
+6309431184810384395 2860
+16867533333923942913 2871
+555261403132789763 2872
+5659601479152637444 2875
+18276768397881284098 2879
+6852010445819064844 2881
+16631838326863331329 2893
+246764640492975110 2894
+1313867708490425347 2900
+8944238870676823556 2903
+1060867472129666057 2907
+16635885715046522883 2916
+13334184179287121921 2919
+1341139991463623173 2920
+0 2925
+6310211216600221189 2925
+3521973268169620995 2930
+1462184866304097281 2933
+8359017763585949185 2934
+14138351761235446785 2935
+6817592922583008262 2936
+0 2942
+6385096150346020868 2942
+0 2946
+5484657660585723395 2946
+10615912620259059212 2949
+11956475177743584771 2961
+14617995947569946629 2964
+16460942815259223553 2969
+9814422111234662404 2970
+4608931955518876683 2974
+8617716815688349187 2985
+17740454941921826819 2988
+0 2991
+10586556775954286081 2991
+11028786367153901576 2992
+7561184979369551368 3000
+10180555287637633027 3008
+262376940139235842 3011
+1252244297117510657 3013
+17286434400127825418 3014
+11940732067173687811 3024
+9446744360256471555 3027
+583923543216445954 3030
+8153426984110241281 3032
+8998238685693393417 3033
+11022193474305971204 3042
+18018779292443289604 3046
+13782486654821986817 3050
+1031535266324627457 3051
+17367371162468022278 3052
+16063095350159409665 3058
+16006913374966627331 3059
+0 3062
+317830424679224322 3062
+14882116247225631239 3064
+9977848214775454210 3071
+15016859152309685763 3073
+1451917599200393219 3076
+14163345466838668289 3079
+7124786413716748809 3080
+8972415547684808706 3089
+17905923295565835779 3091
+11508735911159903238 3094
+1060738927182784515 3100
+3235164743035444235 3103
+7249634886133244929 3114
+13627026919527422469 3115
+804144428748921345 3120
+4260278694170215937 3121
+2554890109424057864 3122
+0 3130
+2939022249034957313 3130
+3727916159743203841 3131
+14170274700031256577 3132
+7153627445263524879 3133
+6798175517396767234 3148
+1899052595905691141 3150
+4651137331222245891 3155
+14020723224952528387 3158
+5768869715157669895 3161
+13394211108659571714 3168
+15788932119193980932 3170
+13584005658508513793 3174
+9286626632069867523 3175
+2398026920081879562 3178
+1285989134179298818 3188
+9371873775174273029 3190
+18182246561705410049 3195
+3627164815665507843 3196
+18002283031389555722 3199
+13723140536667785217 3209
+11940684153082156547 3210
+16151440538186193925 3213
+13475891972713434115 3218
+5932226594251481096 3221
+15508203776273810434 3229
+13958242421862434307 3231
+2178759546197172739 3234
+12536204645038731778 3237
+14021691565090239498 3239
+0 3249
+18424936840617633797 3249
+9515558058741110274 3254
+14427656809453646337 3256
+15295479713001905676 3257
+6924455800485778945 3269
+5547275743159208965 3270
+15965423529103676930 3275
+6276065480049782274 3277
+923852355669990415 3279
+5171389834127005698 3294
+15756927494767584258 3296
+5380717287071449607 3298
+6048706605171842052 3305
+10493631130929582093 3309
+2792686703001238018 3322
+16318095573166788102 3324
+14961739739381704706 3330
+13885085964549002242 3332
+8803999472247604229 3334
+13681809489997040642 3339
+1274343414475602434 3341
+17525390131260455942 3343
+4637625228183366658 3349
+8313154017818126861 3351
+13090076428282480132 3364
+18133227728108545 3368
+8282473413611347970 3369
+107193099920609282 3371
+8505179371271580173 3373
+11102079825957593602 3386
+10212767298703785475 3388
+5215453497761775618 3391
+3298152084179375111 3393
+1095163960428030473 3400
+16887781145875813889 3409
+14786085928210816520 3410
+8581278387803219458 3418
+6241337607249230852 3420
+9254719800476612099 3424
+2568855290428722689 3427
+1289519920250085381 3428
+14618186241114017793 3433
+9612541243912769538 3434
+13926515287424429066 3436
+11093957915681312769 3446
+12010544601346956290 3447
+11839562359654205442 3449
+6839541636025740804 3451
+6012482217637302795 3455
+0 3466
+5775335776577318914 3466
+2685494297938271233 3468
+18186802079969910787 3469
+3127521196291951624 3472
+6934893239724900866 3480
+11630798772510404609 3482
+2767762624498050052 3483
+14135084772626181124 3487
+11643008759045397001 3491
 3500
-5303047073946667464
+14107087915135404740
+3545799512027105927
+32996413518841137
+15568274631689570656
+20587511236070012
+2390363305266056430
+3863606920688567965
 210658854139
+9870724567599405
+103154228
+3007753865419557454
 493093586
-15289397349632312454
-5941764183477191834
-3477193953305167424
-236453760381
-7470284155521404014
-24445261
-16426766960960540026
-14549236
-817365937
+814220189
+538968856
+45810044
+11403474
+2625321602296383846
+3076135121411313050
+16635669954819197974
+5514354727165429372
+18413391979390173264
+3544953467117898450
+6361518319333476776
+5833854247140395797
+518849275
+2752627
+71565807
+9870724570416301
+163316374
+60096910
+817038254
+18411417877468545037
+5993603989931887912
+1873618523431177265
+14787093348585572176
+18413109988782047308
+1283692271244348427
+17461812017531651650
+13165236096819726043
+14883032307819284131
+2789363538679106064
+11161692095903435283
+62914993
+2365498112266798670
+154665586
+13726068529822894439
+5570718
+544604964
+33560368941433940
+819856323
+1873618458944931675
+1873618489039064439
+6156738032733324876
+10259573046193883986
+6208295848581203181
+5991347927496394467
+2272905061487347697
+8972557000702888938
+15289397384024950845
+4767727591019973374
+10758418391935812957
+2292825785040636736
+1545208828
+219257441372
+5569296714050766113
+2207492642904016905
+12612941966326959190
+12426051065400527122
+18331556280207363
+2785415334835848520
+6156737968247080128
+15292217517958891614
+5780604328577598853
+3188833133853148985
+4078298757842341053
+6051485356288903427
+573178715
+102957618
+91488775
+2625321602296187261
+114426460
+22675774
+11206864
+9870724567402585
+5406444726343502428
+68551110
+515834601
+2431124533
+538772246
+11065179658016983681
+8930986418384079868
+4076606646528706921
 1873618471841499416
-71893492
-10694515171064744788
-29330183088506125
-61997475
-4653200
-109445719
-8926052536804313893
-7528330190111771360
-1418462186
-5887104182899575287
-2625321597997091447
-23407864425745813
-1647838213
-6152225753094686522
-14151987057237756511
-18058417591402760409
-538510099
-17855463731522440261
-240752528220
-27920040887059601
-11078361536363433136
-12517601
-15885957841278600403
-518718202
-805438326
-2621553
-1550910461
-2411070513
-59965836
-13012951802392676509
-97518103
-2625321602295859611
-30277976
-546374457
+3701601059573925529
+16166203682344470241
+6101795981361546864
+15289397371128186695
+7569568047215545466
+18411981910273949729
 16759426304739641933
-259654328
-27356063970624739
-1873618458944931675
-6209987959894902621
-5728764444739437994
-18413109988782047308
-13885455448020813663
+48431492
+24535874148371011
+14024943
+59900299
+105775699
+10770155859627543824
+71369196
+9870724570219682
+163119765
+2530739313276357975
+5052785364214352114
+805372789
+5652457623480305518
+644809585
+816841645
+2556016
+4501477955215362649
+4502324021619918399
+2150364451440035988
+6156455943246842659
+1873618497637649718
+12309852946450942075
+3660444556051220001
+11103300151687644832
+8714520725396523830
+5461104765611607541
+27356033875641745
+5352348805862394041
+2012415014
+5151629580948802356
+5374107
+154468975
+108593749
+62718382
+16843031
+28311895
+1107456968073808590
+11490081257974859839
+16633695840000739887
+9386257335747873389
+4959080478982475006
+11408348231855703653
 13464164481390611573
-5514354709969504081
-6364097374632348674
-2676033351739376985
-1136798196293306910
-5299098874403555921
-2120987217453057458
-17306856587979066781
-1873618532028844481
-5572365145471912335
-18412263926676652075
-105382480
-5303047039553965447
-9881712940254169714
-152830562
-8610102806501591788
-15524263781940136850
-14282671233461718187
-2857298572705729021
-29330122900898936
-10554335258691243263
-8453377129057749572
-18411417864571256842
-811271050
-1873618489038604579
-4657106642463886071
-2676033356038145381
-514654951
-10757572347027851837
-4237766514325588729
-571999061
-9821766011288487605
-7230168968130792223
-2704904949959166469
-1823671323
-103350839
-46006654
-2755882956846859930
-15289397371128186695
-12662636664722033563
-16318735
-18411417894664929297
-5462796894122411284
-9950019064427710530
-6981729909914862956
-1992588707391932346
-63766972
-6422699
-23407808536904833
-15394822466617412826
-16881139139804531782
-14312300901618944289
-2625321593698061230
-9870724570679212
-5780604289886653255
-3870997034531752803
-2531021389865944442
-10908568553618343357
-1860700038481053299
-196215461
-1801847830
-24183115
-18424247431471827427
-14287090
-417019855960
-71631344
-4391052
-61735328
-18413674012989259870
-2625321597996829544
-17957750408840481687
-9870724568648556
-41943405
-2789363542978135882
-18412827950883864637
-548143940
-22151483
-17257283845880874759
-899112529018292807
-538247952
-69599701
-8510664359869943178
-27356081165698156
-27638084672359236
-12255453
-11400819049620310987
-1321272283
-16881139122607162703
-2359405
-3101815889301670444
-518456056
-9232147856523987724
-3758799212073651272
-3591160524196219107
-154600049
-17946608694533885076
-11500631658516907905
-825323275339564903
-9870724566615620
-39911783
-12318365723907459763
-546112310
-18412827980977537092
-536216330
-2676033351739114988
-11069796553860646809
-7880043043777809442
-451412296787
-18411981918872141859
-11678577273375754735
-8856014234050823647
-105120332
-1309344723
-162464400
-681145240220010584
-2626514825137096412
-6589396841525218018
-356832249381
-6156738032733324876
-11202456151687629452
-27638041680086900
-11243723090649876783
-5726358144768542273
-12498251711624252784
-13702827714901707594
-811008904
+15494005608834598990
+1407386597
 8192198
-8714520725396523830
-514392806
-9960543895307946415
-15287141235608259625
-5727354401416546168
+219257244681
+42598769
+811008904
+2573543610120276856
+5356297048398365877
+7595953279435999504
+5726226297114658480
+2723374776553770162
+1543385872365455415
+11535686880442518166
+15289397379726773461
+5565348488711963913
+504169174
+9870724567205432
+14212253575230457510
+5831598111619679502
+2625321602295990612
+572982104
+813826970
+279448324634
+538575636
+11010253
+68354499
+11243723090649876783
+18331491793766525
+15292781563660995825
+5991347884505304103
+9409295256684857617
+3759645248384009814
+5832726134240118664
+14312300901618944289
+20305615210743190
+13001845694847518363
+2652485274356286816
+6151097653090126690
+2203332276215481610
+18412545964574834746
 1808894516123993997
-3686437022462641529
+518456056
+2359405
+1321272283
+71172585
+417019398489
+18895516000586505
+162923155
+9870724570023121
+13828334
+2625321864544389907
+816645035
+8453377129057749572
+11949535972653271176
+1873618467543321286
 5249797181178709209
-2625321589399030850
-103088691
-3062219857732765097
-830399540494469985
-530117487457144076
-12454108019635062383
-197984938
-8930986418384079868
-818873277
-16056587
-11526999220155450649
-6160551
-63504826
-7621890105505615217
-11847668763332905754
-10377426660276898779
-1873618519132015281
-18092519415945890646
-15882855708139391266
-7993599274919922706
-2789363538679106064
-2150364451440035988
-9870724570416301
-2625321593697799226
-91161094
-1410073577
-23920969
-7513578521803359945
-22279798815198594
-15520597512816297356
-1023125932615797552
-540017436
-8910392170935354895
-195953314
-644809585
-14024943
-71369196
-1873618476141774348
-816841645
-10906583479868327250
-1454041666728626384
-4128904
-18413392005184749654
-108921430
-468609401971
-16204201012116260706
-99025451
-9870724568385196
-18412545943079354421
-11878630053446878902
+5567604589840172352
+3707523343842937215
+17088205463377873568
+2169005683868174908
+9568723490388248888
+6103488088376871190
+4025969582498383295
+62521771
+18276979644936029994
+154272366
+16646420
+544211744
+28766107292140894
+5177496
+509805280
+1873618519132801026
+1873618544926132491
+7676326001635166459
+7676326031729298383
+869984510486186619
+13146357072728951328
+2000487899013646903
+2449021711964768402
+6155298010574883251
+6098975770044401989
+3189961199463959445
+2676033351739376985
+7995587
+19464489
+547029825
+219257046468
+2021331689141374237
+15288269301218674108
+11705421198335413148
+2508194873
+2625321610894575340
+6097847713031849822
+16064731596255856452
+13701595356116683915
+6364097396127827248
+18413391987988365394
+16364556117061994922
+10296839827164892306
+5403008449516603011
+15858116883009440274
+5833854255738587405
+45220217
+194314911
+10813643
+68157888
+56689033
+114033243
+4287350266942457603
+987047180239768912
+813630359
+18411417886066737167
+18413109997380239438
+11548493110908749415
+6364097387529046615
+5561348123192067576
+5835546388547569431
+5246976935469649046
+13884327378110449525
 18204249488608200784
-5566476545725367766
-17951898368652543383
-7558005371879033601
-16542141154387102177
-6316393479032998553
-11694336983993944146
-11427331956784106382
-4662073785906890031
-1873618454645640429
-537985804
-12999620585941961275
-2295119206548507606
-11993306
-1597536180772867045
-5299098844309358384
-8294669686619703163
-69337553
-1873618506235448739
-518193910
-5406444726343502428
-16765215479188031591
-5460499803636172954
-3431717683755289915
-28202117477106938
-5249797172580910311
-5745384143842643344
-14065038233622153931
-14311172801615955497
-16758489844492275047
-5510538272098551989
-11065487220741573048
-9870724566353399
-5679882735784101879
-259130038
-87097857
-3491703471172619422
-545850164
-18271599167641487963
-5991347923196709309
-1873618458944406678
-7033448275620070919
-812778389
-434977997061097911
-3445982126355516078
-2676033351738852867
-3545799512027105927
-1873618484739311861
-12749251354825264418
-14836382508930370955
-2625321585100000596
-21997756618246082
-8716776809328151764
-15580874176502892132
-3332575624131774585
-4445946672738010859
-5780604328577598853
-2848264744227112681
-1873618441749072804
-257098416
-4930631980557601532
-6877319166685482198
-1005889956380019628
-820642761
-17826079
-23125779236849772
-810746758
-7930050
-8929320279979198383
-9654763076979264499
-11949535972653271176
-1873618514832984063
-514130660
-18066207382028748450
-2573543666009114673
-18613585580197092
-1427238547443354327
-2625321589398768544
-102826544
-5903884228619468800
-4279043148
-7036226112429884975
-818611132
-15794439
-3324580943442478547
-1903640920853056624
-5898403
-1873618497637649718
-1133620887485417426
-10156853965084755435
-63242678
-282723005
-13586095437453200186
-9082058141968173941
-1987794462939089941
-13237708531286474753
-5240852582657493474
-1915314009235720841
-9870724570154139
-90898949
-17090754651615726815
-492307151
-195691169
-11050161621988804687
-23658823
-11623400942792738969
-9304480456320748248
-71107048
-816579498
-23971751058934778
-17869638717220195611
-1873618476141513316
-361675971417279818
-61211034
-1873618501936418049
-3866756
-567411536
-5302201063430292982
-8486888319115725460
-12406930521299355297
-9870724568123690
-11034422950646711803
-4287350254045103750
-5566476545725106758
-1923875870
-547619651
-6366353527348595732
+70975974
+9870724569826462
+816448424
+4211213383
+2162794
+12974919760129952993
+105382480
+5459976661309982295
+21433723812579518
+32432320527074663
+1873618497637255436
+9305858029919208637
+10225919154718574351
 8597156797828894009
-13590665243542948895
-13237708561380147208
-4254959725487523541
-2907303882175415846
-1873618454645376983
-9230753948926543533
-11731158
-527827717
-5511666307614640107
-1330643932
-69075405
-28202091681942395
-4727296740454696303
-1992881785902860007
-18301216972081072101
-4076606659425995504
-9870724566091296
+12461042340477994821
+1455946274504313841
+9538952396691934382
+927164962728314711
+5782296426993943791
+9714916684781063078
+16449809
+4980885
+819266496
+2625321589399030850
+10907429529076434052
+257295025
 39387493
 154075756
-5459976644113468289
-545588016
-12461042340477994821
-223556406340
-32432337723721245
-19595563
-2573543610120276856
-24535874149025753
-5196265237615086368
+62325160
+1495925747
+288043895627
+4504298205224635444
+14835085562484362568
+16881139122607162703
+1839046019115124804
+11923578915473263059
+9388513449772451585
+5247593352907982888
+5153885686374731086
+12020808312486431384
+14848239906707278405
+5405598728725530322
+3653991426073234491
+5566476498435442740
+4333982245204396969
+17007720368052373541
+14458654042895551171
+16885259953617962521
+2676033351739180486
+6877309693745106245
+21997713627284659
+7562235540534921217
+2625321610894378836
+5458848587099997499
+1647838213
+288046714075
+1454859013759438228
+1133620887485417426
+237175467
+810615685
+1418462186
+12162857194684744950
+88080898
+19267879
+7798976
+546833214
+6206321690771522709
+21433680821684597
+1873618480439692390
+3932922014897081298
+2549492329236335496
+5249797112394286460
+12294570438877711433
+2324121364801391676
+3315661715940248009
+8971880411373045432
+5461104782808583112
+18411981918872141859
+15371922320578972378
+361675971417279818
+90898949
+13390152586296232130
+492307151
+13522668389390157414
+538182415
+10617033
+12498251711624252784
+22085946
+1987794462939089941
+425617786716
+1730937871
+5356297014005859746
+5569296739846327213
+16881139139804531782
+4196703391028741586
+1873618476141710425
+821147663836514852
+3158171969379764633
+30176223702288623
 17735566651085687884
-6204347601746593065
-1873618484739049815
-812516243
-6152225714402428442
-15291935501556190620
-15505670362359531298
-451411772583
-9484411285755463284
-161940107
-15292499508566297469
-563348302
-506004186
-11238431078799509026
-18323667541285735009
-2625321610894640833
-103179363763488430
-503001580666
-12769025487284210679
-17785259844527786731
-29612147900877606
-15290243377345399572
-17563932
-7667902
-3186488476490139978
-810484612
-1192315333980326167
-1873618514832721746
-15292499491370961900
-513868514
-5347351719937377689
-45220217
-11775490430040476325
-12240192446106372977
-35324256
-2396555433535145871
-7409502855497715015
-7888341864134085054
-4278781002
-1732546121802517809
-2374936041605498895
-21433680820701635
-12189960762281954023
-869984510486186619
-3598203394278688718
-6103488079777762245
-72876542
-16990917635978692369
-818348984
-15532291
-1146796961722731823
-17761874897365304540
-62980530
-4534407021717882867
-5636255
-32714379920409891
-12552846396214610071
-6262673798361580735
-2528483177756102046
-9870724569894177
-9297735470756268616
-5831598115918776853
-32432303331018178
-6064762127302393958
-6156455943246842659
-23396678
-13500652
-16916327697533962956
-70844900
-816317351
-18411699885273055253
-5884848047378859255
-5837238405281154301
-14311736903207619026
-5141736951422061236
-3604608
-31022281504523376
-3599049409094225259
-577045344
-2974323816123992770
-8021450341214588326
-3577503648415550265
-509805280
-9870724567861628
-11098517635487303139
-7462549834646555859
-98501157
-5779476207078475458
-219257375260
-490013379
-4222974949961697922
+1427238547443354327
+10223260478367337870
+10720606758114626648
+70779363
+105185869
+162529937
+9870724569630759
+24904017
+2681814701524780811
+1320879066
+1584661506
+644219759
+13435115
+6097847786116483627
+12477949191893683608
+6925759835249836137
+27920040887322186
+10003084053115964048
+16253198
+153879145
+2625321589398833886
+257098416
+4784274
+9103100569952650951
+12474564753552836994
+1495729137
+62128549
+9774054990929462949
+5356296971014964874
+6153353870293665804
+9568883447315500158
+1915314009235720841
+16655465042802838677
+14866462842593414402
+2676033351738984017
+546636604
+535167753
+42008942
+30540122
+6365225483234117329
+7602365
+282854078
+2625321610894182276
+13307798926833551183
+10913926882465549337
+15906307047154976446
+6104586261131037638
+8483828720841721486
+15287423226215073909
+17785259896117529586
+2785415278947600352
+9000175594581527004
+14425661002709010016
+5513226652957347114
+805679481429165719
+17859691850682797212
+9181555677596944971
+1363739614
+9870724566615620
+537985804
+572392279
+15175534989820758889
+1873618476141513316
+2152780467316001469
+12601357272775920269
+16765215479188031591
+6534429686359852912
 6366353553143235674
-3158171969379764633
-21365044
-27638058876667848
-29330140097217635
-1873618454645114642
-2703776923039566000
-68813257
-279448782049
-814285726
-12237654319976351671
-517669620
-5779476284463187670
-10375505326587315831
-18411699915366727708
-6205475624366966000
-3307734082
-39125348
-1087507565178193378
-545325868
-15986098390340470919
-223556143025
-19177592590632702
-8865366478519731984
-19333416
-32432337723461001
-812254097
-11305519054433421356
-1873618484738787248
-5105416417023100899
-572982104
-505742040
-563086155
-104333894
-8070528080642443989
-11327137566841769230
-2625321610894378836
-16377260960560187819
-15586729198848181726
-1873618441748546884
-18413109971585663048
-4825924017323379312
-5915592292141435844
+12689613402799605860
+9138963602338286574
+104989258
+644023149
+361131345578
+816055205
+9870724569433729
+70582752
+1309213649
+17634738504986593825
+5639662680184522626
+6316393479032998553
+16340493341965880015
+5344573059048999857
+34124461934314600
+5994450030541998229
+2625321589398637514
+2676819007
+15515140772745448064
+498702419026
+227855238971
+4587663
+16893851890367073119
+14264208198271043974
+555090760
+818873277
+61931938
+16056587
+8821966780582857359
+18411699885273055253
+4861149623842704773
+18413391996586557524
+18115578910873816258
 5832726151436896491
-17247780946628644032
+365262179507571896
+16896582888638318388
+4445946672738929841
+17186370630874106258
 810222466
 7405754
-11549275701007551889
-10161648502327149991
-570950482
-1873618514832459339
-313841222762
-4452458274095237609
-1445774942907271091
-6101795934071424788
-92406286
-5293539447540681024
-18331491793766525
-197198505
-11199980773228349986
-32432320526091507
-818086838
-1997667722089860216
-2524806027085153844
-1964966944
-15270143
-1370042529145686776
-5565348523104797810
-18331539082773742
-62718382
-2012415014
-18413110001679335503
-5374107
-14282027259104724924
-10375505339483621145
-9887461037680036022
-1873618544926132491
-4662355883991631380
-18412263939573940270
-157614716
-3295137431799204142
-9870724569630759
-491782859
-214958343888
-16875205763331852041
-7241607903360452069
-5408471212899110030
-23134531
-18411417877468545037
-27356081166681957
-644023149
-70582752
-816055205
-3342460
-5246976952665638015
-14212253575230457510
-576783198
-1842511416005692464
-806159226
-5566476498435574920
-15292217517958891614
-13516735047310051359
-5728764487730398405
-468608617008
-4025969582498383295
-16044698410490725659
-1519546451849645365
-9870724567599405
-5566476545724581156
-5619444426388998007
-98239009
-547095362
-27356033875641745
-219257112483
-8140646021471143544
-4713167439824750602
-16357059045845960667
-5462796881224795644
-9138963602338286574
-21102898
-10905173367761798655
-13701595356116683915
-2477484405147109478
-1880166538706292058
-11206864
-1283692271244348427
-68551110
-5885543833259674054
-18413673995792875610
-2352415791
-14947075702982868
-5299098870103476096
-681145240220994278
-163447447
-331038328206
-38863202
-96207382
-153551462
-2625321606595348609
-5461104757014004985
-10744889200825601240
-1988559907
-258343605
-6517011693716180143
-535167753
-2530175340657839273
-811991951
-15291935475760762248
-4397798264919820154
-18413674025886548065
-12109395139072755174
-475082778886408323
-104071746
-161415815
-8697110475982376165
-15584540329550678645
-13669583335851559254
-2625321610894116800
-1873618441748286746
-18412827963781152832
-819856323
-6209141854797957852
-1783548230307677653
-18411981901675757599
-637928298
-7143606
-15855332315905657597
-2625321864544389907
-12020808312486431384
-3076135121411313050
-10139438201185111279
-6152225744495577231
-33560368941368890
-210659313158
-4278256712
-27638024483702949
-24904017
-32432320525830439
-13263754581809432790
-817824692
-15007995
-359800716494834349
-18613516794268696
-9839328478246341893
-62456234
-5111959
-18411981931769430054
-16219982623696489082
-6261827792145090364
-7692717626264324682
-42664306
-13806855580317125108
-9870724569368358
-16269555352897260337
-214958081659
-11214563466575480865
-15636771529559117046
-13271165719268362246
-2652485274356286816
-538968856
-3784724792312663401
-18263821886743185772
-1986666427421953426
-5565348480114297669
-5352348827359053328
-12976359
-1873618476140725820
-421319345246
-70320604
-11703165067112811597
-21715697223994697
-3757107087862401328
-60424594
-3080312
-10697899350700788395
-1873618527730534170
-468608354196
-509280991
-50528646
-1193603335023233930
-16635669954819197974
-15426482629288462533
-5460499803637156023
-2625321602296318353
-9870724567336570
-97976862
-8818864638845060491
-14288223544298637564
-88080898
-6996745855548787140
-5566476571519223063
-546833214
-220421203678071202
-31022238513759415
-1873618458945389823
-6406389097441592980
-20840752
-813761433
-27356085465188671
-68288962
-5865888353649363875
-109394696450803010
-12213481117926952067
-18413391987988365394
-10944716
-517145329
-5723537903358642458
-21715753112570631
-7758478083289188556
-10675690836223986039
-153289315
-95945236
-11547019543992076059
-9649086479758069023
-2625321606595086582
-258081459
-544801575
-5887799994573980828
-2845029447323880298
-18809125
-8510103668314541335
-6205475701751155414
-1990332636357069057
-429916882098
-2673382969485886910
-1873618489039064439
-18413392018082037849
-10914208898869168291
-3773122177597967623
-161153669
-103809598
-14107087915135404740
-6366071515245381876
-18412545955976642616
-15289397371128645360
-5462796868327967227
-1402930148
-28202057290482949
-797695489810761887
-16777494
-18116142943679220675
-5142301044413893172
-17219576355390295334
-5249797112394286460
-13735950183222348532
-6881458
-29048192479791616
-16896582888638318388
-14517406836956661503
-5458848655886518922
-313840698753
-5197393273133271298
-3861350810962691992
-6375653898722412075
-16885380374869314205
-361129707266
-210659050964
-29048123694646491
-3017170418691476659
-1873618450347593089
-15290243360149277503
-14745847
-72090103
-14546784569801180959
-7431889721301470079
-6364097387529111599
-2435475427475262665
-1873618497636600365
-6151097734773868363
-62194086
-17083693200934636558
-32150372909516328
-4849811
-3172873313800750756
-2150364429944620611
-3862478902367620470
-9305858029919208637
-2625321597997287853
-2508194873
-491258567
-1408762855
-5015996636573993090
-2414921941537785811
-538706709
-5734260728554980678
-22610237
-12714212
-70058456
-6208295882974168451
-32714336929384395
-16643035121679272213
-20023641798084435
-4770547828131824981
-2818164
-1930668198955452820
-13726068529822894439
-468608091255
-5569296714050766113
-17490170188584258190
-8694008299851745161
-7073102484926630551
-155058804
-97714714
-40370537
-2625321602296056238
-1703347206
-15895039144349470066
-5352348805862656188
-3068049059797011246
-5880738612678821404
-12309852946450942075
-33560429128451329
-15289397384024950845
-4767727591019973374
-10682570
-10233718743719545342
-850088361543927300
-2792183694107936667
-1107456968073808590
-5759560470823897206
-162923155
-29612216687004362
-5875369269012203157
-95683088
-294416195335096411
-22279760122415532
-5639662680184522626
-17619012653768771484
-13237708544183762948
-8550520059753138843
-27356042474686002
-249849483538007723
-544539427
-13390152586296232130
-10906513561824594910
-18546980
-1873618489038801706
-2676033356038342054
-6313103561496791450
-2063139881
-6848542126596623056
-160891523
-103547450
-14101293042239958
-6151097653090126690
-1584595969
-12424382439595706534
-17698252132056434004
-4129856573689694799
-16885259953617962521
-12393440069873436875
-32432320527338097
-21433680821684597
-8617826180017097033
-1413046597527668667
-3973491001936446780
-819332033
-17305802226190387588
-1873618467542665344
-16515346
-6619310
-6206321690771522709
-4089771542585346905
-1223976962194278208
-13487493291780736605
-2487491354099451134
-8854886172739175692
-9870724570875039
-2625321593698257851
-1535116279
-6262673798362565305
-91619849
-493028049
-5352348797264856883
-8143564249694210398
-6151097683183797493
-9386257309953099582
-196412070
-3865299044899163405
-71827955
-18613366323088485
-18157949162008873831
-7562235583526800081
-817300400
-4618470194090937269
-4587663
-3932922014897081298
-61931938
-1873618497636337289
-2522831856378710008
-6364097413323754682
-6053028402293443390
-42140016
-12287601267178473523
-2625321597997025900
-538444562
-15991329612793777185
-15291089478142986477
-12452064
-2676033644081056812
-2556016
-16508579235574254010
-805372789
-59900299
-14787093348585572176
-2575517759332551933
-2412665810316625225
-7730749911729375728
-6155298010574883251
-10488220504998020326
-1311572948
-883931539946605906
-5352348805862394041
-2786543383251193103
-546308920
-3346269252
-5782296426993943791
-4469799173763958889
-6205475671656957491
-7872981661881076049
-18116424960081923281
-2676033351739311464
-516621038
-1465168459078698840
-5677488692584514734
-105316943
-4562124351240801677
-5245848874158263187
-16432982289349543214
-162661010
-3971798877726246151
-4787251587800828866
-5875369294806846690
-12217235256243064050
-95420943
-5354604868299326678
-4502324021619918399
-544277281
-5940918086979029952
-2014710471177341259
-2140013610
-1873618463243635741
-18284834
-2676033356038079832
-10531295876509927029
-5458848625792321791
-18411699898170343448
-7410231625909407077
-3478039985316562895
-6204347606046083061
-31586254122912349
-6829167320236755019
-27920101074341046
-13165236096819726043
-32432389312220424
-571933524
-5727354401416743090
-10225919154718574351
-4127600472563058730
-160629376
-103285302
-8483828720842049762
-15740334315622960494
-206359759935
-9813006656186419950
-9319686106503382840
-5515085278788979157
-232154663489
-26149204
-6208295848581203181
-3094190453106412515
-6520986101609793850
-32432320527074663
-5245848925746038203
-5942328186188203485
-1873618467542403595
-16253198
-15881445561639371975
-6357162
-63701435
-15515478115209971466
-5833854247140395797
-283181761
-19177532404009207
-16567374854657149772
-684134257893509654
-9870724570613070
-15680489859993767209
-12826571498698443033
-2625321593697995819
-10329316755526125416
-10754752208794748192
-10758418391935812957
-12105446909435186010
-3143159678306028631
-236453432350
-540214046
-14848239906707278405
-29330157293274228
-684134210602468610
-817038254
-4977791693940394179
-71565807
-1873618497636075077
-807142269
-61669791
-11287403619712895066
-4325515
-13819298136066198
-7734678113259293802
-6098975847429179176
-99222062
-18056758355458722638
-9870724568582655
-16224960573811657069
-2625321597996763849
-4078298757842341053
-17625510063045740642
-10528906628815718922
-490734276
-5412367062202975465
-22085946
-12751507524739009261
-538182415
-12189916
-18413109984482951243
-2541195915421354200
-6671860954713623381
-2893509029140760671
-69534164
-747829823970020707
-6770804071406897080
-2293868
-5566476498434524382
-6534429686359852912
-18412263922377556010
-164430493
-9870724566550039
-154534512
-10167299845199168903
-12754891682880490747
-5250413516934022944
-3315661715940248009
-451651625195343029
-32432333423379563
-5941764217869305943
-2141783083
-283748271730
-10161648493728303880
-5240846595623881868
-67502526
-15618641120352995308
-2676033351739049517
-6205475697451599682
-4023356732265137752
-14986955239351847842
-31304272112126853
-516358893
-2207492698791414354
-477207135345
-1309279186
-105054795
-17859691850682797212
-162398863
-4238330517036600601
-152502880
-18412263952471228465
-257295025
-10905173350565414454
-17498716255300421272
-8881019260503721949
-18022689
-534119176
-18411417890365833232
-6293435910568086045
-9374458755688828226
-820839372
-6153071780807051278
-5909364179964069981
-8126661
-3735453693364143828
-6155045908522469290
-745740842898098858
-2625321589398965240
-12142525752872799042
-160367231
-17958290734101235336
-9523554809025136564
-16892239439269464715
-15289397371127860096
-1736311827
-15991050
-63439289
-6095014
-12484855343804124176
-9658025172156550406
-18067928153034001057
-292345808939
-16572875051796793000
-10542598463376395267
-12772641161582545873
-18413674008690163805
-1544487931
-14737352740221028816
-282919615
-12808641794728789765
-2625321593697733840
-17128487303121020
-1706624008
-14101026494875963
-11214563466576463780
-18412827946584768572
-11966722661119888545
-6156455943247300775
-5300226909920168653
-6004915412369541960
-816776108
-4223816177647290930
-71303659
-1873618476141710425
-12477949191893683608
-417019528294
-9511403338599564690
-4063367
-61407645
-2543805385922512178
-9870724578216632
-5407707525201267705
-9870724568320021
-2564752444
-98959914
-15494005608834598990
-15140097999495498431
-21823800
-12734096628671909131
-537920267
-18412827976678441027
-11927769
-69272016
-18411981914573045794
-2571498445011814318
-10592171188278987146
-2057911839619745748
-9870724566287831
-154272366
-545784627
-17616192489740896443
-21715680027609308
-16886908734816455284
-583336804
-2246313005
-516096747
-2625321585099935141
-620888934
-162136717
-331037018572
-477206873177
-503001777494
-15592058013925444099
-1652810939277510396
-10531295803425490030
-3205882223899445065
-31304323701671300
-28484129580057898
-1873618441749006513
-16893851890367073119
-820577224
-16904712944498838074
-1394017249
-17760542
-4160689491693538063
-4047541379259827663
-7864513
-14219872676477209184
-504169174
-17244622751296785814
-2625321589398702921
-4278977611
-7239633818635733091
-5462796868326918190
-1334641629
-73073152
-7460569593843485201
-15287141188316891641
-818545595
-9339868219275806468
-15728902
-5382561551670903978
-9373330690077689939
-18413392000885653589
-5832866
-63177141
-438515402871
-2373415502940997016
-2148672322930150296
-168849237244054062
-12339564610979564477
-8327325764367420682
-7630443591734791098
-12608147700378373379
-9870724570088730
-2150364451439708714
-18412545938780258356
-13221120945827219803
-492241614
-4129856608083381232
-15740733274947783803
-15858116883009440274
-1873618476141446514
-816513961
-17564225130023161250
-13697261
-10668197763104573447
-71041511
-5357143003026951378
-31022281504720056
-1873618501936351339
-3801219
-442814170389
-5701610621477129021
-8520914754064026558
-15289397306641222853
-108593749
-98697768
-9870724568058057
-5780604294184830225
-156041850
-5192881006389626514
-32150304123324262
-219257572663
-18412545968873930811
-5249797099496672683
-11127945220196076778
-9103100569952650951
-11665621
-421318034537
-17619012718254098754
-14443179094226111164
-1873618480440216958
-69009868
-10594427319499622429
-814482337
-13968724050119231192
-28202091681875145
-27638110466671725
-16166203682344470241
-1712194570
-472907842721
-507970270
-15580874172203795679
-23689855033805297
-154010219
-17092164759424403479
-12893049762838873864
-6877309693745106245
-545522479
-5887800020369606783
-14977809576148535095
-19530026
-14105033451515939293
-6795216411027442152
-2543452128325209336
-1385890784
-114426460
-6444189713816225654
-6152225714402364510
-524384476410219715
-17953567922355439196
-17113993018971653874
-573178715
-515834601
-17090754617222956318
-161874570
-1538130937
-47186305
-30458188512103543
-2449021711964768402
-2414448843017751282
-5214737420442796133
-505938649
-2625321610894575340
-13965057806789381527
-970700105235760464
-15223822230290106035
-16285378285009240167
-16940455997476965252
-2601013084734032090
-5248157445900799208
-1580068669843704469
-15043322265989680207
-29048166685607288
-3863606942184311140
-820315079
-17045009756596405420
-29048192480512516
-11510172448171493799
-5885976160280708469
-7602365
-17785259896117529586
-8856014216854897981
-14477731067643038195
-1873618514832657292
-2578187325
-15292499491370895395
-33560368941827284
-13146357072728951328
-17353152791227993245
-159842942
-15530553734630409457
-5569296726948055802
-494159375523777824
-1812923415
-6366353518750729401
-4278715465
-17097308613030775025
-35258719
-1899651063193471062
-12103109825679658143
-6364338522051512284
-2429880031182916564
-11621189233770302317
-72811005
-15466754
-3880024017885400135
-818283447
-62914993
-4076606625033226775
-1873618497637320883
-7746405201714873917
-5570718
-10859426818132543221
-6925759835249836137
-3506237898852665380
-23407812836853915
-1873618523432225060
-17166316876055971050
-18008952305986046279
-43123062
-9870724569826462
-7410173966093388838
-33560399035500221
-511599051947
-214958540605
-13237708557081051143
-20587696099952690
-15339421027537585423
-6104586261132347910
-11103300151687644832
-1456931819
-1873618450346281005
-9181531069949872018
-14650572868605052119
-17783567759008991682
-575239712866634722
-15288269284022357372
-6206321673575138470
-644219759
-13435115
-399811749952817933
-145335345147610979
-70779363
-6366071455058494624
-7529998377695250462
-519635711
-3539071
-576979807
-9568723490388248888
-634323816
-13012951802393594980
-853643387796785445
-98435620
-28766107292140894
-9181555677596944971
-5195701200510977145
-5129024196560096606
-5831598124518278362
-4844858457232050089
-219257310372
-7569568047215545466
-5461104800004441485
-1518418407735101149
-814220189
-11403474
-18005251247539029895
-10333839787251271664
-1836516380
-8054758354584013306
-507708124
-163644058
-9001701177466488459
-2625321606595545096
-153748072
-4787251587801811388
-39059811
-545260331
-2036204584
-5356296971014964874
-19267879
-9714916684781063078
-3055188874828713383
-14576212124415364447
-2150364417046743283
-4662355849599126556
-1372824966366170355
-1318388695
-15289397293744393060
-8423108281783224429
-505676503
-104268357
-477206348880
-5831598081526006949
-4625631396377398109
-2625321610894313322
-6206321759557388696
-12237654281284815334
-17236251
-9391897711091583990
-3891732840317912522
-8856014216854636141
-5758903550139959418
-7340217
-638124907
-810156929
-6206321690772243584
-112132697
-15287987228927658628
-339636063086
-7721139320100816372
-684134305183500639
-22279768720672168
-5831598111619679502
-14814059355306855043
-4211213383
-15290243360149735302
-18411699880973959188
-15204606
-11507341268100646834
-62652845
-6365225483234117329
-5308570
-3491703531359374171
-17791918762976347730
-4127600455366674792
-11130039777759856047
-13951205954302381098
-18115578910873816258
-8659114857360722535
-6153353844499089111
-157549179
-9870724569564298
-16327183209838150989
-491717322
-214958278120
-32432303330691092
-17684252729367202593
-16965951797418331227
-23068994
-2272905061487347697
-1873618450346019367
-7515799761807542411
-815989668
-2576363817137867614
-70517215
-17763448248357489818
-13172970
-3276923
-806093689
-17621268802185464283
-60621205
-18411699911067631643
-576717661
-1685722535145180234
-23689824939607125
-17256155806064642777
-5516892801706297876
-12982659022915898414
-9870724567533791
-15515140725455259155
-547029825
-219257046468
-4180850416920431050
-21037361
-68485573
-11141327
-813958043
-189614828176542708
-1873618480439692390
-279448454880
-16253215886083360174
-572110149897422243
-9896616181508082455
-153485925
-8021450371307931626
+2507605048
+17607182983838566334
+546439994
+2679637056
+41812332
+99156525
+9140909979694467183
+11742834345080916462
+9950583114428779661
+18411417894664929297
+17160329975787685834
+1518418407735101149
+18331556279224644
+15289397293744393060
+13950077918785243724
+15287141235606883137
+2789363555875490728
+491913932
+90505732
+214958474959
+21692726
+2063139881
+9870724566418979
+339635799228
+11740202404159819072
+12769623874203027091
+7171706723174648069
+16156684754098128751
+6208295835683456476
+1873618476141316771
+5882159696614657105
+3431717683755289915
+1873618506235448739
+17166316876055971050
+1023125932615797552
+22279798815198594
+12346762090311779845
+162136717
+331037018572
+13041896
+1733362705
+643826540
+2306802734
+477206873177
+17309267256018536307
+2625321597997222413
+517669620
+620888934
+70386141
+31022281504720056
+7409502855497715015
+6155045934318095559
+18412263918078459945
+5458848625792321791
 38797665
-19177566795402134
-27356016680241600
 669582195
-2625321606595283106
+27328854
 554894151
-5512098557251945790
-9568883447315500158
-1440671446449589035
-4502324021620638916
-3249068390006196153
-15292781563660995825
-821822415
-27356063969248337
-18413109967286566983
-10911952793442192048
-6064503826171693679
-11161692095903435283
-1004761907965660269
-2207210695286917386
-6388664954993575829
-46662016
-5885976061401368013
-104006209
-5572809636517250553
-2625321610894051277
-17955470565775510239
-4661227814082512385
-6368045642960996241
-5463642874544129714
-16974104
-533070599
-809894783
-18413109997380239438
-7078069
-637862761
-6288511205539515238
+468608091255
+15859976
+4287350254045103750
+61735328
+4391052
+6520986101609793850
+153485925
+8510664359869943178
+11050161621988804687
+20869691006257762
+5196265237615086368
+3491703531359374171
+1873618489037883086
+11633356565151157114
+16633695839999756254
+23407812836853915
+1873618519132015281
+12074216556992400767
+6153071832396467338
+16120716880803858984
+5299098848608717979
+17149817495305717193
+18411981927470333989
+3308118261903721908
+5831598124518278362
+7209143
+810025856
+797977540607610221
+98959914
+7470284155521404014
+2564752444
+1727529996
+12318365723907459763
+5884848047378859255
+13222096480399396057
+6314795724398267312
+4397798316509039896
 3974700764184054454
-18613559784442970
-2791055594105669609
-4504298205224635444
-18412263935274844205
-2605266760616185153
-15287987228927396675
-339635799228
-92078603
-8501910827968825512
-5991347884504386492
-210659247559
-17284241873202253123
-16893851873170950707
-651404368114879038
-18411417873169448972
-24838480
-5726226344404977639
-10259573046193883986
-2676958769323838072
-72286714
-6886936648282539655
-14942458
-521143041
-5046422
-13980703149896829784
-1495991284
-62390697
-18199185222634702635
-8834282535679560676
-15925946803693423456
-42598769
-9870724569302153
-5459976661309982295
-11084138473134491150
-5303047078245827995
-214958016090
-12451287838412704489
-5509410202188647833
-2681814701524780811
-10628953736434486617
-9774054990929462949
+5514354709969504081
+2893509029140760671
+1873618514834032208
+5516046791188875281
+1223976962194278208
+14737352740221028816
+6368045642960996241
+3489447322753895731
+21496117
+9870724566222277
+514654951
+189614828176542708
+214958278120
+491717322
+571999061
+6367324841362000185
+10375505339483621145
+8070938101082033295
+5569296709751605280
+1316357237551401394
+12020684574736647824
+15991329612793777185
+10697899350700788395
+16739161091967945306
+3891732840317912522
+1899651063193471062
+161940107
+24314188
+2224234517276395067
+17082847207615301902
+2625321597997025900
+6152225714402364510
+12845285
+506004186
+1733166096
+70189530
+10906583445476542150
+563348302
+31022281504523376
+1873618527730601376
+2530175340657839273
+1873618497636468806
+1873618441749006513
+18412827950883864637
+6366353518750729401
+1413046597527668667
+4078298775038527628
+5565348505908348542
+4022831255438034250
+153289315
+4194441
+15663365
+11700344903086704893
+73007615
+818480058
+296644709200
+95945236
+2150364417046743283
+30740204914804024
+15290525359355331959
+4237766475632413481
+16758489844492275047
+5408700909154273182
+5153885660580611393
+1873618519131818153
+13951205954302051853
+3597357340772992368
+7432602967203907143
+1880166538706292058
+399811749952817933
+10381427610856195682
+4644563108626631009
+14665351642484132
+7012532
+5141736951422061236
+3344434243
+16330874579771459902
+1873618484739705502
+8550520059753138843
+4645667113710455153
+5885976069999102359
+15501473033754248808
+9896616181508082455
+5462796868327967227
+14585410861575638263
+214958081659
+339635406848
+11818157132458100908
+11526999220155450649
+18613533990193666
+1873618450347593089
+3861350810962691992
+684134305183500639
+18413673995792875610
+15530271666638163275
+17621561863500597513
+4238330517036600601
+22279768720672168
+4502606072414800438
+10655291933708585535
+161743496
+517276402
+2625321597996829544
+12648675
+563151692
+1308623824
+104399431
+236453432350
+4279043148
+540214046
+1744643526947965735
+2065251783916127931
+18411699893871247383
+5459976691403131036
+21715680027609308
+5726226344404977639
+15292499491370895395
+18413392005184749654
+1873618497636273356
+32432320526091507
+31304323701802109
+2576363817137867614
+1631882651478526261
+5995296157134227520
+7558005371879033601
+61342108
+95748625
+520094464
+15466754
+3997830
+5240846595623881868
+5887800020369606783
+15288833278135765839
+818283447
+72811005
+5459935770830768809
+9355193091131312182
 18411417903263121427
-3865299049198390675
-12910822
-5356297009705911966
-2421359666
-70255067
-2248112069177510680
-3493395634074945822
-60359057
-12654580528992553525
-519111421
-3808100888100343209
-3014775
-13513632858283052077
-15289397310941235057
-8861613698626554738
-9697577994188492052
-155255415
-10381427610856195682
-9870724567271440
-2625321602296252770
-14512708438227029368
-97911325
-489423554
-4022831255438034250
-30671195
-1873618458945324208
-20775215
-5459976691403654584
-813695896
-12665415616966166285
-5645056620059298667
-68223425
+8430474765433179073
+5247593352907000017
+27638110466671725
+32714414313178248
+9234597529149245860
+3229097897723824621
+7449919019336600171
+2413828615876118471
+2414448843017751282
+6101795942670075923
+7697026996393675938
+31304285008889193
+15777957523720635747
+3143159678306028631
+11065487220741573048
+6815921
+2140013610
+14282671233461718187
+9230753948926543533
+98566694
+2625321585100065781
+5382561551670903978
+259130038
+155910777
+87097857
+18284834
+282067642
+545850164
+33278352538406314
+21433680820701635
+5625593289148533238
+10512763733196344280
+3784125305237867347
+1873618514833639109
+32432337723461001
+1873618454645376983
+15292499534361593441
+5133829485925763690
+16904712944497920326
+5511666277521099409
+5622264654903379074
+571605843
+514261733
+491324104
+2625321606595414132
+21102898
+1385890784
+524384476410219715
+17257283880273643533
+5195701200510977145
+10280579133800254203
+200191596416994196
+1873618476140725820
+13117263636444153530
+15096032016463431129
+6729754102968812754
+18412263926676652075
+31304272112126853
+12118995007347033900
+1996555300
+9870724568648556
+540017436
 1319896024
-2390363305266056430
-17634738504986593825
-20305632407192782
-17462509665872383079
+4236074403011431549
 1606616067
+195953314
+23920969
+104202820
 305243098454
-163119765
-48431492
-10590197086357423689
-2787671431665157349
-6366353484357502971
-18413674021587452000
-17620986833073014515
-105775699
-20869665212206112
-4445946672738929841
-95879699
-2625321606595021110
-10906583445476542150
-18412827959482056767
-17205553309096938840
-12294570438877711433
-5461104782808583112
-544736038
-9950019055828534995
-5991347927496394467
-811664269
-5403008449516603011
-18411981897376661534
-572392279
-7677136701370927115
-6155045908523191668
-18067928196024961188
-20587511236070012
-103744061
-161088132
-335336768790
-6155045934318095559
-13322381941750499717
-15291371425760087333
-30740222110467489
-5245848925746498573
-5349308051975768286
-4548309565419816229
-255984301
-5461104787107351969
-16711957
-10906583475570214623
-6365225453139920066
-6177363118375897150
-6815921
-7032232753418799293
-5558136817694803400
-4030203865610717075
-12718336251608304605
-18411981927470333989
-1545208828
-15287141235606883137
-5837238474067478018
-11705421198335413148
-5524868651610213131
-210658985303
-6098975770044925746
-24576334
-13151687854617134836
-4662073803102881076
-72024566
-817497011
+12452064
+5248157445900799208
+31022281504130688
+1873618497636075077
+1454041666728626384
+1873618441748613384
+15289397310941170468
+12999620585941961275
+5875369294806846690
+18142877345697171235
+2789363542978135882
+18411981936068526119
+12057284352529139627
+356832576945
+17092164759424403479
+1460339693
+3801219
+256115374
+1987904546
+1964966944
+15270143
+33278386930321618
+442814170389
+818086838
+17462509665872383079
+6206321759557388696
+5408471212899110030
+1873618523432225060
+17353152791227993245
+6261827792145090364
+15223822230290106035
+15287141218411547437
+7576852735584046364
+9714916620293637762
+31586271318836879
+41025899
+2625321585099869531
+223556471268
+545653553
+4023356732265137752
+98370083
+1531970550
+6619310
+23689824939607125
+9950019064427710530
+12424382439595706534
+1873618484739311861
+18331530485106038
+23407864425745813
+797695489810761887
+15289397353931868100
 29330157293733695
-17096567568145714575
-1454859013759438228
-14680310
-4784274
-62128549
-1493907215600323645
-6364097387529046615
-12583654612056476062
-12851509922494416016
-1495729137
-15287141218411547437
-828143439367899804
-2523959969279970191
-3919394969679695174
-7595953279435999504
-2625321597997222413
-491193030
-1839046019115124804
-7241043922144659849
-18613499598604650
-18413391983689269329
-10594427319500605883
-12648675
-4861149623842704773
-5782296448490276391
-5516046782590617836
-518849275
-10015828607276288922
-15662612681012938353
-2752627
-60096910
-5133829485924779401
+6101795994259359091
+7692717626264324682
+5299098870103476096
+9813006656186419950
+3591160524196219107
+4129856608083381232
+2755882956846859930
+5352348797264856883
+812254097
+5524868651610213131
+11124082762859154482
+2857298572705729021
+19177566795402134
+18301216972081072101
+2625321606595217579
+6152225753094686522
+548471621
+5512098595943810546
+6584302816815089825
+11092808190891527547
+5941764144784016877
+18412827959482056767
+14428481252717692252
+5301355009924597043
+12284124821458521275
+3577503648415550265
+9870724568450966
+69599701
+7431889721301470079
+46662016
+35193182
+104006209
+12255453
+17877509356004052233
+11069796553860646809
+4347925833116091776
+10590197086357423689
+5570988786672405753
+9297735470756268616
+14637903957824965363
+539325825270679628
+15584540329550678645
+17247780946628644032
+15073532
+1574831104
+72417787
+152699489
+496763604
+577045344
+817890229
+3604608
+6204347606046083061
+12771845711499103316
+15290243377345399572
+11127945220196076778
+6208295882974168451
+11846037961957312985
+13106160036035691955
+1906986264008723742
+4657106642463886071
+9198943117821938833
+15270695523793439937
+5246976952665638015
 7003516464553396964
-12903069678853164419
-2625321602295990612
-97649177
-259785401
-5464488953846367762
-546505531
-30409049
-374027977988
-1396769762
-21715680028329254
-5637072609524124450
-7731877951544692100
-1873618458945062288
-6767393152337644543
-9467310877347154547
-5429433323061448040
-10617033
-1730937871
-107356700000258304
-425617786716
-451412690018
+10956724774926813288
+820708298
+545456942
+63766972
+4585257123188181630
+6422699
+17891616
+9117971362513815455
+18413674004391067740
+14597841535548131734
+9772926989806013640
+1873618458945784014
+4522983015860209939
+13806855580317125108
+15426482629288462533
+3506237898852665380
+5787083718919720300
+13322381941750499717
+13237708531286474753
+32178524
+490930885
+2625321606595021110
+20709678
+1706624008
+513868514
+245051624454
+525337347
+16483453074211931840
+12217235256243064050
+4794173760728861833
+5347351719937377689
+18411699902469439513
+30458205707308380
+10750398672578676906
+13351948495571782591
 18413392013782941784
-12020684574736647824
-105513554
-3541851256594893702
-16038494049631274933
-497025749
-4661227783988316231
-18412545951677546551
-5565348467217401524
-14428481252717692252
-544473890
-3344434243
-2169005683868174908
-5993603989931887912
-12972952285742288
-13117263636444153530
-811402123
-2676033356038276482
-1873618514833639109
-514786024
-572130134
-160825986
-1938490399
-10280579133800254203
-285938493736356261
-6425213859614951480
-103481913
-11364576519499679975
-1881294612915292853
-15739206202722094240
-4397798316509039896
-17011915733784398286
-1873618446048496233
-14383326641327005
+17088031212435737557
+5105416417023100899
+11427331956784106382
+3698377064120454404
+69403090
+1629160452
+161153669
+17153706162049649384
+103809598
+15580874133512784221
+7872981661881076049
+2544766567488424310
+8818864638845060491
+1597536180772867045
+17631161371525515669
+4977791693940394179
+29048123694646491
+15288269284022357372
+806224763
 26345813
-6156455960443095577
-14975681650483333306
-819266496
-16449809
-15288269301218674108
+72221177
+14876921
+60752278
+3407997
+152502880
+2625321593698257851
+5675796551176948530
+5337229686545450161
+10649664295314066054
+18271599167641487963
+5741055947585816645
+1873618523431831649
+9763237054719266042
+5778348175860436286
+11906248855590275274
+145335345147610979
+27356063970624739
+2676033356038407648
+6226088
+7285785859232107205
+2036204584
+109445719
+5779476228575003910
+13117159209037203716
+97976862
+545260331
+1839046014815569788
+23407778443758207
+13885455448020813663
+17091318705916150977
+14749580058850363919
+32714379920475611
+5511666307614640107
+5780604362970367638
+18412263935274844205
+4767727591020628301
+5885976160280708469
+1396769762
+811860878
+5996988225456246061
+9887461037680036022
+490734276
+295544243748734701
+30458205707109873
+9950019055828534995
+17090754617222956318
+5566476545725367766
+17648172271756969893
+15289115277341755866
+30176189305784773
+6205475701751155414
+9940375093616053431
+5728764444739437994
+15140097999495498431
+6523880655054768550
+5727354401416743090
+210659313158
+1456931819
+11862232
+527958790
+103612987
+4278256712
+9870724568058057
+69206479
+1997667722089860216
+12339564610979564477
+5243108696682924272
+23125779236849772
+1873618501936285414
+16044698410490725659
+13669583335851559254
+14425661019905001872
+13681696359428851594
+16161820259100396848
+72024566
+24535844054893958
+3211386
+817497011
+9870724570875039
+60555668
+26149204
+24535874149025753
+806028152
+232154663489
+507839197
+14680310
+2625321593698061230
+15273884660262766886
+6633286351216577743
 1873618493337504776
-5782296461386581535
-12162857194684744950
-16633695839999756254
-6553773
-6206321690771457172
-5411573444917201071
-14273081993166850387
-17297538988880889355
-9870724570810095
-339635275824
-101450287
-2625321593698192308
-91554312
-3812049113439014303
-492962512
-15289397349632182266
-342928503145892901
-9257009393629660721
-13674941621707869313
-17952462371364276975
-24314188
-7676326001635166459
-12622921449567619867
-14471968401314024391
-14418163
-71762418
-4522126
-1873618497636273356
-1873618523431177265
-31304285008889193
-2625321597996960522
-42074479
-18895601982637667
-14883032307819284131
-32178524
-490930885
-5459976661309458015
-194314911
-1873618454646032908
-9386257314251803173
-13950077918785243724
-5831598146013367591
-5882159627828332650
-69730775
-6100103913039400051
-15744000533156660854
-12386527
-518587129
-59834762
-9231865831523354279
-2490479
+3094190453106412515
+1087507565178193378
+8481290603310483360
+16885380374869314205
+5412367062202975465
+1372824966366170355
+2543805385922512178
+27356033876298083
+2676033356038210923
+820315079
+63373752
+6813843502384089093
+97780251
+258343605
+155124341
 2148672331528407961
-2908260051937332390
-16876615841046071902
-9950583114428779661
-154731123
+6029477
+17632571483632043275
+18349504802666319185
+12133908888583014197
+18412827968080248897
+6100103913039400051
+5249797202674912471
+1873618458945389823
+16271603835638319461
+605557034314828631
+5881866613803452649
+3544107379218385465
+7406761759861377295
+811664269
+3234866947851553000
+43254135
+10675690836223986039
+3346269252
+17946608694533885076
+5459976644113468289
+15290525376551717170
+5946696443085589628
+3477193953305167424
+5353476789790574588
+28202091681942395
+5944020284604679310
+8143564249694210398
+31304332299601191
+8856014216854636141
+160760449
+28766090095429261
+5727354401416546168
+421318034537
+814482337
+103416376
+210659116497
+15505670362359531298
+16893851873170950707
+15515478115209971466
+46072191
+11665621
+9870724567861628
+23134531
+69009868
+14105033451515939293
+1784112314702629632
+32714336929384395
+853643387796785445
+4713167439824750602
+3812049126336301758
+16130159995999161574
+15289397371128645360
+3910652327921846506
+519111421
+71827955
+9870724570679212
+2625321593697864817
+60359057
+326737659857
+163578521
+17219576355390295334
+197984938
+817300400
+3014775
+18413674012989259870
+27638084672359236
+6156455943247300775
+18115860927276518423
+18323667541285735009
+5572809636518234139
+2581508047683782932
 13237708539884666883
-30458205708158447
-2964529530791004471
+2524806027085153844
+2676033356038014277
+31243224015702806
+12982659022915898414
+510460641
+464308734399
+2219404100148201532
+544867112
+438515402871
+258146996
+5832866
+97583640
+63177141
+21715697223994697
+14657391675119111356
+18411699911067631643
+1873618514832657292
+15339421027537585423
+3545799490531822688
+16916327697533962956
+5299098844309358384
+4127600472562141528
+8920676095134336910
+5133829485924779401
+6151097665987347595
+6152225697206437786
+1706034183
+15897859265386515143
+43057525
+536216330
+943759021439519007
+10939233345785957508
+1746899701160150410
+1384869222252743071
+5881302580997523790
+107356700000258304
+11287403619712895066
+814285726
+68813257
+279448782049
+539034393
+22937920
+210658919829
+493159123
+91750922
+5831598081526006949
+103219765
+45875581
+516096747
+9870724567665640
+2625321602296449309
+5568582435113995179
+6154199872212897041
+5356297009705911966
+9001701177466488459
+6425213859614951480
+5129024196560096606
+3971798877726246151
+6471725260172231870
+18412545934481162291
+6155045908523191668
+27356042474686002
+11226550812567014265
+9870724570481756
+417019855960
+12512221777814685432
+2625321593697668091
+18116424960081923281
+14287090
+2818164
+71631344
+18895601982637667
+18066207382028748450
+5353476806987745228
+6100103891543657570
+6996745855548787140
+10594427319500605883
+575239712866634722
+3870997034531752803
+7239633818635733091
+29612147900877606
+8865366478519731984
+5352348805862656188
+8709732826087622782
+18412263943873036335
 40042856
-2933734509745341832
-5459976691403131036
-1730675726
-1873618484739705502
-2676033351739245930
-15215179494928287321
-14866462842593414402
-5463642917535614049
-631243623
+5245789596497153220
+819921860
+15580874176502892132
+154731123
+544670501
+62980530
+17105177
+4254959725487523541
+5636255
+30458188511970924
+3973491001936446780
+7410173966093388838
+8704274751914773568
 5885261859847867262
-11391362031143292020
-506659547
-105251406
-5778348197355914873
-16324853745603185849
-5509410163496651347
-152699489
-15292499534361856724
-496763604
-544211744
-4078298792234977417
-5461104782808057591
-14648423506775771515
-10504814416598927327
-8709732826087622782
-2544766567488424310
-811139977
-17088205463377873568
-15798241638577276499
-2676033356038014277
-2785415326238639918
-12562453432512743836
-12350988444867431112
-1873618514833377412
-16940553195690134509
-45875581
-103219765
-8854886168440079511
-5941764153383128192
-2625321589399162008
-11818157132458100908
-2785415278947600352
-15257764832492062794
-232154598652
-819004351
-16187661
-4644563108626631009
-4000515045253449269
-16872667624306444468
-1873618493337242815
-6291625
-6156737968247080128
-292346005443
-283116224
-3220426554520570467
-12356593998396393868
+13590665243542948895
+2412665810316625225
+18613516794268696
+1873618514832459339
+11623400942792738969
 684134257893444250
-17175427809786595961
-9870724570547380
+9126223058424237061
+10530167802300925091
+14267039342724252928
+7042731044489660311
+811271050
+219257507157
+16204201012116260706
+19923244
+2248112069177510680
+19741625396299098
+14311172801615955497
+313840698753
+157549179
+4662073785906890031
+9658025172156550406
+6364338522051512284
+3599049409094225259
+6177363118375897150
+1801912319444323213
+11272401
+91554312
+2625321602296252770
+68616647
+538837783
+1411918553413126104
+22741311
+492962512
+451411772583
+160367231
+9870724567468020
+814089116
+2528483177756102046
+10370417990725208293
+6829167367527204602
+10167299845199168903
+14284319595218536933
+18413109967286566983
+9881712940254169714
+13819298136066198
+10906513561824594910
+9486667438472824267
+10215782083327823122
+3685635809078676834
+518718202
 1992881803100621054
-2625321593697930351
-9450798976826149302
-16655465042802838677
-6474545510181176536
-11740202404159819072
-15289397349631921063
-9714916620293637762
-6098975770044401989
-16364556117061994922
-196084388
-540148509
-24052042
-11065179658016983681
-12480382642832672298
-71500270
-7285785859232107205
-14156017
-17632571483632043275
-61604254
-4259978
-17750109864738752812
-1873618523430913566
-9830100417878166271
-14425661002709010016
-4794173760728861833
-464308734399
-510460641
-2507605048
-41812332
-2679637056
-99156525
-16044698410491643447
-9870724568517151
-5516046735301085409
-6261263733545503259
-3759645248384009814
-538116878
-5779476232874035736
-6104586261131037638
-10531295842117158093
-12124379
-69468627
-5565348505908348542
-814941090
-5299098870104394759
-14322284629040564382
-10440328872292254866
+14090480
+59965836
+1550910461
+17063697102470777209
+71434733
+2411070513
+17639632887023929831
+805438326
+2621553
+9870724570285675
+1192315333980326167
+6928358494714596151
+5512098613140196086
+15611911946388048179
+5214737420442796133
+5778348150066318224
+27638024483702949
+18412827976678441027
+8881019260503721949
+5837238405281154301
+5461104765611674055
+4181978486829943864
+154534512
+5439644
+755605599842665887
+62783919
+292345154665
+544473890
+567411536
+15257764832492062794
+15122676476569913436
+5835546375651263675
+5516046795487905107
+10758418391935682553
+27356085465188671
+11400819049620310987
+5245848947241584851
+1728578573
+42664306
+219257310372
+8257735
+524354306
+429916226796
+4076606625033226775
+10162787574158199843
+6064503826171693679
+1873618450346019367
+5566476545724581156
+2704904932763174902
+4548309565419816229
+12484855343804124176
+879419277503368073
+6153917830015027393
+573047641
+68420036
+9870724567271440
+2625321602296056238
+2531021389865944442
+11075790
+102826544
+6064762127302393958
+5249797159683425776
+18413674021587452000
+31586340104504804
+4469799173763958889
+13237708548482859013
+31586254122912349
+9870724570088730
+331037869860
+417019463453
+48300418
+71238122
+1539245050
+644678512
+2424942
+11305519054433421356
+15739206202722094240
+18411699919665823773
+4787251587801811388
+32432320527338097
+27920126869375123
+18008952305986046279
+4661227783988316231
+2543452128325209336
+12826571498698443033
+16711957
+4160546838840674266
+5245848925746038203
+62587308
+5784522635265967958
+5243033
+544277281
+6211116016906930204
+16253215886083360174
+23407808536904833
+9821766011288487605
+2676033351739442523
+8061124
+13012951802392676509
+219257112483
+547095362
+1992881785902860007
+19530026
+810877831
+2625321610894640833
+4825924017323379312
+8854886168440079511
+3808100888100343209
+3493395634074945822
+12591757708863407913
+5349308051975768286
+13361048879788721990
+4074350485214726972
+1626046306171619904
+8617826180017097033
+13513914891881875478
+29330183088506125
+18412545943079354421
+6405261049027955879
+17939922315943150958
+1410073577
+7837634943338155161
+85348920764927349
+2625321602295859611
+813695896
+538444562
+29048192480512516
+45285754
+68223425
+91161094
+10184384893691038634
+17542946455516547053
+4180850416920431050
+1928986057181235415
+6364097387529111599
+15289397371127860096
+2522831856378710008
+5885976061401368013
+21997756618246082
+18412263952471228465
+816513961
+13678484518713821516
 2228331
+2531021385567766485
+197198505
 518324983
-16872385650894636566
-6284197438710222140
-8098722631875955846
-5727354392818878727
-9870724566484489
-154468975
-2292825785040636736
-3172873343893834792
-14418466534433295118
-2707725182771857350
-15293345523383077603
-259261111
-19988781
-15371922320578972378
-19741625396299098
-18411699893871247383
-12818875419963886521
-2676033351738984017
-14268291611706526293
-1309213649
-104989258
-6367324841362000185
-7432602967203907143
-11331649863678691999
-15292499534361593441
-1815413785
-5778348223150556659
-5572809636518234139
-11408348231855703653
-2446197814
-13001682102565734253
-17186370630874106258
-2785415274648570354
+71041511
+9870724569894177
+105448017
+5779476207078475458
+13980703149896829784
+13697261
+12769025487284210679
+2150364451439708714
+4162099616697747321
+1873618497637320883
+1873618523430651633
+12716605781588708278
+5248951136269502637
+11703165067112811597
+62390697
+5046422
+521143041
+16515346
+2625321589399096275
+154141293
+1495991284
+165610142
+10528906628815718922
+555549513
+819332033
+567018319
+1095239150174145285
+1873618458944406678
+6364097374632348674
+10811668319096800853
+1465168459078698840
+17269866578628118199
+2676033351739245930
+7864513
+19333416
+11034422950646711803
+283116224
+2625321610894444316
+546898751
+18411417864571256842
+2372569380647405649
+12754891682880490747
+18413109975884759113
+3332575624131774585
+13536993689470216
+15291935475760762248
+6153353775713551670
+15289397349632312454
+9373330690077689939
+29330183088310857
 14264783202905229777
-7171706723174648069
-820773835
-4645667113710455153
-16425638839461284611
-5353476806987745228
-1840738151924108521
-6153071806601889790
-810877831
-8061124
-5356297048398365877
-4770547841029572913
-12804866717273491655
-15580874133512784221
-514261733
-571605843
-12346762090311779845
-102957618
-10907429529076434052
+5995296139937712949
+12269921124804135698
+5885976155981547843
+4129856573689694799
+538247952
+3598203394278688718
+159777405
+22151483
+1409876968
+45089144
+10682570
+3172873343893834792
+15290243360149277503
+18412827985276633157
+28202117477106938
+2057911839619745748
+1193603335023233930
+1410790466305853323
+1873618476141774348
+16643035121679272213
+8716776878113885241
+14581169549127125939
+6375653898722412075
+8694008299851745161
+21997756618049241
+13500652
+816317351
+9870724569695611
+105251406
+70844900
+4279895118
+197001895
+24969554
+15855332315905657597
+151126621
+506659547
+5142301044413893172
+1917322269
+3137288237381257087
+14409153078548760239
+14633483086300318059
+5780604315680310424
+5572809636517250553
+15592058013925444099
+17244622751296785814
+27356063969248337
+33560399034844286
+62194086
+153944682
+4849811
+16318735
 2625321589398899121
-5354604872597767596
-4279174221
-27638024484621167
-8483828720841721486
-1459422188
+9374458755688828226
+15882855708139391266
+10225919150420460684
+15292499508566297469
+14065038233622153931
+17943317531893566468
+7031431794891230329
+16385074671182940805
+6205475624366966000
+6103488079777762245
+18411981897376661534
+9796067026192895123
+7996312456522828750
+13487493291780736605
+17245750799710423012
+2676033351739049517
+546702141
+7667902
+42074479
+282919615
+5515085278788979157
+810484612
+2625321610894247837
+1544487931
+9511403338599564690
+5192881006389626514
+5778348223150556659
+32432363517642192
+14046027923586223423
+18413674030185644130
+10771469979967884371
+2229804632046437624
+17480593072628501093
+6368045642961454306
+13237708557081051143
+13331692090551241408
+7677136701370927115
+339636063086
+538051341
+21954873
+492176077
+9870724566681132
+1398211555
+8807886331112851129
+5516892801706297876
+16546579671803956126
+13217980679449939250
+8503320935775669540
+32150372909516328
+1675756474409617568
+7721139320100816372
+2541195915421354200
+24772943
+1309279186
+105054795
+9870724569498781
+162398863
+70648289
+477207135345
+3452050537366752044
+5460499803637156023
+18199185222634702635
+5512098557251945790
+21715680028265805
+249849483538007723
+3554388240579364748
+9386257314251803173
+10594427319499622429
+15289397306641222853
+29330140097217635
+14311736903207619026
+8856014234050823647
+17855463731522440261
+15291089478142986477
+6365225461738636619
+2625321589398702921
+39059811
 23689889426704296
-17648172271756969893
-232154335723
-15925513
-10811668319096800853
-6365225478934037607
-9763237054719266042
-11633356565151157114
-63373752
-1873618493336979326
-6029477
-3580814869236944221
-5199085482290645376
-282854078
-2625321593697668091
-9870724570285675
-7449919019336600171
-1839046014815569788
-23789896
-9131616131521448314
-5779476228575003910
-5511666277521099409
-13940760354079114484
-18413109980183855178
-644678512
-71238122
-417019463453
-15131353489256221185
-447360420122266222
-520094464
-3997830
-15096032016463431129
-1873618501936549084
-61342108
-1873618523430651633
-18412263918078459945
-5344573059048999857
-5155859771100236117
-5405598659939206416
-27356033876298083
-2146416200305806198
-5303893093062347743
+2474797953316292924
+61997475
+818938814
+153748072
+4653200
+50528646
+256967343
+468608354196
+509280991
+5463642917535614049
+6209141923583494372
+16122124
+10375505326587315831
+4397798264919820154
+8054758354584013306
+1873618489038145001
+830399540494469985
+5637072609524124450
+13913370016116443160
+18412545951677546551
+2676033351738852867
+99222062
+546505531
+18940198
+5411521012994540776
+2625321610894051277
+374027977988
+282723005
+30409049
+7471291
+259785401
+821756878
+11229884474259868248
+16357059045845960667
+1873618454646032908
+10542598463376395267
+5887104182899575287
+12393440069873436875
+7730749911729375728
+15289397349631921063
+7621890105505615217
+18263821886743185772
+18058417591402760409
+1317733333
+511599051947
+214958540605
+67633599
+9870724566484489
 21758263
-3189961199463959445
-527958790
-69206479
-11862232
-6364097396127827248
-1320879066
-365262179507571896
-23689855034002659
-1473119215
-18412263948172132400
-31243224015702806
-39518566
-9870724566222277
-545719090
-5301355009924597043
-9391897706793274792
-11514789185312918199
-18411417886066737167
-5299098848607995194
-2284412389694637269
-10530167802300925091
-10427987387505837891
-14322803714593785119
-2625321585099869531
-6829167367527204602
-6013889919468112625
-4181978486829943864
-8698802578697685482
-1654120425802828663
+1873618532028844481
+8328637003859953646
+5509410163496651347
+15289397375428069113
+16436648502584675667
+6205475697451599682
+8929320279979198383
+4974759074281293673
+9870724569302153
+13107433
+815924131
+2524775482
+28484129580057898
+206359759935
+2625321597997287853
+24576334
+70451678
+16432982289349543214
+5459976661309458015
+4237766514325588729
+14322284629040564382
+9697577994188492052
+5759560470823897206
+1873618527730862181
+6153071780807051278
+15586729198848181726
+5558136817694803400
+10694515171064744788
+1988559907
+5302201063430292982
+4456589
+1713308683
+96207382
+15925513
+13650504529854072320
+5458848655886518922
+61800865
+5779476232874035736
+153551462
+38863202
+8099682237308012832
+18411417873169448972
+2787671431665157349
+651404368114879038
+17955470565775510239
+18413109984482951243
+14911447610171655366
+8858552325786961082
+13840095604897025041
+5940918086979029952
+5723537903358642458
+11238431078799509026
+7758478083289188556
+2014710471177341259
+12454108019635062383
+99025451
+3862478902367620470
+17621268784989013395
+7274680
+546308920
+27638041680086900
+5991347923197297866
+6208295874376239521
+34124418943289166
+27356081166223293
+5461104782808057591
+5357143003026951378
+113312346
+7885152524183078888
+214958343888
+309541536868
+2395604016
+9870724566287831
+491782859
+2414921941537785811
+7528330190111771360
+30458205708158447
+2707725182771857350
+23407795639356361
+5991347884504386492
+14541340640523322842
+14576212124415364447
+525512494730316455
+6795216411027442152
+1160107295621122785
+11391362031143292020
+2421359666
+162005644
+517538547
+196412070
+6152225714402428442
+447112610911
+620757861
+2625321597997091447
+12910822
+9467310877347154547
+70255067
+8926052536804313893
+1873618467542403595
+1873618441749072804
+18116142943679220675
+4787251587800828866
+18411981905974853664
+17175427809786595961
+153354852
+818545595
+5462796881224795644
+15728902
+4259978
+96010773
+1334641629
+73073152
+61604254
+7570985824906512457
+20305632407192782
+14288223544298637564
+12772641161582545873
+13237708565679243273
+15394822466617412826
+18430745393540238720
+282329787
+98828841
+809894783
+17011915733784398286
+7078069
+637862761
+18546980
+7993599274919922706
+546112310
 5569296748444387676
-1873618441748940565
-256967343
-5245848947241584851
-15862817677379702068
-14633483086300318059
-288046714075
-2203332276215481610
-7798976
-810615685
-237175467
-11340219378265033230
-313841615983
-513999587
-18413674004391067740
-2116750858326574509
-8070938101082033295
-2625321589398637514
-25099937047839912
-5245848878456439955
-12118995007347033900
-4562124381333884039
-31586327206235137
-16436648502583690678
-9181481831755875838
-5516046752497929091
+5429433323061448040
+1873618454645640429
+16425638839461284611
+32432337723721245
+2785415309041207732
+6156455960443095577
+15292499534361856724
+4727296740454696303
+15289115311734721621
+10914208898869168291
+583336804
+214958147231
+21365044
+491586249
+9870724566091296
+2246313005
+3901910077209972079
 4183106466458307862
-1991460714865167155
-17082847207615301902
-818480058
-15663365
-73007615
-3701600990787603378
-63111604
-5767329
-579208034
-1493907215601306869
-11535686880442518166
-3313969578832561394
-2704904932763174902
-6570315963541227654
-282591932
-5726226297114658480
-17160329975787685834
-8843457619279611284
-18413674034484740195
-9870724570023121
-492176077
-30740204914083091
-21433663625497129
-1629160452
-1873618450346477252
-18412827972379344962
-5243108696682924272
-7260902865540482639
-816448424
-70975974
-15287423196122254433
-1873618501936285414
-5151629580948802356
-3735682
-61079961
-18411981910273949729
-7837634943338155161
-3597357340772992368
-5133829485925763690
-51184007
-10956724774926813288
-98632231
-17309267256018536307
-9870724567992379
-29048106498198701
-3544107379218385465
-14386655907412249373
-219257507157
-21496117
-68944331
-16330874579771459902
-11600084
-11124082762859154482
-5459935770830768809
-814416800
-347984565637089693
-11923578915473263059
-575144796
-517800693
-3297856681506178941
-326737923180
-16038494049632258844
-15104099179857577674
-32996413518841137
-153944682
-2152780467316001469
-8722536002903082945
-10646954815923686447
-545456942
-14458654042895551171
-3935742187522887052
-16064731596255856452
-19464489
-17648172288953812474
-6213874949885069218
-14851060135220743194
-6471725260172231870
-4504298175131421894
-573113178
-11701191021079496730
-12314601354656483126
-13957562954616997312
-161809033
-563217229
-104464968
+2989761681675848960
+681145240220994278
+4089771542585346905
+9960543895307946415
+1801847830
 1366033375
-1133620930477295468
-6209141923583494372
-2625321610894509848
-5052785364214352114
-6155298040667702671
-5246977012853376412
-4074350485214726972
-27328854
-1873618441748677997
-2000487899013646903
-7465404271946632160
-7239351853821397993
-11742834345080916462
-6368045642961454306
-5516046795487905107
-434216307724
-3493677603186412637
-810353539
-16633695840000739887
-821147663836514852
-18413391996586557524
-7536828
-4151361015346562251
-14540810596246030644
-5995296139937712949
-159777405
-8816997369364548341
-45089144
-18412545934481162291
-9298403582666148514
-15108492788614827244
-35193182
-5568582435113995179
-5570988833963444820
-15289397375428069113
-15401217
-8430474765433179073
-10750398672578676906
-72745468
-5405598728725859379
-9250794030848869727
-62849456
-17422075106091075868
-5505181
-1873618497637255436
-578945889
-13106160036035691955
-282329787
-5570988786672405753
-9870724569761068
-7031431794891230329
-43057525
-1706034183
-491913932
-214958474959
-90505732
-18412545964574834746
-32432303330887118
-846140170598090257
-5458848587099997499
-17607182983838566334
-195297952
-539362075
-5460499872422693597
-23265605
-943759021439519007
-70713826
-816186278
-2207492642904016905
-644154222
-60817815
-806290300
-3473534
-1873618501936022824
-13307798926833551183
-1873618527730926929
-11349795265056081195
-567018319
-9388513449772451585
-165610142
-2625321576501808484
-7290339324003420579
+70058456
+2625321597996894992
+104464968
+563217229
+9391897711091583990
+12714212
+161809033
+24183115
+196215461
+4662073803102881076
+5564423899624572258
+1930668198955452820
+1873618497636337289
+2785415326238575484
+14418466534433295118
+15292499491370961900
+17305802226190387588
+1413046597527538184
+11547019543992076059
+18412545960275738681
+72876542
+6671860954713623381
+818348984
+7073102484926630551
+4844858457232050089
+4063367
+1436145094782551981
+95814162
+8697110475982376165
+15532291
+61407645
+23971751058934778
+13418544886826534789
+17616192489740896443
+8486888319115725460
+5941764217869305943
+5566476498434524382
+17083693235326683482
+12583654612056476062
 15287141244205140113
-41025899
-9870724567730368
-5569296739846327213
-98370083
-1531970550
-219257244681
-2065251783916127931
-6151097665987347595
-1407386597
-3973490993339565383
-12463417266756127924
-17631161371525515669
-21233971
-3232498753
-4767727591020628301
-8972557000702888938
-1873618458945784014
-15290525376551717170
-1559626750
-68682184
-12689613402799605860
-527434500
-517538547
-3542979343701772038
-447112610911
-163578521
-326737659857
-30458205707109873
-2625321606595479619
-498702419026
-555090760
-11846037961957312985
+10329316755526125416
+545915701
+6881458
+23689855034002659
+12734096628671909131
+11199980773228349986
+98632231
+12818875419963886521
+6848542126596623056
+5941764183477191834
+3186488476490139978
+3875793754648151904
+14319322726341872694
+17090754651615726815
+3221822110943152678
+5516046735301085409
 2286775792223980496
-2676819007
-11599686562536949325
-3968978683605551949
-5831598103022077418
-15175534989820758889
-3812049126336301758
-545194794
-12348736218027264207
-12743882002561631754
-12318365723906541324
-8882845388820581451
-12769623874203027091
-1732546160493595960
-10430737389551487761
-9512531412808567772
-21433723812579518
-812123024
-9140909979694467183
-4025048830681353606
-1873618489039455401
-18331530485106038
-5516046791188875281
-6156456003434055463
-12474564753552836994
-17621561863500597513
-104202820
-29612220986426501
-1996555300
-2625321610894247837
-17489156252859434801
-103179363763095696
-15920335005095365860
-13112992413209136128
-2034107431
-17291573824845253535
-9772926989806013640
-819987397
-17170714
-1873618467543321286
-16156684754098128751
-6925759830950740072
-7274680
-16161820259100396848
-3698377064120454404
-10296839827164892306
-13913370016116443160
-1363739614
-92275213
-210659444315
-1784112314702629632
-5461104765611674055
-507299956084
-13237708552781955078
-197067432
-4211147846
-14657391675119111356
-25035091
-1735459858
-15139069
-14426056237756189706
-12771845711499103316
-9940375093616053431
-6523880655054768550
-62587308
-10967349376607587326
-1873618497636993704
-15290807392954681807
-5243033
-1133620917580466754
-1873618523431898109
-11613165301442872555
-282067642
-9870724569498781
-2141513421469058406
-14318336791419094928
-5885976069999102359
-6153917830015027393
-214958212644
-548995910
-90243587
-16101055855214332856
-9409295256684857617
-539099930
-30458248699119542
-23003457
-252379820
-6173800107753209956
-70451678
-13107433
-815924131
-1873618476140856959
-3188833133853148985
-3211386
-60555668
-5514354727165429372
-18430745393540238720
-5566476498435442740
-8821966780582857359
-806028152
-31022281504130688
-15273884660262766886
-17153706162049649384
-15568274631689570656
-98107936
-9870724567468020
-2625321602296449309
+20587696099952690
+6886936648282539655
+6013889919468112625
+2625321606595479619
+1370042529145686776
+365428606574
+339635275824
+101450287
+17625510063045740642
+214957950334
+6213874949885069218
+812516243
+5463642874544129714
+10906583479868327250
+15744000533156660854
+5885543833259674054
+18413391983689269329
+1873618476140792146
+6177363174264606048
+827351178595273968
+10488220504998020326
+12517601
+5354604868299326678
+7734678113259293802
+3055188874828713383
+477206348880
+196018851
+517145329
+1801651221
+104268357
+505676503
+9870724568714358
+10531295842117158093
+15215179494928287321
+18411417881767641102
+11678577273375754735
+50011031689890353
+1873618441748677997
+9839328478246341893
+2704904949959166469
+18413109993081143373
+15289397310941235057
+31304323701671300
+61211034
+15335680
+164430493
+1884114794138700522
+497025749
+3866756
+4131548702199581670
+17761874897365304540
+17619012653768771484
+16542141154387102177
+4451048243670025981
+2973047420892220660
+6155298040667702671
+6684847
 5250413516934940017
-10377197347619277484
-546964288
+98435620
+2625321585099935141
+15293345523383077603
+3324580943442478547
+545719090
+23689855033805297
+6829167320236755019
+5991347923196709309
+12903069678853164419
+8098722631875955846
+17142588721119759321
+6151097721875664077
+5352348827359053328
+491193030
+812319634
+2295119206548507606
+514130660
 2429420595
-68420036
-13840095604897025041
-11075790
+2625321606595283106
+10757572347027851837
+10668197763104573447
+1873618476140594617
+18411981914573045794
+14847634415788362123
+451651625195343029
+4278715465
+746324852
+69665238
+15288551330518206781
+35258719
+23789896
+12320990
+161415815
+1491796976
+1812923415
+9870724568517151
+104071746
+5460499803636172954
+10592171188278987146
+9388513432576593267
+2704904949958969710
+16038494049632258844
+28202057290482949
+5303893093062347743
+5780604350074062990
+13674941621707869313
+15881445561639371975
+11405246090117384413
+61014424
+3670145
+1685722535145180234
+8834282535679560676
+95420943
+15139069
+3205882223899445065
+255984301
+1735459858
+1192315303887243384
+20869665212206112
+6925759830950740072
+883931539946605906
+5299098848607995194
+1445774942907271091
+4445946672738010859
+63832509
+17620986833073014515
+545522479
+98239009
+17045009756596405420
+6488236
+7536133444401891169
+820773835
+13968724050119231192
+5701610621477129021
+3068049059797011246
+20775215
+2625321606595086582
+29048166685607288
+15618641120352995308
+5831598115918776853
+812123024
+313841551493
+5880738612678821404
 1873618506234530930
-517276402
+32432303331018178
+16219982623696489082
+12258209558853782455
+16436648502583690678
+9387385384162626351
+4151361015346562251
+3542979343701772038
+18412545968873930811
+6741138607599781046
+1709507593
+12124379
+451412624470
+516752111
+161219206
+9523554809025136564
+17951898368652543383
+69468627
+814941090
+92406286
+103875135
+9870724568320021
+2523959969279970191
+5144036663261072615
+30740239306197230
+2089265852158774334
+18331517588211470
+1873618501936549084
+6364097443417820330
+1873618441748286746
+27638058876667848
+828143439367899804
 31304293607146613
-10225919150420460684
-32714392818354350
-163316374
-17480593072628501093
-3653991426073234491
-28202143271093720
-2625321606595217579
-669516658
-11075097734987253589
-544932649
-5248951136269502637
-24535874148371011
-5247593352907000017
-13750803869111880047
-821756878
-5565348488711963913
-18940198
-23407778443822783
-811860878
-3910652327921846506
-2372569380647405649
-6151097721875664077
-8481290603310483360
-15289115311734721621
-5197393238738928914
-8858552325786961082
-15270695523793439937
-103940672
+1580068669843704469
+5565348480114297669
+72286714
+164037275
+3473534
+14942458
+356832249381
+806290300
+60817815
 6206603741566403719
-151388766
-2531021385567766485
-7563081637033018620
-13044533461222491710
-6154199872212897041
-9126223058424237061
-1160107295621122785
-32714349826081871
-6152225697206437786
-4333982245204396969
-7012532
-5411521012994803182
-5249797159683425776
-570557265
-17619527108083517000
-3758799224970808644
-11069796609748044689
-210659181949
-14926165161459649868
-7570985824906512457
-3234866947851553000
-1906986264008723742
-24772943
-1873618446046923526
+12240192446106372977
+5942328186188203485
+1493907215601306869
+30740204914083091
+1873618463243635741
+1873618523431898109
+2341393787733871788
+1873618549225229533
+129168850403198609
+5728764487730398405
+10015828607276288922
+12057002331826554305
+10159392401199270768
+12142525752872799042
+2676033356038473537
+494403323033
+292346005443
+223556143025
+10911952793442192048
+820577224
+6291625
+1394017249
+98042399
+12163381674616883890
+1992588707391932346
+109394696450803010
+17760542
+545325868
+12318365723906541324
 7516607870825792868
-14876921
-72221177
-18411699906768535578
-1495925747
-62325160
-288043895627
-31304259214443724
-3685635809078676834
-4980885
-313838798363
-13951205954302051853
-464309454125
-7151957518376504179
-6153353870293665804
-365428606574
-14319322726341872694
-3493083035910933027
-214957950334
-13222096480399396057
-22741311
-538837783
-12845285
-1675756474409617568
-7676326031729298383
-1873618476140594617
-70189530
-2861086850442987769
-12590629664748537952
-15501473033754248808
-1733166096
-2949238
-5833854255738587405
-6405261049027955879
-60293520
-6364097417622914469
-50397573
-15289397310941170468
-1436145094782551981
-9870724567205432
-155189878
-7996312456522828750
-2413828615876118471
-1818166298
-97845788
-2625321602296187261
+18411699880973959188
+2396555433535145871
+4074350515307087985
+18172096623373846790
+23407778443822783
+18413391992287461459
+6284197438710222140
+13819010092172884
+15636771529559117046
+530117487457144076
+7241607903360452069
+17113993018971653874
+6155045921420412650
+17869638717220195611
+8695136395555507435
+548143940
+1992881785903908578
+15741861301866530650
+18411417890365833232
+5726358144768542273
+18413110001679335503
+16149105318148965958
+6366353527348595732
+6155045912821630127
+18067928153034001057
+7888341864134085054
+8856014216854897981
+11202456151687629452
+6152225744495577231
+69272016
+631243623
+210659378559
+3541851256594893702
+23396678
+11927769
+9870724568123690
+103678524
+92209676
+17297538988880889355
+1873618501936351339
+1519546451849645365
+5438906422044199526
+2626514825137096412
+15740334315622960494
+8912366315847026281
+5461104800004441485
+326737923180
+6388664954993575829
+14264208172476401284
+14745847
+3276923
+576717661
+806093689
+1815348248
+152371807
+2625321593698126810
+9870724570940976
+72090103
+60621205
+5569296726948055802
+4502324021620638916
+2599235317101562153
+12665415616966166285
+5197393238738928914
+3701600990787603378
+9654763076979264499
+12808641794728789765
+2676033356038276482
+18412263913779363880
+3865299044899163405
+6877319166685482198
 4451323549999957434
-3544953467117898450
+15287141188316891641
+17563932
+155189878
+292345808939
 40501610
-6364097443417820330
-1543385872365455415
-12606726616442537392
-16436379939763522008
-7562235540534921217
-546702141
-20709678
-18413109962987470918
-10939233345785957508
-1384869222252743071
+97845788
+6095014
+63439289
+2791055594105669609
+28484146776247610
 14383042897579063
-245051624454
-813630359
-5881866613803452649
-1455946274504313841
-68157888
-10813643
-4502606072414800438
-9388513432576593267
-517014256
-16739161091967945306
-6203168539198949844
-20305658202031811
-15122676476569913436
-48365955
-5941764144784016877
-12601357272775920269
-5900805793554762144
-163054228
-6155327937823509637
-95814162
-2625321606594955469
-544670501
-11092808190891527547
-6365225423046182853
-3545799490531822688
-5991347927496329957
-2676033356038473537
-6928358494714596151
-18895516000586505
-18413109993081143373
-1317798870
-3242943116712479419
-8468495303965871404
-10215782083327823122
-295544243748734701
-7536133444401891169
-13880529192106527090
-18412263930975748140
-103678524
-8816997365064994109
-5513226652957347114
-13427220419978791304
-4279895118
-2581508047683782932
-151126621
-16436648502584675667
-5245789596497153220
-18411417868870352907
-1574831104
-5512098613140196086
-16646420
-16881311723980129501
-580191075
-6750384
-460010423829
-17142588721119759321
-5411521012994540776
-13331692090551241408
-2236213724530672835
-10512763733196344280
-91750922
-493159123
-210658919829
-5353476789791099071
-2973047420892220660
-102615266471184862
-817431474
-71959029
-14614773
-29330157293667421
-18411417898964025362
-8854886129749066875
-62063012
-1631882651478526261
-1873618497636468806
-1626046306171619904
-4718737
-6971710725545264615
-15463390673086056969
-5996988225456246061
-2625321597997156982
-1258091056198584472
-2365498112266798670
-12258209558853782455
-548471621
-200191596416994196
+5199085482290645376
+1818166298
+8843457619279611284
+4076606659425995504
+10441809166173275876
+17306856587979066781
+918014392040164136
+1873618458945456185
+12237654281284815334
+1873618484738787248
+3101815889301670444
+20023641798084435
+17404805271631431757
+10554335258691243263
+7239351853821397993
+12012735189037878155
+12893049762838873864
+5565348523104797810
+4078298792234977417
+18411981923171237924
+15636380174699072146
+1440671446449589035
+112132697
+429916882098
+3822179681402096264
+1881294612915292853
+16508579235574254010
+32432389312220424
+11911353550167017696
+3493395603981665996
+15287423196122254433
+6104586261132347910
+1873618450346674286
+2848264744227112681
+681145240220010584
+6366353527348399255
+437798118096833445
+16872667624306444468
+516358893
+15515140725455259155
+527827717
+103481913
+210659181949
+1938490399
+23200068
+160825986
+69075405
+7460569593843485201
+3172873313800750756
+9870724567926898
+14383326641327005
+4452458274095237609
+1330643932
+11731158
+4025048830681353606
+4530582984922301900
+10233718743719545342
+1873618471842024407
+2487491354099451134
+15292499491369977714
+11078361536363433136
+7513578521803359945
+15662612681012938353
+2146416200305806198
+342928503145892901
+7746405201714873917
 5565348480113903112
-10159392401199270768
-538575636
+60424594
+14549236
+15897908900500866947
+33560403333875446
+3080312
+1005889956380019628
+817365937
+163644058
+507708124
+71893492
+2625321593697930351
+9870724570745030
+17750109864738752812
+17419818910382754661
+15475002888547338265
+16224960573811657069
+18412827946584768572
+7880043043777809442
+27638084672424186
+5246977012853376412
+6366353484357502971
+5407707525201267705
+359800716494834349
+447360420122266222
+2676033356038079832
+12463417266756127924
+6053028402293443390
+820184006
+97649177
+63242678
+28484176870181368
+5898403
+33278412725750974
+5293539447540681024
+4504298175131421894
+6313103561496791450
+544932649
+2597848126
+17783567759008991682
+9195012299735698785
+3491703471172619422
+8722536002903082945
+1873618514832721746
+2676958769323838072
+4000515045253449269
+9298403582666148514
+5780604337176709161
+11701191021079496730
+17943317531894613402
+5299098874403555921
 5782296448490211725
-15289115277341755866
-12583138
-4959080478982475006
-4237766475632413481
-2687090
-60031373
-11241814380293784908
-18413674017288355935
-10162787574158199843
-5625593289148533238
-605557034314828631
-2625321602295925195
-97583640
-16546579671803956126
-546439994
-13513914891881875478
-18412827955182960702
-18142877345697171235
-8716776878113885241
-5991347923197297866
-21715680028265805
-5299098848608717979
-2686971790050919863
-10551496
-2676033351739442523
-5246976935469649046
-4236074403011431549
-5561348123192067576
-516752111
-13525196865559988902
-451412624470
-6813843502384089093
-3452050537366752044
-2723374776553770162
-105448017
-14284319595218536933
-356832576945
-1987904546
+13965057806789381527
+3880024017885400135
+43123062
+811533196
+1133620917580466754
+17096567568145714575
+7462549834646555859
+4223816177647290930
+1873618450346477252
+9232147856523987724
+6517011693716180143
+6219166245997316001
+29330122900898936
+17782439637510195701
+16892239439269464715
+8072726556923202784
+15862817677379702068
 2789363555876800106
-17063697102470777209
-6584302816815089825
-5727354422913010657
-13944415416121166662
-28311895
-11906248855590275274
-3707523343842937215
-18412827985276633157
-821232589
-18415907
-2676033356038210923
-17257283880273643533
-18331556279224644
-9117971362513815455
-18411981923171237924
-309541536868
-113312346
-46072191
-103416376
-27920126869375123
-160760449
-361131345578
-9234597529149245860
-14835085562484362568
-4585257123188181630
-1413046597527538184
-6208295874376239521
-13217980679449939250
-1966081057
-6101795981361546864
-16384272
-10370417990725208293
-4196703391028741586
-6488236
-63832509
-5153885660580611393
-6155045912821630127
-5197393273132877515
-2625321593698126810
-10720606758114626648
-9870724570745030
-30740204914804024
-91488775
-7792373120121047026
-3579577413
-5458848587100981064
-755605599842665887
-17404805271631431757
+11534547
+279448847671
+210658985303
+252379820
+539099930
+814351263
+160629376
+9870724567730368
+103285302
+68878794
+14268291611706526293
+23003457
+16876615841046071902
+16269555352897260337
+1873618446048496233
+10161648493728303880
+10430737389551487761
+13735950183222348532
+1991460714865167155
+9870724570547380
 417019921504
-9386257335747873389
+572110149897422243
+2883701
 817169327
-18413391979390173264
+5240852582657493474
+518980348
+60227983
+4618470194090937269
+1459422188
+232154335723
+3773122177597967623
 71696881
-8328637003859953646
-14665059300281706
-6101796011455220816
-4456589
-13070886371126478108
-8733200714257204941
-10913926882465549337
-29330183088310857
-61800865
+2625321593697733840
+331038328206
+163447447
+18411699889572151318
+13044533461223476582
+18413392000885653589
+16436379939763522008
+12314601354656483126
+5539270545646881104
+11507341268100646834
+102615266471184862
+31304259214443724
+7731878007432940921
+7401639761433855789
+9450798976826149302
+5701792
+5245848925746498573
+15104099179857577674
+108921430
+17170714
+5411573444917201071
+544736038
+468609401971
+3758799224970808644
+63046067
+819987397
+18411417898964025362
+14322803714593785119
+3062219857732765097
+2207210695286917386
+10532987970627045461
+1873618458945062288
+684134257893509654
+14101293042239958
+5015996636573993090
+6098975770044925746
+850088361543927300
+157614716
+111739480
+219257572663
+19988781
+30740222110861163
+8618954206935976415
+6206321690772243584
+1873618450346281005
+5405598659939206416
+15872788267758849077
+5572365145471912335
+4625631396377398109
+17619527108083517000
+493028049
+9870724567533791
+1559626750
+19177592590632702
+91619849
+2625321602296318353
+515965674
+103088691
+68682184
+527434500
+1990332636357069057
+899112529018292807
+6570315963541227654
+16038494049631274933
+15920335005095365860
+28202143271093720
 14949273699027977966
+18412263922377556010
+5782296461386581535
+13001682102565734253
+12972952285742288
+2703776923039566000
+2687090
+11775490430040476325
+14156017
+2686971790050919863
+17489156252859434801
+105906772
+60031373
+11241814380293784908
+71500270
+5734260728554980678
+5197393273133271298
+17648172288953812474
+1873618493336979326
+683965395648908415
+18331539082773742
+32150304123324262
+17422075106091075868
 1873618523431110190
-3573803894998305775
-5569296709751605280
-5835546375651263675
-9870724568714358
-42008942
-1746899701160150410
-9664889374910385451
-7406761759861377295
-2625321597996894992
-365428082633
+10628953736434486617
+14512708438227029368
+18411981931769430054
+16886908734816455284
+62849456
+9131616131521448314
+5505181
+11364576519499679975
+33560368941368890
+544539427
+39911783
+33560399035500221
+533070599
+578945889
+8097653480026868144
+154600049
+16974104
+17957750408840481687
+285938493736356261
+5991347927496329957
+10377197347619277484
+15131353489256221185
+15287987228927396675
+14282027259104724924
+8070528080642443989
+17128487303121020
+219257375260
+42729843
+811139977
+6365225478934037607
+460010423829
+490013379
+13850612818404510827
+2207492698791414354
+7515799761807542411
+1873618480440216958
 11888218815508973537
-6311975551774360856
-1408369638
-6101795942670075923
-15515140772745448064
-27638058877519937
-13361048879788721990
-2430665780
-22217020
-538313489
-927164962728314711
-69665238
-27638084672424186
-2573543627316201844
-12320990
-2424942
-18413392009483845719
-3660444556051220001
-18412545947378450486
-154665586
-9870724566681132
-546177847
-2229804632046437624
-5245848917148372136
-15906307047154976446
-827351178595273968
-5780604350074062990
-6350640494756627870
-9198943117821938833
-2676033351739180486
-1192315303887243384
-67633599
-6205475723246636047
-17419818910382754661
-162529937
-17083693235326683482
-105185869
-8912366315847026281
-5249797202674912471
-2446394423
-1461650414
-257426098
-17299513133793348673
-4451048243670025981
-14597841535548131734
-14130457194541352666
-15290525359355331959
-9195012299735698785
-524354306
-429916226796
-6153353788611431303
-1728578573
-6153071806602085789
-2676033356037948725
-8257735
-2785415326238575484
-1873618489038408278
-8072726556923202784
-7731878007432940921
-16271603835638319461
-11229884474259868248
-5835546388547569431
-2704904949958969710
-103154228
-2625321589399096275
-6887529782530082437
-45810044
-16365628939578247566
-4408861808311732424
-3554388240579364748
-3431353251379022211
-4131548706499659810
-3229097897723824621
-818938814
-16122124
-10831084194895235709
-6226088
-6366071472254485645
-10441809166173275876
-9538952396691934382
-5994450030541998229
-6835382734606174906
-4397798273518472097
-2625321593697864817
-9870724570481756
-17782439637510195701
-31304332299601191
-4074350515307087985
-10758418391935682553
-11405246090117384413
-196018851
-17943317531894613402
-15289397375426759758
-1801651221
-12716605781588708278
-5353476789790574588
-1873618450346936800
-14462121002204464918
-2785415309041207732
-71434733
-10770155859627543824
-1873618476141841211
-5780604362970367638
-2530739313276357975
-14090480
-5567604589840172352
-296644709200
-11266915032714840583
-4194441
-2200512120787569683
-2549492329236335496
-6211116016906930204
-99090988
-9625506809262378259
-13237708535585570818
-490103571663
-14541340640523322842
-9870724568450966
-1793158821936040552
-9486667438472824267
-21954873
-538051341
-1398211555
-5408700909154273182
-5356297014005859746
-8444237263823374707
-69403090
-2599235317101562153
-15897859265386515143
-6097847713031849822
-2162794
-9796067026192895123
-13117159209037203716
-164299420
-17088031212435737557
-8099682237308012832
-8971880411373045432
-3099205763721988894
-9870724566418979
-545915701
-13237708565679243273
-4449074137450482853
-18115860927276518423
-5247593352907982888
+10754752208794748192
+4770547841029572913
+2236213724530672835
+12085352355710699032
+2625321602296121720
+11141327
+68485573
+22610237
+279448454880
+573113178
+9870724567336570
+17097308613030775025
+45547899
+102892081
+538706709
+813958043
+2150364429944620611
+15290243360149735302
+1004761907965660269
+13427220419978791304
+18412827955182960702
+2116750858326574509
+11847668763332905754
+15530553734630409457
+6366353492955694732
+5875369269012203157
+7528870385169533979
+71303659
+816776108
+417019528294
+48365955
+2490479
+518587129
+59834762
+163054228
+9870724570154139
+5300226909920168653
+1493907215600323645
+1873618523430913566
+6293435910568086045
+4661227814082512385
+9231865831523354279
+4562124381333884039
+5994450030542718433
+8468495303965871404
+6151097734773868363
+5308570
+51184007
+16777494
+62652845
+7410231625909407077
+8336200085500660803
+10377426660276898779
+15108492788614827244
+5405598728725859379
+5349367342193968413
+1873618489038801706
+12851509922494416016
+3812049113439014303
+14151987057237756511
+10744889200825601240
+9304480456320748248
+19595563
+219257178788
+15798241638577276499
+3017170418691476659
+8857706336766199103
+13957562954616997312
+8126661
+30740222110467489
+4662355883991631380
+18413674000091971675
+6203168539198949844
+12480382642832672298
+14814059355306855043
+10229733804179524009
+6887529782530082437
+10905173350565414454
 16533468055605152863
+3758799212073651272
+240752528220
+68288962
+1318388695
+6350640494756627870
+5462796894122411284
+813761433
+10944716
+2625321602295925195
+538510099
+18411699898170343448
+13880529192106527090
+18413392009483845719
+25099937047839912
+10440328872292254866
+6835382734606174906
+3686437022462641529
+816579498
+541328159
+2293868
+71107048
+105513554
+151388766
+9870724569957729
+5197393273132877515
+5882159627828332650
+7851156332894489575
+11510172448171493799
+9250794030848869727
+8327325764367420682
+16778219695595128445
+8733200714257204941
+16580883
+13513632858283052077
+1461650414
+2625321589399162008
+819397570
+8423108281783224429
+5111959
+1473119215
+257426098
+62456234
+39518566
+12622921449567619867
+10531295803425490030
+5566476498435574920
 1873618458944474091
-19923244
-3188833116656765520
-2676033351738918494
-4501477955215362649
-17621268784989013395
-14581169549127125939
-6206321707968234614
-33278352538406314
-516227820
+1873618489038604579
+18613366323088485
+3008657884776564221
+11621189233770302317
+5464488953846367762
+475082778886408323
+6155327937823509637
+9956242218935388443
+11065487246536017596
+2676033351739311464
+14361095630442006439
+810746758
+283181761
+2625321610894509848
+546964288
+6365225423046182853
+7930050
+6262673798362565305
+1840738151924108521
+8483828720842049762
+9013091190501541709
+8294669686619703163
+9288263741171697848
+15520597512816297356
+2792183694107936667
+9227353569080969220
+2429880031182916564
+5833854255738521265
+18412263930975748140
+2430665780
+22217020
+18301216972082383618
+11964228788262537512
+159842942
+28766150282773591
+538313489
+813564822
+7032232753418799293
+12348736218027264207
+15290243360149343057
+6406389097441592980
+2964529530791004471
+18613559784442970
+1873618476141841211
+5991347884505041337
+6101796011455220816
+6366071455058494624
+6155045908522469290
+8412057600244518110
+3478039985316562895
+12718336251608304605
+70910437
+4211147846
+197067432
+14443179094226111164
+2192639020
+9870724569761068
+105316943
+25035091
+162661010
+518193910
+5303047078245827995
+1903640920853056624
+18092519415945890646
+4127600455366674792
+6474545510181176536
+7731877951544692100
+11084138473134491150
+2625321589398965240
+1495860210
+154010219
+16384272
+15043322265989680207
+6204347601746593065
+4915348
+62259623
+468608617008
+1966081057
+1192315299587689576
+17256155806064642777
+1873618489038408278
+12662636664722033563
+1654120425802828663
+25099894056749168
+5299098874402571932
+2676033351739114988
+489423554
+30671195
+5411521012994803182
+42140016
+7733439
+2625321610894313322
+7329667560521271617
+6206321690771457172
+5967447917778832244
+2284412389694637269
+2572415553107265488
+18412827963781152832
+16904712944498838074
+15289397349632182266
+29330122899915877
+27356081166681957
+6173800107753209956
+538116878
+10551496
+3919394969679695174
+9870724578216632
+492241614
+8816997369364548341
+4662355849599126556
+16567374854657149772
+12884708026702235763
+6364097417622914469
+1873618532029106835
+8861613698626554738
 6890349946557761313
-1411918553413126104
-162267790
-2474797953316292924
-1694703987789596868
-18172096623373846790
-28766090095429261
-1223976979390989739
-3221822110943152678
-104923721
-15185362616787929146
-10003084053115964048
-2625321585100065781
-437798118096833445
-1815348248
-31304323701802109
-152371807
-14046027923586223423
-2021331689141374237
-20869691006257762
-13044533461223476582
-16778219695595128445
-12057002331826554305
-17465760298758178660
-7576852735584046364
-129168850403198609
-820708298
-17891616
-1873618489038145001
-7995587
-11911353550167017696
-4522983015860209939
-12612941966326959190
-102892081
-2625321589398833886
-45547899
-11548493110908749415
-4076606693818764590
-7851156332894489575
-12779163922391107832
-5991347884505304103
-1095239150174145285
-3863606920688567965
-10771469979967884371
-15859976
-14312864964518020808
-17245750799710423012
-5963940
-10655291933708585535
-4162099616697747321
-63308215
-1873618519131818153
-30176189305784773
+5837238474067478018
+5780604294184830225
+11214563466576463780
+29612216687004362
+5516046782590617836
+10156853965084755435
+6151097683183797493
+11613165301442872555
+1986666427421953426
+6155045882728942511
+7033448275620070919
+2907303882175415846
+1320813529
+1584595969
+105120332
+7465404271946632160
+70713826
+24838480
+162464400
+12451287838412704489
+816186278
+644154222
+3735453693364143828
+9870724569564298
+1309344723
+21715680028329254
+13044533461222491710
+1873618497636993704
+3445982126355516078
+7529998377695250462
+12237654319976351671
+4534407021717882867
+3431353251379022211
+494159375523777824
+1136798196293306910
+16426766960960540026
+819004351
+12356593998396393868
+16187661
+3307734082
+14273081993166850387
+4718737
+434977997061097911
+62063012
+2625321589398768544
+39125348
+30458248699315998
+17858552114457937219
+5903884228619468800
+16872385650894636566
+10504814416598927327
+12213481117926952067
+18413674008690163805
+14101026494875963
+4709060078846741586
+2676033351738918494
+9714916620294556051
+13237708535585570818
+810353539
+2625321610894116800
 53412232
+434216307724
+7536828
+41943405
+6770804071406897080
+821822415
 318140582948
-15611911946388048179
+6365225453139920066
+4502324038816629924
+4030203865610717075
+18411699906768535578
+15290807392954681807
+11966722661119888545
+8618954206934993224
+12189960762281954023
+32432333423379563
+18413392018082037849
+6004915412369541960
+14546784569801180959
+745740842898098858
+15289397293744523027
+5299098870104394759
+9257009393629660721
+5900805793554762144
+6155045917120857525
+21823800
+1317798870
+537920267
+1730675726
+1535706104
+9870724566550039
+14648423506775771515
+10531295876509927029
+3973490993339565383
+14312864964518020808
+14824583848163281869
+16940553195690134509
+1873618476141446514
+5778348218852443426
+5758903550139959418
+27356016680241600
+13940760354079114484
+5645056620059298667
+347984565637089693
+815989668
+9870724569368358
+5887799994573980828
+162267790
+517800693
+70517215
+15925946803693423456
+2625321597997353452
+16572875051796793000
+575144796
+104923721
+13172970
+14426056237756189706
+5909364179964069981
+5459976691403654584
+4397798273518472097
+27920040887059601
+1873618527730926929
+1873618467542665344
+18613585580197092
+32714392818354350
+18613499598604650
+5780604289886653255
+3865299049198390675
+22279760122415532
+18412545930182066226
+50397573
+153616999
+2625321589398571980
+1736311827
+15991050
+14665059300281706
+4522126
+7792373120121047026
+30458248699119542
+13951205954302381098
+17785259844527786731
+6444189713816225654
+747829823970020707
+8698802578697685482
+14477731067643038195
+18412263939573940270
+14318336791419094928
+15291371425760087333
+12109395139072755174
+30277976
+99090988
+282591932
+546374457
+490103571663
+15580874172203795679
+810156929
+7340217
+638124907
+259654328
+18809125
+18056758355458722638
+5679882735784101879
+7563081637033018620
+8520914754064026558
+283748271730
+67502526
+9870724566353399
+7242736046355514492
+572130134
+514786024
+214958409445
+29048192479791616
+2625321576501808484
+5354604872597767596
+29048106498198701
+2575517759332551933
+6311975551774360856
+14036340911856223966
+32150286927595340
+17291573824845253535
+14926165161459649868
 12640696470018459947
-30176223702288623
-9870724570219682
-33278412725750974
-1409876968
-28766150282773591
-1873618450346674286
+17498716255300421272
+3968978683605551949
+16377260960560187819
+19177532404009207
+2625321597997156982
+24445261
+5245848878456439955
+421319345246
+5510538272098551989
+70320604
+3249068390006196153
+5888081980883929307
+1836516380
+12976359
+236453760381
+2141513421469058406
+1873618497636600365
+11878630053446878902
+6156456003434055463
+27638058877519937
+18413109962987470918
+6288511205539515238
+4770547828131824981
+4160689491693538063
+14836382508930370955
+12751507524739009261
+10427987387505837891
+2605266760616185153
+2524806001290315567
+33560429128451329
+4325515
+669516658
+15794439
+807142269
+5303047104041388600
+818611132
+61669791
+12644080653952551280
+6045857707735386835
+11229983338076703492
+2845029447323880298
+18412827972379344962
+6767393152337644543
+2673382969485886910
+15185362616787929146
+17490170188584258190
+4047541379259827663
+15680489859993767209
+546177847
+7143606
+637928298
+7276444624641068235
+12287601267178473523
+31022238513759415
+17698252132056434004
+1732546160493595960
+7036226112429884975
+2676033644081056812
+548995910
+90243587
+571933524
+812778389
+9870724566156739
+214958212644
+1873618446046923526
+3493083035910933027
+15291935501556190620
+14650572868605052119
+6971710725545264615
+17302333254828493968
+6098975847429179176
+4504298213822565083
+505938649
+3579577413
+2786543383251193103
+70123993
+47186305
+2352415791
+4279174221
+2625321597996960522
+1538130937
+161874570
+17082847207615236134
+6206321707968234614
+8854886129749066875
+10908568553618343357
+2785415326238639918
+1873618527730534170
+1873618441748940565
+5745384143842643344
+18413674017288355935
+16044698410491643447
+9181531069949872018
+10905173367761798655
+13237708544183762948
+3757107087862401328
+1311572948
+2034107431
+15597828
+2538734651
+5727354392818878727
+4128904
+818414521
+95879699
+5727354422913010657
+5245848874158263187
+9664889374910385451
+18411699915366727708
+14851060135220743194
+17958290734101235336
+9319686106503382840
+89657146100418951
+11349795265056081195
+14540810596246030644
+5779476284463187670
+18415907
+156041850
+259261111
+821232589
+809763710
+98697768
+6946995
+5941764153383128192
+17684252729367202593
+10233694917695638297
+970700105235760464
+21715753112570631
+17953636526298302297
+6262673798361580735
+5847102830955857465
+3313969578832561394
+2974323816123992770
+13271165719268362246
+17083693200934636558
+6101795934071424788
+16990917635978692369
+812581780
+16327183209838150989
+21233971
+1535116279
+214958016090
+2625321606595545096
+3232498753
+1500709877
+514392806
+5831598146013367591
+4502324004423927097
+3099205763721988894
 15290243360148359553
-14036340911856223966
-6365225461738636619
-816645035
-417019398489
-6206321673575531611
-12057284352529139627
-71172585
-13828334
-7528870385169533979
-5832726134240118664
-2785415334835848520
-2572415553107265488
+1873618476140856959
+3295137431799204142
+14130457194541352666
+8910392170935354895
+3967850626592737364
+18412545938780258356
+12583138
+505742040
+4278977611
+540148509
+24052042
+196084388
+563086155
+104333894
+2625321597996763849
+16324853745603185849
+13586095437453200186
+15804734059994287439
+18005251247539029895
+13516735047310051359
+3493677603186412637
+10159956468397444373
+5249797099496672683
+17763448248357489818
+18412263948172132400
 61276571
+7630443591734791098
 3932293
-9870724568188981
+72745468
+95683088
+15401217
+4076606693818764590
+15986098390340470919
+1873618519131556994
+9386257309953099582
+8501910827968825512
+168849237244054062
+6750384
+545784627
+2625321585100000596
+1652810939277510396
+580191075
+98501157
+5198803303557629187
+3297856681506178941
+3935742187522887052
+2601013084734032090
+11500631658516907905
+8021450341214588326
+14977809576148535095
+4127600472563058730
+16965951797418331227
+27356081165698156
+491258567
+12804866717273491655
+1408762855
+2573543666009114673
+2200512120787569683
+2625321606595348609
+21037361
+14462121002204464918
+5619444426388998007
+3973491023432910866
+12103109825679658143
+7260902865540482639
+5566476571519223063
+18413109971585663048
+17791918762976347730
+16365628939578247566
+4449074137450482853
+11214563466575480865
+7239069803025663720
+17952462371364276975
+9512531412808567772
+11075097734987253589
+2373415502940997016
+16874702537456224943
+517014256
+2573543627316201844
+4278781002
+69730775
+9870724568582655
+12386527
+12743882002561631754
+10906583475570214623
+104137283
+35324256
+10167863869407233224
+18412827980977537092
+363084051790629688
+11694336983993944146
+1873618441748546884
+32432320525830439
+12654580528992553525
+7241043922144659849
+9391897706793274792
+152830562
+1402930148
+164299420
+5303047073946667464
+3735682
+61079961
+15204606
 1873618549225491555
-2360543918673038210
-98828841
-12512221777814685432
-17939922315943150958
-6045857707735386835
-21692726
-4502324038816629924
-11490081257974859839
-17639632887023929831
-1316357237551401394
-6101795994259359091
-11796695
-69140942
-18411699889572151318
-12074216556992400767
-1320813529
-8618954206934993224
-164037275
-4160546838840674266
-12591757708863407913
-555549513
-9870724566156739
-154141293
-32714414313178248
-545653553
-223556471268
-12613788024133322735
-812581780
-5778348150066318224
-1500709877
-6741138607599781046
-9227353569080969220
-515965674
-13884327378110449525
-18411699919665823773
-16340493341965880015
-162005644
-620757861
-21997756618049241
-17007720368052373541
-13001845694847518363
-227855238971
-17629469
-1737950228
-9288263741171697848
-20305615210743190
-1873618489037883086
-18613533990193666
-7733439
-313841551493
-15288551330518206781
-17302333254828493968
-6153071832396467338
+3188833116656765520
+31586327206235137
+820839372
+464309454125
+18022689
+545588016
+17205553309096938840
+313838798363
+223556406340
+98304546
+15463390673086056969
+4240022615453076686
+10831084194895235709
+11549275701007551889
+155648632
+6553773
+534119176
+4222974949961697922
+8326286517935867839
+1873618454645114642
+1146796961722731823
+5509410202188647833
+1873618514833377412
+3242943116712479419
+29330157293667421
+8882845388820581451
+12608147700378373379
+14465116522071263669
+5461104757014004985
+9649086479758069023
+2625321606595152102
+513999587
+20840752
+2148672322930150296
+10646954815923686447
+10831360821402142464
+313841615983
+10139438201185111279
+16881311723980129501
+18413674025886548065
+2785415274648570354
+5353476789791099071
 2979056014524680527
-8857706336766199103
-2625321589398571980
-45285754
-5991347884505041337
-4502324004423927097
-16874702537456224943
-14911447610171655366
+6366071515245381876
+8610102806501591788
+10333839787251271664
+13237708552781955078
+451412690018
+16101055855214332856
+9870724568385196
+12189916
+23658823
+195691169
+5155859771100236117
+69534164
+35127645
+103940672
+11069796609748044689
 13944990587222231178
-3308118261903721908
-18413109975884759113
-8412057600244518110
-15597828
-2538734651
-818414521
-17082847207615236134
-18276979644936029994
-5701792
-63046067
-5882159696614657105
-1410790466305853323
-18412263913779363880
-32714379920475611
-539325825270679628
-1873618519131556994
-13536993689470216
-9870724569957729
-43254135
-5153885686374731086
-9387385384162626351
-8336200085500660803
-5303047104041388600
-5512098595943810546
-5717788221838658971
-2324121364801391676
-12012735189037878155
-2192639020
-1873618476141316771
-70910437
-3670145
-2219404100148201532
-2544580112253650683
-61014424
-6155045921420412650
-18412263943873036335
-1873618549225229533
-9870724567926898
-98566694
+27920101074341046
+17298949057997047589
+2908260051937332390
+6364097413323754682
+12350988444867431112
+1223976979390989739
+5782296431293302176
+11098517635487303139
+13525196865559988902
+2374936041605498895
+15007995
+1574765567
+519635711
+5831598103022077418
+576979807
+817824692
+634323816
+3539071
+2446394423
+6206321673575531611
+2360543918673038210
+27638024484621167
+11340219378265033230
+6366071472254485645
+4562124351240801677
 29894215892535509
-155910777
-6366353527348399255
-9956242218935388443
-31586340104504804
-219257441372
-13522668389390157414
-18411417881767641102
-11534547
-279448847671
-7242736046355514492
-68878794
-814351263
-1192315299587689576
-2524775482
-34124461934314600
-507839197
-5539270545646881104
-4974759074281293673
-5337229686545450161
-153879145
-12644080653952551280
-30458205707308380
+6153353844499089111
+13070886371126478108
+9181481831755875838
+18067928196024961188
+6981729909914862956
+63701435
+6357162
+15288269305517836796
+17299513133793348673
 545391405
-17877509356004052233
+17826079
+820642761
+98107936
+8854886172739175692
+9082058141968173941
+1873618484739049815
+11514789185312918199
+5778348197355914873
+11130039777759856047
+294416195335096411
+846140170598090257
+2571498445011814318
+18412545947378450486
+1408369638
+2625321606594955469
+5245848947242502849
+365428082633
+5245848917148372136
+10859426818132543221
+15524263781940136850
+2578187325
+17564225130023161250
+811991951
+1694703987789596868
+1873618450346936800
+12105446909435186010
+14975681650483333306
+32432303330887118
+29612220986426501
+11644189250161151139
 17520266449292560845
-11065487246536017596
-2011949215506761725
-6155045882728942511
-812319634
-1130753852548581517
-573047641
-5299098874402571932
-18413674000091971675
-18331556280207363
-17269866578628118199
-15289397293744523027
-161743496
-10649664295314066054
-6051485356288903427
-4347925833116091776
-30458188511970924
-104399431
-10184384893691038634
-7401639761433855789
-1308623824
-563151692
-2625321610894444316
-7239069803025663720
-11434534198373320614
-1873618441748613384
-5622264654903379074
-29330122899915877
-15636380174699072146
-820184006
-2597848126
-10233694917695638297
-14585410861575638263
-7471291
-85348920764927349
-6366353492955694732
-18413674030185644130
-4127600472562141528
-35127645
-5780604337176709161
-541328159
-2524806001290315567
-13850612818404510827
-18412827968080248897
-15335680
-3493395603981665996
-17858552114457937219
-62783919
-3875793754648151904
-5564423899624572258
-292345154665
-3489447322753895731
-18411981905974853664
-5439644
-42991988
-9870724569695611
-12269921124804135698
-559088458
-33278386930321618
-15289397353931868100
-214958409445
-6219166245997316001
-15289397379726773461
-30458248699315998
-23200068
-12163381674616883890
-70648289
-9000175594581527004
-806224763
-89657146100418951
-15475002888547338265
-3407997
-60752278
-18411981936068526119
-14267039342724252928
+92275213
+335336768790
+69337553
+7290339324003420579
+17621268802185464283
+161088132
+9870724568188981
+516621038
+11993306
+507299956084
+210659444315
+103744061
+13151687854617134836
+8659114857360722535
+825323275339564903
+103179363763488430
+684134210602468610
+1873618501936418049
+6205475723246636047
+5516046752497929091
+15885957841278600403
+2477484405147109478
+16875205763331852041
+72155640
+472907842721
+14471968401314024391
+806159226
+1712194570
+576783198
+1815413785
+2446197814
+14811384
+507970270
+8929038315166239946
+3342460
+3220426554520570467
+2625321593698192308
+5677488692584514734
+21433663625497129
+2435475427475262665
+16940455997476965252
+6153071806602085789
+5865888353649363875
+17465760298758178660
+13263754581809432790
+8716776809328151764
+13112992413209136128
+6153353788611431303
+3784724792312663401
+12590629664748537952
+2676033356038342054
+14219872676477209184
+11327137566841769230
+63504826
+97911325
+9339868219275806468
 13726068525522684375
-1873618527730862181
-4504298213822565083
-155648632
-98304546
-9870724567665640
-13681696359428851594
-219257178788
-24535844054893958
-50011031689890353
-10532987940533372886
-11272401
-23407795639356361
-68616647
-814089116
-15635925519041823968
-1998521381
-163512984
-797977540607610221
-32150286927595340
-4709060078846741586
-5967447917778832244
-5885976078596834724
-2625321606595414132
-153616999
-1744643526947965735
-17461812017531651650
-987047180239768912
-30740239306197230
-15288833278135765839
-525337347
-5885976155981547843
-18413391992287461459
-10532987970627045461
-56689033
-5722409915131627177
-114033243
-10159956468397444373
-18412545930182066226
-5349367342193968413
-13819010092172884
-104137283
-17953636526298302297
-2224234517276395067
-2789363555875490728
-2625321610894182276
-12426051065400527122
-9355193091131312182
-30740222110861163
-14361095630442006439
-3137288237381257087
-17105177
-819921860
-7209143
-1727529996
-810025856
-805679481429165719
-17298949057997047589
-21997713627284659
-16120716880803858984
-33560368941433940
-1535706104
-10229733804179524009
-18412545960275738681
-9714916620294556051
-4078298775038527628
-5461104765611607541
-210659378559
-92209676
-13418544886826534789
-14264208172476401284
-1917322269
-197001895
-24969554
-5405598728725530322
-15073532
-817890229
-72417787
-1873618471842024407
-17091318705916150977
-5946696443085589628
-5177496
-5847102830955857465
-62521771
-1873618523431831649
+2011949215506761725
+1737950228
+6160551
+9830100417878166271
+155255415
+17629469
+8140646021471143544
+545194794
+8510103668314541335
+18411417868870352907
 5835546371351184527
-14824583848163281869
-42729843
-9870724569433729
-5780604315680310424
-16385074671182940805
-214958147231
-3007753865419557454
-491586249
-17943317531893566468
-1801912319444323213
-22937920
-539034393
-27356055371580547
-1873618476140792146
-5198803303557629187
-6103488088376871190
-13041896
-1733362705
-70386141
-2306802734
-643826540
+18413109980183855178
+5249797172580910311
+10532987940533372886
+32714379920409891
+1873618514832984063
+13702827714901707594
+29330157293274228
+220421203678071202
+5565348467217401524
+313841222762
+570950482
+13012951802393594980
+6209141854797957852
+5717788221838658971
+5460499872422693597
+8444237263823374707
+2544580112253650683
+32432303330691092
+14986955239351847842
+4392112055939237960
+16285378285009240167
+6205475671656957491
+11266915032714840583
+15289397375426759758
+17284241873202253123
+1783548230307677653
+195297952
+69140942
+23265605
+11796695
+210659247559
+17257283845880874759
+451412296787
+92078603
+160891523
+539362075
+103547450
+9870724567992379
+11331649863678691999
+12613788024133322735
+13944415416121166662
+15895039144349470066
+8816997365064994109
+1732546121802517809
+13221120945827219803
+3863606942184311140
+12562453432512743836
+7562235583526800081
+9870724570810095
+71959029
+232154598652
+14614773
 3145849
-14637903957824965363
 519242494
-60490131
+2625321593697995819
+1133620930477295468
+817431474
 805962615
-5784522635265967958
-1873618527730601376
-18301216972082383618
-11644189250161151139
-2625321602296383846
-9870724567402585
-98042399
-15741861301866530650
-494403323033
-6729754102968812754
-546898751
-6208295835683456476
-33560403333875446
-14409153078548760239
-15530271666638163275
-1873618458945456185
+4131548706499659810
+60490131
+503001777494
+6206321673575138470
+1258091056198584472
+3573803894998305775
+10967349376607587326
+1873618523431569790
+6153071806601889790
+12749251354825264418
+9625506809262378259
+2676033356038145381
+15635925519041823968
+5885976078596834724
+9484411285755463284
+532291916112267238
+18411981901675757599
+1703347206
+33560368941827284
+5303047039553965447
+40370537
+97714714
+155058804
+6261263733545503259
+5963940
+63308215
+1130753852548581517
+5570988833963444820
+18157949162008873831
+8021450371307931626
+2861086850442987769
+1873618489039455401
+18413674034484740195
+1873618458945324208
+32714349826081871
+18424247431471827427
+1842511416005692464
+6589396841525218018
+5782296448490276391
+13237708561380147208
+27356055371580547
+5462796868326918190
+1860700038481053299
+5458848587100981064
+3580814869236944221
+5566476545725106758
+28202091681875145
+5915592292141435844
+11434534198373320614
+15740733274947783803
+10161648502327149991
+15287141235608259625
+12779163922391107832
+68944331
+814416800
+1823671323
+23068994
+210659050964
+46006654
+516227820
+11600084
+103350839
+361129707266
+13750803869111880047
+103179363763095696
+1873618501936022824
+2933734509745341832
+7230168968130792223
+14517406836956661503
+17619012718254098754
+12406930521299355297
+4408861808311732424
+2949238
+9870724570613070
+60293520
+503001580666
+14947075702982868
+1998521381
+2625321593697799226
+14418163
+163512984
+71762418
+5722409915131627177
+11599686562536949325
+1873618493337242815
 16951650337051970851
-5144036663261072615
-813826970
-12133908888583014197
-68354499
-11010253
-279448324634
-14749580058850363919
-6633286351216577743
-2089265852158774334
-8929038315166239946
-31586271318836879
-13678484518713821516
-105906772
-96010773
-2625321606595152102
-153354852
-10831360821402142464
-5652457623480305518
-8503320935775669540
-16483453074211931840
-363084051790629688
-544867112
-258146996
-5944020284604679310
-5782296431293302176
-28484176870181368
-23407778443758207
-3973491023432910866
-5778348175860436286
-1873618514834032208
-5438906422044199526
-103875135
-7697026996393675938
-1709507593
-161219206
-13237708548482859013
-3701601059573925529
-879419277503368073
-3822179681402096264
+2676033356037948725
+18412545955976642616
 5565348445721659362
-532291916112267238
-256115374
-1460339693
-13351948495571782591
-14665351642484132
-3008657884776564221
-2341393787733871788
-16904712944497920326
-3967850626592737364
-16843031
-4131548702199581670
-6946995
-809763710
-1928986057181235415
-11964228788262537512
-2989761681675848960
-1873618519132801026
-7276444624641068235
-5994450030542718433
-12284124821458521275
-111739480
-4076606646528706921
-13650504529854072320
-15804734059994287439
-14425661019905001872
-2395604016
-14465116522071263669
-210659116497
-15290243360149343057
-15777957523720635747
-10167863869407233224
-18331517588211470
-12884708026702235763
-14811384
-72155640
-7042731044489660311
-15288269305517836796
-5675796551176948530
-14264208198271043974
-1495860210
-5787083718919720300
-25099894056749168
-683965395648908415
-62259623
-4915348
-12974919760129952993
-6155045917120857525
-1873618523431569790
-9013091190501541709
-4392112055939237960
-2625321597997353452
-15897908900500866947
-6177363174264606048
-15872788267758849077
-491324104
-33560399034844286
-22675774
-17542946455516547053
-2431124533
-538772246
-27920040887322186
-8704274751914773568
-12085352355710699032
-6153353775713551670
-70123993
-27356081166223293
-7885152524183078888
-60227983
-2883701
-11700344903086704893
-7329667560521271617
-518980348
-5833854255738521265
-8618954206935976415
-3901910077209972079
-1713308683
-1992881785903908578
-4530582984922301900
-16130159995999161574
-155124341
-2625321602296121720
-1884114794138700522
-5778348218852443426
-97780251
-4240022615453076686
-6097847786116483627
-6361518319333476776
-30540122
-28484146776247610
-546636604
-5741055947585816645
-6100103891543657570
-8807886331112851129
-813564822
-10223260478367337870
-746324852
-15287423226215073909
-11226550812567014265
-1491796976
-8097653480026868144
-5995296157134227520
-1873618532029106835
-1539245050
-48300418
-331037869860
-95748625
-6314795724398267312
-5888081980883929307
-544604964
-34124418943289166
-5245848947242502849
-32432363517642192
-2676033356038407648
-811533196
-1317733333
-8920676095134336910
-17149817495305717193
-918014392040164136
-103612987
-8695136395555507435
-18349504802666319185
-14847634415788362123
-1584661506
-4287350266942457603
-525512494730316455
-5881302580997523790
-1574765567
-3784125305237867347
-819397570
-8326286517935867839
-16149105318148965958
-16580883
-6684847
-18411699902469439513
-11229983338076703492
-15292499491369977714
-339635406848
-9870724570940976
+5767329
+5250413516934022944
+97518103
+63111604
+579208034
+544801575
+17236251
+258081459
+17953567922355439196
+30458188512103543
+15287987228927658628
+4930631980557601532
+20305658202031811
+2120987217453057458
+6209987959894902621
+7151957518376504179
+12552846396214610071
+1793158821936040552
+5461104787107351969
+559088458
+14386655907412249373
+547619651
+2141783083
+12606726616442537392
+1923875870
+811402123
+570557265
+42991988
 100
 101
 102
diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
index a009802bab0..a8c5ae3b6a3 100644
--- a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
+++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
@@ -14,9 +14,10 @@ def with_nulls(request):
 @pytest.mark.parametrize("nrows", [30, 300, 300_000])
 @pytest.mark.parametrize("nkeys", [1, 2, 4])
 def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
+    rng = np.random.default_rng(seed=0)
     key_names = [f"key{key}" for key in range(nkeys)]
-    key_values = [np.random.randint(100, size=nrows) for _ in key_names]
-    value = np.random.randint(-100, 100, size=nrows)
+    key_values = [rng.integers(100, size=nrows) for _ in key_names]
+    value = rng.integers(-100, 100, size=nrows)
     df = cudf.DataFrame(dict(zip(key_names, key_values), value=value))
     if with_nulls:
         for key in key_names:
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
deleted file mode 100644
index 496a322ff80..00000000000
--- a/python/cudf/cudf/tests/pytest.ini
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-[pytest]
-markers =
-    spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`
-xfail_strict = true
-filterwarnings =
-    error
-    ignore:::.*xdist.*
-    ignore:::.*pytest.*
-    # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
-    ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore
-    # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
-    ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
-    # PerformanceWarning from cupy warming up the JIT cache
-    ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning
-    # Ignore numba PEP 456 warning specific to arm machines
-    ignore:FNV hashing is not implemented in Numba.*:UserWarning
-addopts = --tb=native
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index cea86a5499e..691da224f44 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -266,3 +266,25 @@ def test_pandas_compatible_non_zoneinfo_raises(klass):
     with cudf.option_context("mode.pandas_compatible", True):
         with pytest.raises(NotImplementedError):
             cudf.from_pandas(pandas_obj)
+
+
+def test_astype_naive_to_aware_raises():
+    ser = cudf.Series([datetime.datetime(2020, 1, 1)])
+    with pytest.raises(TypeError):
+        ser.astype("datetime64[ns, UTC]")
+    with pytest.raises(TypeError):
+        ser.to_pandas().astype("datetime64[ns, UTC]")
+
+
+@pytest.mark.parametrize("unit", ["ns", "us"])
+def test_astype_aware_to_aware(unit):
+    ser = cudf.Series(
+        [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)]
+    )
+    result = ser.astype(f"datetime64[{unit}, US/Pacific]")
+    expected = ser.to_pandas().astype(f"datetime64[{unit}, US/Pacific]")
+    zoneinfo_type = pd.DatetimeTZDtype(
+        expected.dtype.unit, zoneinfo.ZoneInfo(str(expected.dtype.tz))
+    )
+    expected = ser.astype(zoneinfo_type)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 979c936a182..af9a6c7e696 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -33,7 +33,7 @@ def __array_function__(self, *args, **kwargs):
 
 missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
-np.random.seed(0)
+rng = np.random.default_rng(seed=0)
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
@@ -49,7 +49,7 @@ def __array_function__(self, *args, **kwargs):
     ],
 )
 def test_array_func_cudf_series(func):
-    np_ar = np.random.random(100)
+    np_ar = rng.random(100)
     cudf_ser = cudf.Series(np_ar)
     expect = func(np_ar)
     got = func(cudf_ser)
@@ -74,7 +74,7 @@ def test_array_func_cudf_series(func):
     ],
 )
 def test_array_func_cudf_dataframe(func):
-    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
+    pd_df = pd.DataFrame(rng.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     expect = func(pd_df)
     got = func(cudf_df)
@@ -91,7 +91,7 @@ def test_array_func_cudf_dataframe(func):
     ],
 )
 def test_array_func_missing_cudf_dataframe(func):
-    pd_df = pd.DataFrame(np.random.uniform(size=(100, 10)))
+    pd_df = pd.DataFrame(rng.uniform(size=(100, 10)))
     cudf_df = cudf.from_pandas(pd_df)
     with pytest.raises(TypeError):
         func(cudf_df)
@@ -105,7 +105,7 @@ def test_array_func_missing_cudf_dataframe(func):
     ],
 )
 def test_array_func_cudf_index(func):
-    np_ar = np.random.random(100)
+    np_ar = rng.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     expect = func(np_ar)
     got = func(cudf_index)
@@ -125,7 +125,7 @@ def test_array_func_cudf_index(func):
     ],
 )
 def test_array_func_missing_cudf_index(func):
-    np_ar = np.random.random(100)
+    np_ar = rng.random(100)
     cudf_index = cudf.Index(cudf.Series(np_ar))
     with pytest.raises(TypeError):
         func(cudf_index)
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 5acdf36de80..17ef033ea9e 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -600,12 +600,12 @@ def test_avro_reader_multiblock(
     else:
         assert dtype in ("float32", "float64")
         avro_type = "float" if dtype == "float32" else "double"
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         # We don't use rand_dataframe() here, because it increases the
         # execution time of each test by a factor of 10 or more (it appears
         # to use a very costly approach to generating random data).
         # See also: https://github.com/rapidsai/cudf/issues/13128
-        values = np.random.rand(total_rows).astype(dtype)
+        values = rng.random(total_rows).astype(dtype)
         bytes_per_row = values.dtype.itemsize
 
     # The sync_interval is the number of bytes between sync blocks.  We know
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 2e8519509e2..949fa909b5b 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -2,7 +2,6 @@
 
 import decimal
 import operator
-import random
 import warnings
 from itertools import combinations_with_replacement, product
 
@@ -179,7 +178,13 @@
 
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("binop", _binops)
-def test_series_binop(binop, obj_class):
+def test_series_binop(request, binop, obj_class):
+    request.applymarker(
+        pytest.mark.xfail(
+            binop is operator.floordiv,
+            reason="https://github.com/rapidsai/cudf/issues/17073",
+        )
+    )
     nelem = 1000
     arr1 = utils.gen_rand("float64", nelem) * 10000
     # Keeping a low value because CUDA 'pow' has 2 full range error
@@ -187,13 +192,15 @@ def test_series_binop(binop, obj_class):
 
     sr1 = Series(arr1)
     sr2 = Series(arr2)
+    psr1 = sr1.to_pandas()
+    psr2 = sr2.to_pandas()
 
     if obj_class == "Index":
         sr1 = Index(sr1)
         sr2 = Index(sr2)
 
+    expect = binop(psr1, psr2)
     result = binop(sr1, sr2)
-    expect = binop(pd.Series(arr1), pd.Series(arr2))
 
     if obj_class == "Index":
         result = Series(result)
@@ -204,7 +211,8 @@ def test_series_binop(binop, obj_class):
 @pytest.mark.parametrize("binop", _binops)
 def test_series_binop_concurrent(binop):
     def func(index):
-        arr = np.random.random(100) * 10
+        rng = np.random.default_rng(seed=0)
+        arr = rng.random(100) * 10
         sr = Series(arr)
 
         result = binop(sr.astype("int32"), sr)
@@ -223,8 +231,9 @@ def func(index):
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("nelem,binop", list(product([1, 2, 100], _binops)))
 def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar):
-    arr = np.random.random(nelem)
-    rhs = random.choice(arr).item()
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(nelem)
+    rhs = rng.choice(arr).item()
 
     sr = Series(arr)
     if obj_class == "Index":
@@ -247,10 +256,11 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar):
     "lhs_dtype,rhs_dtype", list(product(_int_types, _int_types))
 )
 def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype):
-    arr1 = (np.random.random(100) * 100).astype(lhs_dtype)
+    rng = np.random.default_rng(seed=0)
+    arr1 = (rng.random(100) * 100).astype(lhs_dtype)
     sr1 = Series(arr1)
 
-    arr2 = (np.random.random(100) * 100).astype(rhs_dtype)
+    arr2 = (rng.random(100) * 100).astype(rhs_dtype)
     sr2 = Series(arr2)
 
     if obj_class == "Index":
@@ -271,8 +281,9 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype):
     "dtype", ["int8", "int32", "int64", "float32", "float64", "datetime64[ms]"]
 )
 def test_series_compare(cmpop, obj_class, dtype):
-    arr1 = np.random.randint(0, 100, 100).astype(dtype)
-    arr2 = np.random.randint(0, 100, 100).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    arr1 = rng.integers(0, 100, 100).astype(dtype)
+    arr2 = rng.integers(0, 100, 100).astype(dtype)
     sr1 = Series(arr1)
     sr2 = Series(arr2)
 
@@ -438,9 +449,10 @@ def test_str_series_compare_num_reflected(
 def test_series_compare_scalar(
     nelem, cmpop, obj_class, dtype, use_cudf_scalar
 ):
-    arr1 = np.random.randint(0, 100, 100).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    arr1 = rng.integers(0, 100, 100).astype(dtype)
     sr1 = Series(arr1)
-    rhs = random.choice(arr1).item()
+    rhs = rng.choice(arr1).item()
 
     if use_cudf_scalar:
         rhs = cudf.Scalar(rhs)
@@ -465,9 +477,9 @@ def test_series_compare_scalar(
 @pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128])
 @pytest.mark.parametrize("lhs_nulls,rhs_nulls", list(product(_nulls, _nulls)))
 def test_validity_add(nelem, lhs_nulls, rhs_nulls):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     # LHS
-    lhs_data = np.random.random(nelem)
+    lhs_data = rng.random(nelem)
     if lhs_nulls == "some":
         lhs_mask = utils.random_bitmask(nelem)
         lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem]
@@ -478,7 +490,7 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls):
     else:
         lhs = Series(lhs_data)
     # RHS
-    rhs_data = np.random.random(nelem)
+    rhs_data = rng.random(nelem)
     if rhs_nulls == "some":
         rhs_mask = utils.random_bitmask(nelem)
         rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem]
@@ -525,8 +537,9 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls):
 )
 def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class):
     nelem = 10
-    lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype)
-    rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype)
+    rng = np.random.default_rng(seed=0)
+    lhs = (rng.random(nelem) * nelem).astype(lhs_dtype)
+    rhs = (rng.random(nelem) * nelem).astype(rhs_dtype)
 
     sr1 = Series(lhs)
     sr2 = Series(rhs)
@@ -550,8 +563,9 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class):
 )
 def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class):
     nelem = 5
-    lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype)
-    rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype)
+    rng = np.random.default_rng(seed=0)
+    lhs = (rng.random(nelem) * nelem).astype(lhs_dtype)
+    rhs = (rng.random(nelem) * nelem).astype(rhs_dtype)
 
     sr1 = Series(lhs)
     sr2 = Series(rhs)
@@ -574,8 +588,7 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class):
 )
 def test_series_reflected_ops_scalar(func, dtype, obj_class):
     # create random series
-    np.random.seed(12)
-    random_series = utils.gen_rand(dtype, 100, low=10)
+    random_series = utils.gen_rand(dtype, 100, low=10, seed=12)
 
     # gpu series
     gs = Series(random_series)
@@ -631,8 +644,7 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
     cpu_func, gpu_func = funcs
 
     # create random series
-    np.random.seed(12)
-    random_series = utils.gen_rand(dtype, 100, low=10)
+    random_series = utils.gen_rand(dtype, 100, low=10, seed=12)
 
     # gpu series
     gs = Series(random_series)
@@ -774,7 +786,8 @@ def test_df_different_index_shape(df2, binop):
 
 @pytest.mark.parametrize("op", [operator.eq, operator.ne])
 def test_boolean_scalar_binop(op):
-    psr = pd.Series(np.random.choice([True, False], 10))
+    rng = np.random.default_rng(seed=0)
+    psr = pd.Series(rng.choice([True, False], 10))
     gsr = cudf.from_pandas(psr)
     assert_eq(op(psr, True), op(gsr, True))
     assert_eq(op(psr, False), op(gsr, False))
@@ -923,16 +936,17 @@ def test_operator_func_dataframe(func, nulls, fill_value, other):
     num_cols = 3
 
     def gen_df():
+        rng = np.random.default_rng(seed=0)
         pdf = pd.DataFrame()
         from string import ascii_lowercase
 
-        cols = np.random.choice(num_cols + 5, num_cols, replace=False)
+        cols = rng.choice(num_cols + 5, num_cols, replace=False)
 
         for i in range(num_cols):
             colname = ascii_lowercase[cols[i]]
             data = utils.gen_rand("float64", num_rows) * 10000
             if nulls == "some":
-                idx = np.random.choice(
+                idx = rng.choice(
                     num_rows, size=int(num_rows / 2), replace=False
                 )
                 data[idx] = np.nan
@@ -954,21 +968,21 @@ def gen_df():
 @pytest.mark.parametrize("nulls", _nulls)
 @pytest.mark.parametrize("other", ["df", "scalar"])
 def test_logical_operator_func_dataframe(func, nulls, other):
-    np.random.seed(0)
     num_rows = 100
     num_cols = 3
 
     def gen_df():
+        rng = np.random.default_rng(seed=0)
         pdf = pd.DataFrame()
         from string import ascii_lowercase
 
-        cols = np.random.choice(num_cols + 5, num_cols, replace=False)
+        cols = rng.choice(num_cols + 5, num_cols, replace=False)
 
         for i in range(num_cols):
             colname = ascii_lowercase[cols[i]]
             data = utils.gen_rand("float64", num_rows) * 10000
             if nulls == "some":
-                idx = np.random.choice(
+                idx = rng.choice(
                     num_rows, size=int(num_rows / 2), replace=False
                 )
                 data[idx] = np.nan
@@ -977,8 +991,12 @@ def gen_df():
 
     pdf1 = gen_df()
     pdf2 = gen_df() if other == "df" else 59.0
-    gdf1 = cudf.DataFrame.from_pandas(pdf1)
-    gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0
+    gdf1 = cudf.DataFrame.from_pandas(pdf1, nan_as_null=False)
+    gdf2 = (
+        cudf.DataFrame.from_pandas(pdf2, nan_as_null=False)
+        if other == "df"
+        else 59.0
+    )
 
     got = getattr(gdf1, func)(gdf2)
     expect = getattr(pdf1, func)(pdf2)[list(got._data)]
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index cd1ad21ae59..db41f689255 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -252,10 +252,10 @@ def test_cat_series_binop_error():
 @pytest.mark.parametrize("num_elements", [10, 100, 1000])
 def test_categorical_unique(num_elements):
     # create categorical series
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     pd_cat = pd.Categorical(
         pd.Series(
-            np.random.choice(
+            rng.choice(
                 list(string.ascii_letters + string.digits), num_elements
             ),
             dtype="category",
@@ -279,12 +279,10 @@ def test_categorical_unique(num_elements):
 @pytest.mark.parametrize("nelem", [20, 50, 100])
 def test_categorical_unique_count(nelem):
     # create categorical series
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=0)
     pd_cat = pd.Categorical(
         pd.Series(
-            np.random.choice(
-                list(string.ascii_letters + string.digits), nelem
-            ),
+            rng.choice(list(string.ascii_letters + string.digits), nelem),
             dtype="category",
         )
     )
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 4aa7fb27c9b..65947efc2df 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -31,12 +31,13 @@
 @pytest.fixture(params=dtypes, ids=dtypes)
 def pandas_input(request):
     dtype = request.param
-    rng = np.random.default_rng()
+    rng = np.random.default_rng(seed=0)
     size = 100
 
     def random_ints(dtype, size):
         dtype_min = np.iinfo(dtype).min
         dtype_max = np.iinfo(dtype).max
+        rng = np.random.default_rng(seed=0)
         return rng.integers(dtype_min, dtype_max, size=size, dtype=dtype)
 
     try:
@@ -154,7 +155,9 @@ def test_column_slicing(pandas_input, offset, size):
     [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype],
 )
 def test_decimal_column_slicing(offset, size, precision, scale, decimal_type):
-    col = cudf.core.column.as_column(pd.Series(np.random.rand(1000)))
+    col = cudf.core.column.as_column(
+        pd.Series(np.random.default_rng(seed=0).random(1000))
+    )
     col = col.astype(decimal_type(precision, scale))
     column_slicing_test(col, offset, size, True)
 
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 8da589ba45b..ab0f1767cd6 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -30,6 +30,7 @@ def _hide_concat_empty_dtype_warning():
 
 
 def make_frames(index=None, nulls="none"):
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
             "x": range(10),
@@ -51,7 +52,7 @@ def make_frames(index=None, nulls="none"):
         df2.y = np.full_like(df2.y, np.nan)
     if nulls == "some":
         mask = np.arange(10)
-        np.random.shuffle(mask)
+        rng.shuffle(mask)
         mask = mask[:5]
         df.loc[mask, "y"] = np.nan
         df2.loc[mask, "y"] = np.nan
@@ -203,10 +204,9 @@ def test_concat_misordered_columns():
 
 @pytest.mark.parametrize("axis", [1, "columns"])
 def test_concat_columns(axis):
-    pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3])
-    pdf2 = pd.DataFrame(
-        np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7]
-    )
+    rng = np.random.default_rng(seed=0)
+    pdf1 = pd.DataFrame(rng.integers(10, size=(5, 3)), columns=[1, 2, 3])
+    pdf2 = pd.DataFrame(rng.integers(10, size=(5, 4)), columns=[4, 5, 6, 7])
     gdf1 = cudf.from_pandas(pdf1)
     gdf2 = cudf.from_pandas(pdf2)
 
@@ -1398,11 +1398,12 @@ def test_concat_single_object(ignore_index, typ):
     ],
 )
 def test_concat_decimal_dataframe(ltype, rtype):
+    rng = np.random.default_rng(seed=0)
     gdf1 = cudf.DataFrame(
-        {"id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"]}
+        {"id": rng.integers(0, 10, 3), "val": ["22.3", "59.5", "81.1"]}
     )
     gdf2 = cudf.DataFrame(
-        {"id": np.random.randint(0, 10, 3), "val": ["2.35", "5.59", "8.14"]}
+        {"id": rng.integers(0, 10, 3), "val": ["2.35", "5.59", "8.14"]}
     )
 
     gdf1["val"] = gdf1["val"].astype(ltype)
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index 9b6f82ec705..f33cfe268a3 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -16,8 +16,9 @@
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
 def test_repeat(dtype):
-    arr = np.random.rand(10) * 10
-    repeats = np.random.randint(10, size=10)
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(10) * 10
+    repeats = rng.integers(10, size=10)
     psr = pd.Series(arr).astype(dtype)
     gsr = cudf.from_pandas(psr)
 
@@ -25,18 +26,20 @@ def test_repeat(dtype):
 
 
 def test_repeat_index():
+    rng = np.random.default_rng(seed=0)
     arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
     psr = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
     gsr = cudf.from_pandas(psr)
-    repeats = np.random.randint(10, size=4)
+    repeats = rng.integers(10, size=4)
 
     assert_eq(psr.repeat(repeats), gsr.repeat(repeats))
 
 
 def test_repeat_dataframe():
+    rng = np.random.default_rng(seed=0)
     psr = pd.DataFrame({"a": [1, 1, 2, 2]})
     gsr = cudf.from_pandas(psr)
-    repeats = np.random.randint(10, size=4)
+    repeats = rng.integers(10, size=4)
 
     # pd.DataFrame doesn't have repeat() so as a workaround, we are
     # comparing pd.Series.repeat() with cudf.DataFrame.repeat()['a']
@@ -45,7 +48,8 @@ def test_repeat_dataframe():
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES)
 def test_repeat_scalar(dtype):
-    arr = np.random.rand(10) * 10
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(10) * 10
     repeats = 10
     psr = pd.Series(arr).astype(dtype)
     gsr = cudf.from_pandas(psr)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index b6efc8ebd88..8800275bf67 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1764,13 +1764,13 @@ def test_csv_writer_multiindex(tmpdir):
     pdf_df_fname = tmpdir.join("pdf_df_3.csv")
     gdf_df_fname = tmpdir.join("gdf_df_3.csv")
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
-            "a": np.random.randint(0, 5, 20),
-            "b": np.random.randint(0, 5, 20),
+            "a": rng.integers(0, 5, 20),
+            "b": rng.integers(0, 5, 20),
             "c": range(20),
-            "d": np.random.random(20),
+            "d": rng.random(20),
         }
     )
     gdg = gdf.groupby(["a", "b"]).mean()
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6f88d942746..0f2b41888fa 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -428,7 +428,7 @@ def test_series_init_none():
 
 
 def test_dataframe_basic():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame()
 
     # Populate with cuda memory
@@ -437,7 +437,7 @@ def test_dataframe_basic():
     assert len(df) == 10
 
     # Populate with numpy array
-    rnd_vals = np.random.random(10)
+    rnd_vals = rng.random(10)
     df["vals"] = rnd_vals
     np.testing.assert_equal(df["vals"].to_numpy(), rnd_vals)
     assert len(df) == 10
@@ -1238,8 +1238,9 @@ def test_empty_dataframe_to_cupy():
 
     df = cudf.DataFrame()
     nelem = 123
+    rng = np.random.default_rng(seed=0)
     for k in "abc":
-        df[k] = np.random.random(nelem)
+        df[k] = rng.random(nelem)
 
     # Check all columns in empty dataframe.
     mat = df.head(0).to_cupy()
@@ -1250,8 +1251,9 @@ def test_dataframe_to_cupy():
     df = cudf.DataFrame()
 
     nelem = 123
+    rng = np.random.default_rng(seed=0)
     for k in "abcd":
-        df[k] = np.random.random(nelem)
+        df[k] = rng.random(nelem)
 
     # Check all columns
     mat = df.to_cupy()
@@ -1279,8 +1281,9 @@ def test_dataframe_to_cupy_null_values():
     na = -10000
 
     refvalues = {}
+    rng = np.random.default_rng(seed=0)
     for k in "abcd":
-        df[k] = data = np.random.random(nelem)
+        df[k] = data = rng.random(nelem)
         bitmask = utils.random_bitmask(nelem)
         df[k] = df[k]._column.set_mask(bitmask)
         boolmask = np.asarray(
@@ -1321,10 +1324,11 @@ def test_dataframe_append_empty():
 
 
 def test_dataframe_setitem_from_masked_object():
-    ary = np.random.randn(100)
+    rng = np.random.default_rng(seed=0)
+    ary = rng.standard_normal(100)
     mask = np.zeros(100, dtype=bool)
     mask[:20] = True
-    np.random.shuffle(mask)
+    rng.shuffle(mask)
     ary[mask] = np.nan
 
     test1_null = cudf.Series(ary, nan_as_null=True)
@@ -1534,14 +1538,12 @@ def test_dataframe_hash_values_xxhash64():
 @pytest.mark.parametrize("nparts", [1, 2, 8, 13])
 @pytest.mark.parametrize("nkeys", [1, 2])
 def test_dataframe_hash_partition(nrows, nparts, nkeys):
-    np.random.seed(123)
-    gdf = cudf.DataFrame()
-    keycols = []
-    for i in range(nkeys):
-        keyname = f"key{i}"
-        gdf[keyname] = np.random.randint(0, 7 - i, nrows)
-        keycols.append(keyname)
-    gdf["val1"] = np.random.randint(0, nrows * 2, nrows)
+    rng = np.random.default_rng(seed=0)
+    gdf = cudf.DataFrame(
+        {f"key{i}": rng.integers(0, 7 - i, nrows) for i in range(nkeys)}
+    )
+    keycols = gdf.columns.to_list()
+    gdf["val1"] = rng.integers(0, nrows * 2, nrows)
 
     got = gdf.partition_by_hash(keycols, nparts=nparts)
     # Must return a list
@@ -1751,8 +1753,9 @@ def test_concat_with_axis():
 
     assert_eq(concat_cdf_s, concat_s, check_index_type=True)
 
+    rng = np.random.default_rng(seed=0)
     # concat series and dataframes
-    s3 = pd.Series(np.random.random(5))
+    s3 = pd.Series(rng.random(5))
     cs3 = cudf.Series.from_pandas(s3)
 
     concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1)
@@ -1787,13 +1790,14 @@ def test_concat_with_axis():
         check_index_type=True,
     )
 
+    rng = np.random.default_rng(seed=0)
     # concat groupby multi index
     gdf1 = cudf.DataFrame(
         {
-            "x": np.random.randint(0, 10, 10),
-            "y": np.random.randint(0, 10, 10),
-            "z": np.random.randint(0, 10, 10),
-            "v": np.random.randint(0, 10, 10),
+            "x": rng.integers(0, 10, 10),
+            "y": rng.integers(0, 10, 10),
+            "z": rng.integers(0, 10, 10),
+            "v": rng.integers(0, 10, 10),
         }
     )
     gdf2 = gdf1[5:]
@@ -1833,14 +1837,14 @@ def test_concat_with_axis():
 
 @pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000])
 def test_nonmatching_index_setitem(nrows):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     gdf = cudf.DataFrame()
-    gdf["a"] = np.random.randint(2147483647, size=nrows)
-    gdf["b"] = np.random.randint(2147483647, size=nrows)
+    gdf["a"] = rng.integers(2147483647, size=nrows)
+    gdf["b"] = rng.integers(2147483647, size=nrows)
     gdf = gdf.set_index("b")
 
-    test_values = np.random.randint(2147483647, size=nrows)
+    test_values = rng.integers(2147483647, size=nrows)
     gdf["c"] = test_values
     assert len(test_values) == len(gdf["c"])
     gdf_series = cudf.Series(test_values, index=gdf.index, name="c")
@@ -1974,10 +1978,11 @@ def test_index_in_dataframe_constructor():
 @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000])
 @pytest.mark.parametrize("data_type", dtypes)
 def test_from_arrow(nelem, data_type):
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "a": np.random.randint(0, 1000, nelem).astype(data_type),
-            "b": np.random.randint(0, 1000, nelem).astype(data_type),
+            "a": rng.integers(0, 1000, nelem).astype(data_type),
+            "b": rng.integers(0, 1000, nelem).astype(data_type),
         }
     )
     padf = pa.Table.from_pandas(
@@ -2012,10 +2017,11 @@ def test_from_arrow_chunked_categories():
 @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000])
 @pytest.mark.parametrize("data_type", dtypes)
 def test_to_arrow(nelem, data_type):
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "a": np.random.randint(0, 1000, nelem).astype(data_type),
-            "b": np.random.randint(0, 1000, nelem).astype(data_type),
+            "a": rng.integers(0, 1000, nelem).astype(data_type),
+            "b": rng.integers(0, 1000, nelem).astype(data_type),
         }
     )
     gdf = cudf.DataFrame.from_pandas(df)
@@ -2119,17 +2125,16 @@ def test_to_arrow_missing_categorical():
 
 @pytest.mark.parametrize("data_type", dtypes)
 def test_from_scalar_typing(data_type):
+    rng = np.random.default_rng(seed=0)
     if data_type == "datetime64[ms]":
         scalar = (
-            np.dtype("int64")
-            .type(np.random.randint(0, 5))
-            .astype("datetime64[ms]")
+            np.dtype("int64").type(rng.integers(0, 5)).astype("datetime64[ms]")
         )
     elif data_type.startswith("datetime64"):
         scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]")
         data_type = "datetime64[ms]"
     else:
-        scalar = np.dtype(data_type).type(np.random.randint(0, 5))
+        scalar = np.dtype(data_type).type(rng.integers(0, 5))
 
     gdf = cudf.DataFrame()
     gdf["a"] = [1, 2, 3, 4, 5]
@@ -2140,7 +2145,8 @@ def test_from_scalar_typing(data_type):
 
 @pytest.mark.parametrize("data_type", NUMERIC_TYPES)
 def test_from_python_array(data_type):
-    np_arr = np.random.randint(0, 100, 10).astype(data_type)
+    rng = np.random.default_rng(seed=0)
+    np_arr = rng.integers(0, 100, 10).astype(data_type)
     data = memoryview(np_arr)
     data = arr.array(data.format, data)
 
@@ -2220,7 +2226,7 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype):
     # against pandas nullable types as they are the ones that closely
     # resemble `cudf` dtypes behavior.
     pdf = pd.DataFrame()
-
+    rng = np.random.default_rng(seed=0)
     null_rep = np.nan if dtype in ["float32", "float64"] else None
     np_dtype = dtype
     dtype = np.dtype(dtype)
@@ -2228,13 +2234,11 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype):
     for i in range(num_cols):
         colname = string.ascii_lowercase[i]
         data = pd.Series(
-            np.random.randint(0, 26, num_rows).astype(np_dtype),
+            rng.integers(0, 26, num_rows).astype(np_dtype),
             dtype=dtype,
         )
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             if len(idx):
                 data[idx] = null_rep
         elif nulls == "all":
@@ -2652,8 +2656,8 @@ def test_unaryops_df(pdf, unaryop, col_name, assign_col_name):
 
 
 def test_df_abs(pdf):
-    np.random.seed(0)
-    disturbance = pd.Series(np.random.rand(10))
+    rng = np.random.default_rng(seed=0)
+    disturbance = pd.Series(rng.random(10))
     pdf = pdf - 5 + disturbance
     d = pdf.apply(np.abs)
     g = cudf.from_pandas(pdf).abs()
@@ -2706,8 +2710,9 @@ def test_iteritems(gdf):
 def test_quantile(q, numeric_only):
     ts = pd.date_range("2018-08-24", periods=5, freq="D")
     td = pd.to_timedelta(np.arange(5), unit="h")
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
-        {"date": ts, "delta": td, "val": np.random.randn(len(ts))}
+        {"date": ts, "delta": td, "val": rng.standard_normal(len(ts))}
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
 
@@ -2729,9 +2734,10 @@ def test_quantile(q, numeric_only):
     [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
 )
 def test_decimal_quantile(q, interpolation, decimal_type):
+    rng = np.random.default_rng(seed=0)
     data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"]
     gdf = cudf.DataFrame(
-        {"id": np.random.randint(0, 10, size=len(data)), "val": data}
+        {"id": rng.integers(0, 10, size=len(data)), "val": data}
     )
     gdf["id"] = gdf["id"].astype("float64")
     gdf["val"] = gdf["val"].astype(decimal_type(7, 2))
@@ -2843,9 +2849,9 @@ def test_cuda_array_interface(dtype):
 @pytest.mark.parametrize("nchunks", [1, 2, 5, 10])
 @pytest.mark.parametrize("data_type", dtypes)
 def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
+    rng = np.random.default_rng(seed=0)
     np_list_data = [
-        np.random.randint(0, 100, nelem).astype(data_type)
-        for i in range(nchunks)
+        rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks)
     ]
     pa_chunk_array = pa.chunked_array(np_list_data)
 
@@ -2855,8 +2861,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     assert_eq(expect, got)
 
     np_list_data2 = [
-        np.random.randint(0, 100, nelem).astype(data_type)
-        for i in range(nchunks)
+        rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks)
     ]
     pa_chunk_array2 = pa.chunked_array(np_list_data2)
     pa_table = pa.Table.from_arrays(
@@ -2881,11 +2886,13 @@ def query_GPU_memory(note=""):
     cuda.current_context().deallocations.clear()
     nRows = int(1e8)
     nCols = 2
-    dataNumpy = np.asfortranarray(np.random.rand(nRows, nCols))
+    rng = np.random.default_rng(seed=0)
+    dataNumpy = np.asfortranarray(rng.random(nRows, nCols))
     colNames = ["col" + str(iCol) for iCol in range(nCols)]
     pandasDF = pd.DataFrame(data=dataNumpy, columns=colNames, dtype=np.float32)
     cudaDF = cudf.core.DataFrame.from_pandas(pandasDF)
-    boolmask = cudf.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool"))
+    rng = np.random.default_rng(seed=0)
+    boolmask = cudf.Series(rng.integers(1, 2, len(cudaDF)).astype("bool"))
 
     memory_used = query_GPU_memory()
     cudaDF = cudaDF[boolmask]
@@ -2903,7 +2910,8 @@ def query_GPU_memory(note=""):
 
 
 def test_boolmask(pdf, gdf):
-    boolmask = np.random.randint(0, 2, len(pdf)) > 0
+    rng = np.random.default_rng(seed=0)
+    boolmask = rng.integers(0, 2, len(pdf)) > 0
     gdf = gdf[boolmask]
     pdf = pdf[boolmask]
     assert_eq(pdf, gdf)
@@ -2922,12 +2930,11 @@ def test_boolmask(pdf, gdf):
     ],
 )
 def test_dataframe_boolmask(mask_shape):
-    pdf = pd.DataFrame()
-    for col in "abc":
-        pdf[col] = np.random.randint(0, 10, 3)
-    pdf_mask = pd.DataFrame()
-    for col in mask_shape[1]:
-        pdf_mask[col] = np.random.randint(0, 2, mask_shape[0]) > 0
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame({col: rng.integers(0, 10, 3) for col in "abc"})
+    pdf_mask = pd.DataFrame(
+        {col: rng.integers(0, 2, mask_shape[0]) > 0 for col in mask_shape[1]}
+    )
     gdf = cudf.DataFrame.from_pandas(pdf)
     gdf_mask = cudf.DataFrame.from_pandas(pdf_mask)
     gdf = gdf[gdf_mask]
@@ -2992,7 +2999,8 @@ def test_arrow_handle_no_index_name(pdf, gdf):
 
 
 def test_pandas_non_contiguious():
-    arr1 = np.random.sample([5000, 10])
+    rng = np.random.default_rng(seed=0)
+    arr1 = rng.random(size=(5000, 10))
     assert arr1.flags["C_CONTIGUOUS"] is True
     df = pd.DataFrame(arr1)
     for col in df.columns:
@@ -3052,10 +3060,11 @@ def test_series_rename():
 @pytest.mark.parametrize("data_type", dtypes)
 @pytest.mark.parametrize("nelem", [0, 100])
 def test_head_tail(nelem, data_type):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "a": np.random.randint(0, 1000, nelem).astype(data_type),
-            "b": np.random.randint(0, 1000, nelem).astype(data_type),
+            "a": rng.integers(0, 1000, nelem).astype(data_type),
+            "b": rng.integers(0, 1000, nelem).astype(data_type),
         }
     )
     gdf = cudf.from_pandas(pdf)
@@ -3308,15 +3317,15 @@ def test_set_index_verify_integrity(data, index, verify_integrity):
 @pytest.mark.parametrize("drop", [True, False])
 @pytest.mark.parametrize("nelem", [10, 200, 1333])
 def test_set_index_multi(drop, nelem):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     a = np.arange(nelem)
-    np.random.shuffle(a)
+    rng.shuffle(a)
     df = pd.DataFrame(
         {
             "a": a,
-            "b": np.random.randint(0, 4, size=nelem),
-            "c": np.random.uniform(low=0, high=4, size=nelem),
-            "d": np.random.choice(["green", "black", "white"], nelem),
+            "b": rng.integers(0, 4, size=nelem),
+            "c": rng.uniform(low=0, high=4, size=nelem),
+            "d": rng.choice(["green", "black", "white"], nelem),
         }
     )
     df["e"] = df["d"].astype("category")
@@ -3894,13 +3903,13 @@ def test_select_dtype_datetime_with_frequency():
 
 
 def test_dataframe_describe_exclude():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
     df["x"] = df.x.astype("int64")
-    df["y"] = np.random.normal(10, 1, data_length)
+    df["y"] = rng.normal(10, 1, data_length)
     pdf = df.to_pandas()
 
     gdf_results = df.describe(exclude=["float"])
@@ -3910,13 +3919,13 @@ def test_dataframe_describe_exclude():
 
 
 def test_dataframe_describe_include():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
     df["x"] = df.x.astype("int64")
-    df["y"] = np.random.normal(10, 1, data_length)
+    df["y"] = rng.normal(10, 1, data_length)
     pdf = df.to_pandas()
     gdf_results = df.describe(include=["int"])
     pdf_results = pdf.describe(include=["int"])
@@ -3925,12 +3934,12 @@ def test_dataframe_describe_include():
 
 
 def test_dataframe_describe_default():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
-    df["y"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
+    df["y"] = rng.normal(10, 1, data_length)
     pdf = df.to_pandas()
     gdf_results = df.describe()
     pdf_results = pdf.describe()
@@ -3939,14 +3948,14 @@ def test_dataframe_describe_default():
 
 
 def test_series_describe_include_all():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
     df["x"] = df.x.astype("int64")
-    df["y"] = np.random.normal(10, 1, data_length)
-    df["animal"] = np.random.choice(["dog", "cat", "bird"], data_length)
+    df["y"] = rng.normal(10, 1, data_length)
+    df["animal"] = rng.choice(["dog", "cat", "bird"], data_length)
 
     pdf = df.to_pandas()
     gdf_results = df.describe(include="all")
@@ -3962,13 +3971,13 @@ def test_series_describe_include_all():
 
 
 def test_dataframe_describe_percentiles():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     data_length = 10000
     sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99]
 
     df = cudf.DataFrame()
-    df["x"] = np.random.normal(10, 1, data_length)
-    df["y"] = np.random.normal(10, 1, data_length)
+    df["x"] = rng.normal(10, 1, data_length)
+    df["y"] = rng.normal(10, 1, data_length)
     pdf = df.to_pandas()
     gdf_results = df.describe(percentiles=sample_percentiles)
     pdf_results = pdf.describe(percentiles=sample_percentiles)
@@ -4098,10 +4107,11 @@ def test_ndim():
     ],
 )
 def test_dataframe_round(decimals):
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
             "floats": np.arange(0.5, 10.5, 1),
-            "ints": np.random.normal(-100, 100, 10),
+            "ints": rng.normal(-100, 100, 10),
             "floats_with_na": np.array(
                 [
                     14.123,
@@ -4117,9 +4127,9 @@ def test_dataframe_round(decimals):
                 ]
             ),
             "floats_same": np.repeat([-0.6459412758761901], 10),
-            "bools": np.random.choice([True, None, False], 10),
-            "strings": np.random.choice(["abc", "xyz", None], 10),
-            "struct": np.random.choice([{"abc": 1}, {"xyz": 2}, None], 10),
+            "bools": rng.choice([True, None, False], 10),
+            "strings": rng.choice(["abc", "xyz", None], 10),
+            "struct": rng.choice([{"abc": 1}, {"xyz": 2}, None], 10),
             "list": [[1], [2], None, [4], [3]] * 2,
         }
     )
@@ -5811,10 +5821,11 @@ def test_memory_usage(deep, index, set_index):
 @pytest_xfail
 def test_memory_usage_string():
     rows = int(100)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
             "A": np.arange(rows, dtype="int32"),
-            "B": np.random.choice(["apple", "banana", "orange"], rows),
+            "B": rng.choice(["apple", "banana", "orange"], rows),
         }
     )
     gdf = cudf.from_pandas(df)
@@ -5837,10 +5848,11 @@ def test_memory_usage_string():
 
 def test_memory_usage_cat():
     rows = int(100)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
             "A": np.arange(rows, dtype="int32"),
-            "B": np.random.choice(["apple", "banana", "orange"], rows),
+            "B": rng.choice(["apple", "banana", "orange"], rows),
         }
     )
     df["B"] = df.B.astype("category")
@@ -5870,13 +5882,14 @@ def test_memory_usage_list():
 def test_memory_usage_multi(rows):
     # We need to sample without replacement to guarantee that the size of the
     # levels are always the same.
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
             "A": np.arange(rows, dtype="int32"),
-            "B": np.random.choice(
+            "B": rng.choice(
                 np.arange(rows, dtype="int64"), rows, replace=False
             ),
-            "C": np.random.choice(
+            "C": rng.choice(
                 np.arange(rows, dtype="float64"), rows, replace=False
             ),
         }
@@ -6698,8 +6711,16 @@ def test_dataframe_init_1d_list(data, columns):
         (cupy.array([11, 123, -2342, 232]), ["z"], [0, 1, 1, 0]),
         (cupy.array([11, 123, -2342, 232]), ["z"], [1, 2, 3, 4]),
         (cupy.array([11, 123, -2342, 232]), ["z"], ["a", "z", "d", "e"]),
-        (np.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]),
-        (np.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]),
+        (
+            np.random.default_rng(seed=0).standard_normal(size=(2, 4)),
+            ["a", "b", "c", "d"],
+            ["a", "b"],
+        ),
+        (
+            np.random.default_rng(seed=0).standard_normal(size=(2, 4)),
+            ["a", "b", "c", "d"],
+            [1, 0],
+        ),
         (cupy.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]),
         (cupy.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]),
     ],
@@ -6873,8 +6894,9 @@ def test_dataframe_info_basic():
     memory usage: 859.0+ bytes
     """
     )
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        np.random.randn(10, 10),
+        rng.standard_normal(size=(10, 10)),
         index=["a", "2", "3", "4", "5", "6", "7", "8", "100", "1111"],
     )
     cudf.from_pandas(df).info(buf=buffer, verbose=True)
@@ -9374,8 +9396,8 @@ def test_dataframe_roundtrip_arrow_struct_dtype(gdf):
 
 
 def test_dataframe_setitem_cupy_array():
-    np.random.seed(0)
-    pdf = pd.DataFrame(np.random.randn(10, 2))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.standard_normal(size=(10, 2)))
     gdf = cudf.from_pandas(pdf)
 
     gpu_array = cupy.array([True, False] * 5)
@@ -10161,7 +10183,7 @@ def df_eval(request):
             }
         )
     int_max = 10
-    rng = cupy.random.default_rng(0)
+    rng = cupy.random.default_rng(seed=0)
     return cudf.DataFrame(
         {
             "a": rng.integers(N, size=int_max),
@@ -10529,11 +10551,12 @@ def test_dataframe_init_length_error(data, index):
 
 
 def test_dataframe_binop_with_mixed_date_types():
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        np.random.rand(2, 2),
+        rng.random(size=(2, 2)),
         columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"),
     )
-    ser = pd.Series(np.random.rand(3), index=[0, 1, 2])
+    ser = pd.Series(rng.random(size=3), index=[0, 1, 2])
     gdf = cudf.from_pandas(df)
     gser = cudf.from_pandas(ser)
     expected = df - ser
@@ -10542,9 +10565,10 @@ def test_dataframe_binop_with_mixed_date_types():
 
 
 def test_dataframe_binop_with_mixed_string_types():
-    df1 = pd.DataFrame(np.random.rand(3, 3), columns=pd.Index([0, 1, 2]))
+    rng = np.random.default_rng(seed=0)
+    df1 = pd.DataFrame(rng.random(size=(3, 3)), columns=pd.Index([0, 1, 2]))
     df2 = pd.DataFrame(
-        np.random.rand(6, 6),
+        rng.random(size=(6, 6)),
         columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]),
     )
     gdf1 = cudf.from_pandas(df1)
@@ -10557,7 +10581,8 @@ def test_dataframe_binop_with_mixed_string_types():
 
 
 def test_dataframe_binop_and_where():
-    df = pd.DataFrame(np.random.rand(2, 2), columns=pd.Index([True, False]))
+    rng = np.random.default_rng(seed=0)
+    df = pd.DataFrame(rng.random(size=(2, 2)), columns=pd.Index([True, False]))
     gdf = cudf.from_pandas(df)
 
     expected = df > 1
@@ -10572,12 +10597,13 @@ def test_dataframe_binop_and_where():
 
 
 def test_dataframe_binop_with_datetime_index():
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        np.random.rand(2, 2),
+        rng.random(size=(2, 2)),
         columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"),
     )
     ser = pd.Series(
-        np.random.rand(2),
+        rng.random(2),
         index=pd.Index(
             [
                 "2000-01-04",
@@ -10615,8 +10641,8 @@ def test_dataframe_dict_like_with_columns(columns, index):
 
 
 def test_dataframe_init_columns_named_multiindex():
-    np.random.seed(0)
-    data = np.random.randn(2, 2)
+    rng = np.random.default_rng(seed=0)
+    data = rng.standard_normal(size=(2, 2))
     columns = cudf.MultiIndex.from_tuples(
         [("A", "one"), ("A", "two")], names=["y", "z"]
     )
@@ -10627,8 +10653,8 @@ def test_dataframe_init_columns_named_multiindex():
 
 
 def test_dataframe_init_columns_named_index():
-    np.random.seed(0)
-    data = np.random.randn(2, 2)
+    rng = np.random.default_rng(seed=0)
+    data = rng.standard_normal(size=(2, 2))
     columns = pd.Index(["a", "b"], name="custom_name")
     gdf = cudf.DataFrame(data, columns=columns)
     pdf = pd.DataFrame(data, columns=columns)
@@ -11146,3 +11172,12 @@ def test_from_pandas_preserve_column_dtype():
     df = pd.DataFrame([[1, 2]], columns=pd.Index([1, 2], dtype="int8"))
     result = cudf.DataFrame.from_pandas(df)
     pd.testing.assert_index_equal(result.columns, df.columns, exact=True)
+
+
+def test_dataframe_init_column():
+    s = cudf.Series([1, 2, 3])
+    with pytest.raises(TypeError):
+        cudf.DataFrame(s._column)
+    expect = cudf.DataFrame({"a": s})
+    actual = cudf.DataFrame._from_arrays(s._column, columns=["a"])
+    assert_eq(expect, actual)
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index 45bd31ef58e..3aedbf8365b 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -93,11 +93,15 @@ def test_dataframe_deep_copy_and_insert(copy_parameters):
 @pytest.mark.parametrize("ncols", [0, 1, 10])
 @pytest.mark.parametrize("data_type", ALL_TYPES)
 def test_cudf_dataframe_copy(copy_fn, ncols, data_type):
-    pdf = pd.DataFrame()
-    for i in range(ncols):
-        pdf[chr(i + ord("a"))] = pd.Series(
-            np.random.randint(0, 1000, 20)
-        ).astype(data_type)
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(
+        {
+            chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype(
+                data_type
+            )
+            for i in range(ncols)
+        }
+    )
     df = DataFrame.from_pandas(pdf)
     copy_df = copy_fn(df)
     assert_eq(df, copy_df)
@@ -116,18 +120,20 @@ def test_cudf_dataframe_copy(copy_fn, ncols, data_type):
 @pytest.mark.parametrize("ncols", [0, 1, 10])
 @pytest.mark.parametrize("data_type", ALL_TYPES)
 def test_cudf_dataframe_copy_then_insert(copy_fn, ncols, data_type):
-    pdf = pd.DataFrame()
-    for i in range(ncols):
-        pdf[chr(i + ord("a"))] = pd.Series(
-            np.random.randint(0, 1000, 20)
-        ).astype(data_type)
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(
+        {
+            chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype(
+                data_type
+            )
+            for i in range(ncols)
+        }
+    )
     df = DataFrame.from_pandas(pdf)
     copy_df = copy_fn(df)
     copy_pdf = copy_fn(pdf)
-    copy_df["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype(data_type)
-    copy_pdf["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype(
-        data_type
-    )
+    copy_df["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type)
+    copy_pdf["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type)
     assert not copy_pdf.to_string().split() == pdf.to_string().split()
     assert not copy_df.to_string().split() == df.to_string().split()
 
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 4a2345fc009..b7403c12bcd 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -216,17 +216,21 @@ def test_setitem_datetime():
 
 
 def test_sort_datetime():
-    df = pd.DataFrame()
-    df["date"] = np.array(
-        [
-            np.datetime64("2016-11-20"),
-            np.datetime64("2020-11-20"),
-            np.datetime64("2019-11-20"),
-            np.datetime64("1918-11-20"),
-            np.datetime64("2118-11-20"),
-        ]
+    rng = np.random.default_rng(seed=0)
+    df = pd.DataFrame(
+        {
+            "date": np.array(
+                [
+                    np.datetime64("2016-11-20"),
+                    np.datetime64("2020-11-20"),
+                    np.datetime64("2019-11-20"),
+                    np.datetime64("1918-11-20"),
+                    np.datetime64("2118-11-20"),
+                ]
+            ),
+            "vals": rng.random(5),
+        }
     )
-    df["vals"] = np.random.sample(len(df["date"]))
 
     gdf = cudf.from_pandas(df)
 
@@ -432,11 +436,12 @@ def test_datetime_to_arrow(dtype):
 )
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_datetime_unique(data, nulls):
+    rng = np.random.default_rng(seed=0)
     psr = data.copy()
 
     if len(data) > 0:
         if nulls == "some":
-            p = np.random.randint(0, len(data), 2)
+            p = rng.integers(0, len(data), 2)
             psr[p] = None
 
     gsr = cudf.from_pandas(psr)
@@ -461,10 +466,11 @@ def test_datetime_unique(data, nulls):
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_datetime_nunique(data, nulls):
     psr = data.copy()
+    rng = np.random.default_rng(seed=0)
 
     if len(data) > 0:
         if nulls == "some":
-            p = np.random.randint(0, len(data), 2)
+            p = rng.integers(0, len(data), 2)
             psr[p] = None
 
     gsr = cudf.from_pandas(psr)
@@ -2525,23 +2531,7 @@ def test_dti_asi8():
 
 @pytest.mark.parametrize(
     "method, kwargs",
-    [
-        ["mean", {}],
-        pytest.param(
-            "std",
-            {},
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/16444"
-            ),
-        ),
-        pytest.param(
-            "std",
-            {"ddof": 0},
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/16444"
-            ),
-        ),
-    ],
+    [["mean", {}], ["std", {}], ["std", {"ddof": 0}]],
 )
 def test_dti_reduction(method, kwargs):
     pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index ebcc35784ee..20c24bd7564 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -42,9 +42,10 @@ def data_1d(request):
     nelems = request.param[0]
     dtype = request.param[1]
     nulls = request.param[2]
-    a = np.random.randint(10, size=nelems).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    a = rng.integers(10, size=nelems).astype(dtype)
     if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating):
-        idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False)
+        idx = rng.choice(a.size, size=int(a.size * 0.2), replace=False)
         a[idx] = np.nan
     return a
 
@@ -55,9 +56,10 @@ def data_2d(request):
     nrows = request.param[1]
     dtype = request.param[2]
     nulls = request.param[3]
-    a = np.random.randint(10, size=(nrows, ncols)).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    a = rng.integers(10, size=(nrows, ncols)).astype(dtype)
     if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating):
-        idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False)
+        idx = rng.choice(a.size, size=int(a.size * 0.2), replace=False)
         a.ravel()[idx] = np.nan
     return np.ascontiguousarray(a)
 
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index 5b1ee0ffac6..eeac78dbebc 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -22,13 +22,13 @@
 @pytest.mark.parametrize("inplace", [True, False])
 def test_dropna_series(data, nulls, inplace):
     psr = pd.Series(data)
-
+    rng = np.random.default_rng(seed=0)
     if len(data) > 0:
         if nulls == "one":
-            p = np.random.randint(0, 4)
+            p = rng.integers(0, 4)
             psr[p] = None
         elif nulls == "some":
-            p1, p2 = np.random.randint(0, 4, (2,))
+            p1, p2 = rng.integers(0, 4, (2,))
             psr[p1] = None
             psr[p2] = None
         elif nulls == "all":
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index 0b4ed52ba96..67dd7a8388b 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -368,9 +368,13 @@ def test_dataframe_drop_duplicates_method():
 
 
 def test_datetime_drop_duplicates():
-    date_df = cudf.DataFrame()
-    date_df["date"] = pd.date_range("11/20/2018", periods=6, freq="D")
-    date_df["value"] = np.random.sample(len(date_df))
+    rng = np.random.default_rng(seed=0)
+    date_df = cudf.DataFrame(
+        {
+            "date": pd.date_range("11/20/2018", periods=6, freq="D"),
+            "value": rng.random(6),
+        }
+    )
 
     df = concat([date_df, date_df[:4]])
     assert_eq(df[:-4], df.drop_duplicates())
@@ -585,7 +589,8 @@ def test_drop_duplicates_multi_index():
     ]
 
     idx = pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"])
-    pdf = pd.DataFrame(np.random.randint(0, 2, (8, 4)), index=idx)
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.integers(0, 2, (8, 4)), index=idx)
     gdf = cudf.DataFrame.from_pandas(pdf)
 
     expected = pdf.drop_duplicates()
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 47f9180dcb1..cfb4ae2c0f8 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -13,13 +13,16 @@
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
 def test_factorize_series_obj(ncats, nelem):
     df = DataFrame()
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # initialize data frame
-    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
+    df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32)
 
     uvals, labels = df["cats"].factorize()
-    np.testing.assert_array_equal(labels.to_numpy(), sorted(set(arr)))
+    unique_values, indices = np.unique(arr, return_index=True)
+    expected_values = unique_values[np.argsort(indices)]
+
+    np.testing.assert_array_equal(labels.to_numpy(), expected_values)
     assert isinstance(uvals, cp.ndarray)
     assert isinstance(labels, Index)
 
@@ -31,14 +34,17 @@ def test_factorize_series_obj(ncats, nelem):
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
 def test_factorize_index_obj(ncats, nelem):
     df = DataFrame()
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # initialize data frame
-    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
+    df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32)
     df = df.set_index("cats")
 
     uvals, labels = df.index.factorize()
-    np.testing.assert_array_equal(labels.values.get(), sorted(set(arr)))
+    unique_values, indices = np.unique(arr, return_index=True)
+    expected_values = unique_values[np.argsort(indices)]
+
+    np.testing.assert_array_equal(labels.values.get(), expected_values)
     assert isinstance(uvals, cp.ndarray)
     assert isinstance(labels, Index)
 
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 7e5523bb8c7..f93bd2c5d32 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -15,13 +15,14 @@
 
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
+    rng = np.random.default_rng(seed=0)
     types = NUMERIC_TYPES + ["bool"]
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
         {
-            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
             for typ in types
         }
     )
@@ -30,7 +31,7 @@ def pdf(request):
     test_pdf.index.name = "index"
 
     # Create non-numeric categorical data otherwise may get typecasted
-    data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
+    data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)]
     test_pdf["col_category"] = pd.Series(data, dtype="category")
 
     # Feather can't handle indexes properly
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 14ba9894fd3..6b222841622 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -77,21 +77,21 @@ def make_frame(
     extra_vals=(),
     with_datetime=False,
 ):
-    np.random.seed(seed)
+    rng = np.random.default_rng(seed=seed)
 
     df = dataframe_class()
 
-    df["x"] = np.random.randint(0, 5, nelem)
-    df["y"] = np.random.randint(0, 3, nelem)
+    df["x"] = rng.integers(0, 5, nelem)
+    df["y"] = rng.integers(0, 3, nelem)
     for lvl in extra_levels:
-        df[lvl] = np.random.randint(0, 2, nelem)
+        df[lvl] = rng.integers(0, 2, nelem)
 
-    df["val"] = np.random.random(nelem)
+    df["val"] = rng.random(nelem)
     for val in extra_vals:
-        df[val] = np.random.random(nelem)
+        df[val] = rng.random(nelem)
 
     if with_datetime:
-        df["datetime"] = np.random.randint(
+        df["datetime"] = rng.integers(
             _now, _tomorrow, nelem, dtype=np.int64
         ).astype("datetime64[ns]")
 
@@ -266,9 +266,10 @@ def test_groupby_getitem_getattr(as_index):
 
 
 def test_groupby_cats():
-    df = DataFrame()
-    df["cats"] = pd.Categorical(list("aabaacaab"))
-    df["vals"] = np.random.random(len(df))
+    rng = np.random.default_rng(seed=0)
+    df = DataFrame(
+        {"cats": pd.Categorical(list("aabaacaab")), "vals": rng.random(9)}
+    )
 
     cats = df["cats"].values_host
     vals = df["vals"].to_numpy()
@@ -285,13 +286,16 @@ def test_groupby_cats():
 
 
 def test_groupby_iterate_groups():
-    np.random.seed(0)
-    df = DataFrame()
+    rng = np.random.default_rng(seed=0)
     nelem = 20
-    df["key1"] = np.random.randint(0, 3, nelem)
-    df["key2"] = np.random.randint(0, 2, nelem)
-    df["val1"] = np.random.random(nelem)
-    df["val2"] = np.random.random(nelem)
+    df = DataFrame(
+        {
+            "key1": rng.integers(0, 3, nelem),
+            "key2": rng.integers(0, 2, nelem),
+            "val1": rng.random(nelem),
+            "val2": rng.random(nelem),
+        }
+    )
 
     def assert_values_equal(arr):
         np.testing.assert_array_equal(arr[0], arr)
@@ -307,13 +311,16 @@ def assert_values_equal(arr):
     reason="Fails in older versions of pandas",
 )
 def test_groupby_apply():
-    np.random.seed(0)
-    df = DataFrame()
+    rng = np.random.default_rng(seed=0)
     nelem = 20
-    df["key1"] = np.random.randint(0, 3, nelem)
-    df["key2"] = np.random.randint(0, 2, nelem)
-    df["val1"] = np.random.random(nelem)
-    df["val2"] = np.random.random(nelem)
+    df = DataFrame(
+        {
+            "key1": rng.integers(0, 3, nelem),
+            "key2": rng.integers(0, 2, nelem),
+            "val1": rng.random(nelem),
+            "val2": rng.random(nelem),
+        }
+    )
 
     expect_grpby = df.to_pandas().groupby(
         ["key1", "key2"], as_index=False, group_keys=False
@@ -351,13 +358,16 @@ def f3(df, k, L, m):
     reason="Fails in older versions of pandas",
 )
 def test_groupby_apply_args(func, args):
-    np.random.seed(0)
-    df = DataFrame()
+    rng = np.random.default_rng(seed=0)
     nelem = 20
-    df["key1"] = np.random.randint(0, 3, nelem)
-    df["key2"] = np.random.randint(0, 2, nelem)
-    df["val1"] = np.random.random(nelem)
-    df["val2"] = np.random.random(nelem)
+    df = DataFrame(
+        {
+            "key1": rng.integers(0, 3, nelem),
+            "key2": rng.integers(0, 2, nelem),
+            "val1": rng.random(nelem),
+            "val2": rng.random(nelem),
+        }
+    )
 
     expect_grpby = df.to_pandas().groupby(
         ["key1", "key2"], as_index=False, group_keys=False
@@ -369,7 +379,6 @@ def test_groupby_apply_args(func, args):
 
 
 def test_groupby_apply_grouped():
-    np.random.seed(0)
     df = DataFrame()
     nelem = 20
     df["key1"] = range(nelem)
@@ -1010,6 +1019,7 @@ def test_groupby_2keys_agg(nelem, func):
     # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"],
 )
 def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
+    rng = np.random.default_rng(seed=0)
     # The number of digits after the decimal to use.
     decimal_digits = 2
     # The number of digits before the decimal to use.
@@ -1026,8 +1036,8 @@ def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
     # https://github.com/pandas-dev/pandas/issues/40685). However, if that is
     # ever enabled, then this issue will crop up again so we may as well have
     # it fixed now.
-    x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))
-    y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))
+    x = np.unique((rng.random(nelem) * scale).round(decimal_digits))
+    y = np.unique((rng.random(nelem) * scale).round(decimal_digits))
 
     if x.size < y.size:
         total_elements = x.size
@@ -1313,9 +1323,9 @@ def test_empty_groupby(func):
 
 
 def test_groupby_unsupported_columns():
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     pd_cat = pd.Categorical(
-        pd.Series(np.random.choice(["a", "b", 1], 3), dtype="category")
+        pd.Series(rng.choice(["a", "b", 1], 3), dtype="category")
     )
     pdf = pd.DataFrame(
         {
@@ -1421,10 +1431,11 @@ def test_groupby_apply_basic_agg_single_column():
 
 
 def test_groupby_multi_agg_single_groupby_series():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
     gdf = cudf.from_pandas(pdf)
@@ -1435,12 +1446,13 @@ def test_groupby_multi_agg_single_groupby_series():
 
 
 def test_groupby_multi_agg_multi_groupby():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "a": np.random.randint(0, 5, 10),
-            "b": np.random.randint(0, 5, 10),
-            "c": np.random.randint(0, 5, 10),
-            "d": np.random.randint(0, 5, 10),
+            "a": rng.integers(0, 5, 10),
+            "b": rng.integers(0, 5, 10),
+            "c": rng.integers(0, 5, 10),
+            "d": rng.integers(0, 5, 10),
         }
     )
     gdf = cudf.from_pandas(pdf)
@@ -1450,6 +1462,7 @@ def test_groupby_multi_agg_multi_groupby():
 
 
 def test_groupby_datetime_multi_agg_multi_groupby():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
             "a": pd.date_range(
@@ -1457,9 +1470,9 @@ def test_groupby_datetime_multi_agg_multi_groupby():
                 datetime.datetime.now() + datetime.timedelta(9),
                 freq="D",
             ),
-            "b": np.random.randint(0, 5, 10),
-            "c": np.random.randint(0, 5, 10),
-            "d": np.random.randint(0, 5, 10),
+            "b": rng.integers(0, 5, 10),
+            "c": rng.integers(0, 5, 10),
+            "d": rng.integers(0, 5, 10),
         }
     )
     gdf = cudf.from_pandas(pdf)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 3f483219423..24d42d9eb4c 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2645,21 +2645,20 @@ def test_isin_multiindex(data, values, level, err):
         )
 
 
-range_data = [
-    range(np.random.randint(0, 100)),
-    range(9, 12, 2),
-    range(20, 30),
-    range(100, 1000, 10),
-    range(0, 10, -2),
-    range(0, -10, 2),
-    range(0, -10, -2),
-]
-
-
-@pytest.fixture(params=range_data)
+@pytest.fixture(
+    params=[
+        range(np.random.default_rng(seed=0).integers(0, 100)),
+        range(9, 12, 2),
+        range(20, 30),
+        range(100, 1000, 10),
+        range(0, 10, -2),
+        range(0, -10, 2),
+        range(0, -10, -2),
+    ]
+)
 def rangeindex(request):
     """Create a cudf RangeIndex of different `nrows`"""
-    return RangeIndex(request.param)
+    return cudf.RangeIndex(request.param)
 
 
 @pytest.mark.parametrize(
@@ -2830,21 +2829,20 @@ def test_rangeindex_append_return_rangeindex():
     assert_eq(result, expected)
 
 
-index_data = [
-    range(np.random.randint(0, 100)),
-    range(0, 10, -2),
-    range(0, -10, 2),
-    range(0, -10, -2),
-    range(0, 1),
-    [1, 2, 3, 1, None, None],
-    [None, None, 3.2, 1, None, None],
-    [None, "a", "3.2", "z", None, None],
-    pd.Series(["a", "b", None], dtype="category"),
-    np.array([1, 2, 3, None], dtype="datetime64[s]"),
-]
-
-
-@pytest.fixture(params=index_data)
+@pytest.fixture(
+    params=[
+        range(np.random.default_rng(seed=0).integers(0, 100)),
+        range(0, 10, -2),
+        range(0, -10, 2),
+        range(0, -10, -2),
+        range(0, 1),
+        [1, 2, 3, 1, None, None],
+        [None, None, 3.2, 1, None, None],
+        [None, "a", "3.2", "z", None, None],
+        pd.Series(["a", "b", None], dtype="category"),
+        np.array([1, 2, 3, None], dtype="datetime64[s]"),
+    ]
+)
 def index(request):
     """Create a cudf Index of different dtypes"""
     return cudf.Index(request.param)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 00ae99466bb..421bc0c298b 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -32,7 +32,8 @@ def pdf_gdf():
 
 @pytest.fixture
 def pdf_gdf_multi():
-    pdf = pd.DataFrame(np.random.rand(7, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(7, 5)))
     pdfIndex = pd.MultiIndex(
         [
             ["a", "b", "c"],
@@ -212,12 +213,17 @@ def test_dataframe_column_name_indexing():
         df[1].to_numpy(), np.asarray(range(10), dtype=np.int32)
     )
 
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame()
     nelem = 10
-    pdf["key1"] = np.random.randint(0, 5, nelem)
-    pdf["key2"] = np.random.randint(0, 3, nelem)
-    pdf[1] = np.arange(1, 1 + nelem)
-    pdf[2] = np.random.random(nelem)
+    pdf = pd.DataFrame(
+        {
+            "key1": rng.integers(0, 5, nelem),
+            "key2": rng.integers(0, 3, nelem),
+            1: np.arange(1, 1 + nelem),
+            2: rng.random(nelem),
+        }
+    )
     df = cudf.from_pandas(pdf)
 
     assert_eq(df[df.columns], df)
@@ -239,16 +245,13 @@ def test_dataframe_column_name_indexing():
 
 
 def test_dataframe_slicing():
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame()
     size = 123
-    df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(
-        np.int32
-    )
-    df["b"] = hb = np.random.random(size).astype(np.float32)
-    df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype(
-        np.int64
-    )
-    df["d"] = hd = np.random.random(size).astype(np.float64)
+    df["a"] = ha = rng.integers(low=0, high=100, size=size).astype(np.int32)
+    df["b"] = hb = rng.random(size).astype(np.float32)
+    df["c"] = hc = rng.integers(low=0, high=100, size=size).astype(np.int64)
+    df["d"] = hd = rng.random(size).astype(np.float64)
 
     # Row slice first 10
     first_10 = df[:10]
@@ -287,12 +290,13 @@ def test_dataframe_slicing():
 @pytest.mark.parametrize("scalar", [0, 20, 100])
 def test_dataframe_loc(scalar, step):
     size = 123
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "a": np.random.randint(low=0, high=100, size=size),
-            "b": np.random.random(size).astype(np.float32),
-            "c": np.random.random(size).astype(np.float64),
-            "d": np.random.random(size).astype(np.float64),
+            "a": rng.integers(low=0, high=100, size=size),
+            "b": rng.random(size).astype(np.float32),
+            "c": rng.random(size).astype(np.float64),
+            "d": rng.random(size).astype(np.float64),
         }
     )
     pdf.index.name = "index"
@@ -392,12 +396,11 @@ def test_dataframe_loc_mask(mask, arg):
 
 
 def test_dataframe_loc_outbound():
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame()
     size = 10
-    df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(
-        np.int32
-    )
-    df["b"] = hb = np.random.random(size).astype(np.float32)
+    df["a"] = ha = rng.integers(low=0, high=100, size=size).astype(np.int32)
+    df["b"] = hb = rng.random(size).astype(np.float32)
 
     pdf = pd.DataFrame()
     pdf["a"] = ha
@@ -590,8 +593,8 @@ def test_dataframe_series_loc_multiindex(obj):
 @pytest.mark.parametrize("nelem", [2, 5, 20, 100])
 def test_series_iloc(nelem):
     # create random cudf.Series
-    np.random.seed(12)
-    ps = pd.Series(np.random.sample(nelem))
+    rng = np.random.default_rng(seed=0)
+    ps = pd.Series(rng.random(nelem))
 
     # gpu cudf.Series
     gs = cudf.Series(ps)
@@ -625,12 +628,11 @@ def test_series_iloc(nelem):
 
 @pytest.mark.parametrize("nelem", [2, 5, 20, 100])
 def test_dataframe_iloc(nelem):
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame()
 
-    gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(
-        np.int32
-    )
-    gdf["b"] = hb = np.random.random(nelem).astype(np.float32)
+    gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32)
+    gdf["b"] = hb = rng.random(nelem).astype(np.float32)
 
     pdf = pd.DataFrame()
     pdf["a"] = ha
@@ -679,12 +681,11 @@ def test_dataframe_iloc(nelem):
 
 
 def test_dataframe_iloc_tuple():
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame()
     nelem = 123
-    gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(
-        np.int32
-    )
-    gdf["b"] = hb = np.random.random(nelem).astype(np.float32)
+    gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32)
+    gdf["b"] = hb = rng.random(nelem).astype(np.float32)
 
     pdf = pd.DataFrame()
     pdf["a"] = ha
@@ -695,12 +696,11 @@ def test_dataframe_iloc_tuple():
 
 
 def test_dataframe_iloc_index_error():
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame()
     nelem = 123
-    gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype(
-        np.int32
-    )
-    gdf["b"] = hb = np.random.random(nelem).astype(np.float32)
+    gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32)
+    gdf["b"] = hb = rng.random(nelem).astype(np.float32)
 
     pdf = pd.DataFrame()
     pdf["a"] = ha
@@ -714,14 +714,16 @@ def test_dataframe_iloc_index_error():
 
 @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200])
 def test_dataframe_take(ntake):
-    np.random.seed(0)
-    df = cudf.DataFrame()
-
+    rng = np.random.default_rng(seed=0)
     nelem = 123
-    df["ii"] = np.random.randint(0, 20, nelem)
-    df["ff"] = np.random.random(nelem)
+    df = cudf.DataFrame(
+        {
+            "ii": rng.integers(0, 20, nelem),
+            "ff": rng.random(nelem),
+        }
+    )
 
-    take_indices = np.random.randint(0, len(df), ntake)
+    take_indices = rng.integers(0, len(df), ntake)
 
     actual = df.take(take_indices)
     expected = df.to_pandas().take(take_indices)
@@ -733,7 +735,7 @@ def test_dataframe_take(ntake):
 
 @pytest.mark.parametrize("ntake", [1, 2, 8, 9])
 def test_dataframe_take_with_multiindex(ntake):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(
         index=cudf.MultiIndex(
             levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
@@ -742,10 +744,10 @@ def test_dataframe_take_with_multiindex(ntake):
     )
 
     nelem = 9
-    df["ii"] = np.random.randint(0, 20, nelem)
-    df["ff"] = np.random.random(nelem)
+    df["ii"] = rng.integers(0, 20, nelem)
+    df["ff"] = rng.random(nelem)
 
-    take_indices = np.random.randint(0, len(df), ntake)
+    take_indices = rng.integers(0, len(df), ntake)
 
     actual = df.take(take_indices)
     expected = df.to_pandas().take(take_indices)
@@ -755,13 +757,13 @@ def test_dataframe_take_with_multiindex(ntake):
 
 @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200])
 def test_series_take(ntake):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     nelem = 123
 
-    psr = pd.Series(np.random.randint(0, 20, nelem))
+    psr = pd.Series(rng.integers(0, 20, nelem))
     gsr = cudf.Series(psr)
 
-    take_indices = np.random.randint(0, len(gsr), ntake)
+    take_indices = rng.integers(0, len(gsr), ntake)
 
     actual = gsr.take(take_indices)
     expected = psr.take(take_indices)
@@ -841,14 +843,15 @@ def test_empty_boolean_mask(dtype):
 )
 @pytest.mark.parametrize("nulls", ["one", "some", "all", "none"])
 def test_series_apply_boolean_mask(data, mask, nulls):
+    rng = np.random.default_rng(seed=0)
     psr = pd.Series(data)
 
     if len(data) > 0:
         if nulls == "one":
-            p = np.random.randint(0, 4)
+            p = rng.integers(0, 4)
             psr[p] = None
         elif nulls == "some":
-            p1, p2 = np.random.randint(0, 4, (2,))
+            p1, p2 = rng.integers(0, 4, (2,))
             psr[p1] = None
             psr[p2] = None
         elif nulls == "all":
@@ -1810,13 +1813,14 @@ def test_boolean_mask_columns_iloc_series():
 
 @pytest.mark.parametrize("index_type", ["single", "slice"])
 def test_loc_timestamp_issue_8585(index_type):
+    rng = np.random.default_rng(seed=0)
     # https://github.com/rapidsai/cudf/issues/8585
     start = pd.Timestamp(
         datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
     )
     end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M"))
     timestamps = pd.date_range(start, end, periods=12)
-    value = np.random.normal(size=12)
+    value = rng.normal(size=12)
     df = pd.DataFrame(value, index=timestamps, columns=["value"])
     cdf = cudf.from_pandas(df)
     if index_type == "single":
@@ -1851,6 +1855,7 @@ def test_loc_timestamp_issue_8585(index_type):
     ],
 )
 def test_loc_multiindex_timestamp_issue_8585(index_type):
+    rng = np.random.default_rng(seed=0)
     # https://github.com/rapidsai/cudf/issues/8585
     start = pd.Timestamp(
         datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
@@ -1861,7 +1866,7 @@ def test_loc_multiindex_timestamp_issue_8585(index_type):
     index = pd.MultiIndex.from_product(
         [timestamps, labels], names=["timestamp", "label"]
     )
-    value = np.random.normal(size=12)
+    value = rng.normal(size=12)
     df = pd.DataFrame(value, index=index, columns=["value"])
     cdf = cudf.from_pandas(df)
     start = pd.Timestamp(
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index b1ce69e58ef..f6941ce7fae 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -22,7 +22,7 @@
 
 
 def make_params():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     hows = _JOIN_TYPES
 
@@ -39,14 +39,14 @@ def make_params():
         yield (aa, bb, how)
 
     # Test large random integer inputs
-    aa = np.random.randint(0, 50, 100)
-    bb = np.random.randint(0, 50, 100)
+    aa = rng.integers(0, 50, 100)
+    bb = rng.integers(0, 50, 100)
     for how in hows:
         yield (aa, bb, how)
 
     # Test floating point inputs
-    aa = np.random.random(50)
-    bb = np.random.random(50)
+    aa = rng.random(50)
+    bb = rng.random(50)
     for how in hows:
         yield (aa, bb, how)
 
@@ -162,9 +162,9 @@ def _check_series(expect, got):
     reason="bug in older version of pandas",
 )
 def test_dataframe_join_suffix():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
-    df = cudf.DataFrame(np.random.randint(0, 5, (5, 3)), columns=list("abc"))
+    df = cudf.DataFrame(rng.integers(0, 5, (5, 3)), columns=list("abc"))
 
     left = df.set_index("a")
     right = df.set_index("c")
@@ -281,19 +281,19 @@ def test_dataframe_join_mismatch_cats(how):
 
 @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None])
 def test_dataframe_merge_on(on):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # Make cuDF
     df_left = cudf.DataFrame()
     nelem = 500
-    df_left["key1"] = np.random.randint(0, 40, nelem)
-    df_left["key2"] = np.random.randint(0, 50, nelem)
+    df_left["key1"] = rng.integers(0, 40, nelem)
+    df_left["key2"] = rng.integers(0, 50, nelem)
     df_left["left_val"] = np.arange(nelem)
 
     df_right = cudf.DataFrame()
     nelem = 500
-    df_right["key1"] = np.random.randint(0, 30, nelem)
-    df_right["key2"] = np.random.randint(0, 50, nelem)
+    df_right["key1"] = rng.integers(0, 30, nelem)
+    df_right["key2"] = rng.integers(0, 50, nelem)
     df_right["right_val"] = np.arange(nelem)
 
     # Make pandas DF
@@ -347,19 +347,19 @@ def test_dataframe_merge_on(on):
 
 
 def test_dataframe_merge_on_unknown_column():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # Make cuDF
     df_left = cudf.DataFrame()
     nelem = 500
-    df_left["key1"] = np.random.randint(0, 40, nelem)
-    df_left["key2"] = np.random.randint(0, 50, nelem)
+    df_left["key1"] = rng.integers(0, 40, nelem)
+    df_left["key2"] = rng.integers(0, 50, nelem)
     df_left["left_val"] = np.arange(nelem)
 
     df_right = cudf.DataFrame()
     nelem = 500
-    df_right["key1"] = np.random.randint(0, 30, nelem)
-    df_right["key2"] = np.random.randint(0, 50, nelem)
+    df_right["key1"] = rng.integers(0, 30, nelem)
+    df_right["key2"] = rng.integers(0, 50, nelem)
     df_right["right_val"] = np.arange(nelem)
 
     with pytest.raises(KeyError) as raises:
@@ -368,19 +368,19 @@ def test_dataframe_merge_on_unknown_column():
 
 
 def test_dataframe_merge_no_common_column():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # Make cuDF
     df_left = cudf.DataFrame()
     nelem = 500
-    df_left["key1"] = np.random.randint(0, 40, nelem)
-    df_left["key2"] = np.random.randint(0, 50, nelem)
+    df_left["key1"] = rng.integers(0, 40, nelem)
+    df_left["key2"] = rng.integers(0, 50, nelem)
     df_left["left_val"] = np.arange(nelem)
 
     df_right = cudf.DataFrame()
     nelem = 500
-    df_right["key3"] = np.random.randint(0, 30, nelem)
-    df_right["key4"] = np.random.randint(0, 50, nelem)
+    df_right["key3"] = rng.integers(0, 30, nelem)
+    df_right["key4"] = rng.integers(0, 50, nelem)
     df_right["right_val"] = np.arange(nelem)
 
     with pytest.raises(ValueError) as raises:
@@ -460,14 +460,14 @@ def test_dataframe_merge_order():
 @pytest.mark.parametrize("rows", [1, 5, 100])
 @pytest.mark.parametrize("how", ["left", "inner", "outer"])
 def test_dataframe_pairs_of_triples(pairs, max, rows, how):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     pdf_left = pd.DataFrame()
     pdf_right = pd.DataFrame()
     for left_column in pairs[0]:
-        pdf_left[left_column] = np.random.randint(0, max, rows)
+        pdf_left[left_column] = rng.integers(0, max, rows)
     for right_column in pairs[1]:
-        pdf_right[right_column] = np.random.randint(0, max, rows)
+        pdf_right[right_column] = rng.integers(0, max, rows)
     gdf_left = cudf.from_pandas(pdf_left)
     gdf_right = cudf.from_pandas(pdf_right)
     if not set(pdf_left.columns).intersection(pdf_right.columns):
@@ -504,15 +504,15 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how):
 
 
 def test_safe_merging_with_left_empty():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     pairs = ("bcd", "b")
     pdf_left = pd.DataFrame()
     pdf_right = pd.DataFrame()
     for left_column in pairs[0]:
-        pdf_left[left_column] = np.random.randint(0, 10, 0)
+        pdf_left[left_column] = rng.integers(0, 10, 0)
     for right_column in pairs[1]:
-        pdf_right[right_column] = np.random.randint(0, 10, 5)
+        pdf_right[right_column] = rng.integers(0, 10, 5)
     gdf_left = cudf.from_pandas(pdf_left)
     gdf_right = cudf.from_pandas(pdf_right)
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index c81c2d1d94b..47976fc4bac 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -32,13 +32,14 @@ def make_numeric_dataframe(nrows, dtype):
 
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
+    rng = np.random.default_rng(seed=0)
     types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"]
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
         {
-            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
             for typ in types
         }
     )
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 790e84559a9..a34c89f55d3 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -164,7 +164,8 @@ def test_series(testlist):
 
 
 def test_multiindex():
-    pdf = pd.DataFrame(np.random.rand(7, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(7, 5)))
     pdf.index = pd.MultiIndex(
         [
             ["a", "b", "c"],
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index c41be3e4428..ad0e0858c43 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -153,7 +153,8 @@ def test_multiindex_swaplevel():
 
 
 def test_string_index():
-    pdf = pd.DataFrame(np.random.rand(5, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(5, 5)))
     gdf = cudf.from_pandas(pdf)
     stringIndex = ["a", "b", "c", "d", "e"]
     pdf.index = stringIndex
@@ -176,7 +177,8 @@ def test_string_index():
 
 
 def test_multiindex_row_shape():
-    pdf = pd.DataFrame(np.random.rand(0, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(0, 5)))
     gdf = cudf.from_pandas(pdf)
     pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]])
     pdfIndex.names = ["alpha"]
@@ -193,7 +195,8 @@ def test_multiindex_row_shape():
 
 @pytest.fixture
 def pdf():
-    return pd.DataFrame(np.random.rand(7, 5))
+    rng = np.random.default_rng(seed=0)
+    return pd.DataFrame(rng.random(size=(7, 5)))
 
 
 @pytest.fixture
@@ -271,7 +274,8 @@ def test_from_pandas_series():
 
 
 def test_series_multiindex(pdfIndex):
-    ps = pd.Series(np.random.rand(7))
+    rng = np.random.default_rng(seed=0)
+    ps = pd.Series(rng.random(7))
     gs = cudf.from_pandas(ps)
     ps.index = pdfIndex
     gs.index = cudf.from_pandas(pdfIndex)
@@ -439,7 +443,8 @@ def test_multiindex_loc_rows_1_1_key(pdf, gdf, pdfIndex):
 
 
 def test_multiindex_column_shape():
-    pdf = pd.DataFrame(np.random.rand(5, 0))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(5, 0)))
     gdf = cudf.from_pandas(pdf)
     pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]])
     pdfIndex.names = ["alpha"]
@@ -522,9 +527,13 @@ def test_multiindex_from_product(arrays):
 
 
 def test_multiindex_index_and_columns():
-    gdf = cudf.DataFrame()
-    gdf["x"] = np.random.randint(0, 5, 5)
-    gdf["y"] = np.random.randint(0, 5, 5)
+    rng = np.random.default_rng(seed=0)
+    gdf = cudf.DataFrame(
+        {
+            "x": rng.integers(0, 5, 5),
+            "y": rng.integers(0, 5, 5),
+        }
+    )
     pdf = gdf.to_pandas()
     mi = cudf.MultiIndex(
         levels=[[0, 1, 2], [3, 4]],
@@ -542,11 +551,12 @@ def test_multiindex_index_and_columns():
 
 
 def test_multiindex_multiple_groupby():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
             "a": [4, 17, 4, 9, 5],
             "b": [1, 4, 4, 3, 2],
-            "x": np.random.normal(size=5),
+            "x": rng.normal(size=5),
         }
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -566,11 +576,12 @@ def test_multiindex_multiple_groupby():
     ],
 )
 def test_multi_column(func):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=1000),
-            "y": np.random.randint(0, 10, size=1000),
-            "z": np.random.normal(size=1000),
+            "x": rng.integers(0, 5, size=1000),
+            "y": rng.integers(0, 10, size=1000),
+            "z": rng.normal(size=1000),
         }
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 1dd732c7191..41c1c3ccb20 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -681,7 +681,6 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
 def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     from pyarrow import orc
 
-    np.random.seed(0)
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Writing bool columns to multiple row groups is disabled
     # until #6763 is fixed
@@ -704,6 +703,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
                 has_nulls=True,
                 low=0,
                 high=max_char_length,
+                seed=0,
             )
             for dtype in supported_stat_types
         }
@@ -845,7 +845,6 @@ def test_orc_reader_gmt_timestamps(datadir):
 
 
 def test_orc_bool_encode_fail():
-    np.random.seed(0)
     buffer = BytesIO()
 
     # Generate a boolean column longer than a single row group
@@ -927,7 +926,6 @@ def test_empty_string_columns(data):
     [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
 )
 def test_orc_writer_decimal(tmpdir, scale, decimal_type):
-    np.random.seed(0)
     fname = tmpdir / "decimal.orc"
 
     expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)})
@@ -988,7 +986,7 @@ def test_orc_string_stream_offset_issue():
 
 def generate_list_struct_buff(size=100_000):
     rd = random.Random(1)
-    np.random.seed(seed=1)
+    rng = np.random.default_rng(seed=1)
 
     buff = BytesIO()
 
@@ -999,12 +997,12 @@ def generate_list_struct_buff(size=100_000):
                 [
                     [
                         [
-                            rd.choice([None, np.random.randint(1, 3)])
-                            for _ in range(np.random.randint(1, 3))
+                            rd.choice([None, rng.integers(1, 3)])
+                            for _ in range(rng.integers(1, 3))
                         ]
-                        for _ in range(np.random.randint(0, 3))
+                        for _ in range(rng.integers(0, 3))
                     ]
-                    for _ in range(np.random.randint(0, 3))
+                    for _ in range(rng.integers(0, 3))
                 ],
             ]
         )
@@ -1012,8 +1010,8 @@ def generate_list_struct_buff(size=100_000):
     ]
     lvl1_list = [
         [
-            rd.choice([None, np.random.randint(0, 3)])
-            for _ in range(np.random.randint(1, 4))
+            rd.choice([None, rng.integers(0, 3)])
+            for _ in range(rng.integers(1, 4))
         ]
         for _ in range(size)
     ]
@@ -1021,7 +1019,7 @@ def generate_list_struct_buff(size=100_000):
         rd.choice(
             [
                 None,
-                {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)},
+                {"a": rng.integers(0, 3), "b": rng.integers(0, 3)},
             ]
         )
         for _ in range(size)
@@ -1030,11 +1028,11 @@ def generate_list_struct_buff(size=100_000):
         rd.choice(
             [
                 None,
-                {"a": rd.choice([None, np.random.randint(0, 3)])},
+                {"a": rd.choice([None, rng.integers(0, 3)])},
                 {
                     "lvl1_struct": {
-                        "c": rd.choice([None, np.random.randint(0, 3)]),
-                        "d": np.random.randint(0, 3),
+                        "c": rd.choice([None, rng.integers(0, 3)]),
+                        "d": rng.integers(0, 3),
                     },
                 },
             ]
@@ -1044,7 +1042,7 @@ def generate_list_struct_buff(size=100_000):
     list_nests_struct = [
         [
             {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)}
-            for _ in range(np.random.randint(1, 4))
+            for _ in range(rng.integers(1, 4))
         ]
         for _ in range(size)
     ]
@@ -1135,7 +1133,7 @@ def gen_map_buff(size):
     from pyarrow import orc
 
     rd = random.Random(1)
-    np.random.seed(seed=1)
+    rng = np.random.default_rng(seed=1)
 
     buff = BytesIO()
 
@@ -1146,7 +1144,7 @@ def gen_map_buff(size):
                     None,
                     {
                         rd.choice(al): rd.choice(
-                            [None, np.random.randint(1, 1500)]
+                            [None, rng.integers(1, 1500)]
                         ),
                     },
                 ]
@@ -1167,7 +1165,7 @@ def gen_map_buff(size):
                                     None,
                                     [
                                         rd.choice(
-                                            [None, np.random.randint(1, 1500)]
+                                            [None, rng.integers(1, 1500)]
                                         )
                                         for _ in range(5)
                                     ],
@@ -1194,10 +1192,10 @@ def gen_map_buff(size):
                                     None,
                                     {
                                         "a": rd.choice(
-                                            [None, np.random.randint(1, 1500)]
+                                            [None, rng.integers(1, 1500)]
                                         ),
                                         "b": rd.choice(
-                                            [None, np.random.randint(1, 1500)]
+                                            [None, rng.integers(1, 1500)]
                                         ),
                                     },
                                 ]
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
index ad78621c5fa..b474bbe9bd8 100644
--- a/python/cudf/cudf/tests/test_pack.py
+++ b/python/cudf/cudf/tests/test_pack.py
@@ -24,11 +24,11 @@
 
 
 def test_sizeof_packed_dataframe():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
     nelem = 1000
     df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
-    df["vals"] = hvals = np.random.random(nelem)
+    df["vals"] = hvals = rng.random(nelem)
     packed = pack(df)
 
     nbytes = hkeys.nbytes + hvals.nbytes
@@ -67,46 +67,46 @@ def assert_packed_frame_equality(df):
 
 
 def test_packed_dataframe_equality_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_packed_equality(df)
 
 
 def test_packed_dataframe_equality_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_equality(df)
 
 
 def test_packed_dataframe_equality_list():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_equality(df)
 
 
 def test_packed_dataframe_equality_struct():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(
         list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_equality(df)
 
@@ -135,46 +135,46 @@ def assert_packed_frame_unique_pointers(df):
 
 
 def test_packed_dataframe_unique_pointers_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_packed_unique_pointers(df)
 
 
 def test_packed_dataframe_unique_pointers_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_unique_pointers(df)
 
 
 def test_packed_dataframe_unique_pointers_list():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_unique_pointers(df)
 
 
 def test_packed_dataframe_unique_pointers_struct():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(
         list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_unique_pointers(df)
 
@@ -208,46 +208,46 @@ def assert_packed_frame_picklable(df):
 
 
 def test_pickle_packed_dataframe_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_packed_pickled_equality(df)
 
 
 def test_pickle_packed_dataframe_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_pickled_equality(df)
 
 
 def test_pickle_packed_dataframe_list():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_pickled_equality(df)
 
 
 def test_pickle_packed_dataframe_struct():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(
         list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_pickled_equality(df)
 
@@ -273,45 +273,45 @@ def assert_packed_frame_serializable(df):
 
 
 def test_serialize_packed_dataframe_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_packed_serialized_equality(df)
 
 
 def test_serialize_packed_dataframe_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_serialized_equality(df)
 
 
 def test_serialize_packed_dataframe_list():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_serialized_equality(df)
 
 
 def test_serialize_packed_dataframe_struct():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = Series(
         list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_packed_serialized_equality(df)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 7f1b0b1cd46..c9ce24d2a5b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -53,6 +53,7 @@ def datadir(datadir):
 
 @pytest.fixture(params=[1, 5, 10, 100000])
 def simple_pdf(request):
+    rng = np.random.default_rng(seed=0)
     types = [
         "bool",
         "int8",
@@ -72,7 +73,7 @@ def simple_pdf(request):
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
         {
-            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
             for typ in types
         },
         # Need to ensure that this index is not a RangeIndex to get the
@@ -92,6 +93,7 @@ def simple_gdf(simple_pdf):
 
 
 def build_pdf(num_columns, day_resolution_timestamps):
+    rng = np.random.default_rng(seed=0)
     types = [
         "bool",
         "int8",
@@ -114,7 +116,7 @@ def build_pdf(num_columns, day_resolution_timestamps):
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
         {
-            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ)
             for typ in types
         },
         # Need to ensure that this index is not a RangeIndex to get the
@@ -142,7 +144,7 @@ def build_pdf(num_columns, day_resolution_timestamps):
         },
     ]:
         data = [
-            np.random.randint(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"]))
+            rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"]))
             for i in range(nrows)
         ]
         if day_resolution_timestamps:
@@ -152,11 +154,11 @@ def build_pdf(num_columns, day_resolution_timestamps):
         )
 
     # Create non-numeric categorical data otherwise parquet may typecast it
-    data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
+    data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)]
     test_pdf["col_category"] = pd.Series(data, dtype="category")
 
     # Create non-numeric str data
-    data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
+    data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)]
     test_pdf["col_str"] = pd.Series(data, dtype="str")
 
     return test_pdf
@@ -453,7 +455,9 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
                 dg.ColumnParameters(
                     40,
                     0.2,
-                    lambda: np.random.default_rng().integers(0, 100, size=40),
+                    lambda: np.random.default_rng(seed=None).integers(
+                        0, 100, size=40
+                    ),
                     True,
                 ),
             ],
@@ -1909,6 +1913,7 @@ def test_parquet_writer_dictionary_setting(use_dict, max_dict_size):
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
 def test_parquet_partitioned(tmpdir_factory, cols, filename):
+    rng = np.random.default_rng(seed=0)
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
@@ -1917,8 +1922,8 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
     pdf = pd.DataFrame(
         {
             "a": np.arange(0, stop=size, dtype="int64"),
-            "b": np.random.choice(list("abcd"), size=size),
-            "c": np.random.choice(np.arange(4), size=size),
+            "b": rng.choice(list("abcd"), size=size),
+            "c": rng.choice(np.arange(4), size=size),
         }
     )
     pdf.to_parquet(pdf_dir, index=False, partition_cols=cols)
@@ -1954,6 +1959,7 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
 
 @pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}])
 def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs):
+    rng = np.random.default_rng(seed=0)
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
@@ -1961,8 +1967,8 @@ def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs):
     pdf = pd.DataFrame(
         {
             "a": np.arange(0, stop=size, dtype="int64"),
-            "b": np.random.choice(list("abcd"), size=size),
-            "c": np.random.choice(np.arange(4), size=size),
+            "b": rng.choice(list("abcd"), size=size),
+            "c": rng.choice(np.arange(4), size=size),
         }
     )
     pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"])
@@ -2127,6 +2133,7 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
 @pytest.mark.parametrize("cols", [None, ["b"]])
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema):
+    rng = np.random.default_rng(seed=0)
     dir1 = tmpdir_factory.mktemp("dir1")
     dir2 = tmpdir_factory.mktemp("dir2")
     if cols is None:
@@ -2139,7 +2146,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema):
     gdf = cudf.DataFrame(
         {
             "a": np.arange(0, stop=size),
-            "b": np.random.choice(np.arange(4), size=size),
+            "b": rng.choice(np.arange(4), size=size),
         }
     )
     gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
@@ -3214,11 +3221,12 @@ def test_parquet_nested_struct_list():
 
 def test_parquet_writer_zstd():
     size = 12345
+    rng = np.random.default_rng(seed=0)
     expected = cudf.DataFrame(
         {
             "a": np.arange(0, stop=size, dtype="float64"),
-            "b": np.random.choice(list("abcd"), size=size),
-            "c": np.random.choice(np.arange(4), size=size),
+            "b": rng.choice(list("abcd"), size=size),
+            "c": rng.choice(np.arange(4), size=size),
         }
     )
 
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 0f13a9e173a..2f10a5dfd74 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -40,33 +40,33 @@ def assert_frame_picklable(df):
 
 
 def test_pickle_dataframe_numeric():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
     nelem = 10
     df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = np.random.random(nelem)
+    df["vals"] = rng.random(nelem)
 
     check_serialization(df)
 
 
 def test_pickle_dataframe_categorical():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = DataFrame()
     df["keys"] = pd.Categorical(
         ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
     )
-    df["vals"] = np.random.random(len(df))
+    df["vals"] = rng.random(len(df))
 
     check_serialization(df)
 
 
 def test_memory_usage_dataframe():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
     nelem = 1000
     df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
-    df["vals"] = hvals = np.random.random(nelem)
+    df["vals"] = hvals = rng.random(nelem)
 
     nbytes = hkeys.nbytes + hvals.nbytes
     sizeof = df.memory_usage().sum()
@@ -98,11 +98,11 @@ def test_pickle_buffer():
 
 @pytest.mark.parametrize("named", [True, False])
 def test_pickle_series(named):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     if named:
-        ser = Series(np.random.random(10), name="a")
+        ser = Series(rng.random(10), name="a")
     else:
-        ser = Series(np.random.random(10))
+        ser = Series(rng.random(10))
 
     pickled = pickle.dumps(ser)
     out = pickle.loads(pickled)
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index b12209fd3b9..7685d09203e 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -45,10 +45,10 @@ def test_query(data, fn, nulls):
     # prepare
     nelem, seed = data
     expect_fn, query_expr = fn
-    np.random.seed(seed)
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame()
     pdf["a"] = np.arange(nelem)
-    pdf["b"] = np.random.random(nelem) * nelem
+    pdf["b"] = rng.random(nelem) * nelem
     if nulls:
         pdf.loc[::2, "a"] = None
     gdf = cudf.from_pandas(pdf)
@@ -71,10 +71,10 @@ def test_query_ref_env(data, fn):
     # prepare
     nelem, seed = data
     expect_fn, query_expr = fn
-    np.random.seed(seed)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
     df["a"] = aa = np.arange(nelem)
-    df["b"] = bb = np.random.random(nelem) * nelem
+    df["b"] = bb = rng.random(nelem) * nelem
     c = 2.3
     d = 1.2
     # udt
@@ -121,9 +121,9 @@ def test_query_local_dict():
 
 
 def test_query_splitted_combine():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)}
+        {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)}
     )
     gdf = DataFrame.from_pandas(df)
 
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index 4c1d8ce92ae..1d9c6690f14 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -125,32 +125,28 @@ def test_rank_error_arguments(pdf):
     )
 
 
-sort_group_args = [
-    np.full((3,), np.nan),
-    100 * np.random.random(10),
-    np.full((3,), np.inf),
-    np.full((3,), -np.inf),
-]
-sort_dtype_args = [np.int32, np.int64, np.float32, np.float64]
-
-
 @pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "elem,dtype",
     list(
         product(
-            combinations_with_replacement(sort_group_args, 4),
-            sort_dtype_args,
+            combinations_with_replacement(
+                [
+                    np.full((3,), np.nan),
+                    100 * np.random.default_rng(seed=0).random(10),
+                    np.full((3,), np.inf),
+                    np.full((3,), -np.inf),
+                ],
+                4,
+            ),
+            [np.int32, np.int64, np.float32, np.float64],
         )
     ),
 )
 def test_series_rank_combinations(elem, dtype):
-    np.random.seed(0)
     aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype)
-    gdf = DataFrame()
-    df = pd.DataFrame()
-    gdf["a"] = aa
-    df["a"] = aa
+    gdf = DataFrame({"a": aa})
+    df = pd.DataFrame({"a": aa})
     ranked_gs = gdf["a"].rank(method="first")
     ranked_ps = df["a"].rank(method="first")
     # Check
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index f276f394cd0..e0bc8f32c9b 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -62,8 +62,7 @@ def test_sum_string():
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_sum_decimal(dtype, nelem):
-    np.random.seed(0)
-    data = [str(x) for x in gen_rand("int64", nelem) / 100]
+    data = [str(x) for x in gen_rand("int64", nelem, seed=0) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).sum()
     got = cudf.Series(data).astype(dtype).sum()
@@ -73,15 +72,13 @@ def test_sum_decimal(dtype, nelem):
 
 @pytest.mark.parametrize("dtype,nelem", params)
 def test_product(dtype, nelem):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     dtype = cudf.dtype(dtype).type
     if cudf.dtype(dtype).kind in {"u", "i"}:
         data = np.ones(nelem, dtype=dtype)
         # Set at most 30 items to [0..2) to keep the value within 2^32
         for _ in range(30):
-            data[np.random.randint(low=0, high=nelem, size=1)] = (
-                np.random.uniform() * 2
-            )
+            data[rng.integers(low=0, high=nelem, size=1)] = rng.uniform() * 2
     else:
         data = gen_rand(dtype, nelem)
 
@@ -104,7 +101,6 @@ def test_product(dtype, nelem):
     ],
 )
 def test_product_decimal(dtype):
-    np.random.seed(0)
     data = [str(x) for x in gen_rand("int8", 3) / 10]
 
     expected = pd.Series([Decimal(x) for x in data]).product()
@@ -153,7 +149,6 @@ def test_sum_of_squares(dtype, nelem):
     ],
 )
 def test_sum_of_squares_decimal(dtype):
-    np.random.seed(0)
     data = [str(x) for x in gen_rand("int8", 3) / 10]
 
     expected = pd.Series([Decimal(x) for x in data]).pow(2).sum()
@@ -186,7 +181,6 @@ def test_min(dtype, nelem):
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_min_decimal(dtype, nelem):
-    np.random.seed(0)
     data = [str(x) for x in gen_rand("int64", nelem) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).min()
@@ -219,7 +213,6 @@ def test_max(dtype, nelem):
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_max_decimal(dtype, nelem):
-    np.random.seed(0)
     data = [str(x) for x in gen_rand("int64", nelem) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).max()
@@ -256,7 +249,8 @@ def test_sum_boolean():
 
 
 def test_date_minmax():
-    np_data = np.random.normal(size=10**3)
+    rng = np.random.default_rng(seed=0)
+    np_data = rng.normal(size=10**3)
     gdf_data = Series(np_data)
 
     np_casted = np_data.astype("datetime64[ms]")
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 95e19fae501..bf0c97adb00 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -25,9 +25,10 @@
 @pytest.mark.parametrize("dtype", repr_categories)
 @pytest.mark.parametrize("nrows", [0, 5, 10])
 def test_null_series(nrows, dtype):
+    rng = np.random.default_rng(seed=0)
     size = 5
-    sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype)
-    sr[np.random.choice([False, True], size=size)] = None
+    sr = cudf.Series(rng.integers(1, 9, size)).astype(dtype)
+    sr[rng.choice([False, True], size=size)] = None
     if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}:
         ps = pd.Series(
             sr._column.data_array_view(mode="read").copy_to_host(),
@@ -60,11 +61,12 @@ def test_null_series(nrows, dtype):
 
 @pytest.mark.parametrize("ncols", [1, 2, 3, 4, 5, 10])
 def test_null_dataframe(ncols):
+    rng = np.random.default_rng(seed=0)
     size = 20
     gdf = cudf.DataFrame()
     for idx, dtype in enumerate(dtype_categories):
-        sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype)
-        sr[np.random.choice([False, True], size=size)] = None
+        sr = cudf.Series(rng.integers(0, 128, size)).astype(dtype)
+        sr[rng.choice([False, True], size=size)] = None
         gdf[dtype] = sr
     pdf = gdf.to_pandas()
     pd.options.display.max_columns = int(ncols)
@@ -77,7 +79,8 @@ def test_null_dataframe(ncols):
 @pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21])
 def test_full_series(nrows, dtype):
     size = 20
-    ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    ps = pd.Series(rng.integers(0, 100, size)).astype(dtype)
     sr = cudf.from_pandas(ps)
     pd.options.display.max_rows = nrows
     assert repr(ps) == repr(sr)
@@ -89,8 +92,9 @@ def test_full_series(nrows, dtype):
 @pytest.mark.parametrize("size", [20, 21])
 @pytest.mark.parametrize("dtype", repr_categories)
 def test_full_dataframe_20(dtype, size, nrows, ncols):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
-        {idx: np.random.randint(0, 100, size) for idx in range(size)}
+        {idx: rng.integers(0, 100, size) for idx in range(size)}
     ).astype(dtype)
     gdf = cudf.from_pandas(pdf)
 
@@ -178,11 +182,12 @@ def test_mixed_series(mixed_pdf, mixed_gdf):
 
 
 def test_MI():
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
-            "a": np.random.randint(0, 4, 10),
-            "b": np.random.randint(0, 4, 10),
-            "c": np.random.randint(0, 4, 10),
+            "a": rng.integers(0, 4, 10),
+            "b": rng.integers(0, 4, 10),
+            "c": rng.integers(0, 4, 10),
         }
     )
     levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]]
@@ -223,9 +228,10 @@ def test_groupby_MI(nrows, ncols):
 @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES)
 @pytest.mark.parametrize("length", [0, 1, 10, 100, 1000])
 def test_generic_index(length, dtype):
+    rng = np.random.default_rng(seed=0)
     psr = pd.Series(
         range(length),
-        index=np.random.randint(0, high=100, size=length).astype(dtype),
+        index=rng.integers(0, high=100, size=length).astype(dtype),
         dtype="float64" if length == 0 else None,
     )
     gsr = cudf.Series.from_pandas(psr)
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index a61477981f8..5ff0098bcf4 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -50,8 +50,9 @@ def test_series_upsample_simple():
 
 @pytest.mark.parametrize("rule", ["2s", "10s"])
 def test_series_resample_ffill(rule):
-    rng = pd.date_range("1/1/2012", periods=10, freq="5s")
-    ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
+    date_idx = pd.date_range("1/1/2012", periods=10, freq="5s")
+    rng = np.random.default_rng(seed=0)
+    ts = pd.Series(rng.integers(0, 500, len(date_idx)), index=date_idx)
     gts = cudf.from_pandas(ts)
     assert_resample_results_equal(
         ts.resample(rule).ffill(), gts.resample(rule).ffill()
@@ -60,8 +61,9 @@ def test_series_resample_ffill(rule):
 
 @pytest.mark.parametrize("rule", ["2s", "10s"])
 def test_series_resample_bfill(rule):
-    rng = pd.date_range("1/1/2012", periods=10, freq="5s")
-    ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
+    date_idx = pd.date_range("1/1/2012", periods=10, freq="5s")
+    rng = np.random.default_rng(seed=0)
+    ts = pd.Series(rng.integers(0, 500, len(date_idx)), index=date_idx)
     gts = cudf.from_pandas(ts)
     assert_resample_results_equal(
         ts.resample(rule).bfill(), gts.resample(rule).bfill()
@@ -70,8 +72,9 @@ def test_series_resample_bfill(rule):
 
 @pytest.mark.parametrize("rule", ["2s", "10s"])
 def test_series_resample_asfreq(rule):
-    rng = pd.date_range("1/1/2012", periods=100, freq="5s")
-    ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
+    date_range = pd.date_range("1/1/2012", periods=100, freq="5s")
+    rng = np.random.default_rng(seed=0)
+    ts = pd.Series(rng.integers(0, 500, len(date_range)), index=date_range)
     gts = cudf.from_pandas(ts)
     assert_resample_results_equal(
         ts.resample(rule).asfreq(), gts.resample(rule).asfreq()
@@ -79,8 +82,9 @@ def test_series_resample_asfreq(rule):
 
 
 def test_dataframe_resample_aggregation_simple():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
-        np.random.randn(1000, 3),
+        rng.standard_normal(size=(1000, 3)),
         index=pd.date_range("1/1/2012", freq="s", periods=1000),
         columns=["A", "B", "C"],
     )
@@ -91,8 +95,9 @@ def test_dataframe_resample_aggregation_simple():
 
 
 def test_dataframe_resample_multiagg():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
-        np.random.randn(1000, 3),
+        rng.standard_normal(size=(1000, 3)),
         index=pd.date_range("1/1/2012", freq="s", periods=1000),
         columns=["A", "B", "C"],
     )
@@ -104,10 +109,11 @@ def test_dataframe_resample_multiagg():
 
 
 def test_dataframe_resample_on():
+    rng = np.random.default_rng(seed=0)
     # test resampling on a specified column
     pdf = pd.DataFrame(
         {
-            "x": np.random.randn(1000),
+            "x": rng.standard_normal(size=(1000)),
             "y": pd.date_range("1/1/2012", freq="s", periods=1000),
         }
     )
@@ -119,15 +125,16 @@ def test_dataframe_resample_on():
 
 
 def test_dataframe_resample_level():
+    rng = np.random.default_rng(seed=0)
     # test resampling on a specific level of a MultIndex
     pdf = pd.DataFrame(
         {
-            "x": np.random.randn(1000),
+            "x": rng.standard_normal(size=1000),
             "y": pd.date_range("1/1/2012", freq="s", periods=1000),
         }
     )
     pdi = pd.MultiIndex.from_frame(pdf)
-    pdf = pd.DataFrame({"a": np.random.randn(1000)}, index=pdi)
+    pdf = pd.DataFrame({"a": rng.standard_normal(size=1000)}, index=pdi)
     gdf = cudf.from_pandas(pdf)
     assert_resample_results_equal(
         pdf.resample("3min", level="y").mean(),
@@ -153,11 +160,12 @@ def test_dataframe_resample_level():
     reason="Fails in older versions of pandas",
 )
 def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
+    rng = np.random.default_rng(seed=0)
     # test that we cast to the appropriate frequency
     # when resampling:
     pdf = pd.DataFrame(
         {
-            "x": np.random.randn(100),
+            "x": rng.standard_normal(size=100),
             "y": pd.date_range("1/1/2012", freq=in_freq, periods=100),
         }
     )
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 4235affd4d1..26386abb05d 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -46,13 +46,12 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
 
     pdf = pd.DataFrame()
     id_vars = []
+    rng = np.random.default_rng(seed=0)
     for i in range(num_id_vars):
         colname = "id" + str(i)
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         elif nulls == "all":
             data[:] = np.nan
@@ -62,11 +61,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
     value_vars = []
     for i in range(num_value_vars):
         colname = "val" + str(i)
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         elif nulls == "all":
             data[:] = np.nan
@@ -119,6 +116,15 @@ def test_melt_str_scalar_id_var():
     assert_eq(result, expected)
 
 
+def test_melt_falsy_var_name():
+    df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]})
+    result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="")
+    expected = pd.melt(
+        df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name=""
+    )
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 1000])
 @pytest.mark.parametrize(
@@ -130,13 +136,12 @@ def test_df_stack(nulls, num_cols, num_rows, dtype):
         pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame()
+    rng = np.random.default_rng(seed=0)
     for i in range(num_cols):
         colname = str(i)
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         pdf[colname] = data
 
@@ -271,8 +276,8 @@ def test_df_stack_multiindex_column_axis_pd_example(level):
         ],
         names=["exp", "animal", "hair_length"],
     )
-
-    df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
+    rng = np.random.default_rng(seed=0)
+    df = pd.DataFrame(rng.standard_normal(size=(4, 4)), columns=columns)
 
     with expect_warning_if(PANDAS_GE_220, FutureWarning):
         expect = df.stack(level=level, future_stack=False)
@@ -299,14 +304,13 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype):
         pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame(dtype=dtype)
+    rng = np.random.default_rng(seed=0)
     for i in range(num_cols):
         colname = str(i)
-        data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype)
+        data = pd.Series(rng.integers(0, 26, num_rows)).astype(dtype)
 
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         pdf[colname] = data
 
@@ -335,16 +339,13 @@ def test_tile(nulls, num_cols, num_rows, dtype, count):
         pytest.skip(reason="nulls not supported in dtype: " + dtype)
 
     pdf = pd.DataFrame(dtype=dtype)
+    rng = np.random.default_rng(seed=0)
     for i in range(num_cols):
         colname = str(i)
-        data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype(
-            dtype
-        )
+        data = pd.Series(rng.integers(num_cols, 26, num_rows)).astype(dtype)
 
         if nulls == "some":
-            idx = np.random.choice(
-                num_rows, size=int(num_rows / 2), replace=False
-            )
+            idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False)
             data[idx] = np.nan
         pdf[colname] = data
 
@@ -715,23 +716,20 @@ def test_pivot_duplicate_error():
 
 
 @pytest.mark.parametrize(
-    "data",
-    [
+    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
+)
+@pytest.mark.parametrize("fill_value", [0])
+def test_pivot_table_simple(aggfunc, fill_value):
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(
         {
             "A": ["one", "one", "two", "three"] * 6,
             "B": ["A", "B", "C"] * 8,
             "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
-            "D": np.random.randn(24),
-            "E": np.random.randn(24),
+            "D": rng.standard_normal(size=24),
+            "E": rng.standard_normal(size=24),
         }
-    ],
-)
-@pytest.mark.parametrize(
-    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
-)
-@pytest.mark.parametrize("fill_value", [0])
-def test_pivot_table_simple(data, aggfunc, fill_value):
-    pdf = pd.DataFrame(data)
+    )
     expected = pd.pivot_table(
         pdf,
         values=["D", "E"],
@@ -740,7 +738,7 @@ def test_pivot_table_simple(data, aggfunc, fill_value):
         aggfunc=aggfunc,
         fill_value=fill_value,
     )
-    cdf = cudf.DataFrame(data)
+    cdf = cudf.DataFrame.from_pandas(pdf)
     actual = cudf.pivot_table(
         cdf,
         values=["D", "E"],
@@ -753,23 +751,20 @@ def test_pivot_table_simple(data, aggfunc, fill_value):
 
 
 @pytest.mark.parametrize(
-    "data",
-    [
+    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
+)
+@pytest.mark.parametrize("fill_value", [0])
+def test_dataframe_pivot_table_simple(aggfunc, fill_value):
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(
         {
             "A": ["one", "one", "two", "three"] * 6,
             "B": ["A", "B", "C"] * 8,
             "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
-            "D": np.random.randn(24),
-            "E": np.random.randn(24),
+            "D": rng.standard_normal(size=24),
+            "E": rng.standard_normal(size=24),
         }
-    ],
-)
-@pytest.mark.parametrize(
-    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
-)
-@pytest.mark.parametrize("fill_value", [0])
-def test_dataframe_pivot_table_simple(data, aggfunc, fill_value):
-    pdf = pd.DataFrame(data)
+    )
     expected = pdf.pivot_table(
         values=["D", "E"],
         index=["A", "B"],
@@ -777,7 +772,7 @@ def test_dataframe_pivot_table_simple(data, aggfunc, fill_value):
         aggfunc=aggfunc,
         fill_value=fill_value,
     )
-    cdf = cudf.DataFrame(data)
+    cdf = cudf.DataFrame.from_pandas(pdf)
     actual = cdf.pivot_table(
         values=["D", "E"],
         index=["A", "B"],
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 0b892a51895..68f2aaf9cab 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -170,11 +170,15 @@ def test_serialize_dataframe():
 
 
 def test_serialize_dataframe_with_index():
-    df = cudf.DataFrame()
-    df["a"] = np.arange(100)
-    df["b"] = np.random.random(100)
-    df["c"] = pd.Categorical(
-        ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"]
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "a": np.arange(100),
+            "b": rng.random(100),
+            "c": pd.Categorical(
+                ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"]
+            ),
+        }
     )
     df = df.sort_values("b")
     outdf = cudf.DataFrame.deserialize(*df.serialize())
@@ -200,11 +204,12 @@ def test_serialize_generic_index():
 
 
 def test_serialize_multi_index():
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
             "a": [4, 17, 4, 9, 5],
             "b": [1, 4, 4, 3, 2],
-            "x": np.random.normal(size=5),
+            "x": rng.normal(size=5),
         }
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -218,7 +223,8 @@ def test_serialize_multi_index():
 
 def test_serialize_masked_series():
     nelem = 50
-    data = np.random.random(nelem)
+    rng = np.random.default_rng(seed=0)
+    data = rng.random(nelem)
     mask = utils.random_bitmask(nelem)
     bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
     null_count = utils.count_zero(bitmask)
@@ -229,10 +235,14 @@ def test_serialize_masked_series():
 
 
 def test_serialize_groupby_df():
-    df = cudf.DataFrame()
-    df["key_1"] = np.random.randint(0, 20, 100)
-    df["key_2"] = np.random.randint(0, 20, 100)
-    df["val"] = np.arange(100, dtype=np.float32)
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "key_1": rng.integers(0, 20, 100),
+            "key_2": rng.integers(0, 20, 100),
+            "val": np.arange(100, dtype=np.float32),
+        }
+    )
     gb = df.groupby(["key_1", "key_2"], sort=True)
     outgb = gb.deserialize(*gb.serialize())
     expect = gb.mean()
@@ -241,9 +251,9 @@ def test_serialize_groupby_df():
 
 
 def test_serialize_groupby_external():
-    df = cudf.DataFrame()
-    df["val"] = np.arange(100, dtype=np.float32)
-    gb = df.groupby(cudf.Series(np.random.randint(0, 20, 100)))
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame({"val": np.arange(100, dtype=np.float32)})
+    gb = df.groupby(cudf.Series(rng.integers(0, 20, 100)))
     outgb = gb.deserialize(*gb.serialize())
     expect = gb.mean()
     got = outgb.mean()
@@ -262,7 +272,8 @@ def test_serialize_groupby_level():
 
 
 def test_serialize_groupby_sr():
-    sr = cudf.Series(np.random.randint(0, 20, 100))
+    rng = np.random.default_rng(seed=0)
+    sr = cudf.Series(rng.integers(0, 20, 100))
     gb = sr.groupby(sr // 2)
     outgb = gb.deserialize(*gb.serialize())
     got = gb.mean()
@@ -271,9 +282,10 @@ def test_serialize_groupby_sr():
 
 
 def test_serialize_datetime():
+    rng = np.random.default_rng(seed=0)
     # Make frame with datetime column
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
     ts = np.arange(0, len(df), dtype=np.dtype("datetime64[ms]"))
     df["timestamp"] = ts
@@ -285,9 +297,10 @@ def test_serialize_datetime():
 
 
 def test_serialize_string():
+    rng = np.random.default_rng(seed=0)
     # Make frame with string column
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=5), "y": np.random.normal(size=5)}
+        {"x": rng.integers(0, 5, size=5), "y": rng.normal(size=5)}
     )
     str_data = ["a", "bc", "def", "ghij", "klmno"]
     df["timestamp"] = str_data
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index a24002dc38e..7f0a4902ed1 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -519,13 +519,13 @@ def test_series_factorize_sort(data, sort):
 @pytest.mark.parametrize("nulls", ["none", "some"])
 def test_series_datetime_value_counts(data, nulls, normalize, dropna):
     psr = data.copy()
-
+    rng = np.random.default_rng(seed=0)
     if len(data) > 0:
         if nulls == "one":
-            p = np.random.randint(0, len(data))
+            p = rng.integers(0, len(data))
             psr[p] = None
         elif nulls == "some":
-            p = np.random.randint(0, len(data), 2)
+            p = rng.integers(0, len(data), 2)
             psr[p] = None
 
     gsr = cudf.from_pandas(psr)
@@ -546,10 +546,10 @@ def test_series_datetime_value_counts(data, nulls, normalize, dropna):
 @pytest.mark.parametrize("num_elements", [10, 100, 1000])
 def test_categorical_value_counts(dropna, normalize, num_elements):
     # create categorical series
-    np.random.seed(12)
+    rng = np.random.default_rng(seed=12)
     pd_cat = pd.Categorical(
         pd.Series(
-            np.random.choice(list(ascii_letters + digits), num_elements),
+            rng.choice(list(ascii_letters + digits), num_elements),
             dtype="category",
         )
     )
@@ -586,8 +586,9 @@ def test_categorical_value_counts(dropna, normalize, num_elements):
 @pytest.mark.parametrize("dropna", [True, False])
 @pytest.mark.parametrize("normalize", [True, False])
 def test_series_value_counts(dropna, normalize):
+    rng = np.random.default_rng(seed=0)
     for size in [10**x for x in range(5)]:
-        arr = np.random.randint(low=-1, high=10, size=size)
+        arr = rng.integers(low=-1, high=10, size=size)
         mask = arr != -1
         sr = cudf.Series.from_masked_array(
             arr, cudf.Series(mask)._column.as_mask()
@@ -714,8 +715,8 @@ def test_series_mode(gs, dropna):
 @pytest.mark.parametrize(
     "arr",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat([-0.6459412758761901], 100),
         np.repeat(np.nan, 100),
@@ -731,12 +732,12 @@ def test_series_round(arr, decimals):
     expected = pser.round(decimals)
 
     assert_eq(result, expected)
-
+    rng = np.random.default_rng(seed=0)
     # with nulls, maintaining existing null mask
     arr = arr.astype("float64")  # for pandas nulls
-    arr.ravel()[
-        np.random.choice(arr.shape[0], arr.shape[0] // 2, replace=False)
-    ] = np.nan
+    arr.ravel()[rng.choice(arr.shape[0], arr.shape[0] // 2, replace=False)] = (
+        np.nan
+    )
 
     pser = pd.Series(arr)
     ser = cudf.Series(arr)
@@ -1726,7 +1727,7 @@ def test_series_truncate_datetimeindex():
         [],
         [0, 12, 14],
         [0, 14, 12, 12, 3, 10, 12, 14],
-        np.random.randint(-100, 100, 200),
+        np.random.default_rng(seed=0).integers(-100, 100, 200),
         pd.Series([0.0, 1.0, None, 10.0]),
         [None, None, None, None],
         [np.nan, None, -1, 2, 3],
@@ -1735,7 +1736,7 @@ def test_series_truncate_datetimeindex():
 @pytest.mark.parametrize(
     "values",
     [
-        np.random.randint(-100, 100, 10),
+        np.random.default_rng(seed=0).integers(-100, 100, 10),
         [],
         [np.nan, None, -1, 2, 3],
         [1.0, 12.0, None, None, 120],
@@ -1746,7 +1747,8 @@ def test_series_truncate_datetimeindex():
     ],
 )
 def test_isin_numeric(data, values):
-    index = np.random.randint(0, 100, len(data))
+    rng = np.random.default_rng(seed=0)
+    index = rng.integers(0, 100, len(data))
     psr = pd.Series(data, index=index)
     gsr = cudf.Series.from_pandas(psr, nan_as_null=False)
 
@@ -1943,8 +1945,9 @@ def test_diff_many_dtypes(data):
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 @pytest.mark.parametrize("series_bins", [True, False])
 def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
-    data = np.random.randint(0, 100, num_rows).astype(dtype)
-    bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype)))
+    rng = np.random.default_rng(seed=0)
+    data = rng.integers(0, 100, num_rows).astype(dtype)
+    bins = np.unique(np.sort(rng.integers(2, 95, num_bins).astype(dtype)))
     s = cudf.Series(data)
     if series_bins:
         s_bins = cudf.Series(bins)
@@ -1957,7 +1960,8 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
 
 
 def test_series_digitize_invalid_bins():
-    s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32")
+    rng = np.random.default_rng(seed=0)
+    s = cudf.Series(rng.integers(0, 30, 80), dtype="int32")
     bins = cudf.Series([2, None, None, 50, 90], dtype="int32")
 
     with pytest.raises(
@@ -2038,7 +2042,8 @@ def test_default_float_bitwidth_construction(default_float_bitwidth, data):
 
 def test_series_ordered_dedup():
     # part of https://github.com/rapidsai/cudf/issues/11486
-    sr = cudf.Series(np.random.randint(0, 100, 1000))
+    rng = np.random.default_rng(seed=0)
+    sr = cudf.Series(rng.integers(0, 100, 1000))
     # pandas unique() preserves order
     expect = pd.Series(sr.to_pandas().unique())
     got = cudf.Series._from_column(sr._column.unique())
diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py
index 3d8b6a79d2a..db1de7d0cf4 100644
--- a/python/cudf/cudf/tests/test_seriesmap.py
+++ b/python/cudf/cudf/tests/test_seriesmap.py
@@ -47,8 +47,8 @@ def test_series_map_callable_numeric_basic():
 @pytest.mark.parametrize("nelem", list(product([2, 10, 100, 1000])))
 def test_series_map_callable_numeric_random(nelem):
     # Generate data
-    np.random.seed(0)
-    data = np.random.random(nelem) * 100
+    rng = np.random.default_rng(seed=0)
+    data = rng.random(nelem) * 100
 
     sr = Series(data)
     pdsr = pd.Series(data)
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 2cf2259d9ec..7e5ce713c7e 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -34,10 +34,10 @@
     "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args))
 )
 def test_dataframe_sort_values(nelem, dtype):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
-    df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype)
-    df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype)
+    df["a"] = aa = (100 * rng.random(nelem)).astype(dtype)
+    df["b"] = bb = (100 * rng.random(nelem)).astype(dtype)
     sorted_df = df.sort_values(by="a")
     # Check
     sorted_index = np.argsort(aa, kind="mergesort")
@@ -85,9 +85,9 @@ def test_series_sort_values_ignore_index(ignore_index):
     "nelem,sliceobj", list(product([10, 100], sort_slice_args))
 )
 def test_dataframe_sort_values_sliced(nelem, sliceobj):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame()
-    df["a"] = np.random.random(nelem)
+    df["a"] = rng.random(nelem)
 
     expect = df[sliceobj]["a"].sort_values()
     gdf = DataFrame.from_pandas(df)
@@ -100,8 +100,8 @@ def test_dataframe_sort_values_sliced(nelem, sliceobj):
     list(product(sort_nelem_args, sort_dtype_args, [True, False])),
 )
 def test_series_argsort(nelem, dtype, asc):
-    np.random.seed(0)
-    sr = Series((100 * np.random.random(nelem)).astype(dtype))
+    rng = np.random.default_rng(seed=0)
+    sr = Series((100 * rng.random(nelem)).astype(dtype))
     res = sr.argsort(ascending=asc)
 
     if asc:
@@ -116,8 +116,8 @@ def test_series_argsort(nelem, dtype, asc):
     "nelem,asc", list(product(sort_nelem_args, [True, False]))
 )
 def test_series_sort_index(nelem, asc):
-    np.random.seed(0)
-    sr = Series(100 * np.random.random(nelem))
+    rng = np.random.default_rng(seed=0)
+    sr = Series(100 * rng.random(nelem))
     psr = sr.to_pandas()
 
     expected = psr.sort_index(ascending=asc)
@@ -167,9 +167,9 @@ def test_series_nsmallest(data, n):
 @pytest.mark.parametrize("op", ["nsmallest", "nlargest"])
 @pytest.mark.parametrize("columns", ["a", ["b", "a"]])
 def test_dataframe_nlargest_nsmallest(nelem, n, op, columns):
-    np.random.seed(0)
-    aa = np.random.random(nelem)
-    bb = np.random.random(nelem)
+    rng = np.random.default_rng(seed=0)
+    aa = rng.random(nelem)
+    bb = rng.random(nelem)
 
     df = DataFrame({"a": aa, "b": bb})
     pdf = df.to_pandas()
@@ -181,10 +181,10 @@ def test_dataframe_nlargest_nsmallest(nelem, n, op, columns):
 )
 def test_dataframe_nlargest_sliced(counts, sliceobj):
     nelem, n = counts
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame()
-    df["a"] = np.random.random(nelem)
-    df["b"] = np.random.random(nelem)
+    df["a"] = rng.random(nelem)
+    df["b"] = rng.random(nelem)
 
     expect = df[sliceobj].nlargest(n, "a")
     gdf = DataFrame.from_pandas(df)
@@ -197,10 +197,10 @@ def test_dataframe_nlargest_sliced(counts, sliceobj):
 )
 def test_dataframe_nsmallest_sliced(counts, sliceobj):
     nelem, n = counts
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame()
-    df["a"] = np.random.random(nelem)
-    df["b"] = np.random.random(nelem)
+    df["a"] = rng.random(nelem)
+    df["b"] = rng.random(nelem)
 
     expect = df[sliceobj].nsmallest(n, "a")
     gdf = DataFrame.from_pandas(df)
@@ -216,13 +216,13 @@ def test_dataframe_nsmallest_sliced(counts, sliceobj):
 def test_dataframe_multi_column(
     num_cols, num_rows, dtype, ascending, na_position
 ):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     by = list(string.ascii_lowercase[:num_cols])
     pdf = pd.DataFrame()
 
     for i in range(5):
         colname = string.ascii_lowercase[i]
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         pdf[colname] = data
 
     gdf = DataFrame.from_pandas(pdf)
@@ -244,17 +244,17 @@ def test_dataframe_multi_column(
 def test_dataframe_multi_column_nulls(
     num_cols, num_rows, dtype, nulls, ascending, na_position
 ):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     by = list(string.ascii_lowercase[:num_cols])
     pdf = pd.DataFrame()
 
     for i in range(3):
         colname = string.ascii_lowercase[i]
-        data = np.random.randint(0, 26, num_rows).astype(dtype)
+        data = rng.integers(0, 26, num_rows).astype(dtype)
         if nulls == "some":
             idx = np.array([], dtype="int64")
             if num_rows > 0:
-                idx = np.random.choice(
+                idx = rng.choice(
                     num_rows, size=int(num_rows / 4), replace=False
                 )
             data[idx] = np.nan
@@ -295,8 +295,8 @@ def test_dataframe_multi_column_nulls_multiple_ascending(
 
 @pytest.mark.parametrize("nelem", [1, 100])
 def test_series_nlargest_nelem(nelem):
-    np.random.seed(0)
-    elems = np.random.random(nelem)
+    rng = np.random.default_rng(seed=0)
+    elems = rng.random(nelem)
     gds = Series(elems).nlargest(nelem)
     pds = pd.Series(elems).nlargest(nelem)
 
@@ -308,11 +308,14 @@ def test_series_nlargest_nelem(nelem):
 @pytest.mark.parametrize("keep", [True, False])
 def test_dataframe_scatter_by_map(map_size, nelem, keep):
     strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"]
-    np.random.seed(0)
-    df = DataFrame()
-    df["a"] = np.random.choice(strlist[:map_size], nelem)
-    df["b"] = np.random.uniform(low=0, high=map_size, size=nelem)
-    df["c"] = np.random.randint(map_size, size=nelem)
+    rng = np.random.default_rng(seed=0)
+    df = DataFrame(
+        {
+            "a": rng.choice(strlist[:map_size], nelem),
+            "b": rng.uniform(low=0, high=map_size, size=nelem),
+            "c": rng.integers(map_size, size=nelem),
+        }
+    )
     df["d"] = df["a"].astype("category")
 
     def _check_scatter_by_map(dfs, col):
@@ -381,10 +384,10 @@ def _check_scatter_by_map(dfs, col):
     "kind", ["quicksort", "mergesort", "heapsort", "stable"]
 )
 def test_dataframe_sort_values_kind(nelem, dtype, kind):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = DataFrame()
-    df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype)
-    df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype)
+    df["a"] = aa = (100 * rng.random(nelem)).astype(dtype)
+    df["b"] = bb = (100 * rng.random(nelem)).astype(dtype)
     with expect_warning_if(kind != "quicksort", UserWarning):
         sorted_df = df.sort_values(by="a", kind=kind)
     # Check
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index 3248e7f72c0..8b68ae6480b 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -6,7 +6,8 @@
 
 
 def test_to_dense_array():
-    data = np.random.random(8)
+    rng = np.random.default_rng(seed=0)
+    data = rng.random(8)
     mask = np.asarray([0b11010110]).astype(np.byte)
 
     sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index f952cea07f8..27de0ed42e5 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -24,8 +24,8 @@
 @pytest.mark.parametrize("dtype", params_dtypes)
 @pytest.mark.parametrize("skipna", [True, False])
 def test_series_reductions(method, dtype, skipna):
-    np.random.seed(0)
-    arr = np.random.random(100)
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(100)
     if np.issubdtype(dtype, np.integer):
         arr *= 100
         mask = arr > 10
@@ -56,8 +56,8 @@ def call_test(sr, skipna):
 def test_series_reductions_concurrency(method):
     e = ThreadPoolExecutor(10)
 
-    np.random.seed(0)
-    srs = [cudf.Series(np.random.random(10000)) for _ in range(1)]
+    rng = np.random.default_rng(seed=0)
+    srs = [cudf.Series(rng.random(10000)) for _ in range(1)]
 
     def call_test(sr):
         fn = getattr(sr, method)
@@ -74,8 +74,8 @@ def f(sr):
 
 @pytest.mark.parametrize("ddof", range(3))
 def test_series_std(ddof):
-    np.random.seed(0)
-    arr = np.random.random(100) - 0.5
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(100) - 0.5
     sr = cudf.Series(arr)
     pd = sr.to_pandas()
     got = sr.std(ddof=ddof)
@@ -84,8 +84,9 @@ def test_series_std(ddof):
 
 
 def test_series_unique():
+    rng = np.random.default_rng(seed=0)
     for size in [10**x for x in range(5)]:
-        arr = np.random.randint(low=-1, high=10, size=size)
+        arr = rng.integers(low=-1, high=10, size=size)
         mask = arr != -1
         sr = cudf.Series(arr)
         sr[~mask] = None
@@ -129,7 +130,8 @@ def test_series_nunique(nan_as_null, dropna):
 
 
 def test_series_scale():
-    arr = pd.Series(np.random.randint(low=-10, high=10, size=100))
+    rng = np.random.default_rng(seed=0)
+    arr = pd.Series(rng.integers(low=-10, high=10, size=100))
     sr = cudf.Series(arr)
 
     vmin = arr.min()
@@ -229,8 +231,8 @@ def test_misc_quantiles(data, q):
 @pytest.mark.parametrize(
     "data",
     [
-        {"data": np.random.normal(-100, 100, 1000)},
-        {"data": np.random.randint(-50, 50, 1000)},
+        {"data": np.random.default_rng(seed=0).normal(-100, 100, 1000)},
+        {"data": np.random.default_rng(seed=0).integers(-50, 50, 1000)},
         {"data": (np.zeros(100))},
         {"data": np.repeat(np.nan, 100)},
         {"data": np.array([1.123, 2.343, np.nan, 0.0])},
@@ -280,8 +282,8 @@ def test_kurt_skew_error(op):
 @pytest.mark.parametrize(
     "data",
     [
-        cudf.Series(np.random.normal(-100, 100, 1000)),
-        cudf.Series(np.random.randint(-50, 50, 1000)),
+        cudf.Series(np.random.default_rng(seed=0).normal(-100, 100, 1000)),
+        cudf.Series(np.random.default_rng(seed=0).integers(-50, 50, 1000)),
         cudf.Series(np.zeros(100)),
         cudf.Series(np.repeat(np.nan, 100)),
         cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])),
@@ -311,8 +313,8 @@ def test_skew_series(data, null_flag, numeric_only):
 @pytest.mark.parametrize("dtype", params_dtypes)
 @pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100])
 def test_series_median(dtype, num_na):
-    np.random.seed(0)
-    arr = np.random.random(100)
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(100)
     if np.issubdtype(dtype, np.integer):
         arr *= 100
     mask = np.arange(100) >= num_na
@@ -344,8 +346,8 @@ def test_series_median(dtype, num_na):
 @pytest.mark.parametrize(
     "data",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.array([1.123, 2.343, np.nan, 0.0]),
         np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]),
@@ -379,8 +381,8 @@ def test_series_pct_change(data, periods, fill_method):
 @pytest.mark.parametrize(
     "data1",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat(np.nan, 100),
         np.array([1.123, 2.343, np.nan, 0.0]),
@@ -393,8 +395,8 @@ def test_series_pct_change(data, periods, fill_method):
 @pytest.mark.parametrize(
     "data2",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat(np.nan, 100),
         np.array([1.123, 2.343, np.nan, 0.0]),
@@ -423,8 +425,8 @@ def test_cov1d(data1, data2):
 @pytest.mark.parametrize(
     "data1",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat(np.nan, 100),
         np.array([1.123, 2.343, np.nan, 0.0]),
@@ -437,8 +439,8 @@ def test_cov1d(data1, data2):
 @pytest.mark.parametrize(
     "data2",
     [
-        np.random.normal(-100, 100, 1000),
-        np.random.randint(-50, 50, 1000),
+        np.random.default_rng(seed=0).normal(-100, 100, 1000),
+        np.random.default_rng(seed=0).integers(-50, 50, 1000),
         np.zeros(100),
         np.repeat(np.nan, 100),
         np.array([1.123, 2.343, np.nan, 0.0]),
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index cc88cc79769..e25f99d7bee 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -36,6 +36,7 @@
 idx_list = [None, [10, 11, 12, 13, 14]]
 
 idx_id_list = ["None_index", "Set_index"]
+rng = np.random.default_rng(seed=0)
 
 
 def raise_builder(flags, exceptions):
@@ -132,9 +133,14 @@ def test_string_get_item(ps_gs, item):
         np.array([False] * 5),
         cupy.asarray(np.array([True] * 5)),
         cupy.asarray(np.array([False] * 5)),
-        np.random.randint(0, 2, 5).astype("bool").tolist(),
-        np.random.randint(0, 2, 5).astype("bool"),
-        cupy.asarray(np.random.randint(0, 2, 5).astype("bool")),
+        np.random.default_rng(seed=0)
+        .integers(0, 2, 5)
+        .astype("bool")
+        .tolist(),
+        np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool"),
+        cupy.asarray(
+            np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool")
+        ),
     ],
 )
 def test_string_bool_mask(ps_gs, item):
@@ -1078,7 +1084,8 @@ def test_string_set_scalar(scalar):
 
 
 def test_string_index():
-    pdf = pd.DataFrame(np.random.rand(5, 5))
+    rng = np.random.default_rng(seed=0)
+    pdf = pd.DataFrame(rng.random(size=(5, 5)))
     gdf = cudf.DataFrame.from_pandas(pdf)
     stringIndex = ["a", "b", "c", "d", "e"]
     pdf.index = stringIndex
@@ -1899,6 +1906,26 @@ def test_string_findall(pat, flags):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "pat, flags, pos",
+    [
+        ("Monkey", 0, [-1, 0, -1, -1]),
+        ("on", 0, [2, 1, -1, 1]),
+        ("bit", 0, [-1, -1, 3, -1]),
+        ("on$", 0, [2, -1, -1, -1]),
+        ("on$", re.MULTILINE, [2, -1, -1, 1]),
+        ("o.*k", re.DOTALL, [-1, 1, -1, 1]),
+    ],
+)
+def test_string_find_re(pat, flags, pos):
+    test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"]
+    gs = cudf.Series(test_data)
+
+    expected = pd.Series(pos, dtype=np.int32)
+    actual = gs.str.find_re(pat, flags)
+    assert_eq(expected, actual)
+
+
 def test_string_replace_multi():
     ps = pd.Series(["hello", "goodbye"])
     gs = cudf.Series(["hello", "goodbye"])
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index e91edc9eec6..899d78c999b 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -50,10 +50,14 @@ def test_struct_for_field(key, expect):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("input_obj", [[{"a": 1, "b": cudf.NA, "c": 3}]])
-def test_series_construction_with_nulls(input_obj):
-    expect = pa.array(input_obj, from_pandas=True)
-    got = cudf.Series(input_obj).to_arrow()
+def test_series_construction_with_nulls():
+    fields = [
+        pa.array([1], type=pa.int64()),
+        pa.array([None], type=pa.int64()),
+        pa.array([3], type=pa.int64()),
+    ]
+    expect = pa.StructArray.from_arrays(fields, ["a", "b", "c"])
+    got = cudf.Series(expect).to_arrow()
 
     assert expect == got
 
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index 88938457545..1305022d7fa 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -24,8 +24,8 @@ def _generic_function(a):
 )
 def test_apply_python_lambda(dtype, udf, testfunc):
     size = 500
-
-    lhs_arr = np.random.random(size).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    lhs_arr = rng.random(size).astype(dtype)
     lhs_ser = Series(lhs_arr)
 
     out_ser = lhs_ser.apply(udf)
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 5f5d79c1dce..b714beb0069 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -17,7 +17,8 @@
 
 @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES)
 def test_series_abs(dtype):
-    arr = (np.random.random(1000) * 100).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    arr = (rng.random(1000) * 100).astype(dtype)
     sr = Series(arr)
     np.testing.assert_equal(sr.abs().to_numpy(), np.abs(arr))
     np.testing.assert_equal(abs(sr).to_numpy(), abs(arr))
@@ -25,22 +26,24 @@ def test_series_abs(dtype):
 
 @pytest.mark.parametrize("dtype", utils.INTEGER_TYPES)
 def test_series_invert(dtype):
-    arr = (np.random.random(1000) * 100).astype(dtype)
+    rng = np.random.default_rng(seed=0)
+    arr = (rng.random(1000) * 100).astype(dtype)
     sr = Series(arr)
     np.testing.assert_equal((~sr).to_numpy(), np.invert(arr))
     np.testing.assert_equal((~sr).to_numpy(), ~arr)
 
 
 def test_series_neg():
-    arr = np.random.random(100) * 100
+    rng = np.random.default_rng(seed=0)
+    arr = rng.random(100) * 100
     sr = Series(arr)
     np.testing.assert_equal((-sr).to_numpy(), -arr)
 
 
 @pytest.mark.parametrize("mth", ["min", "max", "sum", "product"])
 def test_series_pandas_methods(mth):
-    np.random.seed(0)
-    arr = (1 + np.random.random(5) * 100).astype(np.int64)
+    rng = np.random.default_rng(seed=0)
+    arr = (1 + rng.random(5) * 100).astype(np.int64)
     sr = Series(arr)
     psr = pd.Series(arr)
     np.testing.assert_equal(getattr(sr, mth)(), getattr(psr, mth)())
diff --git a/python/cudf/cudf/tests/test_unique.py b/python/cudf/cudf/tests/test_unique.py
index 699b3340521..9a1c3b213b8 100644
--- a/python/cudf/cudf/tests/test_unique.py
+++ b/python/cudf/cudf/tests/test_unique.py
@@ -12,9 +12,9 @@
 @pytest.fixture
 def df():
     df = cudf.DataFrame()
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
-    arr = np.random.randint(2, size=10, dtype=np.int64)
+    arr = rng.integers(2, size=10, dtype=np.int64)
     df["foo"] = arr
     df["bar"] = cudf.Series([pd.Timestamp(x) for x in arr])
 
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index babe4be2715..896a3809c67 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -69,10 +69,10 @@ def _get_space_util(bins, init_bins):
     return sum(_new_bin_length(len(b)) for b in bins) + 2 * init_bins
 
 
-def _pick_initial_a_b(data, max_constant, init_bins):
+def _pick_initial_a_b(data, max_constant, init_bins, rng):
     while True:
-        a = np.random.randint(2**12, 2**15)
-        b = np.random.randint(2**12, 2**15)
+        a = rng.integers(2**12, 2**15)
+        b = rng.integers(2**12, 2**15)
         bins = _make_bins(data, init_bins, a, b)
         score = _get_space_util(bins, init_bins) / len(data)
 
@@ -86,18 +86,18 @@ def _pick_initial_a_b(data, max_constant, init_bins):
     return bins, a, b
 
 
-def _find_hash_for_internal(hash_bin):
+def _find_hash_for_internal(hash_bin, rng):
     if not hash_bin:
         return [[], 0, 0]
 
     new_length = _new_bin_length(len(hash_bin))
 
     while True:
-        a = np.random.randint(
+        a = rng.integers(
             A_LBOUND_SECOND_LEVEL_HASH,
             A_HBOUND_SECOND_LEVEL_HASH,
         )
-        b = np.random.randint(
+        b = rng.integers(
             B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
         )
         bins = _make_bins(hash_bin, new_length, a, b)
@@ -108,11 +108,11 @@ def _find_hash_for_internal(hash_bin):
             return bins, a, b
 
 
-def _perfect_hash(integers, max_constant):
+def _perfect_hash(integers, max_constant, rng):
     num_top_level_bins = len(integers) // 4
 
     init_bins, init_a, init_b = _pick_initial_a_b(
-        integers, max_constant, num_top_level_bins
+        integers, max_constant, num_top_level_bins, rng
     )
     flattened_bins = []
 
@@ -127,7 +127,7 @@ def _perfect_hash(integers, max_constant):
     for i, b in enumerate(init_bins):
         if i % 500 == 0:
             print(f"Processing bin {i} / {len(init_bins)} of size = {len(b)}")
-        internal_table, coeff_a, coeff_b = _find_hash_for_internal(b)
+        internal_table, coeff_a, coeff_b = _find_hash_for_internal(b, rng)
         bin_length = len(internal_table)
         max_bin_length = max(bin_length, max_bin_length)
         internal_table_coeffs[i] = (
@@ -245,7 +245,7 @@ def hash_vocab(
     """
     Write the vocab vocabulary hashtable to the output_path
     """
-    np.random.seed(1243342)
+    rng = np.random.default_rng(seed=1243342)
     vocab = _load_vocab_dict(vocab_path)
     keys = list(map(_sdbm_hash, vocab.keys()))
 
@@ -264,7 +264,7 @@ def hash_vocab(
         hash_table,
         inner_table_coeffs,
         offsets_into_ht,
-    ) = _perfect_hash(keys, 10)
+    ) = _perfect_hash(keys, 10, rng)
 
     _pack_keys_and_values(hash_table, hashed_vocab)
     _store_func(
diff --git a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
index c7d39b78810..94904fd83d4 100644
--- a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
+++ b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
@@ -18,13 +18,13 @@
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
-    "np.random.seed(0)\n",
+    "rng = np.random.default_rng(seed=0)\n",
     "\n",
     "num_rows = 25_000_000\n",
     "num_columns = 12\n",
     "\n",
     "# Create a DataFrame with random data\n",
-    "df = pd.DataFrame(np.random.randint(0, 100, size=(num_rows, num_columns)),\n",
+    "df = pd.DataFrame(rng.integers(0, 100, size=(num_rows, num_columns)),\n",
     "                  columns=[f'Column_{i}' for i in range(1, num_columns + 1)])"
    ]
   },
diff --git a/python/cudf/cudf_pandas_tests/pytest.ini b/python/cudf/cudf_pandas_tests/pytest.ini
new file mode 100644
index 00000000000..46e2448ea24
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/pytest.ini
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# Note, this config file overrides the default "cudf" test config in
+# ../pyproject.toml We do so deliberately because we have different
+# treatment of markers and warnings
+[pytest]
+addopts = --tb=native --strict-config --strict-markers
+empty_parameter_set_mark = fail_at_collect
+xfail_strict = true
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 2bbed40e34e..7aefdc386bb 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import collections
+import contextlib
 import copy
 import datetime
 import operator
@@ -21,10 +22,15 @@
 import pyarrow as pa
 import pytest
 from nbconvert.preprocessors import ExecutePreprocessor
-from numba import NumbaDeprecationWarning, vectorize
+from numba import (
+    NumbaDeprecationWarning,
+    __version__ as numba_version,
+    vectorize,
+)
+from packaging import version
 from pytz import utc
 
-from cudf.core._compat import PANDAS_GE_220
+from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220, PANDAS_VERSION
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import (
     ProxyFallbackError,
@@ -52,8 +58,6 @@
     get_calendar,
 )
 
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-
 # Accelerated pandas has the real pandas and cudf modules as attributes
 pd = xpd._fsproxy_slow
 cudf = xpd._fsproxy_fast
@@ -622,10 +626,6 @@ def test_array_function_series_fallback(series):
     tm.assert_equal(expect, got)
 
 
-@pytest.mark.xfail(
-    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
-    reason="Fails in older versions of pandas",
-)
 def test_timedeltaproperties(series):
     psr, sr = series
     psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]")
@@ -685,10 +685,6 @@ def test_maintain_container_subclasses(multiindex):
     assert isinstance(got, xpd.core.indexes.frozen.FrozenList)
 
 
-@pytest.mark.xfail(
-    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
-    reason="Fails in older versions of pandas due to unsupported boxcar window type",
-)
 def test_rolling_win_type():
     pdf = pd.DataFrame(range(5))
     df = xpd.DataFrame(range(5))
@@ -697,8 +693,14 @@ def test_rolling_win_type():
     tm.assert_equal(result, expected)
 
 
-@pytest.mark.skip(
-    reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009"
+@pytest.mark.skipif(
+    version.parse(numba_version) < version.parse("0.59"),
+    reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009",
+)
+@pytest.mark.xfail(
+    version.parse(numba_version) >= version.parse("0.59")
+    and PANDAS_VERSION < version.parse("2.1"),
+    reason="numba.generated_jit removed in 0.59, requires pandas >= 2.1",
 )
 def test_rolling_apply_numba_engine():
     def weighted_mean(x):
@@ -709,7 +711,12 @@ def weighted_mean(x):
     pdf = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
     df = xpd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
 
-    with pytest.warns(NumbaDeprecationWarning):
+    ctx = (
+        contextlib.nullcontext()
+        if PANDAS_GE_210
+        else pytest.warns(NumbaDeprecationWarning)
+    )
+    with ctx:
         expect = pdf.rolling(2, method="table", min_periods=0).apply(
             weighted_mean, raw=True, engine="numba"
         )
@@ -1135,8 +1142,8 @@ def test_private_method_result_wrapped():
 
 
 def test_numpy_var():
-    np.random.seed(42)
-    data = np.random.rand(1000)
+    rng = np.random.default_rng(seed=42)
+    data = rng.random(1000)
     psr = pd.Series(data)
     sr = xpd.Series(data)
 
@@ -1305,7 +1312,7 @@ def max_times_two(self):
 
 
 @pytest.mark.xfail(
-    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_VERSION < version.parse("2.1"),
     reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0",
 )
 def test_floordiv_array_vs_df():
@@ -1580,7 +1587,7 @@ def test_numpy_cupy_flatiter(series):
 
 
 @pytest.mark.xfail(
-    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_VERSION < version.parse("2.1"),
     reason="pyarrow_numpy storage type was not supported in pandas-2.0.0",
 )
 def test_arrow_string_arrays():
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index a75a20a4681..63fd9601fc1 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -387,7 +387,8 @@ def test_dir_bound_method(
 ):
     """This test will fail because dir for bound methods is currently
     incorrect, but we have no way to fix it without materializing the slow
-    type, which is unnecessarily expensive."""
+    type, which is unnecessarily expensive.
+    """
     Fast, FastIntermediate = fast_and_intermediate_with_doc
     Slow, SlowIntermediate = slow_and_intermediate_with_doc
 
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 5b7bde06d1d..a5c29bd93a2 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -23,12 +23,12 @@
     reason="function names change across versions of pandas, so making sure it only runs on latest version of pandas",
 )
 def test_profiler():
-    np.random.seed(42)
+    rng = np.random.default_rng(seed=42)
     with Profiler() as profiler:
         df = pd.DataFrame(
             {
-                "idx": np.random.randint(0, 10, 1000),
-                "data": np.random.rand(1000),
+                "idx": rng.integers(0, 10, 1000),
+                "data": rng.random(1000),
             }
         )
         sums = df.groupby("idx").sum()
@@ -58,7 +58,7 @@ def test_profiler():
     calls = [
         "pd.DataFrame",
         "",
-        "np.random.randint",
+        "rng.integers",
         "np.random.rand",
         'df.groupby("idx").sum',
         'df.sum()["data"]',
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
index 892d0886596..27eaff87ba0 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
@@ -102,7 +102,7 @@ def test_random_forest(binary_classification_data):
 def test_clustering():
     rng = np.random.default_rng(42)
     nsamps = 300
-    X = rng.random((nsamps, 2))
+    X = rng.random(size=(nsamps, 2))
     data = pd.DataFrame(X, columns=["x", "y"])
 
     kmeans = KMeans(n_clusters=3, random_state=42)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
index 37e3cc34856..0777d982ac2 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
@@ -31,17 +31,17 @@ def dask_client():
 
 
 def test_1d_distributed(dask_client):
-    np.random.seed(42)
-    ts = pd.Series(np.random.rand(100))
+    rng = np.random.default_rng(seed=42)
+    ts = pd.Series(rng.random(100))
     m = 10
     return stumpy.stumped(dask_client, ts, m)
 
 
 def test_multidimensional_distributed_timeseries(dask_client):
-    np.random.seed(42)
+    rng = np.random.default_rng(seed=42)
     # Each row represents data from a different dimension while each column represents
     # data from the same dimension
-    your_time_series = np.random.rand(3, 1000)
+    your_time_series = rng.random(3, 1000)
     # Approximately, how many data points might be found in a pattern
     window_size = 50
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 605f9be5a49..80201dd84db 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -28,9 +28,10 @@ dependencies = [
     "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pandas>=2.0,<2.2.3dev0",
+    "pandas>=2.0,<2.2.4dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.0,<18.0.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'",
     "pylibcudf==24.12.*,>=0.0.0a0",
     "rich",
     "rmm==24.12.*,>=0.0.0a0",
@@ -80,49 +81,26 @@ cudf-pandas-tests = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
+[tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
+filterwarnings = [
+    "error",
+    "ignore:::.*xdist.*",
+    "ignore:::.*pytest.*",
+    # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
+    "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore",
+    # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
+    "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning",
+    # PerformanceWarning from cupy warming up the JIT cache
+    "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning",
+    # Ignore numba PEP 456 warning specific to arm machines
+    "ignore:FNV hashing is not implemented in Numba.*:UserWarning"
 ]
-known_rapids = [
-    "rmm",
-    "pylibcudf"
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
+markers = [
+    "spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`"
 ]
+xfail_strict = true
 
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
@@ -152,3 +130,18 @@ wheel.packages = ["cudf"]
 provider = "scikit_build_core.metadata.regex"
 input = "cudf/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "pylibcudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index a1a3ec37842..667cd7b1db8 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -32,56 +32,28 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-    "streamz",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "cudf_kafka",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf_kafka"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda", "streamz"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
   "error"
 ]
+xfail_strict = true
 
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
index 06bb08953f1..3b1eff4a0d0 100644
--- a/python/cudf_polars/cudf_polars/containers/__init__.py
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-__all__: list[str] = ["DataFrame", "Column", "NamedColumn"]
+__all__: list[str] = ["DataFrame", "Column"]
 
-from cudf_polars.containers.column import Column, NamedColumn
+from cudf_polars.containers.column import Column
 from cudf_polars.containers.dataframe import DataFrame
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 3fe3e5557cb..00186098e54 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -15,7 +15,7 @@
 
     import polars as pl
 
-__all__: list[str] = ["Column", "NamedColumn"]
+__all__: list[str] = ["Column"]
 
 
 class Column:
@@ -26,6 +26,9 @@ class Column:
     order: plc.types.Order
     null_order: plc.types.NullOrder
     is_scalar: bool
+    # Optional name, only ever set by evaluation of NamedExpr nodes
+    # The internal evaluation should not care about the name.
+    name: str | None
 
     def __init__(
         self,
@@ -34,14 +37,12 @@ def __init__(
         is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
         order: plc.types.Order = plc.types.Order.ASCENDING,
         null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+        name: str | None = None,
     ):
         self.obj = column
         self.is_scalar = self.obj.size() == 1
-        if self.obj.size() <= 1:
-            is_sorted = plc.types.Sorted.YES
-        self.is_sorted = is_sorted
-        self.order = order
-        self.null_order = null_order
+        self.name = name
+        self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
 
     @functools.cached_property
     def obj_scalar(self) -> plc.Scalar:
@@ -63,9 +64,26 @@ def obj_scalar(self) -> plc.Scalar:
             )
         return plc.copying.get_element(self.obj, 0)
 
+    def rename(self, name: str | None, /) -> Self:
+        """
+        Return a shallow copy with a new name.
+
+        Parameters
+        ----------
+        name
+            New name
+
+        Returns
+        -------
+        Shallow copy of self with new name set.
+        """
+        new = self.copy()
+        new.name = name
+        return new
+
     def sorted_like(self, like: Column, /) -> Self:
         """
-        Copy sortedness properties from a column onto self.
+        Return a shallow copy with sortedness from like.
 
         Parameters
         ----------
@@ -74,20 +92,23 @@ def sorted_like(self, like: Column, /) -> Self:
 
         Returns
         -------
-        Self with metadata set.
+        Shallow copy of self with metadata set.
 
         See Also
         --------
         set_sorted, copy_metadata
         """
-        return self.set_sorted(
-            is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
+        return type(self)(
+            self.obj,
+            name=self.name,
+            is_sorted=like.is_sorted,
+            order=like.order,
+            null_order=like.null_order,
         )
 
-    # TODO: Return Column once #16272 is fixed.
-    def astype(self, dtype: plc.DataType) -> plc.Column:
+    def astype(self, dtype: plc.DataType) -> Column:
         """
-        Return the backing column as the requested dtype.
+        Cast the column to as the requested dtype.
 
         Parameters
         ----------
@@ -109,8 +130,10 @@ def astype(self, dtype: plc.DataType) -> plc.Column:
         the current one.
         """
         if self.obj.type() != dtype:
-            return plc.unary.cast(self.obj, dtype)
-        return self.obj
+            return Column(plc.unary.cast(self.obj, dtype), name=self.name).sorted_like(
+                self
+            )
+        return self
 
     def copy_metadata(self, from_: pl.Series, /) -> Self:
         """
@@ -129,6 +152,7 @@ def copy_metadata(self, from_: pl.Series, /) -> Self:
         --------
         set_sorted, sorted_like
         """
+        self.name = from_.name
         if len(from_) <= 1:
             return self
         ascending = from_.flags["SORTED_ASC"]
@@ -192,6 +216,7 @@ def copy(self) -> Self:
             is_sorted=self.is_sorted,
             order=self.order,
             null_order=self.null_order,
+            name=self.name,
         )
 
     def mask_nans(self) -> Self:
@@ -217,58 +242,3 @@ def nan_count(self) -> int:
                 )
             ).as_py()
         return 0
-
-
-class NamedColumn(Column):
-    """A column with a name."""
-
-    name: str
-
-    def __init__(
-        self,
-        column: plc.Column,
-        name: str,
-        *,
-        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
-        order: plc.types.Order = plc.types.Order.ASCENDING,
-        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
-    ) -> None:
-        super().__init__(
-            column, is_sorted=is_sorted, order=order, null_order=null_order
-        )
-        self.name = name
-
-    def copy(self, *, new_name: str | None = None) -> Self:
-        """
-        A shallow copy of the column.
-
-        Parameters
-        ----------
-        new_name
-            Optional new name for the copied column.
-
-        Returns
-        -------
-        New column sharing data with self.
-        """
-        return type(self)(
-            self.obj,
-            self.name if new_name is None else new_name,
-            is_sorted=self.is_sorted,
-            order=self.order,
-            null_order=self.null_order,
-        )
-
-    def mask_nans(self) -> Self:
-        """Return a shallow copy of self with nans masked out."""
-        # Annoying, the inheritance is not right (can't call the
-        # super-type mask_nans), but will sort that by refactoring
-        # later.
-        if plc.traits.is_floating_point(self.obj.type()):
-            old_count = self.obj.null_count()
-            mask, new_count = plc.transform.nans_to_nulls(self.obj)
-            result = type(self)(self.obj.with_mask(mask, new_count), self.name)
-            if old_count == new_count:
-                return result.sorted_like(self)
-            return result
-        return self.copy()
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index f3e3862d0cc..2c195f6637c 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -5,43 +5,50 @@
 
 from __future__ import annotations
 
-import itertools
 from functools import cached_property
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import pyarrow as pa
 import pylibcudf as plc
 
 import polars as pl
 
-from cudf_polars.containers.column import NamedColumn
+from cudf_polars.containers import Column
 from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence, Set
+    from collections.abc import Iterable, Mapping, Sequence, Set
 
     from typing_extensions import Self
 
-    from cudf_polars.containers import Column
-
 
 __all__: list[str] = ["DataFrame"]
 
 
+# Pacify the type checker. DataFrame init asserts that all the columns
+# have a string name, so let's narrow the type.
+class NamedColumn(Column):
+    name: str
+
+
 class DataFrame:
     """A representation of a dataframe."""
 
-    columns: list[NamedColumn]
+    column_map: dict[str, Column]
     table: plc.Table
+    columns: list[NamedColumn]
 
-    def __init__(self, columns: Sequence[NamedColumn]) -> None:
-        self.columns = list(columns)
-        self._column_map = {c.name: c for c in self.columns}
-        self.table = plc.Table([c.obj for c in columns])
+    def __init__(self, columns: Iterable[Column]) -> None:
+        columns = list(columns)
+        if any(c.name is None for c in columns):
+            raise ValueError("All columns must have a name")
+        self.columns = [cast(NamedColumn, c) for c in columns]
+        self.column_map = {c.name: c for c in self.columns}
+        self.table = plc.Table([c.obj for c in self.columns])
 
     def copy(self) -> Self:
         """Return a shallow copy of self."""
-        return type(self)([c.copy() for c in self.columns])
+        return type(self)(c.copy() for c in self.columns)
 
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
@@ -51,42 +58,38 @@ def to_polars(self) -> pl.DataFrame:
         # https://github.com/pola-rs/polars/issues/11632
         # To guarantee we produce correct names, we therefore
         # serialise with names we control and rename with that map.
-        name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)}
+        name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
         table: pa.Table = plc.interop.to_arrow(
             self.table,
             [plc.interop.ColumnMetadata(name=name) for name in name_map],
         )
         df: pl.DataFrame = pl.from_arrow(table)
         return df.rename(name_map).with_columns(
-            *(
-                pl.col(c.name).set_sorted(
-                    descending=c.order == plc.types.Order.DESCENDING
-                )
-                if c.is_sorted
-                else pl.col(c.name)
-                for c in self.columns
-            )
+            pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING)
+            if c.is_sorted
+            else pl.col(c.name)
+            for c in self.columns
         )
 
     @cached_property
     def column_names_set(self) -> frozenset[str]:
         """Return the column names as a set."""
-        return frozenset(c.name for c in self.columns)
+        return frozenset(self.column_map)
 
     @cached_property
     def column_names(self) -> list[str]:
         """Return a list of the column names."""
-        return [c.name for c in self.columns]
+        return list(self.column_map)
 
     @cached_property
     def num_columns(self) -> int:
         """Number of columns."""
-        return len(self.columns)
+        return len(self.column_map)
 
     @cached_property
     def num_rows(self) -> int:
         """Number of rows."""
-        return 0 if len(self.columns) == 0 else self.table.num_rows()
+        return self.table.num_rows() if self.column_map else 0
 
     @classmethod
     def from_polars(cls, df: pl.DataFrame) -> Self:
@@ -111,12 +114,8 @@ def from_polars(cls, df: pl.DataFrame) -> Self:
         # No-op if the schema is unchanged.
         d_table = plc.interop.from_arrow(table.cast(schema))
         return cls(
-            [
-                NamedColumn(column, h_col.name).copy_metadata(h_col)
-                for column, h_col in zip(
-                    d_table.columns(), df.iter_columns(), strict=True
-                )
-            ]
+            Column(column).copy_metadata(h_col)
+            for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True)
         )
 
     @classmethod
@@ -144,17 +143,14 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls(
-            [
-                NamedColumn(c, name)
-                for c, name in zip(table.columns(), names, strict=True)
-            ]
+            Column(c, name=name) for c, name in zip(table.columns(), names, strict=True)
         )
 
     def sorted_like(
         self, like: DataFrame, /, *, subset: Set[str] | None = None
     ) -> Self:
         """
-        Copy sortedness from a dataframe onto self.
+        Return a shallow copy with sortedness copied from like.
 
         Parameters
         ----------
@@ -165,7 +161,7 @@ def sorted_like(
 
         Returns
         -------
-        Self with metadata set.
+        Shallow copy of self with metadata set.
 
         Raises
         ------
@@ -175,13 +171,12 @@ def sorted_like(
         if like.column_names != self.column_names:
             raise ValueError("Can only copy from identically named frame")
         subset = self.column_names_set if subset is None else subset
-        self.columns = [
+        return type(self)(
             c.sorted_like(other) if c.name in subset else c
             for c, other in zip(self.columns, like.columns, strict=True)
-        ]
-        return self
+        )
 
-    def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
+    def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self:
         """
         Return a new dataframe with extra columns.
 
@@ -189,6 +184,8 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         ----------
         columns
             Columns to add
+        replace_only
+            If true, then only replacements are allowed (matching by name).
 
         Returns
         -------
@@ -196,36 +193,30 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
 
         Notes
         -----
-        If column names overlap, newer names replace older ones.
+        If column names overlap, newer names replace older ones, and
+        appear in the same order as the original frame.
         """
-        columns = list(
-            {c.name: c for c in itertools.chain(self.columns, columns)}.values()
-        )
-        return type(self)(columns)
+        new = {c.name: c for c in columns}
+        if replace_only and not self.column_names_set.issuperset(new.keys()):
+            raise ValueError("Cannot replace with non-existing names")
+        return type(self)((self.column_map | new).values())
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
-        return type(self)([c for c in self.columns if c.name not in names])
+        return type(self)(column for column in self.columns if column.name not in names)
 
     def select(self, names: Sequence[str]) -> Self:
         """Select columns by name returning DataFrame."""
-        want = set(names)
-        if not want.issubset(self.column_names_set):
-            raise ValueError("Can't select missing names")
-        return type(self)([self._column_map[name] for name in names])
-
-    def replace_columns(self, *columns: NamedColumn) -> Self:
-        """Return a new dataframe with columns replaced by name."""
-        new = {c.name: c for c in columns}
-        if not set(new).issubset(self.column_names_set):
-            raise ValueError("Cannot replace with non-existing names")
-        return type(self)([new.get(c.name, c) for c in self.columns])
+        try:
+            return type(self)(self.column_map[name] for name in names)
+        except KeyError as e:
+            raise ValueError("Can't select missing names") from e
 
     def rename_columns(self, mapping: Mapping[str, str]) -> Self:
         """Rename some columns."""
-        return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns])
+        return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns)
 
-    def select_columns(self, names: Set[str]) -> list[NamedColumn]:
+    def select_columns(self, names: Set[str]) -> list[Column]:
         """Select columns by name."""
         return [c for c in self.columns if c.name in names]
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index c401e5a2f17..e748ec16f14 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -15,33 +15,30 @@
 
 from __future__ import annotations
 
-import enum
-from enum import IntEnum
-from functools import partial, reduce
-from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
-
-import pyarrow as pa
-import pyarrow.compute as pc
-import pylibcudf as plc
-
-from polars.exceptions import InvalidOperationError
-from polars.polars import _expr_nodes as pl_expr
-
-from cudf_polars.containers import Column, NamedColumn
-from cudf_polars.utils import dtypes, sorting
-
-if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence
-
-    import polars as pl
-    import polars.type_aliases as pl_types
-
-    from cudf_polars.containers import DataFrame
+from cudf_polars.dsl.expressions.aggregation import Agg
+from cudf_polars.dsl.expressions.base import (
+    AggInfo,
+    Col,
+    Expr,
+    NamedExpr,
+)
+from cudf_polars.dsl.expressions.binaryop import BinOp
+from cudf_polars.dsl.expressions.boolean import BooleanFunction
+from cudf_polars.dsl.expressions.datetime import TemporalFunction
+from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
+from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow
+from cudf_polars.dsl.expressions.selection import Filter, Gather
+from cudf_polars.dsl.expressions.sorting import Sort, SortBy
+from cudf_polars.dsl.expressions.string import StringFunction
+from cudf_polars.dsl.expressions.ternary import Ternary
+from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
 
 __all__ = [
     "Expr",
     "NamedExpr",
     "Literal",
+    "LiteralColumn",
+    "Len",
     "Col",
     "BooleanFunction",
     "StringFunction",
@@ -54,1782 +51,8 @@
     "GroupedRollingWindow",
     "Cast",
     "Agg",
+    "AggInfo",
     "Ternary",
     "BinOp",
+    "UnaryFunction",
 ]
-
-
-class ExecutionContext(IntEnum):
-    FRAME = enum.auto()
-    GROUPBY = enum.auto()
-    ROLLING = enum.auto()
-
-
-class AggInfo(NamedTuple):
-    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
-
-
-class Expr:
-    """
-    An abstract expression object.
-
-    This contains a (potentially empty) tuple of child expressions,
-    along with non-child data. For uniform reconstruction and
-    implementation of hashing and equality schemes, child classes need
-    to provide a certain amount of metadata when they are defined.
-    Specifically, the ``_non_child`` attribute must list, in-order,
-    the names of the slots that are passed to the constructor. The
-    constructor must take arguments in the order ``(*_non_child,
-    *children).``
-    """
-
-    __slots__ = ("dtype", "_hash_value", "_repr_value")
-    dtype: plc.DataType
-    """Data type of the expression."""
-    _hash_value: int
-    """Caching slot for the hash of the expression."""
-    _repr_value: str
-    """Caching slot for repr of the expression."""
-    children: tuple[Expr, ...] = ()
-    """Children of the expression."""
-    _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
-    """Names of non-child data (not Exprs) for reconstruction."""
-
-    # Constructor must take arguments in order (*_non_child, *children)
-    def __init__(self, dtype: plc.DataType) -> None:
-        self.dtype = dtype
-
-    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
-        return (*(getattr(self, attr) for attr in self._non_child), *children)
-
-    def get_hash(self) -> int:
-        """
-        Return the hash of this expr.
-
-        Override this in subclasses, rather than __hash__.
-
-        Returns
-        -------
-        The integer hash value.
-        """
-        return hash((type(self), self._ctor_arguments(self.children)))
-
-    def __hash__(self) -> int:
-        """Hash of an expression with caching."""
-        try:
-            return self._hash_value
-        except AttributeError:
-            self._hash_value = self.get_hash()
-            return self._hash_value
-
-    def is_equal(self, other: Any) -> bool:
-        """
-        Equality of two expressions.
-
-        Override this in subclasses, rather than __eq__.
-
-        Parameter
-        ---------
-        other
-            object to compare to
-
-        Returns
-        -------
-        True if the two expressions are equal, false otherwise.
-        """
-        if type(self) is not type(other):
-            return False  # pragma: no cover; __eq__ trips first
-        return self._ctor_arguments(self.children) == other._ctor_arguments(
-            other.children
-        )
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of expressions."""
-        if type(self) is not type(other) or hash(self) != hash(other):
-            return False
-        else:
-            return self.is_equal(other)
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def __repr__(self) -> str:
-        """String representation of an expression with caching."""
-        try:
-            return self._repr_value
-        except AttributeError:
-            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
-            self._repr_value = f"{type(self).__name__}({args})"
-            return self._repr_value
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame that will provide columns.
-        context
-            What context are we performing this evaluation in?
-        mapping
-            Substitution mapping from expressions to Columns, used to
-            override the evaluation of a given expression if we're
-            performing a simple rewritten evaluation.
-
-        Notes
-        -----
-        Do not call this function directly, but rather
-        :meth:`evaluate` which handles the mapping lookups.
-
-        Returns
-        -------
-        Column representing the evaluation of the expression.
-
-        Raises
-        ------
-        NotImplementedError
-            If we couldn't evaluate the expression. Ideally all these
-            are returned during translation to the IR, but for now we
-            are not perfect.
-        """
-        raise NotImplementedError(
-            f"Evaluation of expression {type(self).__name__}"
-        )  # pragma: no cover; translation of unimplemented nodes trips first
-
-    def evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame that will provide columns.
-        context
-            What context are we performing this evaluation in?
-        mapping
-            Substitution mapping from expressions to Columns, used to
-            override the evaluation of a given expression if we're
-            performing a simple rewritten evaluation.
-
-        Notes
-        -----
-        Individual subclasses should implement :meth:`do_evaluate`,
-        this method provides logic to handle lookups in the
-        substitution mapping.
-
-        Returns
-        -------
-        Column representing the evaluation of the expression.
-
-        Raises
-        ------
-        NotImplementedError
-            If we couldn't evaluate the expression. Ideally all these
-            are returned during translation to the IR, but for now we
-            are not perfect.
-        """
-        if mapping is None:
-            return self.do_evaluate(df, context=context, mapping=mapping)
-        try:
-            return mapping[self]
-        except KeyError:
-            return self.do_evaluate(df, context=context, mapping=mapping)
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """
-        Collect information about aggregations in groupbys.
-
-        Parameters
-        ----------
-        depth
-            The depth of aggregating (reduction or sampling)
-            expressions we are currently at.
-
-        Returns
-        -------
-        Aggregation info describing the expression to aggregate in the
-        groupby.
-
-        Raises
-        ------
-        NotImplementedError
-            If we can't currently perform the aggregation request, for
-            example nested aggregations like ``a.max().min()``.
-        """
-        raise NotImplementedError(
-            f"Collecting aggregation info for {type(self).__name__}"
-        )  # pragma: no cover; check_agg trips first
-
-
-class NamedExpr:
-    # NamedExpr does not inherit from Expr since it does not appear
-    # when evaluating expressions themselves, only when constructing
-    # named return values in dataframe (IR) nodes.
-    __slots__ = ("name", "value")
-    value: Expr
-    name: str
-
-    def __init__(self, name: str, value: Expr) -> None:
-        self.name = name
-        self.value = value
-
-    def __hash__(self) -> int:
-        """Hash of the expression."""
-        return hash((type(self), self.name, self.value))
-
-    def __repr__(self) -> str:
-        """Repr of the expression."""
-        return f"NamedExpr({self.name}, {self.value})"
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of two expressions."""
-        return (
-            type(self) is type(other)
-            and self.name == other.name
-            and self.value == other.value
-        )
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> NamedColumn:
-        """
-        Evaluate this expression given a dataframe for context.
-
-        Parameters
-        ----------
-        df
-            DataFrame providing context
-        context
-            Execution context
-        mapping
-            Substitution mapping
-
-        Returns
-        -------
-        NamedColumn attaching a name to an evaluated Column
-
-        See Also
-        --------
-        :meth:`Expr.evaluate` for details, this function just adds the
-        name to a column produced from an expression.
-        """
-        obj = self.value.evaluate(df, context=context, mapping=mapping)
-        return NamedColumn(
-            obj.obj,
-            self.name,
-            is_sorted=obj.is_sorted,
-            order=obj.order,
-            null_order=obj.null_order,
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return self.value.collect_agg(depth=depth)
-
-
-class Literal(Expr):
-    __slots__ = ("value",)
-    _non_child = ("dtype", "value")
-    value: pa.Scalar[Any]
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
-        super().__init__(dtype)
-        assert value.type == plc.interop.to_arrow(dtype)
-        self.value = value
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        # datatype of pyarrow scalar is correct by construction.
-        return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([])
-
-
-class LiteralColumn(Expr):
-    __slots__ = ("value",)
-    _non_child = ("dtype", "value")
-    value: pa.Array[Any, Any]
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
-        super().__init__(dtype)
-        data = value.to_arrow()
-        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
-
-    def get_hash(self) -> int:
-        """Compute a hash of the column."""
-        # This is stricter than necessary, but we only need this hash
-        # for identity in groupby replacements so it's OK. And this
-        # way we avoid doing potentially expensive compute.
-        return hash((type(self), self.dtype, id(self.value)))
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        # datatype of pyarrow array is correct by construction.
-        return Column(plc.interop.from_arrow(self.value))
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([])
-
-
-class Col(Expr):
-    __slots__ = ("name",)
-    _non_child = ("dtype", "name")
-    name: str
-    children: tuple[()]
-
-    def __init__(self, dtype: plc.DataType, name: str) -> None:
-        self.dtype = dtype
-        self.name = name
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        return df._column_map[self.name]
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        return AggInfo([(self, plc.aggregation.collect_list(), self)])
-
-
-class Len(Expr):
-    children: tuple[()]
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        return Column(
-            plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
-                ),
-                1,
-            )
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: polars returns a uint, not an int for count
-        return AggInfo(
-            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
-        )
-
-
-class BooleanFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.BooleanFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        if self.name == pl_expr.BooleanFunction.IsIn and not all(
-            c.dtype == self.children[0].dtype for c in self.children
-        ):
-            # TODO: If polars IR doesn't put the casts in, we need to
-            # mimic the supertype promotion rules.
-            raise NotImplementedError("IsIn doesn't support supertype casting")
-
-    @staticmethod
-    def _distinct(
-        column: Column,
-        *,
-        keep: plc.stream_compaction.DuplicateKeepOption,
-        source_value: plc.Scalar,
-        target_value: plc.Scalar,
-    ) -> Column:
-        table = plc.Table([column.obj])
-        indices = plc.stream_compaction.distinct_indices(
-            table,
-            keep,
-            # TODO: polars doesn't expose options for these
-            plc.types.NullEquality.EQUAL,
-            plc.types.NanEquality.ALL_EQUAL,
-        )
-        return Column(
-            plc.copying.scatter(
-                [source_value],
-                indices,
-                plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
-            ).columns()[0]
-        )
-
-    _BETWEEN_OPS: ClassVar[
-        dict[
-            pl_types.ClosedInterval,
-            tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
-        ]
-    ] = {
-        "none": (
-            plc.binaryop.BinaryOperator.GREATER,
-            plc.binaryop.BinaryOperator.LESS,
-        ),
-        "left": (
-            plc.binaryop.BinaryOperator.GREATER_EQUAL,
-            plc.binaryop.BinaryOperator.LESS,
-        ),
-        "right": (
-            plc.binaryop.BinaryOperator.GREATER,
-            plc.binaryop.BinaryOperator.LESS_EQUAL,
-        ),
-        "both": (
-            plc.binaryop.BinaryOperator.GREATER_EQUAL,
-            plc.binaryop.BinaryOperator.LESS_EQUAL,
-        ),
-    }
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name in (
-            pl_expr.BooleanFunction.IsFinite,
-            pl_expr.BooleanFunction.IsInfinite,
-        ):
-            # Avoid evaluating the child if the dtype tells us it's unnecessary.
-            (child,) = self.children
-            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
-            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
-                value = plc.interop.from_arrow(
-                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
-                )
-                return Column(plc.Column.from_scalar(value, df.num_rows))
-            needles = child.evaluate(df, context=context, mapping=mapping)
-            to_search = [-float("inf"), float("inf")]
-            if is_finite:
-                # NaN is neither finite not infinite
-                to_search.append(float("nan"))
-            haystack = plc.interop.from_arrow(
-                pa.array(
-                    to_search,
-                    type=plc.interop.to_arrow(needles.obj.type()),
-                )
-            )
-            result = plc.search.contains(haystack, needles.obj)
-            if is_finite:
-                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
-            return Column(result)
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
-        # False
-        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
-            (ignore_nulls,) = self.options
-            (column,) = columns
-            is_any = self.name == pl_expr.BooleanFunction.Any
-            agg = plc.aggregation.any() if is_any else plc.aggregation.all()
-            result = plc.reduce.reduce(column.obj, agg, self.dtype)
-            if not ignore_nulls and column.obj.null_count() > 0:
-                #      Truth tables
-                #     Any         All
-                #   | F U T     | F U T
-                # --+------   --+------
-                # F | F U T   F | F F F
-                # U | U U T   U | F U U
-                # T | T T T   T | F U T
-                #
-                # If the input null count was non-zero, we must
-                # post-process the result to insert the correct value.
-                h_result = plc.interop.to_arrow(result).as_py()
-                if is_any and not h_result or not is_any and h_result:
-                    # Any                     All
-                    # False || Null => Null   True && Null => Null
-                    return Column(plc.Column.all_null_like(column.obj, 1))
-            return Column(plc.Column.from_scalar(result, 1))
-        if self.name == pl_expr.BooleanFunction.IsNull:
-            (column,) = columns
-            return Column(plc.unary.is_null(column.obj))
-        elif self.name == pl_expr.BooleanFunction.IsNotNull:
-            (column,) = columns
-            return Column(plc.unary.is_valid(column.obj))
-        elif self.name == pl_expr.BooleanFunction.IsNan:
-            (column,) = columns
-            return Column(
-                plc.unary.is_nan(column.obj).with_mask(
-                    column.obj.null_mask(), column.obj.null_count()
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsNotNan:
-            (column,) = columns
-            return Column(
-                plc.unary.is_not_nan(column.obj).with_mask(
-                    column.obj.null_mask(), column.obj.null_count()
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsUnique:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
-            (column,) = columns
-            return self._distinct(
-                column,
-                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(
-                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
-                ),
-                target_value=plc.interop.from_arrow(
-                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
-                ),
-            )
-        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
-            return Column(
-                reduce(
-                    partial(
-                        plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-                        output_type=self.dtype,
-                    ),
-                    (c.obj for c in columns),
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
-            return Column(
-                reduce(
-                    partial(
-                        plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-                        output_type=self.dtype,
-                    ),
-                    (c.obj for c in columns),
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.IsIn:
-            needles, haystack = columns
-            return Column(plc.search.contains(haystack.obj, needles.obj))
-        elif self.name == pl_expr.BooleanFunction.Not:
-            (column,) = columns
-            return Column(
-                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
-            )
-        else:
-            raise NotImplementedError(
-                f"BooleanFunction {self.name}"
-            )  # pragma: no cover; handled by init raising
-
-
-class StringFunction(Expr):
-    __slots__ = ("name", "options", "children", "_regex_program")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.StringFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        self._validate_input()
-
-    def _validate_input(self):
-        if self.name not in (
-            pl_expr.StringFunction.Contains,
-            pl_expr.StringFunction.EndsWith,
-            pl_expr.StringFunction.Lowercase,
-            pl_expr.StringFunction.Replace,
-            pl_expr.StringFunction.ReplaceMany,
-            pl_expr.StringFunction.Slice,
-            pl_expr.StringFunction.Strptime,
-            pl_expr.StringFunction.StartsWith,
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-            pl_expr.StringFunction.Uppercase,
-        ):
-            raise NotImplementedError(f"String function {self.name}")
-        if self.name == pl_expr.StringFunction.Contains:
-            literal, strict = self.options
-            if not literal:
-                if not strict:
-                    raise NotImplementedError(
-                        "f{strict=} is not supported for regex contains"
-                    )
-                if not isinstance(self.children[1], Literal):
-                    raise NotImplementedError(
-                        "Regex contains only supports a scalar pattern"
-                    )
-                pattern = self.children[1].value.as_py()
-                try:
-                    self._regex_program = plc.strings.regex_program.RegexProgram.create(
-                        pattern,
-                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
-                    )
-                except RuntimeError as e:
-                    raise NotImplementedError(
-                        f"Unsupported regex {pattern} for GPU engine."
-                    ) from e
-        elif self.name == pl_expr.StringFunction.Replace:
-            _, literal = self.options
-            if not literal:
-                raise NotImplementedError("literal=False is not supported for replace")
-            if not all(isinstance(expr, Literal) for expr in self.children[1:]):
-                raise NotImplementedError("replace only supports scalar target")
-            target = self.children[1]
-            if target.value == pa.scalar("", type=pa.string()):
-                raise NotImplementedError(
-                    "libcudf replace does not support empty strings"
-                )
-        elif self.name == pl_expr.StringFunction.ReplaceMany:
-            (ascii_case_insensitive,) = self.options
-            if ascii_case_insensitive:
-                raise NotImplementedError(
-                    "ascii_case_insensitive not implemented for replace_many"
-                )
-            if not all(
-                isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:]
-            ):
-                raise NotImplementedError("replace_many only supports literal inputs")
-            target = self.children[1]
-            if pc.any(pc.equal(target.value, "")).as_py():
-                raise NotImplementedError(
-                    "libcudf replace_many is implemented differently from polars "
-                    "for empty strings"
-                )
-        elif self.name == pl_expr.StringFunction.Slice:
-            if not all(isinstance(child, Literal) for child in self.children[1:]):
-                raise NotImplementedError(
-                    "Slice only supports literal start and stop values"
-                )
-        elif self.name == pl_expr.StringFunction.Strptime:
-            format, _, exact, cache = self.options
-            if cache:
-                raise NotImplementedError("Strptime cache is a CPU feature")
-            if format is None:
-                raise NotImplementedError("Strptime format is required")
-            if not exact:
-                raise NotImplementedError("Strptime does not support exact=False")
-        elif self.name in {
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-        }:
-            if not isinstance(self.children[1], Literal):
-                raise NotImplementedError(
-                    "strip operations only support scalar patterns"
-                )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name == pl_expr.StringFunction.Contains:
-            child, arg = self.children
-            column = child.evaluate(df, context=context, mapping=mapping)
-
-            literal, _ = self.options
-            if literal:
-                pat = arg.evaluate(df, context=context, mapping=mapping)
-                pattern = (
-                    pat.obj_scalar
-                    if pat.is_scalar and pat.obj.size() != column.obj.size()
-                    else pat.obj
-                )
-                return Column(plc.strings.find.contains(column.obj, pattern))
-            else:
-                return Column(
-                    plc.strings.contains.contains_re(column.obj, self._regex_program)
-                )
-        elif self.name == pl_expr.StringFunction.Slice:
-            child, expr_offset, expr_length = self.children
-            assert isinstance(expr_offset, Literal)
-            assert isinstance(expr_length, Literal)
-
-            column = child.evaluate(df, context=context, mapping=mapping)
-            # libcudf slices via [start,stop).
-            # polars slices with offset + length where start == offset
-            # stop = start + length. Negative values for start look backward
-            # from the last element of the string. If the end index would be
-            # below zero, an empty string is returned.
-            # Do this maths on the host
-            start = expr_offset.value.as_py()
-            length = expr_length.value.as_py()
-
-            if length == 0:
-                stop = start
-            else:
-                # No length indicates a scan to the end
-                # The libcudf equivalent is a null stop
-                stop = start + length if length else None
-                if length and start < 0 and length >= -start:
-                    stop = None
-            return Column(
-                plc.strings.slice.slice_strings(
-                    column.obj,
-                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
-                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
-                )
-            )
-        elif self.name in {
-            pl_expr.StringFunction.StripChars,
-            pl_expr.StringFunction.StripCharsStart,
-            pl_expr.StringFunction.StripCharsEnd,
-        }:
-            column, chars = (
-                c.evaluate(df, context=context, mapping=mapping) for c in self.children
-            )
-            if self.name == pl_expr.StringFunction.StripCharsStart:
-                side = plc.strings.SideType.LEFT
-            elif self.name == pl_expr.StringFunction.StripCharsEnd:
-                side = plc.strings.SideType.RIGHT
-            else:
-                side = plc.strings.SideType.BOTH
-            return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
-
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        if self.name == pl_expr.StringFunction.Lowercase:
-            (column,) = columns
-            return Column(plc.strings.case.to_lower(column.obj))
-        elif self.name == pl_expr.StringFunction.Uppercase:
-            (column,) = columns
-            return Column(plc.strings.case.to_upper(column.obj))
-        elif self.name == pl_expr.StringFunction.EndsWith:
-            column, suffix = columns
-            return Column(
-                plc.strings.find.ends_with(
-                    column.obj,
-                    suffix.obj_scalar
-                    if column.obj.size() != suffix.obj.size() and suffix.is_scalar
-                    else suffix.obj,
-                )
-            )
-        elif self.name == pl_expr.StringFunction.StartsWith:
-            column, prefix = columns
-            return Column(
-                plc.strings.find.starts_with(
-                    column.obj,
-                    prefix.obj_scalar
-                    if column.obj.size() != prefix.obj.size() and prefix.is_scalar
-                    else prefix.obj,
-                )
-            )
-        elif self.name == pl_expr.StringFunction.Strptime:
-            # TODO: ignores ambiguous
-            format, strict, exact, cache = self.options
-            col = self.children[0].evaluate(df, context=context, mapping=mapping)
-
-            is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
-                col.obj, format.encode()
-            )
-
-            if strict:
-                if not plc.interop.to_arrow(
-                    plc.reduce.reduce(
-                        is_timestamps,
-                        plc.aggregation.all(),
-                        plc.DataType(plc.TypeId.BOOL8),
-                    )
-                ).as_py():
-                    raise InvalidOperationError("conversion from `str` failed.")
-            else:
-                not_timestamps = plc.unary.unary_operation(
-                    is_timestamps, plc.unary.UnaryOperator.NOT
-                )
-
-                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
-                res = plc.copying.boolean_mask_scatter(
-                    [null], plc.Table([col.obj]), not_timestamps
-                )
-                return Column(
-                    plc.strings.convert.convert_datetime.to_timestamps(
-                        res.columns()[0], self.dtype, format.encode()
-                    )
-                )
-        elif self.name == pl_expr.StringFunction.Replace:
-            column, target, repl = columns
-            n, _ = self.options
-            return Column(
-                plc.strings.replace.replace(
-                    column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
-                )
-            )
-        elif self.name == pl_expr.StringFunction.ReplaceMany:
-            column, target, repl = columns
-            return Column(
-                plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
-            )
-        raise NotImplementedError(
-            f"StringFunction {self.name}"
-        )  # pragma: no cover; handled by init raising
-
-
-class TemporalFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
-        pl_expr.TemporalFunction.Year: "year",
-        pl_expr.TemporalFunction.Month: "month",
-        pl_expr.TemporalFunction.Day: "day",
-        pl_expr.TemporalFunction.WeekDay: "weekday",
-        pl_expr.TemporalFunction.Hour: "hour",
-        pl_expr.TemporalFunction.Minute: "minute",
-        pl_expr.TemporalFunction.Second: "second",
-        pl_expr.TemporalFunction.Millisecond: "millisecond",
-        pl_expr.TemporalFunction.Microsecond: "microsecond",
-        pl_expr.TemporalFunction.Nanosecond: "nanosecond",
-    }
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        name: pl_expr.TemporalFunction,
-        options: tuple[Any, ...],
-        *children: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.name = name
-        self.children = children
-        if self.name not in self._COMPONENT_MAP:
-            raise NotImplementedError(f"Temporal function {self.name}")
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
-        (column,) = columns
-        if self.name == pl_expr.TemporalFunction.Microsecond:
-            millis = plc.datetime.extract_datetime_component(column.obj, "millisecond")
-            micros = plc.datetime.extract_datetime_component(column.obj, "microsecond")
-            millis_as_micros = plc.binaryop.binary_operation(
-                millis,
-                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.DataType(plc.TypeId.INT32),
-            )
-            total_micros = plc.binaryop.binary_operation(
-                micros,
-                millis_as_micros,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            return Column(total_micros)
-        elif self.name == pl_expr.TemporalFunction.Nanosecond:
-            millis = plc.datetime.extract_datetime_component(column.obj, "millisecond")
-            micros = plc.datetime.extract_datetime_component(column.obj, "microsecond")
-            nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond")
-            millis_as_nanos = plc.binaryop.binary_operation(
-                millis,
-                plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            micros_as_nanos = plc.binaryop.binary_operation(
-                micros,
-                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
-                plc.binaryop.BinaryOperator.MUL,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            total_nanos = plc.binaryop.binary_operation(
-                nanos,
-                millis_as_nanos,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            total_nanos = plc.binaryop.binary_operation(
-                total_nanos,
-                micros_as_nanos,
-                plc.binaryop.BinaryOperator.ADD,
-                plc.types.DataType(plc.types.TypeId.INT32),
-            )
-            return Column(total_nanos)
-
-        return Column(
-            plc.datetime.extract_datetime_component(
-                column.obj,
-                self._COMPONENT_MAP[self.name],
-            )
-        )
-
-
-class UnaryFunction(Expr):
-    __slots__ = ("name", "options", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    # Note: log, and pow are handled via translation to binops
-    _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
-        "sin": plc.unary.UnaryOperator.SIN,
-        "cos": plc.unary.UnaryOperator.COS,
-        "tan": plc.unary.UnaryOperator.TAN,
-        "arcsin": plc.unary.UnaryOperator.ARCSIN,
-        "arccos": plc.unary.UnaryOperator.ARCCOS,
-        "arctan": plc.unary.UnaryOperator.ARCTAN,
-        "sinh": plc.unary.UnaryOperator.SINH,
-        "cosh": plc.unary.UnaryOperator.COSH,
-        "tanh": plc.unary.UnaryOperator.TANH,
-        "arcsinh": plc.unary.UnaryOperator.ARCSINH,
-        "arccosh": plc.unary.UnaryOperator.ARCCOSH,
-        "arctanh": plc.unary.UnaryOperator.ARCTANH,
-        "exp": plc.unary.UnaryOperator.EXP,
-        "sqrt": plc.unary.UnaryOperator.SQRT,
-        "cbrt": plc.unary.UnaryOperator.CBRT,
-        "ceil": plc.unary.UnaryOperator.CEIL,
-        "floor": plc.unary.UnaryOperator.FLOOR,
-        "abs": plc.unary.UnaryOperator.ABS,
-        "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
-        "not": plc.unary.UnaryOperator.NOT,
-    }
-    _supported_misc_fns = frozenset(
-        {
-            "drop_nulls",
-            "fill_null",
-            "mask_nans",
-            "round",
-            "set_sorted",
-            "unique",
-        }
-    )
-    _supported_cum_aggs = frozenset(
-        {
-            "cum_min",
-            "cum_max",
-            "cum_prod",
-            "cum_sum",
-        }
-    )
-    _supported_fns = frozenset().union(
-        _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys()
-    )
-
-    def __init__(
-        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.name = name
-        self.options = options
-        self.children = children
-
-        if self.name not in UnaryFunction._supported_fns:
-            raise NotImplementedError(f"Unary function {name=}")
-        if self.name in UnaryFunction._supported_cum_aggs:
-            (reverse,) = self.options
-            if reverse:
-                raise NotImplementedError(
-                    "reverse=True is not supported for cumulative aggregations"
-                )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if self.name == "mask_nans":
-            (child,) = self.children
-            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
-        if self.name == "round":
-            (decimal_places,) = self.options
-            (values,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            return Column(
-                plc.round.round(
-                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
-                )
-            ).sorted_like(values)
-        elif self.name == "unique":
-            (maintain_order,) = self.options
-            (values,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            # Only one column, so keep_any is the same as keep_first
-            # for stable distinct
-            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
-            if values.is_sorted:
-                maintain_order = True
-                result = plc.stream_compaction.unique(
-                    plc.Table([values.obj]),
-                    [0],
-                    keep,
-                    plc.types.NullEquality.EQUAL,
-                )
-            else:
-                distinct = (
-                    plc.stream_compaction.stable_distinct
-                    if maintain_order
-                    else plc.stream_compaction.distinct
-                )
-                result = distinct(
-                    plc.Table([values.obj]),
-                    [0],
-                    keep,
-                    plc.types.NullEquality.EQUAL,
-                    plc.types.NanEquality.ALL_EQUAL,
-                )
-            (column,) = result.columns()
-            if maintain_order:
-                return Column(column).sorted_like(values)
-            return Column(column)
-        elif self.name == "set_sorted":
-            (column,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            (asc,) = self.options
-            order = (
-                plc.types.Order.ASCENDING
-                if asc == "ascending"
-                else plc.types.Order.DESCENDING
-            )
-            null_order = plc.types.NullOrder.BEFORE
-            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
-                # PERF: This invokes four stream synchronisations!
-                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
-                has_nulls_last = not plc.copying.get_element(
-                    column.obj, n - 1
-                ).is_valid()
-                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
-                    order == plc.types.Order.ASCENDING and has_nulls_last
-                ):
-                    null_order = plc.types.NullOrder.AFTER
-            return column.set_sorted(
-                is_sorted=plc.types.Sorted.YES,
-                order=order,
-                null_order=null_order,
-            )
-        elif self.name == "drop_nulls":
-            (column,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
-            return Column(
-                plc.stream_compaction.drop_nulls(
-                    plc.Table([column.obj]), [0], 1
-                ).columns()[0]
-            )
-        elif self.name == "fill_null":
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            if isinstance(self.children[1], Literal):
-                arg = plc.interop.from_arrow(self.children[1].value)
-            else:
-                evaluated = self.children[1].evaluate(
-                    df, context=context, mapping=mapping
-                )
-                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
-            return Column(plc.replace.replace_nulls(column.obj, arg))
-        elif self.name in self._OP_MAPPING:
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            if column.obj.type().id() != self.dtype.id():
-                arg = plc.unary.cast(column.obj, self.dtype)
-            else:
-                arg = column.obj
-            return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
-        elif self.name in UnaryFunction._supported_cum_aggs:
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
-            plc_col = column.obj
-            col_type = column.obj.type()
-            # cum_sum casts
-            # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
-            # Bool -> UInt32
-            # cum_prod casts integer dtypes < int64 and bool to int64
-            # See:
-            # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs
-            if (
-                self.name == "cum_sum"
-                and col_type.id()
-                in {
-                    plc.types.TypeId.INT8,
-                    plc.types.TypeId.UINT8,
-                    plc.types.TypeId.INT16,
-                    plc.types.TypeId.UINT16,
-                }
-            ) or (
-                self.name == "cum_prod"
-                and plc.traits.is_integral(col_type)
-                and plc.types.size_of(col_type) <= 4
-            ):
-                plc_col = plc.unary.cast(
-                    plc_col, plc.types.DataType(plc.types.TypeId.INT64)
-                )
-            elif (
-                self.name == "cum_sum"
-                and column.obj.type().id() == plc.types.TypeId.BOOL8
-            ):
-                plc_col = plc.unary.cast(
-                    plc_col, plc.types.DataType(plc.types.TypeId.UINT32)
-                )
-            if self.name == "cum_sum":
-                agg = plc.aggregation.sum()
-            elif self.name == "cum_prod":
-                agg = plc.aggregation.product()
-            elif self.name == "cum_min":
-                agg = plc.aggregation.min()
-            elif self.name == "cum_max":
-                agg = plc.aggregation.max()
-
-            return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE))
-        raise NotImplementedError(
-            f"Unimplemented unary function {self.name=}"
-        )  # pragma: no cover; init trips first
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
-            raise NotImplementedError(f"{self.name} in groupby")
-        if depth == 1:
-            # inside aggregation, need to pre-evaluate, groupby
-            # construction has checked that we don't have nested aggs,
-            # so stop the recursion and return ourselves for pre-eval
-            return AggInfo([(self, plc.aggregation.collect_list(), self)])
-        else:
-            (child,) = self.children
-            return child.collect_agg(depth=depth)
-
-
-class Sort(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr]
-
-    def __init__(
-        self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (column,)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
-        (stable, nulls_last, descending) = self.options
-        order, null_order = sorting.sort_order(
-            [descending], nulls_last=[nulls_last], num_keys=1
-        )
-        do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
-        table = do_sort(plc.Table([column.obj]), order, null_order)
-        return Column(
-            table.columns()[0],
-            is_sorted=plc.types.Sorted.YES,
-            order=order[0],
-            null_order=null_order[0],
-        )
-
-
-class SortBy(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        options: tuple[bool, tuple[bool], tuple[bool]],
-        column: Expr,
-        *by: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (column, *by)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        column, *by = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        (stable, nulls_last, descending) = self.options
-        order, null_order = sorting.sort_order(
-            descending, nulls_last=nulls_last, num_keys=len(by)
-        )
-        do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
-        table = do_sort(
-            plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
-        )
-        return Column(table.columns()[0])
-
-
-class Gather(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
-
-    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
-        super().__init__(dtype)
-        self.children = (values, indices)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        values, indices = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        lo, hi = plc.reduce.minmax(indices.obj)
-        lo = plc.interop.to_arrow(lo).as_py()
-        hi = plc.interop.to_arrow(hi).as_py()
-        n = df.num_rows
-        if hi >= n or lo < -n:
-            raise ValueError("gather indices are out of bounds")
-        if indices.obj.null_count():
-            bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
-            obj = plc.replace.replace_nulls(
-                indices.obj,
-                plc.interop.from_arrow(
-                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
-                ),
-            )
-        else:
-            bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
-            obj = indices.obj
-        table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
-        return Column(table.columns()[0])
-
-
-class Filter(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
-
-    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
-        super().__init__(dtype)
-        self.children = (values, indices)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        values, mask = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        table = plc.stream_compaction.apply_boolean_mask(
-            plc.Table([values.obj]), mask.obj
-        )
-        return Column(table.columns()[0]).sorted_like(values)
-
-
-class RollingWindow(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr]
-
-    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (agg,)
-        raise NotImplementedError("Rolling window not implemented")
-
-
-class GroupedRollingWindow(Expr):
-    __slots__ = ("options", "children")
-    _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
-        super().__init__(dtype)
-        self.options = options
-        self.children = (agg, *by)
-        raise NotImplementedError("Grouped rolling window not implemented")
-
-
-class Cast(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr]
-
-    def __init__(self, dtype: plc.DataType, value: Expr) -> None:
-        super().__init__(dtype)
-        self.children = (value,)
-        if not dtypes.can_cast(value.dtype, self.dtype):
-            raise NotImplementedError(
-                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
-            )
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
-        return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column)
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: Could do with sort-based groupby and segmented filter
-        (child,) = self.children
-        return child.collect_agg(depth=depth)
-
-
-class Agg(Expr):
-    __slots__ = ("name", "options", "op", "request", "children")
-    _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
-
-    def __init__(
-        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.name = name
-        self.options = options
-        self.children = children
-        if name not in Agg._SUPPORTED:
-            raise NotImplementedError(
-                f"Unsupported aggregation {name=}"
-            )  # pragma: no cover; all valid aggs are supported
-        # TODO: nan handling in groupby case
-        if name == "min":
-            req = plc.aggregation.min()
-        elif name == "max":
-            req = plc.aggregation.max()
-        elif name == "median":
-            req = plc.aggregation.median()
-        elif name == "n_unique":
-            # TODO: datatype of result
-            req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
-        elif name == "first" or name == "last":
-            req = None
-        elif name == "mean":
-            req = plc.aggregation.mean()
-        elif name == "sum":
-            req = plc.aggregation.sum()
-        elif name == "std":
-            # TODO: handle nans
-            req = plc.aggregation.std(ddof=options)
-        elif name == "var":
-            # TODO: handle nans
-            req = plc.aggregation.variance(ddof=options)
-        elif name == "count":
-            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
-        elif name == "quantile":
-            _, quantile = self.children
-            if not isinstance(quantile, Literal):
-                raise NotImplementedError("Only support literal quantile values")
-            req = plc.aggregation.quantile(
-                quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
-            )
-        else:
-            raise NotImplementedError(
-                f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
-            )  # pragma: no cover
-        self.request = req
-        op = getattr(self, f"_{name}", None)
-        if op is None:
-            op = partial(self._reduce, request=req)
-        elif name in {"min", "max"}:
-            op = partial(op, propagate_nans=options)
-        elif name in {"count", "first", "last"}:
-            pass
-        else:
-            raise NotImplementedError(
-                f"Unreachable, supported agg {name=} has no implementation"
-            )  # pragma: no cover
-        self.op = op
-
-    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
-        [
-            "min",
-            "max",
-            "median",
-            "n_unique",
-            "first",
-            "last",
-            "mean",
-            "sum",
-            "count",
-            "std",
-            "var",
-            "quantile",
-        ]
-    )
-
-    interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = {
-        "nearest": plc.types.Interpolation.NEAREST,
-        "higher": plc.types.Interpolation.HIGHER,
-        "lower": plc.types.Interpolation.LOWER,
-        "midpoint": plc.types.Interpolation.MIDPOINT,
-        "linear": plc.types.Interpolation.LINEAR,
-    }
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if depth >= 1:
-            raise NotImplementedError(
-                "Nested aggregations in groupby"
-            )  # pragma: no cover; check_agg trips first
-        if (isminmax := self.name in {"min", "max"}) and self.options:
-            raise NotImplementedError("Nan propagation in groupby for min/max")
-        (child,) = self.children
-        ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
-        request = self.request
-        # These are handled specially here because we don't set up the
-        # request for the whole-frame agg because we can avoid a
-        # reduce for these.
-        if self.name == "first":
-            request = plc.aggregation.nth_element(
-                0, null_handling=plc.types.NullPolicy.INCLUDE
-            )
-        elif self.name == "last":
-            request = plc.aggregation.nth_element(
-                -1, null_handling=plc.types.NullPolicy.INCLUDE
-            )
-        if request is None:
-            raise NotImplementedError(
-                f"Aggregation {self.name} in groupby"
-            )  # pragma: no cover; __init__ trips first
-        if isminmax and plc.traits.is_floating_point(self.dtype):
-            assert expr is not None
-            # Ignore nans in these groupby aggs, do this by masking
-            # nans in the input
-            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
-        return AggInfo([(expr, request, self)])
-
-    def _reduce(
-        self, column: Column, *, request: plc.aggregation.Aggregation
-    ) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, request, self.dtype),
-                1,
-            )
-        )
-
-    def _count(self, column: Column) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(
-                        column.obj.size() - column.obj.null_count(),
-                        type=plc.interop.to_arrow(self.dtype),
-                    ),
-                ),
-                1,
-            )
-        )
-
-    def _min(self, column: Column, *, propagate_nans: bool) -> Column:
-        if propagate_nans and column.nan_count > 0:
-            return Column(
-                plc.Column.from_scalar(
-                    plc.interop.from_arrow(
-                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
-                    ),
-                    1,
-                )
-            )
-        if column.nan_count > 0:
-            column = column.mask_nans()
-        return self._reduce(column, request=plc.aggregation.min())
-
-    def _max(self, column: Column, *, propagate_nans: bool) -> Column:
-        if propagate_nans and column.nan_count > 0:
-            return Column(
-                plc.Column.from_scalar(
-                    plc.interop.from_arrow(
-                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
-                    ),
-                    1,
-                )
-            )
-        if column.nan_count > 0:
-            column = column.mask_nans()
-        return self._reduce(column, request=plc.aggregation.max())
-
-    def _first(self, column: Column) -> Column:
-        return Column(plc.copying.slice(column.obj, [0, 1])[0])
-
-    def _last(self, column: Column) -> Column:
-        n = column.obj.size()
-        return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        if context is not ExecutionContext.FRAME:
-            raise NotImplementedError(
-                f"Agg in context {context}"
-            )  # pragma: no cover; unreachable
-
-        # Aggregations like quantiles may have additional children that were
-        # preprocessed into pylibcudf requests.
-        child = self.children[0]
-        return self.op(child.evaluate(df, context=context, mapping=mapping))
-
-
-class Ternary(Expr):
-    __slots__ = ("children",)
-    _non_child = ("dtype",)
-    children: tuple[Expr, Expr, Expr]
-
-    def __init__(
-        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
-    ) -> None:
-        super().__init__(dtype)
-        self.children = (when, then, otherwise)
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        when, then, otherwise = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        then_obj = then.obj_scalar if then.is_scalar else then.obj
-        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
-        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
-
-
-class BinOp(Expr):
-    __slots__ = ("op", "children")
-    _non_child = ("dtype", "op")
-    children: tuple[Expr, Expr]
-
-    def __init__(
-        self,
-        dtype: plc.DataType,
-        op: plc.binaryop.BinaryOperator,
-        left: Expr,
-        right: Expr,
-    ) -> None:
-        super().__init__(dtype)
-        if plc.traits.is_boolean(self.dtype):
-            # For boolean output types, bitand and bitor implement
-            # boolean logic, so translate. bitxor also does, but the
-            # default behaviour is correct.
-            op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
-        self.op = op
-        self.children = (left, right)
-        if not plc.binaryop.is_supported_operation(
-            self.dtype, left.dtype, right.dtype, op
-        ):
-            raise NotImplementedError(
-                f"Operation {op.name} not supported "
-                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
-                f"with output type {self.dtype.id().name}"
-            )
-
-    _BOOL_KLEENE_MAPPING: ClassVar[
-        dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator]
-    ] = {
-        plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-        plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-        plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
-        plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
-    }
-
-    _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
-        pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
-        pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
-        pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
-        pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
-        pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS,
-        pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
-        pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER,
-        pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
-        pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD,
-        pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB,
-        pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL,
-        pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV,
-        pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
-        pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
-        pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
-        pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
-        pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
-        pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
-        pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
-        pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
-    }
-
-    def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
-        left, right = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        )
-        lop = left.obj
-        rop = right.obj
-        if left.obj.size() != right.obj.size():
-            if left.is_scalar:
-                lop = left.obj_scalar
-            elif right.is_scalar:
-                rop = right.obj_scalar
-        return Column(
-            plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
-        )
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if depth == 1:
-            # inside aggregation, need to pre-evaluate,
-            # groupby construction has checked that we don't have
-            # nested aggs, so stop the recursion and return ourselves
-            # for pre-eval
-            return AggInfo([(self, plc.aggregation.collect_list(), self)])
-        else:
-            left_info, right_info = (
-                child.collect_agg(depth=depth) for child in self.children
-            )
-            requests = [*left_info.requests, *right_info.requests]
-            # TODO: Hack, if there were no reductions inside this
-            # binary expression then we want to pre-evaluate and
-            # collect ourselves. Otherwise we want to collect the
-            # aggregations inside and post-evaluate. This is a bad way
-            # of checking that we are in case 1.
-            if all(
-                agg.kind() == plc.aggregation.Kind.COLLECT_LIST
-                for _, agg, _ in requests
-            ):
-                return AggInfo([(self, plc.aggregation.collect_list(), self)])
-            return AggInfo(
-                [*left_info.requests, *right_info.requests],
-            )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py
new file mode 100644
index 00000000000..acbea129088
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Implementations of various expressions."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
new file mode 100644
index 00000000000..41b1defab39
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -0,0 +1,228 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for aggregations."""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    AggInfo,
+    ExecutionContext,
+    Expr,
+)
+from cudf_polars.dsl.expressions.literal import Literal
+from cudf_polars.dsl.expressions.unary import UnaryFunction
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Agg"]
+
+
+class Agg(Expr):
+    __slots__ = ("name", "options", "op", "request")
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
+    ) -> None:
+        self.dtype = dtype
+        self.name = name
+        self.options = options
+        self.children = children
+        if name not in Agg._SUPPORTED:
+            raise NotImplementedError(
+                f"Unsupported aggregation {name=}"
+            )  # pragma: no cover; all valid aggs are supported
+        # TODO: nan handling in groupby case
+        if name == "min":
+            req = plc.aggregation.min()
+        elif name == "max":
+            req = plc.aggregation.max()
+        elif name == "median":
+            req = plc.aggregation.median()
+        elif name == "n_unique":
+            # TODO: datatype of result
+            req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
+        elif name == "first" or name == "last":
+            req = None
+        elif name == "mean":
+            req = plc.aggregation.mean()
+        elif name == "sum":
+            req = plc.aggregation.sum()
+        elif name == "std":
+            # TODO: handle nans
+            req = plc.aggregation.std(ddof=options)
+        elif name == "var":
+            # TODO: handle nans
+            req = plc.aggregation.variance(ddof=options)
+        elif name == "count":
+            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
+        elif name == "quantile":
+            _, quantile = self.children
+            if not isinstance(quantile, Literal):
+                raise NotImplementedError("Only support literal quantile values")
+            req = plc.aggregation.quantile(
+                quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
+            )
+        else:
+            raise NotImplementedError(
+                f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
+            )  # pragma: no cover
+        self.request = req
+        op = getattr(self, f"_{name}", None)
+        if op is None:
+            op = partial(self._reduce, request=req)
+        elif name in {"min", "max"}:
+            op = partial(op, propagate_nans=options)
+        elif name in {"count", "first", "last"}:
+            pass
+        else:
+            raise NotImplementedError(
+                f"Unreachable, supported agg {name=} has no implementation"
+            )  # pragma: no cover
+        self.op = op
+
+    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
+        [
+            "min",
+            "max",
+            "median",
+            "n_unique",
+            "first",
+            "last",
+            "mean",
+            "sum",
+            "count",
+            "std",
+            "var",
+            "quantile",
+        ]
+    )
+
+    interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = {
+        "nearest": plc.types.Interpolation.NEAREST,
+        "higher": plc.types.Interpolation.HIGHER,
+        "lower": plc.types.Interpolation.LOWER,
+        "midpoint": plc.types.Interpolation.MIDPOINT,
+        "linear": plc.types.Interpolation.LINEAR,
+    }
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth >= 1:
+            raise NotImplementedError(
+                "Nested aggregations in groupby"
+            )  # pragma: no cover; check_agg trips first
+        if (isminmax := self.name in {"min", "max"}) and self.options:
+            raise NotImplementedError("Nan propagation in groupby for min/max")
+        (child,) = self.children
+        ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
+        request = self.request
+        # These are handled specially here because we don't set up the
+        # request for the whole-frame agg because we can avoid a
+        # reduce for these.
+        if self.name == "first":
+            request = plc.aggregation.nth_element(
+                0, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        elif self.name == "last":
+            request = plc.aggregation.nth_element(
+                -1, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        if request is None:
+            raise NotImplementedError(
+                f"Aggregation {self.name} in groupby"
+            )  # pragma: no cover; __init__ trips first
+        if isminmax and plc.traits.is_floating_point(self.dtype):
+            assert expr is not None
+            # Ignore nans in these groupby aggs, do this by masking
+            # nans in the input
+            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
+        return AggInfo([(expr, request, self)])
+
+    def _reduce(
+        self, column: Column, *, request: plc.aggregation.Aggregation
+    ) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, request, self.dtype),
+                1,
+            )
+        )
+
+    def _count(self, column: Column) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(
+                        column.obj.size() - column.obj.null_count(),
+                        type=plc.interop.to_arrow(self.dtype),
+                    ),
+                ),
+                1,
+            )
+        )
+
+    def _min(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
+                    ),
+                    1,
+                )
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return self._reduce(column, request=plc.aggregation.min())
+
+    def _max(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
+                    ),
+                    1,
+                )
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return self._reduce(column, request=plc.aggregation.max())
+
+    def _first(self, column: Column) -> Column:
+        return Column(plc.copying.slice(column.obj, [0, 1])[0])
+
+    def _last(self, column: Column) -> Column:
+        n = column.obj.size()
+        return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if context is not ExecutionContext.FRAME:
+            raise NotImplementedError(
+                f"Agg in context {context}"
+            )  # pragma: no cover; unreachable
+
+        # Aggregations like quantiles may have additional children that were
+        # preprocessed into pylibcudf requests.
+        child = self.children[0]
+        return self.op(child.evaluate(df, context=context, mapping=mapping))
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
new file mode 100644
index 00000000000..effe8cb2378
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -0,0 +1,251 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Base and common classes for expression DSL nodes."""
+
+from __future__ import annotations
+
+import enum
+from enum import IntEnum
+from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.nodebase import Node
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import Column, DataFrame
+
+__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext"]
+
+
+class AggInfo(NamedTuple):
+    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
+
+
+class ExecutionContext(IntEnum):
+    FRAME = enum.auto()
+    GROUPBY = enum.auto()
+    ROLLING = enum.auto()
+
+
+class Expr(Node["Expr"]):
+    """An abstract expression object."""
+
+    __slots__ = ("dtype",)
+    dtype: plc.DataType
+    """Data type of the expression."""
+    # This annotation is needed because of https://github.com/python/mypy/issues/17981
+    _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
+    """Names of non-child data (not Exprs) for reconstruction."""
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Do not call this function directly, but rather
+        :meth:`evaluate` which handles the mapping lookups.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression.
+
+        Raises
+        ------
+        NotImplementedError
+            If we couldn't evaluate the expression. Ideally all these
+            are returned during translation to the IR, but for now we
+            are not perfect.
+        """
+        raise NotImplementedError(
+            f"Evaluation of expression {type(self).__name__}"
+        )  # pragma: no cover; translation of unimplemented nodes trips first
+
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Individual subclasses should implement :meth:`do_evaluate`,
+        this method provides logic to handle lookups in the
+        substitution mapping.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression.
+
+        Raises
+        ------
+        NotImplementedError
+            If we couldn't evaluate the expression. Ideally all these
+            are returned during translation to the IR, but for now we
+            are not perfect.
+        """
+        if mapping is None:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+        try:
+            return mapping[self]
+        except KeyError:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """
+        Collect information about aggregations in groupbys.
+
+        Parameters
+        ----------
+        depth
+            The depth of aggregating (reduction or sampling)
+            expressions we are currently at.
+
+        Returns
+        -------
+        Aggregation info describing the expression to aggregate in the
+        groupby.
+
+        Raises
+        ------
+        NotImplementedError
+            If we can't currently perform the aggregation request, for
+            example nested aggregations like ``a.max().min()``.
+        """
+        raise NotImplementedError(
+            f"Collecting aggregation info for {type(self).__name__}"
+        )  # pragma: no cover; check_agg trips first
+
+
+class NamedExpr:
+    # NamedExpr does not inherit from Expr since it does not appear
+    # when evaluating expressions themselves, only when constructing
+    # named return values in dataframe (IR) nodes.
+    __slots__ = ("name", "value")
+    value: Expr
+    name: str
+
+    def __init__(self, name: str, value: Expr) -> None:
+        self.name = name
+        self.value = value
+
+    def __hash__(self) -> int:
+        """Hash of the expression."""
+        return hash((type(self), self.name, self.value))
+
+    def __repr__(self) -> str:
+        """Repr of the expression."""
+        return f"NamedExpr({self.name}, {self.value})"
+
+    def __eq__(self, other: Any) -> bool:
+        """Equality of two expressions."""
+        return (
+            type(self) is type(other)
+            and self.name == other.name
+            and self.value == other.value
+        )
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame providing context
+        context
+            Execution context
+        mapping
+            Substitution mapping
+
+        Returns
+        -------
+        Evaluated Column with name attached.
+
+        See Also
+        --------
+        :meth:`Expr.evaluate` for details, this function just adds the
+        name to a column produced from an expression.
+        """
+        return self.value.evaluate(df, context=context, mapping=mapping).rename(
+            self.name
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return self.value.collect_agg(depth=depth)
+
+
+class Col(Expr):
+    __slots__ = ("name",)
+    _non_child = ("dtype", "name")
+    name: str
+
+    def __init__(self, dtype: plc.DataType, name: str) -> None:
+        self.dtype = dtype
+        self.name = name
+        self.children = ()
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # Deliberately remove the name here so that we guarantee
+        # evaluation of the IR produces names.
+        return df.column_map[self.name].rename(None)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([(self, plc.aggregation.collect_list(), self)])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
new file mode 100644
index 00000000000..11a47e7ea51
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""BinaryOp DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, ClassVar
+
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["BinOp"]
+
+
+class BinOp(Expr):
+    __slots__ = ("op",)
+    _non_child = ("dtype", "op")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        op: plc.binaryop.BinaryOperator,
+        left: Expr,
+        right: Expr,
+    ) -> None:
+        self.dtype = dtype
+        if plc.traits.is_boolean(self.dtype):
+            # For boolean output types, bitand and bitor implement
+            # boolean logic, so translate. bitxor also does, but the
+            # default behaviour is correct.
+            op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
+        self.op = op
+        self.children = (left, right)
+        if not plc.binaryop.is_supported_operation(
+            self.dtype, left.dtype, right.dtype, op
+        ):
+            raise NotImplementedError(
+                f"Operation {op.name} not supported "
+                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
+                f"with output type {self.dtype.id().name}"
+            )
+
+    _BOOL_KLEENE_MAPPING: ClassVar[
+        dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator]
+    ] = {
+        plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+        plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+    }
+
+    _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
+        pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
+        pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
+        pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
+        pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+        pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS,
+        pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
+        pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER,
+        pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
+        pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD,
+        pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB,
+        pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL,
+        pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV,
+        pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
+        pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
+        pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
+        pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
+        pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
+        pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
+        pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
+        pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
+    }
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        left, right = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        lop = left.obj
+        rop = right.obj
+        if left.obj.size() != right.obj.size():
+            if left.is_scalar:
+                lop = left.obj_scalar
+            elif right.is_scalar:
+                rop = right.obj_scalar
+        return Column(
+            plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate,
+            # groupby construction has checked that we don't have
+            # nested aggs, so stop the recursion and return ourselves
+            # for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            left_info, right_info = (
+                child.collect_agg(depth=depth) for child in self.children
+            )
+            requests = [*left_info.requests, *right_info.requests]
+            # TODO: Hack, if there were no reductions inside this
+            # binary expression then we want to pre-evaluate and
+            # collect ourselves. Otherwise we want to collect the
+            # aggregations inside and post-evaluate. This is a bad way
+            # of checking that we are in case 1.
+            if all(
+                agg.kind() == plc.aggregation.Kind.COLLECT_LIST
+                for _, agg, _ in requests
+            ):
+                return AggInfo([(self, plc.aggregation.collect_list(), self)])
+            return AggInfo(
+                [*left_info.requests, *right_info.requests],
+            )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
new file mode 100644
index 00000000000..9c14a8386f3
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -0,0 +1,268 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Boolean DSL nodes."""
+
+from __future__ import annotations
+
+from functools import partial, reduce
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    ExecutionContext,
+    Expr,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    import polars.type_aliases as pl_types
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["BooleanFunction"]
+
+
+class BooleanFunction(Expr):
+    __slots__ = ("name", "options")
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.BooleanFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        self.dtype = dtype
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name == pl_expr.BooleanFunction.IsIn and not all(
+            c.dtype == self.children[0].dtype for c in self.children
+        ):
+            # TODO: If polars IR doesn't put the casts in, we need to
+            # mimic the supertype promotion rules.
+            raise NotImplementedError("IsIn doesn't support supertype casting")
+
+    @staticmethod
+    def _distinct(
+        column: Column,
+        *,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        source_value: plc.Scalar,
+        target_value: plc.Scalar,
+    ) -> Column:
+        table = plc.Table([column.obj])
+        indices = plc.stream_compaction.distinct_indices(
+            table,
+            keep,
+            # TODO: polars doesn't expose options for these
+            plc.types.NullEquality.EQUAL,
+            plc.types.NanEquality.ALL_EQUAL,
+        )
+        return Column(
+            plc.copying.scatter(
+                [source_value],
+                indices,
+                plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
+            ).columns()[0]
+        )
+
+    _BETWEEN_OPS: ClassVar[
+        dict[
+            pl_types.ClosedInterval,
+            tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
+        ]
+    ] = {
+        "none": (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        "left": (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        "right": (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+        "both": (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+    }
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+        ):
+            # Avoid evaluating the child if the dtype tells us it's unnecessary.
+            (child,) = self.children
+            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
+            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+                value = plc.interop.from_arrow(
+                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
+                )
+                return Column(plc.Column.from_scalar(value, df.num_rows))
+            needles = child.evaluate(df, context=context, mapping=mapping)
+            to_search = [-float("inf"), float("inf")]
+            if is_finite:
+                # NaN is neither finite not infinite
+                to_search.append(float("nan"))
+            haystack = plc.interop.from_arrow(
+                pa.array(
+                    to_search,
+                    type=plc.interop.to_arrow(needles.obj.type()),
+                )
+            )
+            result = plc.search.contains(haystack, needles.obj)
+            if is_finite:
+                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
+            return Column(result)
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
+        # False
+        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
+            (ignore_nulls,) = self.options
+            (column,) = columns
+            is_any = self.name == pl_expr.BooleanFunction.Any
+            agg = plc.aggregation.any() if is_any else plc.aggregation.all()
+            result = plc.reduce.reduce(column.obj, agg, self.dtype)
+            if not ignore_nulls and column.obj.null_count() > 0:
+                #      Truth tables
+                #     Any         All
+                #   | F U T     | F U T
+                # --+------   --+------
+                # F | F U T   F | F F F
+                # U | U U T   U | F U U
+                # T | T T T   T | F U T
+                #
+                # If the input null count was non-zero, we must
+                # post-process the result to insert the correct value.
+                h_result = plc.interop.to_arrow(result).as_py()
+                if is_any and not h_result or not is_any and h_result:
+                    # Any                     All
+                    # False || Null => Null   True && Null => Null
+                    return Column(plc.Column.all_null_like(column.obj, 1))
+            return Column(plc.Column.from_scalar(result, 1))
+        if self.name == pl_expr.BooleanFunction.IsNull:
+            (column,) = columns
+            return Column(plc.unary.is_null(column.obj))
+        elif self.name == pl_expr.BooleanFunction.IsNotNull:
+            (column,) = columns
+            return Column(plc.unary.is_valid(column.obj))
+        elif self.name == pl_expr.BooleanFunction.IsNan:
+            (column,) = columns
+            return Column(
+                plc.unary.is_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsNotNan:
+            (column,) = columns
+            return Column(
+                plc.unary.is_not_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsUnique:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+            )
+        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                )
+            )
+        elif self.name == pl_expr.BooleanFunction.IsIn:
+            needles, haystack = columns
+            return Column(plc.search.contains(haystack.obj, needles.obj))
+        elif self.name == pl_expr.BooleanFunction.Not:
+            (column,) = columns
+            return Column(
+                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
+            )
+        else:
+            raise NotImplementedError(
+                f"BooleanFunction {self.name}"
+            )  # pragma: no cover; handled by init raising
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
new file mode 100644
index 00000000000..596e193d8fe
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for datetime operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["TemporalFunction"]
+
+
+class TemporalFunction(Expr):
+    __slots__ = ("name", "options")
+    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
+        pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
+        pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
+        pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
+        pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
+        pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR,
+        pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE,
+        pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND,
+        pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
+        pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
+        pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
+    }
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.TemporalFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        self.dtype = dtype
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name not in self._COMPONENT_MAP:
+            raise NotImplementedError(f"Temporal function {self.name}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        (column,) = columns
+        if self.name == pl_expr.TemporalFunction.Microsecond:
+            millis = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
+            )
+            micros = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
+            )
+            millis_as_micros = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.DataType(plc.TypeId.INT32),
+            )
+            total_micros = plc.binaryop.binary_operation(
+                micros,
+                millis_as_micros,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_micros)
+        elif self.name == pl_expr.TemporalFunction.Nanosecond:
+            millis = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MILLISECOND
+            )
+            micros = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.MICROSECOND
+            )
+            nanos = plc.datetime.extract_datetime_component(
+                column.obj, plc.datetime.DatetimeComponent.NANOSECOND
+            )
+            millis_as_nanos = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            micros_as_nanos = plc.binaryop.binary_operation(
+                micros,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                nanos,
+                millis_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                total_nanos,
+                micros_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_nanos)
+
+        return Column(
+            plc.datetime.extract_datetime_component(
+                column.obj,
+                self._COMPONENT_MAP[self.name],
+            )
+        )
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
new file mode 100644
index 00000000000..c8aa993b994
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Literal DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+from cudf_polars.utils import dtypes
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, Mapping
+
+    import pyarrow as pa
+
+    import polars as pl
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Literal", "LiteralColumn"]
+
+
+class Literal(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Scalar[Any]
+
+    def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
+        self.dtype = dtype
+        assert value.type == plc.interop.to_arrow(dtype)
+        self.value = value
+        self.children = ()
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow scalar is correct by construction.
+        return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
+
+class LiteralColumn(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Array[Any, Any]
+
+    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
+        self.dtype = dtype
+        data = value.to_arrow()
+        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+        self.children = ()
+
+    def get_hashable(self) -> Hashable:
+        """Compute a hash of the column."""
+        # This is stricter than necessary, but we only need this hash
+        # for identity in groupby replacements so it's OK. And this
+        # way we avoid doing potentially expensive compute.
+        return (type(self), self.dtype, id(self.value))
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow array is correct by construction.
+        return Column(plc.interop.from_arrow(self.value))
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
new file mode 100644
index 00000000000..fa68bcb9426
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Rolling DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from cudf_polars.dsl.expressions.base import Expr
+
+if TYPE_CHECKING:
+    import pylibcudf as plc
+
+__all__ = ["RollingWindow", "GroupedRollingWindow"]
+
+
+class RollingWindow(Expr):
+    __slots__ = ("options",)
+    _non_child = ("dtype", "options")
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
+        self.dtype = dtype
+        self.options = options
+        self.children = (agg,)
+        raise NotImplementedError("Rolling window not implemented")
+
+
+class GroupedRollingWindow(Expr):
+    __slots__ = ("options",)
+    _non_child = ("dtype", "options")
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
+        self.dtype = dtype
+        self.options = options
+        self.children = (agg, *by)
+        raise NotImplementedError("Grouped rolling window not implemented")
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
new file mode 100644
index 00000000000..0247256e507
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for selection operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Gather", "Filter"]
+
+
+class Gather(Expr):
+    __slots__ = ()
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
+        self.dtype = dtype
+        self.children = (values, indices)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values, indices = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        lo, hi = plc.reduce.minmax(indices.obj)
+        lo = plc.interop.to_arrow(lo).as_py()
+        hi = plc.interop.to_arrow(hi).as_py()
+        n = df.num_rows
+        if hi >= n or lo < -n:
+            raise ValueError("gather indices are out of bounds")
+        if indices.obj.null_count():
+            bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
+            obj = plc.replace.replace_nulls(
+                indices.obj,
+                plc.interop.from_arrow(
+                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
+                ),
+            )
+        else:
+            bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
+            obj = indices.obj
+        table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
+        return Column(table.columns()[0])
+
+
+class Filter(Expr):
+    __slots__ = ()
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
+        self.dtype = dtype
+        self.children = (values, indices)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values, mask = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        table = plc.stream_compaction.apply_boolean_mask(
+            plc.Table([values.obj]), mask.obj
+        )
+        return Column(table.columns()[0]).sorted_like(values)
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
new file mode 100644
index 00000000000..99512e2ef52
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""Sorting DSL nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+from cudf_polars.utils import sorting
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Sort", "SortBy"]
+
+
+class Sort(Expr):
+    __slots__ = ("options",)
+    _non_child = ("dtype", "options")
+
+    def __init__(
+        self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
+    ) -> None:
+        self.dtype = dtype
+        self.options = options
+        self.children = (column,)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            [descending], nulls_last=[nulls_last], num_keys=1
+        )
+        do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
+        table = do_sort(plc.Table([column.obj]), order, null_order)
+        return Column(
+            table.columns()[0],
+            is_sorted=plc.types.Sorted.YES,
+            order=order[0],
+            null_order=null_order[0],
+        )
+
+
+class SortBy(Expr):
+    __slots__ = ("options",)
+    _non_child = ("dtype", "options")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        options: tuple[bool, tuple[bool], tuple[bool]],
+        column: Expr,
+        *by: Expr,
+    ) -> None:
+        self.dtype = dtype
+        self.options = options
+        self.children = (column, *by)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        column, *by = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            descending, nulls_last=nulls_last, num_keys=len(by)
+        )
+        do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+        table = do_sort(
+            plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
+        )
+        return Column(table.columns()[0])
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
new file mode 100644
index 00000000000..62b54c63a8d
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -0,0 +1,282 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for string operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+
+from polars.exceptions import InvalidOperationError
+from polars.polars import _expr_nodes as pl_expr
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
+from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["StringFunction"]
+
+
+class StringFunction(Expr):
+    __slots__ = ("name", "options", "_regex_program")
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.StringFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        self.dtype = dtype
+        self.options = options
+        self.name = name
+        self.children = children
+        self._validate_input()
+
+    def _validate_input(self):
+        if self.name not in (
+            pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.EndsWith,
+            pl_expr.StringFunction.Lowercase,
+            pl_expr.StringFunction.Replace,
+            pl_expr.StringFunction.ReplaceMany,
+            pl_expr.StringFunction.Slice,
+            pl_expr.StringFunction.Strptime,
+            pl_expr.StringFunction.StartsWith,
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+            pl_expr.StringFunction.Uppercase,
+        ):
+            raise NotImplementedError(f"String function {self.name}")
+        if self.name == pl_expr.StringFunction.Contains:
+            literal, strict = self.options
+            if not literal:
+                if not strict:
+                    raise NotImplementedError(
+                        "f{strict=} is not supported for regex contains"
+                    )
+                if not isinstance(self.children[1], Literal):
+                    raise NotImplementedError(
+                        "Regex contains only supports a scalar pattern"
+                    )
+                pattern = self.children[1].value.as_py()
+                try:
+                    self._regex_program = plc.strings.regex_program.RegexProgram.create(
+                        pattern,
+                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+                    )
+                except RuntimeError as e:
+                    raise NotImplementedError(
+                        f"Unsupported regex {pattern} for GPU engine."
+                    ) from e
+        elif self.name == pl_expr.StringFunction.Replace:
+            _, literal = self.options
+            if not literal:
+                raise NotImplementedError("literal=False is not supported for replace")
+            if not all(isinstance(expr, Literal) for expr in self.children[1:]):
+                raise NotImplementedError("replace only supports scalar target")
+            target = self.children[1]
+            if target.value == pa.scalar("", type=pa.string()):
+                raise NotImplementedError(
+                    "libcudf replace does not support empty strings"
+                )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            (ascii_case_insensitive,) = self.options
+            if ascii_case_insensitive:
+                raise NotImplementedError(
+                    "ascii_case_insensitive not implemented for replace_many"
+                )
+            if not all(
+                isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:]
+            ):
+                raise NotImplementedError("replace_many only supports literal inputs")
+            target = self.children[1]
+            if pc.any(pc.equal(target.value, "")).as_py():
+                raise NotImplementedError(
+                    "libcudf replace_many is implemented differently from polars "
+                    "for empty strings"
+                )
+        elif self.name == pl_expr.StringFunction.Slice:
+            if not all(isinstance(child, Literal) for child in self.children[1:]):
+                raise NotImplementedError(
+                    "Slice only supports literal start and stop values"
+                )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            format, _, exact, cache = self.options
+            if cache:
+                raise NotImplementedError("Strptime cache is a CPU feature")
+            if format is None:
+                raise NotImplementedError("Strptime format is required")
+            if not exact:
+                raise NotImplementedError("Strptime does not support exact=False")
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            if not isinstance(self.children[1], Literal):
+                raise NotImplementedError(
+                    "strip operations only support scalar patterns"
+                )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == pl_expr.StringFunction.Contains:
+            child, arg = self.children
+            column = child.evaluate(df, context=context, mapping=mapping)
+
+            literal, _ = self.options
+            if literal:
+                pat = arg.evaluate(df, context=context, mapping=mapping)
+                pattern = (
+                    pat.obj_scalar
+                    if pat.is_scalar and pat.obj.size() != column.obj.size()
+                    else pat.obj
+                )
+                return Column(plc.strings.find.contains(column.obj, pattern))
+            else:
+                return Column(
+                    plc.strings.contains.contains_re(column.obj, self._regex_program)
+                )
+        elif self.name == pl_expr.StringFunction.Slice:
+            child, expr_offset, expr_length = self.children
+            assert isinstance(expr_offset, Literal)
+            assert isinstance(expr_length, Literal)
+
+            column = child.evaluate(df, context=context, mapping=mapping)
+            # libcudf slices via [start,stop).
+            # polars slices with offset + length where start == offset
+            # stop = start + length. Negative values for start look backward
+            # from the last element of the string. If the end index would be
+            # below zero, an empty string is returned.
+            # Do this maths on the host
+            start = expr_offset.value.as_py()
+            length = expr_length.value.as_py()
+
+            if length == 0:
+                stop = start
+            else:
+                # No length indicates a scan to the end
+                # The libcudf equivalent is a null stop
+                stop = start + length if length else None
+                if length and start < 0 and length >= -start:
+                    stop = None
+            return Column(
+                plc.strings.slice.slice_strings(
+                    column.obj,
+                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
+                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                )
+            )
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            column, chars = (
+                c.evaluate(df, context=context, mapping=mapping) for c in self.children
+            )
+            if self.name == pl_expr.StringFunction.StripCharsStart:
+                side = plc.strings.SideType.LEFT
+            elif self.name == pl_expr.StringFunction.StripCharsEnd:
+                side = plc.strings.SideType.RIGHT
+            else:
+                side = plc.strings.SideType.BOTH
+            return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
+
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.StringFunction.Lowercase:
+            (column,) = columns
+            return Column(plc.strings.case.to_lower(column.obj))
+        elif self.name == pl_expr.StringFunction.Uppercase:
+            (column,) = columns
+            return Column(plc.strings.case.to_upper(column.obj))
+        elif self.name == pl_expr.StringFunction.EndsWith:
+            column, suffix = columns
+            return Column(
+                plc.strings.find.ends_with(
+                    column.obj,
+                    suffix.obj_scalar
+                    if column.obj.size() != suffix.obj.size() and suffix.is_scalar
+                    else suffix.obj,
+                )
+            )
+        elif self.name == pl_expr.StringFunction.StartsWith:
+            column, prefix = columns
+            return Column(
+                plc.strings.find.starts_with(
+                    column.obj,
+                    prefix.obj_scalar
+                    if column.obj.size() != prefix.obj.size() and prefix.is_scalar
+                    else prefix.obj,
+                )
+            )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            # TODO: ignores ambiguous
+            format, strict, exact, cache = self.options
+            col = self.children[0].evaluate(df, context=context, mapping=mapping)
+
+            is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
+                col.obj, format
+            )
+
+            if strict:
+                if not plc.interop.to_arrow(
+                    plc.reduce.reduce(
+                        is_timestamps,
+                        plc.aggregation.all(),
+                        plc.DataType(plc.TypeId.BOOL8),
+                    )
+                ).as_py():
+                    raise InvalidOperationError("conversion from `str` failed.")
+            else:
+                not_timestamps = plc.unary.unary_operation(
+                    is_timestamps, plc.unary.UnaryOperator.NOT
+                )
+
+                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
+                res = plc.copying.boolean_mask_scatter(
+                    [null], plc.Table([col.obj]), not_timestamps
+                )
+                return Column(
+                    plc.strings.convert.convert_datetime.to_timestamps(
+                        res.columns()[0], self.dtype, format
+                    )
+                )
+        elif self.name == pl_expr.StringFunction.Replace:
+            column, target, repl = columns
+            n, _ = self.options
+            return Column(
+                plc.strings.replace.replace(
+                    column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
+                )
+            )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            column, target, repl = columns
+            return Column(
+                plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
+            )
+        raise NotImplementedError(
+            f"StringFunction {self.name}"
+        )  # pragma: no cover; handled by init raising
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
new file mode 100644
index 00000000000..d2b5d6bae29
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""DSL nodes for ternary operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import (
+    ExecutionContext,
+    Expr,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+
+__all__ = ["Ternary"]
+
+
+class Ternary(Expr):
+    __slots__ = ()
+    _non_child = ("dtype",)
+
+    def __init__(
+        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
+    ) -> None:
+        self.dtype = dtype
+        self.children = (when, then, otherwise)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        when, then, otherwise = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        then_obj = then.obj_scalar if then.is_scalar else then.obj
+        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
+        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
new file mode 100644
index 00000000000..53f6ed29239
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -0,0 +1,328 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+"""DSL nodes for unary operations."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+from cudf_polars.dsl.expressions.literal import Literal
+from cudf_polars.utils import dtypes
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = ["Cast", "UnaryFunction", "Len"]
+
+
+class Cast(Expr):
+    """Class representing a cast of an expression."""
+
+    __slots__ = ()
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, value: Expr) -> None:
+        self.dtype = dtype
+        self.children = (value,)
+        if not dtypes.can_cast(value.dtype, self.dtype):
+            raise NotImplementedError(
+                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
+            )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented filter
+        (child,) = self.children
+        return child.collect_agg(depth=depth)
+
+
+class Len(Expr):
+    """Class representing the length of an expression."""
+
+    def __init__(self, dtype: plc.DataType) -> None:
+        self.dtype = dtype
+        self.children = ()
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
+                ),
+                1,
+            )
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: polars returns a uint, not an int for count
+        return AggInfo(
+            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
+        )
+
+
+class UnaryFunction(Expr):
+    """Class representing unary functions of an expression."""
+
+    __slots__ = ("name", "options")
+    _non_child = ("dtype", "name", "options")
+
+    # Note: log, and pow are handled via translation to binops
+    _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
+        "sin": plc.unary.UnaryOperator.SIN,
+        "cos": plc.unary.UnaryOperator.COS,
+        "tan": plc.unary.UnaryOperator.TAN,
+        "arcsin": plc.unary.UnaryOperator.ARCSIN,
+        "arccos": plc.unary.UnaryOperator.ARCCOS,
+        "arctan": plc.unary.UnaryOperator.ARCTAN,
+        "sinh": plc.unary.UnaryOperator.SINH,
+        "cosh": plc.unary.UnaryOperator.COSH,
+        "tanh": plc.unary.UnaryOperator.TANH,
+        "arcsinh": plc.unary.UnaryOperator.ARCSINH,
+        "arccosh": plc.unary.UnaryOperator.ARCCOSH,
+        "arctanh": plc.unary.UnaryOperator.ARCTANH,
+        "exp": plc.unary.UnaryOperator.EXP,
+        "sqrt": plc.unary.UnaryOperator.SQRT,
+        "cbrt": plc.unary.UnaryOperator.CBRT,
+        "ceil": plc.unary.UnaryOperator.CEIL,
+        "floor": plc.unary.UnaryOperator.FLOOR,
+        "abs": plc.unary.UnaryOperator.ABS,
+        "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
+        "not": plc.unary.UnaryOperator.NOT,
+    }
+    _supported_misc_fns = frozenset(
+        {
+            "drop_nulls",
+            "fill_null",
+            "mask_nans",
+            "round",
+            "set_sorted",
+            "unique",
+        }
+    )
+    _supported_cum_aggs = frozenset(
+        {
+            "cum_min",
+            "cum_max",
+            "cum_prod",
+            "cum_sum",
+        }
+    )
+    _supported_fns = frozenset().union(
+        _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys()
+    )
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
+    ) -> None:
+        self.dtype = dtype
+        self.name = name
+        self.options = options
+        self.children = children
+
+        if self.name not in UnaryFunction._supported_fns:
+            raise NotImplementedError(f"Unary function {name=}")
+        if self.name in UnaryFunction._supported_cum_aggs:
+            (reverse,) = self.options
+            if reverse:
+                raise NotImplementedError(
+                    "reverse=True is not supported for cumulative aggregations"
+                )
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == "mask_nans":
+            (child,) = self.children
+            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
+        if self.name == "round":
+            (decimal_places,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.round.round(
+                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
+                )
+            ).sorted_like(values)
+        elif self.name == "unique":
+            (maintain_order,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            # Only one column, so keep_any is the same as keep_first
+            # for stable distinct
+            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
+            if values.is_sorted:
+                maintain_order = True
+                result = plc.stream_compaction.unique(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                )
+            else:
+                distinct = (
+                    plc.stream_compaction.stable_distinct
+                    if maintain_order
+                    else plc.stream_compaction.distinct
+                )
+                result = distinct(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                    plc.types.NanEquality.ALL_EQUAL,
+                )
+            (column,) = result.columns()
+            if maintain_order:
+                return Column(column).sorted_like(values)
+            return Column(column)
+        elif self.name == "set_sorted":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            (asc,) = self.options
+            order = (
+                plc.types.Order.ASCENDING
+                if asc == "ascending"
+                else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
+                # PERF: This invokes four stream synchronisations!
+                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
+                has_nulls_last = not plc.copying.get_element(
+                    column.obj, n - 1
+                ).is_valid()
+                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
+                    order == plc.types.Order.ASCENDING and has_nulls_last
+                ):
+                    null_order = plc.types.NullOrder.AFTER
+            return column.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
+        elif self.name == "drop_nulls":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.stream_compaction.drop_nulls(
+                    plc.Table([column.obj]), [0], 1
+                ).columns()[0]
+            )
+        elif self.name == "fill_null":
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if isinstance(self.children[1], Literal):
+                arg = plc.interop.from_arrow(self.children[1].value)
+            else:
+                evaluated = self.children[1].evaluate(
+                    df, context=context, mapping=mapping
+                )
+                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
+            return Column(plc.replace.replace_nulls(column.obj, arg))
+        elif self.name in self._OP_MAPPING:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if column.obj.type().id() != self.dtype.id():
+                arg = plc.unary.cast(column.obj, self.dtype)
+            else:
+                arg = column.obj
+            return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
+        elif self.name in UnaryFunction._supported_cum_aggs:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            plc_col = column.obj
+            col_type = column.obj.type()
+            # cum_sum casts
+            # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
+            # Bool -> UInt32
+            # cum_prod casts integer dtypes < int64 and bool to int64
+            # See:
+            # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs
+            if (
+                self.name == "cum_sum"
+                and col_type.id()
+                in {
+                    plc.types.TypeId.INT8,
+                    plc.types.TypeId.UINT8,
+                    plc.types.TypeId.INT16,
+                    plc.types.TypeId.UINT16,
+                }
+            ) or (
+                self.name == "cum_prod"
+                and plc.traits.is_integral(col_type)
+                and plc.types.size_of(col_type) <= 4
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.INT64)
+                )
+            elif (
+                self.name == "cum_sum"
+                and column.obj.type().id() == plc.types.TypeId.BOOL8
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.UINT32)
+                )
+            if self.name == "cum_sum":
+                agg = plc.aggregation.sum()
+            elif self.name == "cum_prod":
+                agg = plc.aggregation.product()
+            elif self.name == "cum_min":
+                agg = plc.aggregation.min()
+            elif self.name == "cum_max":
+                agg = plc.aggregation.max()
+
+            return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE))
+        raise NotImplementedError(
+            f"Unimplemented unary function {self.name=}"
+        )  # pragma: no cover; init trips first
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
+            raise NotImplementedError(f"{self.name} in groupby")
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate, groupby
+            # construction has checked that we don't have nested aggs,
+            # so stop the recursion and return ourselves for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            (child,) = self.children
+            return child.collect_agg(depth=depth)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 1c61075be22..f79e229d3f3 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -13,8 +13,8 @@
 
 from __future__ import annotations
 
-import dataclasses
 import itertools
+import json
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -26,11 +26,12 @@
 import polars as pl
 
 import cudf_polars.dsl.expr as expr
-from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.containers import Column, DataFrame
+from cudf_polars.dsl.nodebase import Node
+from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, MutableMapping
+    from collections.abc import Callable, Hashable, MutableMapping, Sequence
     from typing import Literal
 
     from cudf_polars.typing import Schema
@@ -57,9 +58,7 @@
 ]
 
 
-def broadcast(
-    *columns: NamedColumn, target_length: int | None = None
-) -> list[NamedColumn]:
+def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]:
     """
     Broadcast a sequence of columns to a common length.
 
@@ -112,27 +111,38 @@ def broadcast(
     return [
         column
         if column.obj.size() != 1
-        else NamedColumn(
+        else Column(
             plc.Column.from_scalar(column.obj_scalar, nrows),
-            column.name,
             is_sorted=plc.types.Sorted.YES,
             order=plc.types.Order.ASCENDING,
             null_order=plc.types.NullOrder.BEFORE,
+            name=column.name,
         )
         for column in columns
     ]
 
 
-@dataclasses.dataclass
-class IR:
+class IR(Node["IR"]):
     """Abstract plan node, representing an unevaluated dataframe."""
 
+    __slots__ = ("schema",)
+    # This annotation is needed because of https://github.com/python/mypy/issues/17981
+    _non_child: ClassVar[tuple[str, ...]] = ("schema",)
     schema: Schema
     """Mapping from column names to their data types."""
 
-    def __post_init__(self):
-        """Validate preconditions."""
-        pass  # noqa: PIE790
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of node, treating schema dictionary.
+
+        Since the schema is a dictionary, even though it is morally
+        immutable, it is not hashable. We therefore convert it to
+        tuples for hashing purposes.
+        """
+        # Schema is the first constructor argument
+        args = self._ctor_arguments(self.children)[1:]
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, args)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
@@ -161,24 +171,50 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )  # pragma: no cover
 
 
-@dataclasses.dataclass
 class PythonScan(IR):
     """Representation of input from a python function."""
 
+    __slots__ = ("options", "predicate")
+    _non_child = ("schema", "options", "predicate")
     options: Any
     """Arbitrary options."""
     predicate: expr.NamedExpr | None
     """Filter to apply to the constructed dataframe before returning it."""
 
-    def __post_init__(self):
-        """Validate preconditions."""
+    def __init__(self, schema: Schema, options: Any, predicate: expr.NamedExpr | None):
+        self.schema = schema
+        self.options = options
+        self.predicate = predicate
+        self.children = ()
         raise NotImplementedError("PythonScan not implemented")
 
 
-@dataclasses.dataclass
 class Scan(IR):
     """Input from files."""
 
+    __slots__ = (
+        "typ",
+        "reader_options",
+        "cloud_options",
+        "paths",
+        "with_columns",
+        "skip_rows",
+        "n_rows",
+        "row_index",
+        "predicate",
+    )
+    _non_child = (
+        "schema",
+        "typ",
+        "reader_options",
+        "cloud_options",
+        "paths",
+        "with_columns",
+        "skip_rows",
+        "n_rows",
+        "row_index",
+        "predicate",
+    )
     typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
     reader_options: dict[str, Any]
@@ -187,7 +223,7 @@ class Scan(IR):
     """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
-    with_columns: list[str]
+    with_columns: list[str] | None
     """Projected columns to return."""
     skip_rows: int
     """Rows to skip at the start when reading."""
@@ -198,9 +234,30 @@ class Scan(IR):
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(
+        self,
+        schema: Schema,
+        typ: str,
+        reader_options: dict[str, Any],
+        cloud_options: dict[str, Any] | None,
+        paths: list[str],
+        with_columns: list[str] | None,
+        skip_rows: int,
+        n_rows: int,
+        row_index: tuple[str, int] | None,
+        predicate: expr.NamedExpr | None,
+    ):
+        self.schema = schema
+        self.typ = typ
+        self.reader_options = reader_options
+        self.cloud_options = cloud_options
+        self.paths = paths
+        self.with_columns = with_columns
+        self.skip_rows = skip_rows
+        self.n_rows = n_rows
+        self.row_index = row_index
+        self.predicate = predicate
+        self.children = ()
         if self.typ not in ("csv", "parquet", "ndjson"):  # pragma: no cover
             # This line is unhittable ATM since IPC/Anonymous scan raise
             # on the polars side
@@ -260,6 +317,28 @@ def __post_init__(self) -> None:
                 "Reading only parquet metadata to produce row index."
             )
 
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+
+        The options dictionaries are serialised for hashing purposes
+        as json strings.
+        """
+        schema_hash = tuple(self.schema.items())
+        return (
+            type(self),
+            schema_hash,
+            self.typ,
+            json.dumps(self.reader_options),
+            json.dumps(self.cloud_options),
+            tuple(self.paths),
+            tuple(self.with_columns) if self.with_columns is not None else None,
+            self.skip_rows,
+            self.n_rows,
+            self.row_index,
+            self.predicate,
+        )
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         with_columns = self.with_columns
@@ -385,15 +464,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             init = plc.interop.from_arrow(
                 pa.scalar(offset, type=plc.interop.to_arrow(dtype))
             )
-            index = NamedColumn(
+            index = Column(
                 plc.filling.sequence(df.num_rows, init, step),
-                name,
                 is_sorted=plc.types.Sorted.YES,
                 order=plc.types.Order.ASCENDING,
                 null_order=plc.types.NullOrder.AFTER,
+                name=name,
             )
             df = DataFrame([index, *df.columns])
-        assert all(c.obj.type() == self.schema[c.name] for c in df.columns)
+        assert all(
+            c.obj.type() == self.schema[name] for name, c in df.column_map.items()
+        )
         if self.predicate is None:
             return df
         else:
@@ -401,7 +482,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df.filter(mask)
 
 
-@dataclasses.dataclass
 class Cache(IR):
     """
     Return a cached plan node.
@@ -409,20 +489,25 @@ class Cache(IR):
     Used for CSE at the plan level.
     """
 
+    __slots__ = ("key",)
+    _non_child = ("schema", "key")
     key: int
     """The cache key."""
-    value: IR
-    """The unevaluated node to cache."""
+
+    def __init__(self, schema: Schema, key: int, value: IR):
+        self.schema = schema
+        self.key = key
+        self.children = (value,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         try:
             return cache[self.key]
         except KeyError:
-            return cache.setdefault(self.key, self.value.evaluate(cache=cache))
+            (value,) = self.children
+            return cache.setdefault(self.key, value.evaluate(cache=cache))
 
 
-@dataclasses.dataclass
 class DataFrameScan(IR):
     """
     Input from an existing polars DataFrame.
@@ -430,13 +515,38 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
 
+    __slots__ = ("df", "projection", "predicate")
+    _non_child = ("schema", "df", "projection", "predicate")
     df: Any
     """Polars LazyFrame object."""
-    projection: list[str]
+    projection: tuple[str, ...] | None
     """List of columns to project out."""
     predicate: expr.NamedExpr | None
     """Mask to apply."""
 
+    def __init__(
+        self,
+        schema: Schema,
+        df: Any,
+        projection: Sequence[str] | None,
+        predicate: expr.NamedExpr | None,
+    ):
+        self.schema = schema
+        self.df = df
+        self.projection = tuple(projection) if projection is not None else None
+        self.predicate = predicate
+        self.children = ()
+
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+
+        The (heavy) dataframe object is hashed as its id, so this is
+        not stable across runs, or repeat instances of the same equal dataframes.
+        """
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, id(self.df), self.projection, self.predicate)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(self.df)
@@ -454,28 +564,39 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df
 
 
-@dataclasses.dataclass
 class Select(IR):
     """Produce a new dataframe selecting given expressions from an input."""
 
-    df: IR
-    """Input dataframe."""
-    expr: list[expr.NamedExpr]
+    __slots__ = ("exprs", "should_broadcast")
+    _non_child = ("schema", "exprs", "should_broadcast")
+    exprs: tuple[expr.NamedExpr, ...]
     """List of expressions to evaluate to form the new dataframe."""
     should_broadcast: bool
     """Should columns be broadcast?"""
 
+    def __init__(
+        self,
+        schema: Schema,
+        exprs: Sequence[expr.NamedExpr],
+        should_broadcast: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.exprs = tuple(exprs)
+        self.should_broadcast = should_broadcast
+        self.children = (df,)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         # Handle any broadcasting
-        columns = [e.evaluate(df) for e in self.expr]
+        columns = [e.evaluate(df) for e in self.exprs]
         if self.should_broadcast:
             columns = broadcast(*columns)
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class Reduce(IR):
     """
     Produce a new dataframe selecting given expressions from an input.
@@ -483,36 +604,73 @@ class Reduce(IR):
     This is a special case of :class:`Select` where all outputs are a single row.
     """
 
-    df: IR
-    """Input dataframe."""
-    expr: list[expr.NamedExpr]
+    __slots__ = ("exprs",)
+    _non_child = ("schema", "exprs")
+    exprs: tuple[expr.NamedExpr, ...]
     """List of expressions to evaluate to form the new dataframe."""
 
+    def __init__(
+        self, schema: Schema, exprs: Sequence[expr.NamedExpr], df: IR
+    ):  # pragma: no cover; polars doesn't emit this node yet
+        self.schema = schema
+        self.exprs = tuple(exprs)
+        self.children = (df,)
+
     def evaluate(
         self, *, cache: MutableMapping[int, DataFrame]
     ) -> DataFrame:  # pragma: no cover; polars doesn't emit this node yet
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
-        columns = broadcast(*(e.evaluate(df) for e in self.expr))
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
+        columns = broadcast(*(e.evaluate(df) for e in self.exprs))
         assert all(column.obj.size() == 1 for column in columns)
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
 
-    df: IR
-    """Input dataframe."""
-    agg_requests: list[expr.NamedExpr]
-    """List of expressions to evaluate groupwise."""
-    keys: list[expr.NamedExpr]
-    """List of expressions forming the keys."""
+    __slots__ = (
+        "agg_requests",
+        "keys",
+        "maintain_order",
+        "options",
+        "agg_infos",
+    )
+    _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options")
+    keys: tuple[expr.NamedExpr, ...]
+    """Grouping keys."""
+    agg_requests: tuple[expr.NamedExpr, ...]
+    """Aggregation expressions."""
     maintain_order: bool
-    """Should the order of the input dataframe be maintained?"""
+    """Preserve order in groupby."""
     options: Any
-    """Options controlling style of groupby."""
-    agg_infos: list[expr.AggInfo] = dataclasses.field(init=False)
+    """Arbitrary options."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keys: Sequence[expr.NamedExpr],
+        agg_requests: Sequence[expr.NamedExpr],
+        maintain_order: bool,  # noqa: FBT001
+        options: Any,
+        df: IR,
+    ):
+        self.schema = schema
+        self.keys = tuple(keys)
+        self.agg_requests = tuple(agg_requests)
+        self.maintain_order = maintain_order
+        self.options = options
+        self.children = (df,)
+        if self.options.rolling:
+            raise NotImplementedError(
+                "rolling window/groupby"
+            )  # pragma: no cover; rollingwindow constructor has already raised
+        if self.options.dynamic:
+            raise NotImplementedError("dynamic group by")
+        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
+            raise NotImplementedError("Nested aggregations in groupby")
+        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
 
     @staticmethod
     def check_agg(agg: expr.Expr) -> int:
@@ -542,22 +700,10 @@ def check_agg(agg: expr.Expr) -> int:
         else:
             raise NotImplementedError(f"No handler for {agg=}")
 
-    def __post_init__(self) -> None:
-        """Check whether all the aggregations are implemented."""
-        super().__post_init__()
-        if self.options.rolling:
-            raise NotImplementedError(
-                "rolling window/groupby"
-            )  # pragma: no cover; rollingwindow constructor has already raised
-        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
-            raise NotImplementedError("Nested aggregations in groupby")
-        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
-        if len(self.keys) == 0:
-            raise NotImplementedError("dynamic groupby")
-
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         keys = broadcast(
             *(k.evaluate(df) for k in self.keys), target_length=df.num_rows
         )
@@ -588,15 +734,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 requests.append(plc.groupby.GroupByRequest(col, [req]))
                 replacements.append(rep)
         group_keys, raw_tables = grouper.aggregate(requests)
-        # TODO: names
-        raw_columns: list[NamedColumn] = []
+        raw_columns: list[Column] = []
         for i, table in enumerate(raw_tables):
             (column,) = table.columns()
-            raw_columns.append(NamedColumn(column, f"tmp{i}"))
+            raw_columns.append(Column(column, name=f"tmp{i}"))
         mapping = dict(zip(replacements, raw_columns, strict=True))
         result_keys = [
-            NamedColumn(gk, k.name)
-            for gk, k in zip(group_keys.columns(), keys, strict=True)
+            Column(grouped_key, name=key.name)
+            for key, grouped_key in zip(keys, group_keys.columns(), strict=True)
         ]
         result_subs = DataFrame(raw_columns)
         results = [
@@ -639,31 +784,28 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
             )
             broadcasted = [
-                NamedColumn(reordered, b.name)
-                for reordered, b in zip(
+                Column(reordered, name=old.name)
+                for reordered, old in zip(
                     ordered_table.columns(), broadcasted, strict=True
                 )
             ]
         return DataFrame(broadcasted).slice(self.options.slice)
 
 
-@dataclasses.dataclass
 class Join(IR):
     """A join of two dataframes."""
 
-    left: IR
-    """Left frame."""
-    right: IR
-    """Right frame."""
-    left_on: list[expr.NamedExpr]
+    __slots__ = ("left_on", "right_on", "options")
+    _non_child = ("schema", "left_on", "right_on", "options")
+    left_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the left frame."""
-    right_on: list[expr.NamedExpr]
+    right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
+        Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
         bool,
         tuple[int, int] | None,
-        str | None,
+        str,
         bool,
     ]
     """
@@ -675,9 +817,20 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(
+        self,
+        schema: Schema,
+        left_on: Sequence[expr.NamedExpr],
+        right_on: Sequence[expr.NamedExpr],
+        options: Any,
+        left: IR,
+        right: IR,
+    ):
+        self.schema = schema
+        self.left_on = tuple(left_on)
+        self.right_on = tuple(right_on)
+        self.options = options
+        self.children = (left, right)
         if any(
             isinstance(e.value, expr.Literal)
             for e in itertools.chain(self.left_on, self.right_on)
@@ -687,7 +840,7 @@ def __post_init__(self) -> None:
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"],
+        how: Literal["inner", "left", "right", "full", "semi", "anti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
@@ -709,13 +862,13 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "leftsemi":
+        elif how == "semi":
             return (
                 plc.join.left_semi_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        elif how == "leftanti":
+        elif how == "anti":
             return (
                 plc.join.left_anti_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -778,32 +931,30 @@ def _reorder_maps(
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        left = self.left.evaluate(cache=cache)
-        right = self.right.evaluate(cache=cache)
+        left, right = (c.evaluate(cache=cache) for c in self.children)
         how, join_nulls, zlice, suffix, coalesce = self.options
-        suffix = "_right" if suffix is None else suffix
         if how == "cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
             columns = plc.join.cross_join(left.table, right.table).columns()
             left_cols = [
-                NamedColumn(new, old.name).sorted_like(old)
+                Column(new, name=old.name).sorted_like(old)
                 for new, old in zip(
                     columns[: left.num_columns], left.columns, strict=True
                 )
             ]
             right_cols = [
-                NamedColumn(
+                Column(
                     new,
-                    old.name
-                    if old.name not in left.column_names_set
-                    else f"{old.name}{suffix}",
+                    name=name
+                    if name not in left.column_names_set
+                    else f"{name}{suffix}",
                 )
-                for new, old in zip(
-                    columns[left.num_columns :], right.columns, strict=True
+                for new, name in zip(
+                    columns[left.num_columns :], right.column_names, strict=True
                 )
             ]
-            return DataFrame([*left_cols, *right_cols])
+            return DataFrame([*left_cols, *right_cols]).slice(zlice)
         # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
         left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
         right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))
@@ -838,18 +989,19 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
             if coalesce and how != "inner":
-                left = left.replace_columns(
-                    *(
-                        NamedColumn(
+                left = left.with_columns(
+                    (
+                        Column(
                             plc.replace.replace_nulls(left_col.obj, right_col.obj),
-                            left_col.name,
+                            name=left_col.name,
                         )
                         for left_col, right_col in zip(
                             left.select_columns(left_on.column_names_set),
                             right.select_columns(right_on.column_names_set),
                             strict=True,
                         )
-                    )
+                    ),
+                    replace_only=True,
                 )
                 right = right.discard_columns(right_on.column_names_set)
             if how == "right":
@@ -866,20 +1018,30 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(zlice)
 
 
-@dataclasses.dataclass
 class HStack(IR):
     """Add new columns to a dataframe."""
 
-    df: IR
-    """Input dataframe."""
-    columns: list[expr.NamedExpr]
-    """List of expressions to produce new columns."""
+    __slots__ = ("columns", "should_broadcast")
+    _non_child = ("schema", "columns", "should_broadcast")
     should_broadcast: bool
-    """Should columns be broadcast?"""
+    """Should the resulting evaluated columns be broadcast to the same length."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        columns: Sequence[expr.NamedExpr],
+        should_broadcast: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.columns = tuple(columns)
+        self.should_broadcast = should_broadcast
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         columns = [c.evaluate(df) for c in self.columns]
         if self.should_broadcast:
             columns = broadcast(*columns, target_length=df.num_rows)
@@ -895,20 +1057,36 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.with_columns(columns)
 
 
-@dataclasses.dataclass
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
-    df: IR
-    """Input dataframe."""
+    __slots__ = ("keep", "subset", "zlice", "stable")
+    _non_child = ("schema", "keep", "subset", "zlice", "stable")
     keep: plc.stream_compaction.DuplicateKeepOption
-    """Which rows to keep."""
-    subset: set[str] | None
-    """Which columns to inspect when computing distinct rows."""
+    """Which distinct value to keep."""
+    subset: frozenset[str] | None
+    """Which columns should be used to define distinctness. If None,
+    then all columns are used."""
     zlice: tuple[int, int] | None
-    """Optional slice to perform after compaction."""
+    """Optional slice to apply to the result."""
     stable: bool
-    """Should order be preserved?"""
+    """Should the result maintain ordering."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        subset: frozenset[str] | None,
+        zlice: tuple[int, int] | None,
+        stable: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.keep = keep
+        self.subset = subset
+        self.zlice = zlice
+        self.stable = stable
+        self.children = (df,)
 
     _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = {
         "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
@@ -917,23 +1095,16 @@ class Distinct(IR):
         "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
     }
 
-    def __init__(self, schema: Schema, df: IR, options: Any) -> None:
-        self.schema = schema
-        self.df = df
-        (keep, subset, maintain_order, zlice) = options
-        self.keep = Distinct._KEEP_MAP[keep]
-        self.subset = set(subset) if subset is not None else None
-        self.stable = maintain_order
-        self.zlice = zlice
-
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         if self.subset is None:
             indices = list(range(df.num_columns))
+            keys_sorted = all(c.is_sorted for c in df.column_map.values())
         else:
             indices = [i for i, k in enumerate(df.column_names) if k in self.subset]
-        keys_sorted = all(df.columns[i].is_sorted for i in indices)
+            keys_sorted = all(df.column_map[name].is_sorted for name in self.subset)
         if keys_sorted:
             table = plc.stream_compaction.unique(
                 df.table,
@@ -954,10 +1125,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 plc.types.NullEquality.EQUAL,
                 plc.types.NanEquality.ALL_EQUAL,
             )
+        # TODO: Is this sortedness setting correct
         result = DataFrame(
             [
-                NamedColumn(c, old.name).sorted_like(old)
-                for c, old in zip(table.columns(), df.columns, strict=True)
+                Column(new, name=old.name).sorted_like(old)
+                for new, old in zip(table.columns(), df.columns, strict=True)
             ]
         )
         if keys_sorted or self.stable:
@@ -965,136 +1137,151 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(self.zlice)
 
 
-@dataclasses.dataclass
 class Sort(IR):
     """Sort a dataframe."""
 
-    df: IR
-    """Input."""
-    by: list[expr.NamedExpr]
-    """List of expressions to produce sort keys."""
-    do_sort: Callable[..., plc.Table]
-    """pylibcudf sorting function."""
+    __slots__ = ("by", "order", "null_order", "stable", "zlice")
+    _non_child = ("schema", "by", "order", "null_order", "stable", "zlice")
+    by: tuple[expr.NamedExpr, ...]
+    """Sort keys."""
+    order: tuple[plc.types.Order, ...]
+    """Sort order for each sort key."""
+    null_order: tuple[plc.types.NullOrder, ...]
+    """Null sorting location for each sort key."""
+    stable: bool
+    """Should the sort be stable?"""
     zlice: tuple[int, int] | None
-    """Optional slice to apply after sorting."""
-    order: list[plc.types.Order]
-    """Order keys should be sorted in."""
-    null_order: list[plc.types.NullOrder]
-    """Where nulls sort to."""
+    """Optional slice to apply to the result."""
 
     def __init__(
         self,
         schema: Schema,
-        df: IR,
-        by: list[expr.NamedExpr],
-        options: Any,
+        by: Sequence[expr.NamedExpr],
+        order: Sequence[plc.types.Order],
+        null_order: Sequence[plc.types.NullOrder],
+        stable: bool,  # noqa: FBT001
         zlice: tuple[int, int] | None,
-    ) -> None:
+        df: IR,
+    ):
         self.schema = schema
-        self.df = df
-        self.by = by
+        self.by = tuple(by)
+        self.order = tuple(order)
+        self.null_order = tuple(null_order)
+        self.stable = stable
         self.zlice = zlice
-        stable, nulls_last, descending = options
-        self.order, self.null_order = sorting.sort_order(
-            descending, nulls_last=nulls_last, num_keys=len(by)
-        )
-        self.do_sort = (
-            plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
-        )
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         sort_keys = broadcast(
             *(k.evaluate(df) for k in self.by), target_length=df.num_rows
         )
-        names = {c.name: i for i, c in enumerate(df.columns)}
         # TODO: More robust identification here.
-        keys_in_result = [
-            i
-            for k in sort_keys
-            if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj
-        ]
-        table = self.do_sort(
+        keys_in_result = {
+            k.name: i
+            for i, k in enumerate(sort_keys)
+            if k.name in df.column_map and k.obj is df.column_map[k.name].obj
+        }
+        do_sort = (
+            plc.sorting.stable_sort_by_key if self.stable else plc.sorting.sort_by_key
+        )
+        table = do_sort(
             df.table,
             plc.Table([k.obj for k in sort_keys]),
-            self.order,
-            self.null_order,
+            list(self.order),
+            list(self.null_order),
         )
-        columns = [
-            NamedColumn(c, old.name)
-            for c, old in zip(table.columns(), df.columns, strict=True)
-        ]
-        # If a sort key is in the result table, set the sortedness property
-        for k, i in enumerate(keys_in_result):
-            columns[i] = columns[i].set_sorted(
-                is_sorted=plc.types.Sorted.YES,
-                order=self.order[k],
-                null_order=self.null_order[k],
-            )
+        columns: list[Column] = []
+        for name, c in zip(df.column_map, table.columns(), strict=True):
+            column = Column(c, name=name)
+            # If a sort key is in the result table, set the sortedness property
+            if name in keys_in_result:
+                i = keys_in_result[name]
+                column = column.set_sorted(
+                    is_sorted=plc.types.Sorted.YES,
+                    order=self.order[i],
+                    null_order=self.null_order[i],
+                )
+            columns.append(column)
         return DataFrame(columns).slice(self.zlice)
 
 
-@dataclasses.dataclass
 class Slice(IR):
     """Slice a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("offset", "length")
+    _non_child = ("schema", "offset", "length")
     offset: int
     """Start of the slice."""
     length: int
     """Length of the slice."""
 
+    def __init__(self, schema: Schema, offset: int, length: int, df: IR):
+        self.schema = schema
+        self.offset = offset
+        self.length = length
+        self.children = (df,)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         return df.slice((self.offset, self.length))
 
 
-@dataclasses.dataclass
 class Filter(IR):
     """Filter a dataframe with a boolean mask."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("mask",)
+    _non_child = ("schema", "mask")
     mask: expr.NamedExpr
-    """Expression evaluating to a mask."""
+    """Expression to produce the filter mask."""
+
+    def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR):
+        self.schema = schema
+        self.mask = mask
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows)
         return df.filter(mask)
 
 
-@dataclasses.dataclass
 class Projection(IR):
     """Select a subset of columns from a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ()
+    _non_child = ("schema",)
+
+    def __init__(self, schema: Schema, df: IR):
+        self.schema = schema
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         # This can reorder things.
         columns = broadcast(
-            *df.select(list(self.schema.keys())).columns, target_length=df.num_rows
+            *(df.column_map[name] for name in self.schema), target_length=df.num_rows
         )
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("name", "options")
+    _non_child = ("schema", "name", "options")
     name: str
-    """Function name."""
+    """Name of the function to apply"""
     options: Any
-    """Arbitrary options, interpreted per function."""
+    """Arbitrary name-specific options"""
 
     _NAMES: ClassVar[frozenset[str]] = frozenset(
         [
@@ -1109,9 +1296,11 @@ class MapFunction(IR):
         ]
     )
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(self, schema: Schema, name: str, options: Any, df: IR):
+        self.schema = schema
+        self.name = name
+        self.options = options
+        self.children = (df,)
         if self.name not in MapFunction._NAMES:
             raise NotImplementedError(f"Unhandled map function {self.name}")
         if self.name == "explode":
@@ -1125,7 +1314,7 @@ def __post_init__(self) -> None:
             old, new, _ = self.options
             # TODO: perhaps polars should validate renaming in the IR?
             if len(new) != len(set(new)) or (
-                set(new) & (set(self.df.schema.keys() - set(old)))
+                set(new) & (set(df.schema.keys()) - set(old))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
         elif self.name == "unpivot":
@@ -1134,31 +1323,31 @@ def __post_init__(self) -> None:
             variable_name = "variable" if variable_name is None else variable_name
             if len(pivotees) == 0:
                 index = frozenset(indices)
-                pivotees = [name for name in self.df.schema if name not in index]
+                pivotees = [name for name in df.schema if name not in index]
             if not all(
-                dtypes.can_cast(self.df.schema[p], self.schema[value_name])
-                for p in pivotees
+                dtypes.can_cast(df.schema[p], self.schema[value_name]) for p in pivotees
             ):
                 raise NotImplementedError(
                     "Unpivot cannot cast all input columns to "
                     f"{self.schema[value_name].id()}"
                 )
-            self.options = (indices, pivotees, variable_name, value_name)
+            self.options = (tuple(indices), tuple(pivotees), variable_name, value_name)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
+        (child,) = self.children
         if self.name == "rechunk":
             # No-op in our data model
             # Don't think this appears in a plan tree from python
-            return self.df.evaluate(cache=cache)  # pragma: no cover
+            return child.evaluate(cache=cache)  # pragma: no cover
         elif self.name == "rename":
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             # final tag is "swapping" which is useful for the
             # optimiser (it blocks some pushdown operations)
             old, new, _ = self.options
             return df.rename_columns(dict(zip(old, new, strict=True)))
         elif self.name == "explode":
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             ((to_explode,),) = self.options
             index = df.column_names.index(to_explode)
             subset = df.column_names_set - {to_explode}
@@ -1168,9 +1357,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         elif self.name == "unpivot":
             indices, pivotees, variable_name, value_name = self.options
             npiv = len(pivotees)
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             index_columns = [
-                NamedColumn(col, name)
+                Column(col, name=name)
                 for col, name in zip(
                     plc.reshape.tile(df.select(indices).table, npiv).columns(),
                     indices,
@@ -1191,50 +1380,56 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 df.num_rows,
             ).columns()
             value_column = plc.concatenate.concatenate(
-                [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns]
+                [
+                    df.column_map[pivotee].astype(self.schema[value_name]).obj
+                    for pivotee in pivotees
+                ]
             )
             return DataFrame(
                 [
                     *index_columns,
-                    NamedColumn(variable_column, variable_name),
-                    NamedColumn(value_column, value_name),
+                    Column(variable_column, name=variable_name),
+                    Column(value_column, name=value_name),
                 ]
             )
         else:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
 
-@dataclasses.dataclass
 class Union(IR):
     """Concatenate dataframes vertically."""
 
-    dfs: list[IR]
-    """List of inputs."""
+    __slots__ = ("zlice",)
+    _non_child = ("schema", "zlice")
     zlice: tuple[int, int] | None
-    """Optional slice to apply after concatenation."""
+    """Optional slice to apply to the result."""
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
-        schema = self.dfs[0].schema
-        if not all(s.schema == schema for s in self.dfs[1:]):
+    def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR):
+        self.schema = schema
+        self.zlice = zlice
+        self.children = children
+        schema = self.children[0].schema
+        if not all(s.schema == schema for s in self.children[1:]):
             raise NotImplementedError("Schema mismatch")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: only evaluate what we need if we have a slice
-        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        dfs = [df.evaluate(cache=cache) for df in self.children]
         return DataFrame.from_table(
             plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names
         ).slice(self.zlice)
 
 
-@dataclasses.dataclass
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
 
-    dfs: list[IR]
-    """List of inputs."""
+    __slots__ = ()
+    _non_child = ("schema",)
+
+    def __init__(self, schema: Schema, *children: IR):
+        self.schema = schema
+        self.children = children
 
     @staticmethod
     def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
@@ -1266,7 +1461,7 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        dfs = [df.evaluate(cache=cache) for df in self.children]
         max_rows = max(df.num_rows for df in dfs)
         # Horizontal concatenation extends shorter tables with nulls
         dfs = [
@@ -1278,6 +1473,4 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             )
             for df in dfs
         ]
-        return DataFrame(
-            list(itertools.chain.from_iterable(df.columns for df in dfs)),
-        )
+        return DataFrame(itertools.chain.from_iterable(df.columns for df in dfs))
diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py
new file mode 100644
index 00000000000..228d300f467
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Base class for IR nodes, and utilities."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, Sequence
+
+    from typing_extensions import Self
+
+
+__all__: list[str] = ["Node"]
+
+T = TypeVar("T", bound="Node[Any]")
+
+
+class Node(Generic[T]):
+    """
+    An abstract node type.
+
+    Nodes are immutable!
+
+    This contains a (potentially empty) tuple of child nodes,
+    along with non-child data. For uniform reconstruction and
+    implementation of hashing and equality schemes, child classes need
+    to provide a certain amount of metadata when they are defined.
+    Specifically, the ``_non_child`` attribute must list, in-order,
+    the names of the slots that are passed to the constructor. The
+    constructor must take arguments in the order ``(*_non_child,
+    *children).``
+    """
+
+    __slots__ = ("_hash_value", "_repr_value", "children")
+    _hash_value: int
+    _repr_value: str
+    children: tuple[T, ...]
+    _non_child: ClassVar[tuple[str, ...]] = ()
+
+    def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]:
+        return (*(getattr(self, attr) for attr in self._non_child), *children)
+
+    def reconstruct(
+        self, children: Sequence[T]
+    ) -> Self:  # pragma: no cover; not yet used
+        """
+        Rebuild this node with new children.
+
+        Parameters
+        ----------
+        children
+            New children
+
+        Returns
+        -------
+        New node with new children. Non-child data is shared with the input.
+        """
+        return type(self)(*self._ctor_arguments(children))
+
+    def get_hashable(self) -> Hashable:
+        """
+        Return a hashable object for the node.
+
+        Returns
+        -------
+        Hashable object.
+
+        Notes
+        -----
+        This method is used by the :meth:`__hash__` implementation
+        (which does caching). If your node type needs special-case
+        handling for some of its attributes, override this method, not
+        :meth:`__hash__`.
+        """
+        return (type(self), self._ctor_arguments(self.children))
+
+    def __hash__(self) -> int:
+        """
+        Hash of an expression with caching.
+
+        See Also
+        --------
+        get_hashable
+        """
+        try:
+            return self._hash_value
+        except AttributeError:
+            self._hash_value = hash(self.get_hashable())
+            return self._hash_value
+
+    def is_equal(self, other: Self) -> bool:
+        """
+        Equality of two nodes of equal type.
+
+        Override this in subclasses, rather than :meth:`__eq__`.
+
+        Parameter
+        ---------
+        other
+            object of same type to compare to.
+
+        Notes
+        -----
+        Since nodes are immutable, this does common subexpression
+        elimination when two nodes are determined to be equal.
+
+        :meth:`__eq__` handles the case where the objects being
+        compared are not of the same type, so in this method, we only
+        need to implement equality of equal types.
+
+        Returns
+        -------
+        True if the two nodes are equal, false otherwise.
+        """
+        if self is other:
+            return True
+        result = self._ctor_arguments(self.children) == other._ctor_arguments(
+            other.children
+        )
+        # Eager CSE for nodes that match.
+        if result:
+            self.children = other.children
+        return result
+
+    def __eq__(self, other: Any) -> bool:
+        """
+        Equality of expressions.
+
+        See Also
+        --------
+        is_equal
+        """
+        if type(self) is not type(other) or hash(self) != hash(other):
+            return False
+        else:
+            return self.is_equal(other)
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def __repr__(self) -> str:
+        """String representation of an expression with caching."""
+        try:
+            return self._repr_value
+        except AttributeError:
+            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
+            self._repr_value = f"{type(self).__name__}({args})"
+            return self._repr_value
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a0291037f01..c28f2c2651a 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,10 +5,11 @@
 
 from __future__ import annotations
 
+import functools
 import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
 import pylibcudf as plc
@@ -19,8 +20,12 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged
 from cudf_polars.typing import NodeTraverser
-from cudf_polars.utils import dtypes
+from cudf_polars.utils import dtypes, sorting
+
+if TYPE_CHECKING:
+    from cudf_polars.typing import ExprTransformer
 
 __all__ = ["translate_ir", "translate_named_expr"]
 
@@ -148,7 +153,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Select(schema, inp, exprs, node.should_broadcast)
+    return ir.Select(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
@@ -161,11 +166,11 @@ def _(
         keys = [translate_named_expr(visitor, n=e) for e in node.keys]
     return ir.GroupBy(
         schema,
-        inp,
-        aggs,
         keys,
+        aggs,
         node.maintain_order,
         node.options,
+        inp,
     )
 
 
@@ -182,7 +187,71 @@ def _(
     with set_node(visitor, node.input_right):
         inp_right = translate_ir(visitor, n=None)
         right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
-    return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
+    if (how := node.options[0]) in {
+        "inner",
+        "left",
+        "right",
+        "full",
+        "cross",
+        "semi",
+        "anti",
+    }:
+        return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
+    else:
+        how, op1, op2 = how
+        if how != "ie_join":
+            raise NotImplementedError(
+                f"Unsupported join type {how}"
+            )  # pragma: no cover; asof joins not yet exposed
+        # No exposure of mixed/conditional joins in pylibcudf yet, so in
+        # the first instance, implement by doing a cross join followed by
+        # a filter.
+        _, join_nulls, zlice, suffix, coalesce = node.options
+        cross = ir.Join(
+            schema,
+            [],
+            [],
+            ("cross", join_nulls, None, suffix, coalesce),
+            inp_left,
+            inp_right,
+        )
+        dtype = plc.DataType(plc.TypeId.BOOL8)
+        if op2 is None:
+            ops = [op1]
+        else:
+            ops = [op1, op2]
+        suffix = cross.options[3]
+
+        # Column references in the right table refer to the post-join
+        # names, so with suffixes.
+        def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr:
+            if isinstance(e, expr.Col) and e.name in inp_left.schema:
+                return type(e)(e.dtype, f"{e.name}{suffix}")
+            return reuse_if_unchanged(e, rec)
+
+        mapper = make_recursive(_rename)
+        right_on = [
+            expr.NamedExpr(
+                f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new
+            )
+            for new, old in zip(
+                (mapper(e.value) for e in right_on), right_on, strict=True
+            )
+        ]
+        mask = functools.reduce(
+            functools.partial(
+                expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND
+            ),
+            (
+                expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value)
+                for op, left, right in zip(ops, left_on, right_on, strict=True)
+            ),
+        )
+        filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross)
+        if zlice is not None:
+            offset, length = zlice
+            return ir.Slice(schema, offset, length, filtered)
+        return filtered
 
 
 @_translate_ir.register
@@ -192,7 +261,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
-    return ir.HStack(schema, inp, exprs, node.should_broadcast)
+    return ir.HStack(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
@@ -202,17 +271,23 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Reduce(schema, inp, exprs)
+    return ir.Reduce(schema, exprs, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
+    (keep, subset, maintain_order, zlice) = node.options
+    keep = ir.Distinct._KEEP_MAP[keep]
+    subset = frozenset(subset) if subset is not None else None
     return ir.Distinct(
         schema,
+        keep,
+        subset,
+        zlice,
+        maintain_order,
         translate_ir(visitor, n=node.input),
-        node.options,
     )
 
 
@@ -223,14 +298,18 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         by = [translate_named_expr(visitor, n=e) for e in node.by_column]
-    return ir.Sort(schema, inp, by, node.sort_options, node.slice)
+    stable, nulls_last, descending = node.sort_options
+    order, null_order = sorting.sort_order(
+        descending, nulls_last=nulls_last, num_keys=len(by)
+    )
+    return ir.Sort(schema, by, order, null_order, stable, node.slice, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len)
+    return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input))
 
 
 @_translate_ir.register
@@ -240,7 +319,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         mask = translate_named_expr(visitor, n=node.predicate)
-    return ir.Filter(schema, inp, mask)
+    return ir.Filter(schema, mask, inp)
 
 
 @_translate_ir.register
@@ -259,10 +338,10 @@ def _(
     name, *options = node.function
     return ir.MapFunction(
         schema,
-        # TODO: merge_sorted breaks this pattern
-        translate_ir(visitor, n=node.input),
         name,
         options,
+        # TODO: merge_sorted breaks this pattern
+        translate_ir(visitor, n=node.input),
     )
 
 
@@ -271,7 +350,7 @@ def _(
     node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.Union(
-        schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
+        schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs)
     )
 
 
@@ -279,7 +358,7 @@ def _(
 def _(
     node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+    return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs))
 
 
 def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
@@ -309,8 +388,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     # IR is versioned with major.minor, minor is bumped for backwards
     # compatible changes (e.g. adding new nodes), major is bumped for
     # incompatible changes (e.g. renaming nodes).
-    # Polars 1.7 changes definition of the CSV reader options schema name.
-    if (version := visitor.version()) >= (3, 0):
+    if (version := visitor.version()) >= (4, 0):
         raise NotImplementedError(
             f"No support for polars IR {version=}"
         )  # pragma: no cover; no such version for now.
diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py
new file mode 100644
index 00000000000..be8338cb9a9
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/traversal.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Traversal and visitor utilities for nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Generic
+
+from cudf_polars.typing import U_contra, V_co
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Generator, Mapping, MutableMapping
+
+    from cudf_polars.typing import GenericTransformer, NodeT
+
+
+__all__: list[str] = [
+    "traversal",
+    "reuse_if_unchanged",
+    "make_recursive",
+    "CachingVisitor",
+]
+
+
+def traversal(node: NodeT) -> Generator[NodeT, None, None]:
+    """
+    Pre-order traversal of nodes in an expression.
+
+    Parameters
+    ----------
+    node
+        Root of expression to traverse.
+
+    Yields
+    ------
+    Unique nodes in the expression, parent before child, children
+    in-order from left to right.
+    """
+    seen = {node}
+    lifo = [node]
+
+    while lifo:
+        node = lifo.pop()
+        yield node
+        for child in reversed(node.children):
+            if child not in seen:
+                seen.add(child)
+                lifo.append(child)
+
+
+def reuse_if_unchanged(node: NodeT, fn: GenericTransformer[NodeT, NodeT]) -> NodeT:
+    """
+    Recipe for transforming nodes that returns the old object if unchanged.
+
+    Parameters
+    ----------
+    node
+         Node to recurse on
+    fn
+         Function to transform children
+
+    Notes
+    -----
+    This can be used as a generic "base case" handler when
+    writing transforms that take nodes and produce new nodes.
+
+    Returns
+    -------
+    Existing node `e` if transformed children are unchanged, otherwise
+    reconstructed node with new children.
+    """
+    new_children = [fn(c) for c in node.children]
+    if all(new == old for new, old in zip(new_children, node.children, strict=True)):
+        return node
+    return node.reconstruct(new_children)
+
+
+def make_recursive(
+    fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co],
+    *,
+    state: Mapping[str, Any] | None = None,
+) -> GenericTransformer[U_contra, V_co]:
+    """
+    No-op wrapper for recursive visitors.
+
+    Facilitates using visitors that don't need caching but are written
+    in the same style.
+
+    Parameters
+    ----------
+    fn
+        Function to transform inputs to outputs. Should take as its
+        second argument a callable from input to output.
+    state
+        Arbitrary *immutable* state that should be accessible to the
+        visitor through the `state` property.
+
+    Notes
+    -----
+    All transformation functions *must* be free of side-effects.
+
+    Usually, prefer a :class:`CachingVisitor`, but if we know that we
+    don't need caching in a transformation and then this no-op
+    approach is slightly cheaper.
+
+    Returns
+    -------
+    Recursive function without caching.
+
+    See Also
+    --------
+    CachingVisitor
+    """
+
+    def rec(node: U_contra) -> V_co:
+        return fn(node, rec)  # type: ignore[arg-type]
+
+    rec.state = state if state is not None else {}  # type: ignore[attr-defined]
+    return rec  # type: ignore[return-value]
+
+
+class CachingVisitor(Generic[U_contra, V_co]):
+    """
+    Caching wrapper for recursive visitors.
+
+    Facilitates writing visitors where already computed results should
+    be cached and reused. The cache is managed automatically, and is
+    tied to the lifetime of the wrapper.
+
+    Parameters
+    ----------
+    fn
+        Function to transform inputs to outputs. Should take as its
+        second argument the recursive cache manager.
+    state
+        Arbitrary *immutable* state that should be accessible to the
+        visitor through the `state` property.
+
+    Notes
+    -----
+    All transformation functions *must* be free of side-effects.
+
+    Returns
+    -------
+    Recursive function with caching.
+    """
+
+    def __init__(
+        self,
+        fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co],
+        *,
+        state: Mapping[str, Any] | None = None,
+    ) -> None:
+        self.fn = fn
+        self.cache: MutableMapping[U_contra, V_co] = {}
+        self.state = state if state is not None else {}
+
+    def __call__(self, value: U_contra) -> V_co:
+        """
+        Apply the function to a value.
+
+        Parameters
+        ----------
+        value
+            The value to transform.
+
+        Returns
+        -------
+        A transformed value.
+        """
+        try:
+            return self.cache[value]
+        except KeyError:
+            return self.cache.setdefault(value, self.fn(value, self))
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 05b76d76808..a3607159e01 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
@@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
     "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
     "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
@@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
     "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
@@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 240b11bdf59..a27a3395c35 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -5,8 +5,8 @@
 
 from __future__ import annotations
 
-from collections.abc import Mapping
-from typing import TYPE_CHECKING, Literal, Protocol, Union
+from collections.abc import Hashable, Mapping
+from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union
 
 import pylibcudf as plc
 
@@ -18,7 +18,19 @@
 
     import polars as pl
 
-IR: TypeAlias = Union[
+    from cudf_polars.dsl import expr, ir, nodebase
+
+__all__: list[str] = [
+    "PolarsIR",
+    "PolarsExpr",
+    "NodeTraverser",
+    "OptimizationArgs",
+    "GenericTransformer",
+    "ExprTransformer",
+    "IRTransformer",
+]
+
+PolarsIR: TypeAlias = Union[
     pl_ir.PythonScan,
     pl_ir.Scan,
     pl_ir.Cache,
@@ -38,7 +50,7 @@
     pl_ir.ExtContext,
 ]
 
-Expr: TypeAlias = Union[
+PolarsExpr: TypeAlias = Union[
     pl_expr.Function,
     pl_expr.Window,
     pl_expr.Literal,
@@ -68,7 +80,7 @@ def set_node(self, n: int) -> None:
         """Set the current plan node to n."""
         ...
 
-    def view_current_node(self) -> IR:
+    def view_current_node(self) -> PolarsIR:
         """Convert current plan node to python rep."""
         ...
 
@@ -80,7 +92,7 @@ def get_dtype(self, n: int) -> pl.DataType:
         """Get the datatype of the given expression id."""
         ...
 
-    def view_expression(self, n: int) -> Expr:
+    def view_expression(self, n: int) -> PolarsExpr:
         """Convert the given expression to python rep."""
         ...
 
@@ -107,3 +119,29 @@ def set_udf(
     "cluster_with_columns",
     "no_optimization",
 ]
+
+
+U_contra = TypeVar("U_contra", bound=Hashable, contravariant=True)
+V_co = TypeVar("V_co", covariant=True)
+NodeT = TypeVar("NodeT", bound="nodebase.Node[Any]")
+
+
+class GenericTransformer(Protocol[U_contra, V_co]):
+    """Abstract protocol for recursive visitors."""
+
+    def __call__(self, __value: U_contra) -> V_co:
+        """Apply the visitor to the node."""
+        ...
+
+    @property
+    def state(self) -> Mapping[str, Any]:
+        """Arbitrary immutable state."""
+        ...
+
+
+# Quotes to avoid circular import
+ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"]
+"""Protocol for transformation of Expr nodes."""
+
+IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"]
+"""Protocol for transformation of IR nodes."""
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index bff44af1468..74b2cd4e5de 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -11,14 +11,17 @@ You will need:
    environment](https://github.com/rapidsai/cudf/blob/branch-24.12/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
-> ![NOTE] These instructions will get simpler as we merge code in.
+:::{note}
+These instructions will get simpler as we merge code in.
+:::
 
 ## Installing polars
 
-`cudf-polars` works with polars >= 1.3, as long as the internal IR
-version doesn't get a major version bump. So `pip install polars>=1.3`
-should work. For development, if we're adding things to the polars
-side of things, we will need to build polars from source:
+The `cudf-polars` `pyproject.toml` advertises which polars versions it
+works with. So for pure `cudf-polars` development, installing as
+normal and satisfying the dependencies in the repository is
+sufficient. For development, if we're adding things to the polars side
+of things, we will need to build polars from source:
 
 ```sh
 git clone https://github.com/pola-rs/polars
@@ -36,7 +39,9 @@ pip install --upgrade uv
 uv pip install --upgrade -r py-polars/requirements-dev.txt
 ```
 
-> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster!
+:::{note}
+plain `pip install` works fine, but `uv` is _much_ faster!
+:::
 
 Now we have the necessary machinery to build polars
 ```sh
@@ -83,7 +88,7 @@ representation (IR). Second, an execution phase which executes using
 our IR.
 
 The translation phase receives the a low-level Rust `NodeTraverser`
-object which delivers Python representations of the plan nodes (and
+object that delivers Python representations of the plan nodes (and
 expressions) one at a time. During translation, we endeavour to raise
 `NotImplementedError` for any unsupported functionality. This way, if
 we can't execute something, we just don't modify the logical plan at
@@ -126,7 +131,6 @@ arguments, at the moment, `raise_on_fail` is also supported, which
 raises, rather than falling back, during translation:
 
 ```python
-
 result = q.collect(engine=pl.GPUEngine(raise_on_fail=True))
 ```
 
@@ -144,13 +148,73 @@ changes. We can therefore attempt to detect the IR version
 appropriately. This should be done during IR translation in
 `translate.py`.
 
-## Adding a handler for a new plan node
+# IR design
+
+As noted, we translate the polars DSL into our own IR. This is both so
+that we can smooth out minor version differences (advertised by
+`NodeTraverser` version changes) within `cudf-polars`, and so that we
+have the freedom to introduce new IR nodes and rewrite rules as might
+be appropriate for GPU execution.
+
+To that end, we provide facilities for definition of nodes as well as
+writing traversals and rewrite rules. The abstract base class `Node`
+in `dsl/nodebase.py` defines the interface for implementing new nodes,
+and provides many useful default methods. See also the docstrings of
+the `Node` class.
+
+:::{note}
+This generic implementation relies on nodes being treated as
+*immutable*. Do not implement in-place modification of nodes, bad
+things will happen.
+:::
+
+## Defining nodes
+
+A concrete node type (`cudf-polars` has expression nodes, `Expr`;
+and plan nodes, `IR`), should inherit from `Node`. Nodes have
+two types of data:
+
+1. `children`: a tuple (possibly empty) of concrete nodes;
+2. non-child: arbitrary data attached to the node that is _not_ a
+   concrete node.
+
+The base `Node` class requires that one advertise the names of the
+non-child attributes in the `_non_child` class variable. The
+constructor of the concrete node should take its arguments in the
+order `*_non_child` (ordered as the class variable does) and then
+`*children`. For example, the `Sort` node, which sorts a column
+generated by an expression, has this definition:
+
+```python
+class Expr(Node):
+    children: tuple[Expr, ...]
+
+class Sort(Expr):
+    _non_child = ("dtype", "options")
+    children: tuple[Expr]
+    def __init__(self, dtype, options, column: Expr):
+        self.dtype = dtype
+        self.options = options
+        self.children = (column,)
+```
+
+By following this pattern, we get an automatic (caching)
+implementation of `__hash__` and `__eq__`, as well as a useful
+`reconstruct` method that will rebuild the node with new children.
+
+If you want to control the behaviour of `__hash__` and `__eq__` for a
+single node, override (respectively) the `get_hashable` and `is_equal`
+methods.
+
+## Adding new translation rules from the polars IR
+
+### Plan nodes
 
-Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
-`dataclasses` that inherit from the base `IR` node. The evaluation of
-a plan node is done by implementing the `evaluate` method.
+Plan node definitions live in `cudf_polars/dsl/ir.py`, these all
+inherit from the base `IR` node. The evaluation of a plan node is done
+by implementing the `evaluate` method.
 
-To translate the plan node, add a case handler in `translate_ir` which
+To translate the plan node, add a case handler in `translate_ir` that
 lives in `cudf_polars/dsl/translate.py`.
 
 As well as child nodes that are plans, most plan nodes contain child
@@ -163,25 +227,12 @@ translating a `Join` node, the left keys (expressions) should be
 translated with the left input active (and right keys with right
 input). To facilitate this, use the `set_node` context manager.
 
-## Adding a handler for a new expression node
+### Expression nodes
 
 Adding a handle for an expression node is very similar to a plan node.
-Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit
-from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it
-is simpler for us to implement efficient hashing, repr, and equality if we
-can write that ourselves.
-
-Every expression consists of two types of data:
-1. child data (other `Expr`s)
-2. non-child data (anything other than an `Expr`)
-The generic implementations of special methods in the base `Expr` base
-class require that the subclasses advertise which arguments to the
-constructor are non-child in a `_non_child` class slot. The
-constructor should then take arguments:
-```python
-def __init__(self, *non_child_data: Any, *children: Expr):
-```
-Read the docstrings in the `Expr` class for more details.
+Expressions are defined in `cudf_polars/dsl/expressions/` and exported
+into the `dsl` namespace via `expr.py`. They inherit
+from `Expr`.
 
 Expressions are evaluated by implementing a `do_evaluate` method that
 takes a `DataFrame` as context (this provides columns) along with an
@@ -198,24 +249,142 @@ To simplify state tracking, all columns should be considered immutable
 on construction. This matches the "functional" description coming from
 the logical plan in any case, so is reasonably natural.
 
+## Traversing and transforming nodes
+
+In addition to representing and evaluating nodes. We also provide
+facilities for traversing a tree of nodes and defining transformation
+rules in `dsl/traversal.py`. The simplest is `traversal`, a
+[pre-order](https://en.wikipedia.org/wiki/Tree_traversal) visit of all
+unique nodes in an expression. Use this if you want to know some
+specific thing about an expression. For example, to determine if an
+expression contains a `Literal` node:
+
+```python
+def has_literal(node: Expr) -> bool:
+    return any(isinstance(e, Literal) for e in traversal(node))
+```
+
+It is often convenient to provide (immutable) state to a visitor, as
+well as some facility to perform DAG-aware rewrites (reusing a
+transformation for an expression if we have already seen it). We
+therefore adopt the following pattern of writing DAG-aware visitors.
+Suppose we want a rewrite rule (`rewrite`) between expressions
+(`Expr`) and some new type `T`. We define our general transformation
+function `rewrite` with type `Expr -> (Expr -> T) -> T`:
+
+```python
+from cudf_polars.typing import GenericTransformer
+
+@singledispatch
+def rewrite(e: Expr, rec: GenericTransformer[Expr, T]) -> T:
+    ...
+```
+
+Note in particular that the function to perform the recursion is
+passed as the second argument. Rather than defining methods on each
+node in turn for a particular rewrite rule, we prefer free functions
+and use `functools.singledispatch` to provide dispatching. We now, in
+the usual fashion, register handlers for different expression types.
+To use this function, we need to be able to provide both the
+expression to convert and the recursive function itself. To do this we
+must convert our `rewrite` function into something that only takes a
+single argument (the expression to rewrite), but carries around
+information about how to perform the recursion. To this end, we have
+two utilities in `traversal.py`:
+
+- `make_recursive` and
+- `CachingVisitor`.
+
+These both implement the `GenericTransformer` protocol, and can be
+wrapped around a transformation function like `rewrite` to provide a
+function `Expr -> T`. They also allow us to attach arbitrary
+*immutable* state to our visitor by passing a `state` dictionary. This
+dictionary can then be inspected by the concrete transformation
+function. `make_recursive` is very simple, and provides no caching of
+intermediate results (so any DAGs that are visited will be viewed as
+trees). `CachingVisitor` provides the same interface, but maintains a
+cache of intermediate results, and reuses them if the same expression
+is seen again.
+
+Finally, for writing transformations that take nodes and deliver new
+nodes (e.g. rewrite rules), we have a final utility
+`reuse_if_unchanged` that can be used as a base case transformation
+for node to node rewrites. It is a depth-first visit that transforms
+children but only returns a new node with new children if the rewrite
+of children returned new nodes.
+
+To see how these pieces fit together, let us consider writing a
+`rename` function that takes an expression (potentially with
+references to columns) along with a mapping defining a renaming
+between (some subset of) column names. The goal is to deliver a new
+expression with appropriate columns renamed.
+
+To start, we define the dispatch function
+```python
+from collections.abc import Mapping
+from functools import singledispatch
+from cudf_polars.dsl.traversal import (
+    CachingVisitor, make_recursive, reuse_if_unchanged
+)
+from cudf_polars.dsl.expr import Col, Expr
+from cudf_polars.typing import ExprTransformer
+
+
+@singledispatch
+def _rename(e: Expr, rec: ExprTransformer) -> Expr:
+    raise NotImplementedError(f"No handler for {type(e)}")
+```
+then we register specific handlers, first for columns:
+```python
+@_rename.register
+def _(e: Col, rec: ExprTransformer) -> Expr:
+    mapping = rec.state["mapping"] # state set on rec
+    if e.name in mapping:
+        # If we have a rename, return a new Col reference
+        # with a new name
+        return type(e)(e.dtype, mapping[e.name])
+    return e
+```
+and then for the remaining expressions
+```python
+_rename.register(Expr)(reuse_if_unchanged)
+```
+
+:::{note}
+In this case, we could have put the generic handler in the `_rename`
+function, however, then we would not get a nice error message if we
+accidentally sent in an object of the incorrect type.
+:::
+
+Finally we tie everything together with a public function:
+
+```python
+def rename(e: Expr, mapping: Mapping[str, str]) -> Expr:
+    """Rename column references in an expression."""
+    mapper = CachingVisitor(_rename, state={"mapping": mapping})
+    # or
+    # mapper = make_recursive(_rename, state={"mapping": mapping})
+    return mapper(e)
+```
+
 # Containers
 
 Containers should be constructed as relatively lightweight objects
-around their pylibcudf counterparts. We have four (in
+around their pylibcudf counterparts. We have three (in
 `cudf_polars/containers/`):
 
 1. `Scalar` (a wrapper around a pylibcudf `Scalar`)
 2. `Column` (a wrapper around a pylibcudf `Column`)
-3. `NamedColumn` (a `Column` with an additional name)
-4. `DataFrame` (a wrapper around a pylibcudf `Table`)
+3. `DataFrame` (a wrapper around a pylibcudf `Table`)
 
 The interfaces offered by these are somewhat in flux, but broadly
-speaking, a `DataFrame` is just a list of `NamedColumn`s which each
-hold a `Column` plus a string `name`. `NamedColumn`s are only ever
-constructed via `NamedExpr`s, which are the top-level expression node
-that lives inside an `IR` node. This means that the expression
-evaluator never has to concern itself with column names: columns are
-only ever decorated with names when constructing a `DataFrame`.
+speaking, a `DataFrame` is just a mapping from string `name`s to
+`Column`s, and thus also holds a pylibcudf `Table`. Names are only
+attached to `Column`s and hence inserted into `DataFrames` via
+`NamedExpr`s, which are the top-level expression nodes that live
+inside an `IR` node. This means that the expression evaluator never
+has to concern itself with column names: columns are only ever
+decorated with names when constructing a `DataFrame`.
 
 The columns keep track of metadata (for example, whether or not they
 are sorted). We could imagine tracking more metadata, like minimum and
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f55031e0826..2afdab1be4b 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.8,<1.9",
+    "polars>=1.11,<1.12",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -50,12 +50,17 @@ license-files = ["LICENSE"]
 version = {file = "cudf_polars/VERSION"}
 
 [tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
+filterwarnings = [
+  "error"
+]
 xfail_strict = true
 
 [tool.coverage.report]
 exclude_also = [
   "if TYPE_CHECKING:",
-  "class .*\\bProtocol\\):",
+  "class .*\\bProtocol(?:\\[[^]]+\\])?\\):",
   "assert_never\\("
 ]
 # The cudf_polars test suite doesn't exercise the plugin, so we omit
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 19919877f84..1f26ab1af9f 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -3,13 +3,11 @@
 
 from __future__ import annotations
 
-from functools import partial
-
 import pyarrow
 import pylibcudf as plc
 import pytest
 
-from cudf_polars.containers import Column, NamedColumn
+from cudf_polars.containers import Column
 
 
 def test_non_scalar_access_raises():
@@ -55,11 +53,10 @@ def test_shallow_copy():
 
 
 @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32])
-@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")])
-def test_mask_nans(typeid, constructor):
+def test_mask_nans(typeid):
     dtype = plc.DataType(typeid)
     values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype))
-    column = constructor(plc.interop.from_arrow(values))
+    column = Column(plc.interop.from_arrow(values))
     masked = column.mask_nans()
     assert column.obj.null_count() == masked.obj.null_count()
 
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 39fb44d55a5..5c68fb8f0aa 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -8,18 +8,18 @@
 
 import polars as pl
 
-from cudf_polars.containers import DataFrame, NamedColumn
+from cudf_polars.containers import Column, DataFrame
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
 def test_select_missing_raises():
     df = DataFrame(
         [
-            NamedColumn(
+            Column(
                 plc.column_factories.make_numeric_column(
                     plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
                 ),
-                "a",
+                name="a",
             )
         ]
     )
@@ -30,17 +30,17 @@ def test_select_missing_raises():
 def test_replace_missing_raises():
     df = DataFrame(
         [
-            NamedColumn(
+            Column(
                 plc.column_factories.make_numeric_column(
                     plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
                 ),
-                "a",
+                name="a",
             )
         ]
     )
-    replacement = df.columns[0].copy(new_name="b")
+    replacement = df.column_map["a"].copy().rename("b")
     with pytest.raises(ValueError):
-        df.replace_columns(replacement)
+        df.with_columns([replacement], replace_only=True)
 
 
 def test_from_table_wrong_names():
@@ -55,14 +55,23 @@ def test_from_table_wrong_names():
         DataFrame.from_table(table, ["a", "b"])
 
 
+def test_unnamed_column_raise():
+    payload = plc.column_factories.make_numeric_column(
+        plc.DataType(plc.TypeId.INT8), 0, plc.MaskState.ALL_VALID
+    )
+
+    with pytest.raises(ValueError):
+        DataFrame([Column(payload, name="a"), Column(payload)])
+
+
 def test_sorted_like_raises_mismatching_names():
     df = DataFrame(
         [
-            NamedColumn(
+            Column(
                 plc.column_factories.make_numeric_column(
                     plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
                 ),
-                "a",
+                name="a",
             )
         ]
     )
@@ -72,11 +81,11 @@ def test_sorted_like_raises_mismatching_names():
 
 
 def test_shallow_copy():
-    column = NamedColumn(
+    column = Column(
         plc.column_factories.make_numeric_column(
             plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
         ),
-        "a",
+        name="a",
     )
     column.set_sorted(
         is_sorted=plc.types.Sorted.YES,
@@ -85,13 +94,13 @@ def test_shallow_copy():
     )
     df = DataFrame([column])
     copy = df.copy()
-    copy.columns[0].set_sorted(
+    copy.column_map["a"].set_sorted(
         is_sorted=plc.types.Sorted.NO,
         order=plc.types.Order.ASCENDING,
         null_order=plc.types.NullOrder.AFTER,
     )
-    assert df.columns[0].is_sorted == plc.types.Sorted.YES
-    assert copy.columns[0].is_sorted == plc.types.Sorted.NO
+    assert df.column_map["a"].is_sorted == plc.types.Sorted.YES
+    assert copy.column_map["a"].is_sorted == plc.types.Sorted.NO
 
 
 def test_sorted_flags_preserved_empty():
@@ -100,7 +109,7 @@ def test_sorted_flags_preserved_empty():
 
     gf = DataFrame.from_polars(df)
 
-    (a,) = gf.columns
+    a = gf.column_map["a"]
 
     assert a.is_sorted == plc.types.Sorted.YES
 
diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py
index b7d4672daca..84e33262869 100644
--- a/python/cudf_polars/tests/dsl/test_expr.py
+++ b/python/cudf_polars/tests/dsl/test_expr.py
@@ -73,3 +73,24 @@ def test_namedexpr_repr_stable():
     b2 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
 
     assert repr(b1) == repr(b2)
+
+
+def test_equality_cse():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    def make_expr(n1, n2):
+        a = expr.Col(plc.DataType(plc.TypeId.INT8), n1)
+        b = expr.Col(plc.DataType(plc.TypeId.INT8), n2)
+
+        return expr.BinOp(dt, plc.binaryop.BinaryOperator.ADD, a, b)
+
+    e1 = make_expr("a", "b")
+    e2 = make_expr("a", "b")
+    e3 = make_expr("a", "c")
+
+    assert e1.children is not e2.children
+    assert e1 == e2
+    assert e1.children is e2.children
+    assert e1 == e2
+    assert e1 != e3
+    assert e2 != e3
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
new file mode 100644
index 00000000000..6505a786855
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -0,0 +1,229 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from functools import singledispatch
+
+import pylibcudf as plc
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+from cudf_polars import translate_ir
+from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import (
+    CachingVisitor,
+    make_recursive,
+    reuse_if_unchanged,
+    traversal,
+)
+from cudf_polars.typing import ExprTransformer, IRTransformer
+
+
+def make_expr(dt, n1, n2):
+    a1 = expr.Col(dt, n1)
+    a2 = expr.Col(dt, n2)
+
+    return expr.BinOp(dt, plc.binaryop.BinaryOperator.MUL, a1, a2)
+
+
+def test_traversal_unique():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "a")
+    unique_exprs = list(traversal(e1))
+
+    assert len(unique_exprs) == 2
+    assert set(unique_exprs) == {expr.Col(dt, "a"), e1}
+    assert unique_exprs == [e1, expr.Col(dt, "a")]
+
+    e2 = make_expr(dt, "a", "b")
+    unique_exprs = list(traversal(e2))
+
+    assert len(unique_exprs) == 3
+    assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2}
+    assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")]
+
+    e3 = make_expr(dt, "b", "a")
+    unique_exprs = list(traversal(e3))
+
+    assert len(unique_exprs) == 3
+    assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3}
+    assert unique_exprs == [e3, expr.Col(dt, "b"), expr.Col(dt, "a")]
+
+
+def rename(e, rec):
+    mapping = rec.state["mapping"]
+    if isinstance(e, expr.Col) and e.name in mapping:
+        return type(e)(e.dtype, mapping[e.name])
+    return reuse_if_unchanged(e, rec)
+
+
+def test_caching_visitor():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "b")
+
+    mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e1)
+    assert renamed == make_expr(dt, "a", "c")
+    assert len(mapper.cache) == 3
+
+    e2 = make_expr(dt, "a", "a")
+    mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "a", "a")
+    assert len(mapper.cache) == 2
+    mapper = CachingVisitor(rename, state={"mapping": {"a": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "c", "c")
+    assert len(mapper.cache) == 2
+
+
+def test_noop_visitor():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "b")
+
+    mapper = make_recursive(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e1)
+    assert renamed == make_expr(dt, "a", "c")
+
+    e2 = make_expr(dt, "a", "a")
+    mapper = make_recursive(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "a", "a")
+    mapper = make_recursive(rename, state={"mapping": {"a": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "c", "c")
+
+
+def test_rewrite_ir_node():
+    df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]})
+    q = df.group_by("a").agg(pl.col("b").sum()).sort("b")
+
+    orig = translate_ir(q._ldf.visit())
+
+    new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]})
+
+    def replace_df(node, rec):
+        if isinstance(node, ir.DataFrameScan):
+            return ir.DataFrameScan(
+                node.schema, new_df._df, node.projection, node.predicate
+            )
+        return reuse_if_unchanged(node, rec)
+
+    mapper = CachingVisitor(replace_df)
+
+    new = mapper(orig)
+
+    result = new.evaluate(cache={}).to_polars()
+
+    expect = pl.DataFrame({"a": [2, 1], "b": [-4, -3]})
+
+    assert_frame_equal(result, expect)
+
+
+def test_rewrite_scan_node(tmp_path):
+    left = pl.LazyFrame({"a": [1, 2, 3], "b": [1, 3, 4]})
+    right = pl.DataFrame({"a": [1, 4, 2], "c": [1, 2, 3]})
+
+    right.write_parquet(tmp_path / "right.pq")
+
+    right_s = pl.scan_parquet(tmp_path / "right.pq")
+
+    q = left.join(right_s, on="a", how="inner")
+
+    def replace_scan(node, rec):
+        if isinstance(node, ir.Scan):
+            return ir.DataFrameScan(
+                node.schema, right._df, node.with_columns, node.predicate
+            )
+        return reuse_if_unchanged(node, rec)
+
+    mapper = CachingVisitor(replace_scan)
+
+    orig = translate_ir(q._ldf.visit())
+    new = mapper(orig)
+
+    result = new.evaluate(cache={}).to_polars()
+
+    expect = q.collect()
+
+    assert_frame_equal(result, expect, check_row_order=False)
+
+
+def test_rewrite_names_and_ops():
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]})
+
+    q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d")
+
+    # We will replace a -> d, c -> d, and addition with multiplication
+    expect = (
+        df.select(
+            (pl.col("d") - (pl.col("b") * pl.col("d") * 2)).alias("a"), pl.col("d")
+        )
+        .sort("d")
+        .collect()
+    )
+
+    qir = translate_ir(q._ldf.visit())
+
+    @singledispatch
+    def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr:
+        raise NotImplementedError("Unhandled")
+
+    @_transform.register
+    def _(e: expr.Col, fn: ExprTransformer):
+        mapping = fn.state["mapping"]
+        if e.name in mapping:
+            return type(e)(e.dtype, mapping[e.name])
+        return e
+
+    @_transform.register
+    def _(e: expr.BinOp, fn: ExprTransformer):
+        if e.op == plc.binaryop.BinaryOperator.ADD:
+            return type(e)(
+                e.dtype, plc.binaryop.BinaryOperator.MUL, *map(fn, e.children)
+            )
+        return reuse_if_unchanged(e, fn)
+
+    _transform.register(expr.Expr)(reuse_if_unchanged)
+
+    @singledispatch
+    def _rewrite(node: ir.IR, fn: IRTransformer) -> ir.IR:
+        raise NotImplementedError("Unhandled")
+
+    @_rewrite.register
+    def _(node: ir.Select, fn: IRTransformer):
+        expr_mapper = fn.state["expr_mapper"]
+        return type(node)(
+            node.schema,
+            [expr.NamedExpr(e.name, expr_mapper(e.value)) for e in node.exprs],
+            node.should_broadcast,
+            fn(node.children[0]),
+        )
+
+    _rewrite.register(ir.IR)(reuse_if_unchanged)
+
+    rewriter = CachingVisitor(
+        _rewrite,
+        state={
+            "expr_mapper": CachingVisitor(
+                _transform, state={"mapping": {"a": "d", "c": "d"}}
+            )
+        },
+    )
+
+    new_ir = rewriter(qir)
+
+    got = new_ir.evaluate(cache={}).to_polars()
+
+    assert_frame_equal(expect, got)
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 56055f4c6c2..86cb2352dcc 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -93,10 +93,10 @@ def test_bool_agg(agg, request):
     expr = getattr(pl.col("a"), agg)()
     q = df.select(expr)
 
-    assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_exact=False)
 
 
-@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs)
+@pytest.mark.parametrize("cum_agg", sorted(expr.UnaryFunction._supported_cum_aggs))
 def test_cum_agg_reverse_unsupported(cum_agg):
     df = pl.LazyFrame({"a": [1, 2, 3]})
     expr = getattr(pl.col("a"), cum_agg)(reverse=True)
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 76c7648813a..2a37683478b 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -69,7 +69,7 @@ def test_setsorted(descending, nulls_last, with_nulls):
 
     df = translate_ir(q._ldf.visit()).evaluate(cache={})
 
-    (a,) = df.columns
+    a = df.column_map["a"]
 
     assert a.is_sorted == plc.types.Sorted.YES
     null_order = (
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 3c3986be19b..9900f598e5f 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -10,7 +10,7 @@
 
 import rmm
 
-from cudf_polars.dsl.ir import IR
+from cudf_polars.dsl.ir import DataFrameScan
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
@@ -18,10 +18,10 @@
 
 
 def test_polars_verbose_warns(monkeypatch):
-    def raise_unimplemented(self):
+    def raise_unimplemented(self, *args):
         raise NotImplementedError("We don't support this")
 
-    monkeypatch.setattr(IR, "__post_init__", raise_unimplemented)
+    monkeypatch.setattr(DataFrameScan, "__init__", raise_unimplemented)
     q = pl.LazyFrame({})
     # Ensure that things raise
     assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 7d9ec98db97..501560d15b8 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -2,9 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from contextlib import nullcontext
+
 import pytest
 
 import polars as pl
+from polars.testing import assert_frame_equal
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
@@ -22,6 +25,11 @@ def how(request):
     return request.param
 
 
+@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)])
+def zlice(request):
+    return request.param
+
+
 @pytest.fixture
 def left():
     return pl.LazyFrame(
@@ -37,8 +45,9 @@ def left():
 def right():
     return pl.LazyFrame(
         {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
+            "a": [1, 4, 3, 7, None, None, 1],
+            "c": [2, 3, 4, 5, 6, 7, 8],
+            "d": [6, None, 7, 8, -1, 2, 4],
         }
     )
 
@@ -70,11 +79,31 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
-def test_cross_join(left, right):
+def test_left_join_with_slice(left, right, join_nulls, zlice):
+    q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True)
+    ctx = nullcontext()
+    if zlice is not None:
+        q_expect = q.collect().slice(*zlice)
+        q = q.slice(*zlice)
+        if zlice == (1, 5) or zlice == (0, 2):
+            # https://github.com/pola-rs/polars/issues/19403
+            # https://github.com/pola-rs/polars/issues/19405
+            ctx = pytest.raises(AssertionError)
+            assert_frame_equal(
+                q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True))
+            )
+
+    with ctx:
+        assert_gpu_result_equal(q)
+
+
+def test_cross_join(left, right, zlice):
     q = left.join(right, how="cross")
+    if zlice is not None:
+        q = q.slice(*zlice)
 
     assert_gpu_result_equal(q)
 
@@ -86,3 +115,26 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "conditions",
+    [
+        [pl.col("a") < pl.col("a_right")],
+        [pl.col("a_right") <= pl.col("a") * 2],
+        [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
+        [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
+        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
+    ],
+)
+def test_join_where(left, right, conditions, zlice):
+    q = left.join_where(right, *conditions)
+
+    assert_gpu_result_equal(q, check_row_order=False)
+
+    if zlice is not None:
+        q_len = q.slice(*zlice).select(pl.len())
+        # Can't compare result, since row order is not guaranteed and
+        # therefore we only check the length
+
+        assert_gpu_result_equal(q_len)
diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py
index 35aaef44e1f..e7770bfadac 100644
--- a/python/cudf_polars/tests/utils/test_broadcast.py
+++ b/python/cudf_polars/tests/utils/test_broadcast.py
@@ -6,34 +6,35 @@
 import pylibcudf as plc
 import pytest
 
-from cudf_polars.containers import NamedColumn
+from cudf_polars.containers import Column
 from cudf_polars.dsl.ir import broadcast
 
 
 @pytest.mark.parametrize("target", [4, None])
 def test_broadcast_all_scalar(target):
     columns = [
-        NamedColumn(
+        Column(
             plc.column_factories.make_numeric_column(
                 plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID
             ),
-            f"col{i}",
+            name=f"col{i}",
         )
         for i in range(3)
     ]
     result = broadcast(*columns, target_length=target)
     expected = 1 if target is None else target
 
+    assert [c.name for c in result] == [f"col{i}" for i in range(3)]
     assert all(column.obj.size() == expected for column in result)
 
 
 def test_invalid_target_length():
     columns = [
-        NamedColumn(
+        Column(
             plc.column_factories.make_numeric_column(
                 plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID
             ),
-            f"col{i}",
+            name=f"col{i}",
         )
         for i in range(3)
     ]
@@ -43,11 +44,11 @@ def test_invalid_target_length():
 
 def test_broadcast_mismatching_column_lengths():
     columns = [
-        NamedColumn(
+        Column(
             plc.column_factories.make_numeric_column(
                 plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID
             ),
-            f"col{i}",
+            name=f"col{i}",
         )
         for i in range(3)
     ]
@@ -58,16 +59,17 @@ def test_broadcast_mismatching_column_lengths():
 @pytest.mark.parametrize("nrows", [0, 5])
 def test_broadcast_with_scalars(nrows):
     columns = [
-        NamedColumn(
+        Column(
             plc.column_factories.make_numeric_column(
                 plc.DataType(plc.TypeId.INT8),
                 nrows if i == 0 else 1,
                 plc.MaskState.ALL_VALID,
             ),
-            f"col{i}",
+            name=f"col{i}",
         )
         for i in range(3)
     ]
 
     result = broadcast(*columns)
+    assert [c.name for c in result] == [f"col{i}" for i in range(3)]
     assert all(column.obj.size() == nrows for column in result)
diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 8da17ef09dc..e81fc35c544 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -26,7 +26,7 @@ tips_df = consumer.read_gdf(topic="custreamz_tips",
                         partition=0,
                         start=0,
                         end=10000,
-                        message_format="CSV")
+                        message_format="csv")
 
 print(tips_df.head())
 tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill'] * 100
diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py
index 0def0ba746e..4cbd7244751 100644
--- a/python/custreamz/custreamz/kafka.py
+++ b/python/custreamz/custreamz/kafka.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import confluent_kafka as ck
 from cudf_kafka._lib.kafka import KafkaDatasource
 
@@ -288,4 +288,4 @@ def poll(self, timeout=None):
             (default: infinite (None translated into -1 in the
             library)). (Seconds)
         """
-        return self.ck.poll(timeout)
+        return self.ck_consumer.poll(timeout)
diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py
index 1cda9b71387..c5135bc6414 100644
--- a/python/custreamz/custreamz/tests/conftest.py
+++ b/python/custreamz/custreamz/tests/conftest.py
@@ -2,6 +2,7 @@
 import socket
 
 import pytest
+
 from custreamz import kafka
 
 
diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/custreamz/custreamz/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py
index bae4b051cae..8c0130d2818 100644
--- a/python/custreamz/custreamz/tests/test_dataframes.py
+++ b/python/custreamz/custreamz/tests/test_dataframes.py
@@ -377,24 +377,16 @@ def test_setitem_overwrites(stream):
     [
         ({}, "sum"),
         ({}, "mean"),
-        pytest.param({}, "min"),
+        ({}, "min"),
         pytest.param(
             {},
             "median",
             marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
         ),
-        pytest.param({}, "max"),
-        pytest.param(
-            {},
-            "var",
-            marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
-        ),
-        pytest.param({}, "count"),
-        pytest.param(
-            {"ddof": 0},
-            "std",
-            marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
-        ),
+        ({}, "max"),
+        ({}, "var"),
+        ({}, "count"),
+        ({"ddof": 0}, "std"),
         pytest.param(
             {"quantile": 0.5},
             "quantile",
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 85ab0024bb5..a8ab05a3922 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -65,52 +65,24 @@ include = [
 ]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "streamz",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["streamz"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
     "error",
     "ignore:unclosed <socket.socket:ResourceWarning",
@@ -118,4 +90,8 @@ filterwarnings = [
     # Should be fixed in the next streamz release
     # https://github.com/python-streamz/streamz/commit/2812f1f961dfcb3f17e948d8b12a12472975558e
     "ignore:pkg_resources is deprecated as an API:DeprecationWarning:streamz",
+    "ignore:Deprecated call to `pkg_resources.declare_namespace:DeprecationWarning",
+    # Ignore numba PEP 456 warning specific to arm machines
+    "ignore:FNV hashing is not implemented in Numba.*:UserWarning"
 ]
+xfail_strict = true
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 04c2ad65b99..f9df22cc436 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -7,15 +7,15 @@
 # do anything for dask==2024.2.0)
 config.set({"dataframe.query-planning-warning": False})
 
-import dask.dataframe as dd
-from dask.dataframe import from_delayed
+import dask.dataframe as dd  # noqa: E402
+from dask.dataframe import from_delayed  # noqa: E402
 
-import cudf
+import cudf  # noqa: E402
 
-from . import backends
-from ._version import __git_commit__, __version__
-from .core import concat, from_cudf, from_dask_dataframe
-from .expr import QUERY_PLANNING_ON
+from . import backends  # noqa: E402, F401
+from ._version import __git_commit__, __version__  # noqa: E402, F401
+from .core import concat, from_cudf, from_dask_dataframe  # noqa: E402
+from .expr import QUERY_PLANNING_ON  # noqa: E402
 
 
 def read_csv(*args, **kwargs):
@@ -55,9 +55,9 @@ def inner_func(*args, **kwargs):
     to_orc = raise_not_implemented_error("to_orc")
 
 else:
-    from .core import DataFrame, Index, Series
-    from .groupby import groupby_agg
-    from .io import read_text, to_orc
+    from .core import DataFrame, Index, Series  # noqa: F401
+    from .groupby import groupby_agg  # noqa: F401
+    from .io import read_text, to_orc  # noqa: F401
 
 
 __all__ = [
diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index a76b655ef42..6dadadd5263 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -12,8 +12,8 @@
     config.set({"dataframe.shuffle.method": "tasks"})
 
     try:
-        import dask_cudf.expr._collection
-        import dask_cudf.expr._expr
+        import dask_cudf.expr._collection  # noqa: F401
+        import dask_cudf.expr._expr  # noqa: F401
 
     except ImportError as err:
         # Dask *should* raise an error before this.
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index af83a01da98..c7cf66fbffd 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -6,11 +6,20 @@
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
+from dask_expr._groupby import (
+    DecomposableGroupbyAggregation,
+    GroupbyAggregation,
+)
 from dask_expr._reductions import Reduction, Var
 from dask_expr.io.io import FusedParquetIO
-from dask_expr.io.parquet import ReadParquetPyarrowFS
-
-from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
+from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS
+
+from dask.dataframe.core import (
+    _concat,
+    is_dataframe_like,
+    make_meta,
+    meta_nonempty,
+)
 from dask.dataframe.dispatch import is_categorical_dtype
 from dask.typing import no_default
 
@@ -21,6 +30,210 @@
 ##
 
 
+def _get_spec_info(gb):
+    if isinstance(gb.arg, (dict, list)):
+        aggs = gb.arg.copy()
+    else:
+        aggs = gb.arg
+
+    if gb._slice and not isinstance(aggs, dict):
+        aggs = {gb._slice: aggs}
+
+    gb_cols = gb._by_columns
+    if isinstance(gb_cols, str):
+        gb_cols = [gb_cols]
+    columns = [c for c in gb.frame.columns if c not in gb_cols]
+    if not isinstance(aggs, dict):
+        aggs = {col: aggs for col in columns}
+
+    # Assert if our output will have a MultiIndex; this will be the case if
+    # any value in the `aggs` dict is not a string (i.e. multiple/named
+    # aggregations per column)
+    str_cols_out = True
+    aggs_renames = {}
+    for col in aggs:
+        if isinstance(aggs[col], str) or callable(aggs[col]):
+            aggs[col] = [aggs[col]]
+        elif isinstance(aggs[col], dict):
+            str_cols_out = False
+            col_aggs = []
+            for k, v in aggs[col].items():
+                aggs_renames[col, v] = k
+                col_aggs.append(v)
+            aggs[col] = col_aggs
+        else:
+            str_cols_out = False
+        if col in gb_cols:
+            columns.append(col)
+
+    return {
+        "aggs": aggs,
+        "columns": columns,
+        "str_cols_out": str_cols_out,
+        "aggs_renames": aggs_renames,
+    }
+
+
+def _get_meta(gb):
+    spec_info = gb.spec_info
+    gb_cols = gb._by_columns
+    aggs = spec_info["aggs"].copy()
+    aggs_renames = spec_info["aggs_renames"]
+    if spec_info["str_cols_out"]:
+        # Metadata should use `str` for dict values if that is
+        # what the user originally specified (column names will
+        # be str, rather than tuples).
+        for col in aggs:
+            aggs[col] = aggs[col][0]
+    _meta = gb.frame._meta.groupby(gb_cols).agg(aggs)
+    if aggs_renames:
+        col_array = []
+        agg_array = []
+        for col, agg in _meta.columns:
+            col_array.append(col)
+            agg_array.append(aggs_renames.get((col, agg), agg))
+        _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
+    return _meta
+
+
+class DecomposableCudfGroupbyAgg(DecomposableGroupbyAggregation):
+    sep = "___"
+
+    @functools.cached_property
+    def spec_info(self):
+        return _get_spec_info(self)
+
+    @functools.cached_property
+    def _meta(self):
+        return _get_meta(self)
+
+    @property
+    def shuffle_by_index(self):
+        return False  # We always group by column(s)
+
+    @classmethod
+    def chunk(cls, df, *by, **kwargs):
+        from dask_cudf.groupby import _groupby_partition_agg
+
+        return _groupby_partition_agg(df, **kwargs)
+
+    @classmethod
+    def combine(cls, inputs, **kwargs):
+        from dask_cudf.groupby import _tree_node_agg
+
+        return _tree_node_agg(_concat(inputs), **kwargs)
+
+    @classmethod
+    def aggregate(cls, inputs, **kwargs):
+        from dask_cudf.groupby import _finalize_gb_agg
+
+        return _finalize_gb_agg(_concat(inputs), **kwargs)
+
+    @property
+    def chunk_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        return {
+            "gb_cols": self._by_columns,
+            "aggs": self.spec_info["aggs"],
+            "columns": self.spec_info["columns"],
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+        }
+
+    @property
+    def combine_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        return {
+            "gb_cols": self._by_columns,
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+        }
+
+    @property
+    def aggregate_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        final_columns = self._slice or self._meta.columns
+        return {
+            "gb_cols": self._by_columns,
+            "aggs": self.spec_info["aggs"],
+            "columns": self.spec_info["columns"],
+            "final_columns": final_columns,
+            "as_index": True,
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+            "str_cols_out": self.spec_info["str_cols_out"],
+            "aggs_renames": self.spec_info["aggs_renames"],
+        }
+
+
+class CudfGroupbyAgg(GroupbyAggregation):
+    @functools.cached_property
+    def spec_info(self):
+        return _get_spec_info(self)
+
+    @functools.cached_property
+    def _meta(self):
+        return _get_meta(self)
+
+    def _lower(self):
+        return DecomposableCudfGroupbyAgg(
+            self.frame,
+            self.arg,
+            self.observed,
+            self.dropna,
+            self.split_every,
+            self.split_out,
+            self.sort,
+            self.shuffle_method,
+            self._slice,
+            *self.by,
+        )
+
+
+def _maybe_get_custom_expr(
+    gb,
+    aggs,
+    split_every=None,
+    split_out=None,
+    shuffle_method=None,
+    **kwargs,
+):
+    from dask_cudf.groupby import (
+        OPTIMIZED_AGGS,
+        _aggs_optimized,
+        _redirect_aggs,
+    )
+
+    if kwargs:
+        # Unsupported key-word arguments
+        return None
+
+    if not hasattr(gb.obj._meta, "to_pandas"):
+        # Not cuDF-backed data
+        return None
+
+    _aggs = _redirect_aggs(aggs)
+    if not _aggs_optimized(_aggs, OPTIMIZED_AGGS):
+        # One or more aggregations are unsupported
+        return None
+
+    return CudfGroupbyAgg(
+        gb.obj.expr,
+        _aggs,
+        gb.observed,
+        gb.dropna,
+        split_every,
+        split_out,
+        gb.sort,
+        shuffle_method,
+        gb._slice,
+        *gb.by,
+    )
+
+
 class CudfFusedParquetIO(FusedParquetIO):
     @staticmethod
     def _load_multiple_files(
@@ -89,16 +302,34 @@ def _dataset_info(self):
         return dataset_info
 
     @staticmethod
-    def _table_to_pandas(
-        table,
-        index_name,
-        *args,
-    ):
+    def _table_to_pandas(table, index_name):
         df = cudf.DataFrame.from_arrow(table)
         if index_name is not None:
             df = df.set_index(index_name)
         return df
 
+    def _filtered_task(self, index: int):
+        columns = self.columns.copy()
+        index_name = self.index.name
+        if self.index is not None:
+            index_name = self.index.name
+        schema = self._dataset_info["schema"].remove_metadata()
+        if index_name:
+            if columns is None:
+                columns = list(schema.names)
+            columns.append(index_name)
+        return (
+            self._table_to_pandas,
+            (
+                self._fragment_to_table,
+                FragmentWrapper(self.fragments[index], filesystem=self.fs),
+                self.filters,
+                columns,
+                schema,
+            ),
+            index_name,
+        )
+
     def _tune_up(self, parent):
         if self._fusion_compression_factor >= 1:
             return
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 65688115b59..8a16fe7615d 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from dask_expr._collection import new_collection
 from dask_expr._groupby import (
     GroupBy as DXGroupBy,
     SeriesGroupBy as DXSeriesGroupBy,
@@ -11,6 +12,8 @@
 
 from cudf.core.groupby.groupby import _deprecate_collect
 
+from dask_cudf.expr._expr import _maybe_get_custom_expr
+
 ##
 ## Custom groupby classes
 ##
@@ -54,9 +57,16 @@ def _translate_arg(arg):
         return arg
 
 
-# TODO: These classes are mostly a work-around for missing
-# `observed=False` support.
-# See: https://github.com/rapidsai/cudf/issues/15173
+# We define our own GroupBy classes in Dask cuDF for
+# the following reasons:
+#  (1) We want to use a custom `aggregate` algorithm
+#      that performs multiple aggregations on the
+#      same dataframe partition at once. The upstream
+#      algorithm breaks distinct aggregations into
+#      separate tasks.
+#  (2) We need to work around missing `observed=False`
+#      support:
+#      https://github.com/rapidsai/cudf/issues/15173
 
 
 class GroupBy(DXGroupBy):
@@ -89,8 +99,15 @@ def collect(self, **kwargs):
         _deprecate_collect()
         return self._single_agg(ListAgg, **kwargs)
 
-    def aggregate(self, arg, **kwargs):
-        return super().aggregate(_translate_arg(arg), **kwargs)
+    def aggregate(self, arg, fused=True, **kwargs):
+        if (
+            fused
+            and (expr := _maybe_get_custom_expr(self, arg, **kwargs))
+            is not None
+        ):
+            return new_collection(expr)
+        else:
+            return super().aggregate(_translate_arg(arg), **kwargs)
 
 
 class SeriesGroupBy(DXSeriesGroupBy):
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 76bb2ea99b4..0421bd755f4 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from .csv import read_csv
-from .json import read_json
-from .orc import read_orc, to_orc
-from .text import read_text
+from .csv import read_csv  # noqa: F401
+from .json import read_json  # noqa: F401
+from .orc import read_orc, to_orc  # noqa: F401
+from .text import read_text  # noqa: F401
 
 try:
-    from .parquet import read_parquet, to_parquet
+    from .parquet import read_parquet, to_parquet  # noqa: F401
 except ImportError:
     pass
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 620a917109e..ae5ca480e31 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -15,7 +15,11 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    require_dask_expr,
+    skip_dask_expr,
+    xfail_dask_expr,
+)
 
 # Check if create_metadata_file is supported by
 # the current dask.dataframe version
@@ -371,12 +375,12 @@ def test_split_row_groups(tmpdir, row_groups, index):
     row_group_size = 5
     file_row_groups = 10  # Known apriori
     npartitions_expected = math.ceil(file_row_groups / row_groups) * 2
-
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "a": np.random.choice(["apple", "banana", "carrot"], size=df_size),
-            "b": np.random.random(size=df_size),
-            "c": np.random.randint(1, 5, size=df_size),
+            "a": rng.choice(["apple", "banana", "carrot"], size=df_size),
+            "b": rng.random(size=df_size),
+            "c": rng.integers(1, 5, size=df_size),
             "index": np.arange(0, df_size),
         }
     )
@@ -615,3 +619,28 @@ def test_timezone_column(tmpdir):
     got = dask_cudf.read_parquet(path)
     expect = cudf.read_parquet(path)
     dd.assert_eq(got, expect)
+
+
+@require_dask_expr()
+@pytest.mark.skipif(
+    not dask_cudf.backends.PYARROW_GE_15,
+    reason="Requires pyarrow 15",
+)
+@pytest.mark.parametrize("min_part_size", ["1B", "1GB"])
+def test_read_parquet_arrow_filesystem(tmpdir, min_part_size):
+    tmp_path = str(tmpdir)
+    with dask.config.set(
+        {
+            "dataframe.backend": "cudf",
+            "dataframe.parquet.minimum-partition-size": min_part_size,
+        }
+    ):
+        dd.from_dict(
+            {"x": range(1000), "y": ["a", "b", "c", "d"] * 250},
+            npartitions=10,
+        ).to_parquet(tmp_path, write_index=False)
+        df = cudf.read_parquet(tmp_path)
+        ddf = dask_cudf.read_parquet(tmp_path, filesystem="arrow")
+        dd.assert_eq(df, ddf, check_index=False)
+        assert isinstance(ddf._meta, cudf.DataFrame)
+        assert isinstance(ddf.compute(), cudf.DataFrame)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index cf8af82e112..90907f6fb99 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -11,6 +11,8 @@
 
 from dask.dataframe import assert_eq
 
+import cudf
+
 import dask_cudf
 from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
@@ -168,6 +170,8 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem):
                 filesystem=filesystem,
             )
         assert df.b.sum().compute() == 9
+        assert isinstance(df._meta, cudf.DataFrame)
+        assert isinstance(df.compute(), cudf.DataFrame)
 
 
 def test_read_parquet_filesystem_explicit(s3_base, s3so, pdf):
diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/dask_cudf/dask_cudf/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 6f04b5737da..3fbb2aacd2c 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -25,7 +25,7 @@ def data_dt_1():
 
 
 def data_dt_2():
-    return np.random.randn(100)
+    return np.random.default_rng(seed=0).standard_normal(size=100)
 
 
 dt_fields = ["year", "month", "day", "hour", "minute", "second"]
diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py
index 87bd401accd..8c51f950765 100644
--- a/python/dask_cudf/dask_cudf/tests/test_binops.py
+++ b/python/dask_cudf/dask_cudf/tests/test_binops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import operator
 
@@ -21,10 +21,11 @@ def _make_empty_frame(npartitions=2):
 
 
 def _make_random_frame_float(nelem, npartitions=2):
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=nelem),
-            "y": np.random.normal(size=nelem) + 1,
+            "x": rng.integers(0, 5, size=nelem),
+            "y": rng.normal(size=nelem) + 1,
         }
     )
     gdf = cudf.from_pandas(df)
@@ -51,7 +52,6 @@ def _make_random_frame_float(nelem, npartitions=2):
 
 @pytest.mark.parametrize("binop", _binops)
 def test_series_binops_integer(binop):
-    np.random.seed(0)
     size = 1000
     lhs_df, lhs_gdf = _make_random_frame(size)
     rhs_df, rhs_gdf = _make_random_frame(size)
@@ -62,7 +62,6 @@ def test_series_binops_integer(binop):
 
 @pytest.mark.parametrize("binop", _binops)
 def test_series_binops_float(binop):
-    np.random.seed(0)
     size = 1000
     lhs_df, lhs_gdf = _make_random_frame_float(size)
     rhs_df, rhs_gdf = _make_random_frame_float(size)
@@ -73,10 +72,10 @@ def test_series_binops_float(binop):
 
 @pytest.mark.parametrize("operator", _binops)
 def test_df_series_bind_ops(operator):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     size = 1000
     lhs_df, lhs_gdf = _make_random_frame_float(size)
-    rhs = np.random.rand()
+    rhs = rng.random()
 
     for col in lhs_gdf.columns:
         got = getattr(lhs_gdf[col], operator.__name__)(rhs)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 5f0fae86691..8e42c847ddf 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -22,13 +22,15 @@
     xfail_dask_expr,
 )
 
+rng = np.random.default_rng(seed=0)
+
 
 def test_from_dict_backend_dispatch():
     # Test ddf.from_dict cudf-backend dispatch
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     data = {
-        "x": np.random.randint(0, 5, size=10000),
-        "y": np.random.normal(size=10000),
+        "x": rng.integers(0, 5, size=10000),
+        "y": rng.normal(size=10000),
     }
     expect = cudf.DataFrame(data)
     with dask.config.set({"dataframe.backend": "cudf"}):
@@ -62,10 +64,10 @@ def test_from_dask_dataframe_deprecated():
 
 
 def test_to_backend():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     data = {
-        "x": np.random.randint(0, 5, size=10000),
-        "y": np.random.normal(size=10000),
+        "x": rng.integers(0, 5, size=10000),
+        "y": rng.normal(size=10000),
     }
     with dask.config.set({"dataframe.backend": "pandas"}):
         ddf = dd.from_dict(data, npartitions=2)
@@ -114,12 +116,12 @@ def test_to_backend_kwargs():
 
 
 def test_from_pandas():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
@@ -169,10 +171,10 @@ def _fragmented_gdf(df, nsplit):
 
 
 def test_query():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)}
+        {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)}
     )
     gdf = cudf.DataFrame.from_pandas(df)
     expr = "x > 2"
@@ -188,9 +190,9 @@ def test_query():
 
 
 def test_query_local_dict():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)}
+        {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)}
     )
     gdf = cudf.DataFrame.from_pandas(df)
     ddf = dask_cudf.from_cudf(gdf, npartitions=2)
@@ -204,11 +206,11 @@ def test_query_local_dict():
 
 
 def test_head():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=100),
-            "y": np.random.normal(size=100),
+            "x": rng.integers(0, 5, size=100),
+            "y": rng.normal(size=100),
         }
     )
     gdf = cudf.DataFrame.from_pandas(df)
@@ -220,13 +222,11 @@ def test_head():
 @pytest.mark.parametrize("nelem", [10, 200, 1333])
 def test_set_index(nelem):
     with dask.config.set(scheduler="single-threaded"):
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         # Use unique index range as the sort may not be stable-ordering
         x = np.arange(nelem)
-        np.random.shuffle(x)
-        df = pd.DataFrame(
-            {"x": x, "y": np.random.randint(0, nelem, size=nelem)}
-        )
+        rng.shuffle(x)
+        df = pd.DataFrame({"x": x, "y": rng.integers(0, nelem, size=nelem)})
         ddf = dd.from_pandas(df, npartitions=2)
         ddf2 = ddf.to_backend("cudf")
 
@@ -242,7 +242,7 @@ def test_set_index(nelem):
 def test_set_index_quantile(nelem, nparts, by):
     df = cudf.DataFrame()
     df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
-    df["b"] = np.random.choice(cudf.datasets.names, size=nelem)
+    df["b"] = rng.choice(cudf.datasets.names, size=nelem)
     ddf = dd.from_pandas(df, npartitions=nparts)
 
     with pytest.warns(FutureWarning, match="deprecated"):
@@ -270,11 +270,11 @@ def assert_frame_equal_by_index_group(expect, got):
 @pytest.mark.parametrize("nelem", [10, 200, 1333])
 def test_set_index_2(nelem):
     with dask.config.set(scheduler="single-threaded"):
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         df = pd.DataFrame(
             {
-                "x": 100 + np.random.randint(0, nelem // 2, size=nelem),
-                "y": np.random.normal(size=nelem),
+                "x": 100 + rng.integers(0, nelem // 2, size=nelem),
+                "y": rng.normal(size=nelem),
             }
         )
         expect = df.set_index("x").sort_index()
@@ -289,11 +289,11 @@ def test_set_index_2(nelem):
 def test_set_index_w_series():
     with dask.config.set(scheduler="single-threaded"):
         nelem = 20
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         df = pd.DataFrame(
             {
-                "x": 100 + np.random.randint(0, nelem // 2, size=nelem),
-                "y": np.random.normal(size=nelem),
+                "x": 100 + rng.integers(0, nelem // 2, size=nelem),
+                "y": rng.normal(size=nelem),
             }
         )
         expect = df.set_index(df.x).sort_index()
@@ -327,12 +327,12 @@ def test_set_index_sorted():
 @pytest.mark.parametrize("index", [None, "myindex"])
 def test_rearrange_by_divisions(nelem, index):
     with dask.config.set(scheduler="single-threaded"):
-        np.random.seed(0)
+        rng = np.random.default_rng(seed=0)
         df = pd.DataFrame(
             {
-                "x": np.random.randint(0, 20, size=nelem),
-                "y": np.random.normal(size=nelem),
-                "z": np.random.choice(["dog", "cat", "bird"], nelem),
+                "x": rng.integers(0, 20, size=nelem),
+                "y": rng.normal(size=nelem),
+                "z": rng.choice(["dog", "cat", "bird"], nelem),
             }
         )
         df["z"] = df["z"].astype("category")
@@ -355,9 +355,9 @@ def test_rearrange_by_divisions(nelem, index):
 
 
 def test_assign():
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
 
     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
@@ -372,10 +372,10 @@ def test_assign():
 
 @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"])
 def test_setitem_scalar_integer(data_type):
-    np.random.seed(0)
-    scalar = np.random.randint(0, 100, dtype=data_type)
+    rng = np.random.default_rng(seed=0)
+    scalar = rng.integers(0, 100, dtype=data_type)
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
 
@@ -388,10 +388,10 @@ def test_setitem_scalar_integer(data_type):
 
 @pytest.mark.parametrize("data_type", ["float32", "float64"])
 def test_setitem_scalar_float(data_type):
-    np.random.seed(0)
-    scalar = np.random.randn(1).astype(data_type)[0]
+    rng = np.random.default_rng(seed=0)
+    scalar = rng.standard_normal(size=1).astype(data_type)[0]
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
 
@@ -403,10 +403,10 @@ def test_setitem_scalar_float(data_type):
 
 
 def test_setitem_scalar_datetime():
-    np.random.seed(0)
-    scalar = np.int64(np.random.randint(0, 100)).astype("datetime64[ms]")
+    rng = np.random.default_rng(seed=0)
+    scalar = np.int64(rng.integers(0, 100)).astype("datetime64[ms]")
     df = pd.DataFrame(
-        {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)}
+        {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)}
     )
     dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2)
 
@@ -422,12 +422,12 @@ def test_setitem_scalar_datetime():
     "func",
     [
         lambda: pd.DataFrame(
-            {"A": np.random.rand(10), "B": np.random.rand(10)},
+            {"A": rng.random(10), "B": rng.random(10)},
             index=list("abcdefghij"),
         ),
         lambda: pd.DataFrame(
             {
-                "A": np.random.rand(10),
+                "A": rng.random(10),
                 "B": list("a" * 10),
                 "C": pd.Series(
                     [str(20090101 + i) for i in range(10)],
@@ -438,7 +438,7 @@ def test_setitem_scalar_datetime():
         ),
         lambda: pd.Series(list("abcdefghijklmnop")),
         lambda: pd.Series(
-            np.random.rand(10),
+            rng.random(10),
             index=pd.Index(
                 [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]"
             ),
@@ -497,10 +497,11 @@ def test_repartition_hash_staged(npartitions):
     by = ["b"]
     datarange = 35
     size = 100
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
             "a": np.arange(size, dtype="int64"),
-            "b": np.random.randint(datarange, size=size),
+            "b": rng.integers(datarange, size=size),
         }
     )
     # WARNING: Specific npartitions-max_branch combination
@@ -537,12 +538,13 @@ def test_repartition_hash(by, npartitions, max_branch):
     npartitions_i = 4
     datarange = 26
     size = 100
+    rng = np.random.default_rng(seed=0)
     gdf = cudf.DataFrame(
         {
             "a": np.arange(0, stop=size, dtype="int64"),
-            "b": np.random.randint(datarange, size=size),
-            "c": np.random.choice(list("abcdefgh"), size=size),
-            "d": np.random.choice(np.arange(26), size=size),
+            "b": rng.integers(datarange, size=size),
+            "c": rng.choice(list("abcdefgh"), size=size),
+            "d": rng.choice(np.arange(26), size=size),
         }
     )
     gdf.d = gdf.d.astype("datetime64[ms]")
@@ -768,6 +770,7 @@ def test_dataframe_series_replace(data):
 
 
 def test_dataframe_assign_col():
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(list(range(100)))
     pdf = pd.DataFrame(list(range(100)))
 
@@ -780,7 +783,7 @@ def test_dataframe_assign_col():
     pddf = dd.from_pandas(pdf, npartitions=4)
     pddf["fold"] = 0
     pddf["fold"] = pddf["fold"].map_partitions(
-        lambda p_df: pd.Series(np.random.randint(0, 4, len(p_df)))
+        lambda p_df: pd.Series(rng.integers(0, 4, len(p_df)))
     )
 
     dd.assert_eq(ddf[0], pddf[0])
@@ -1015,10 +1018,11 @@ def test_to_backend_simplify():
 @pytest.mark.parametrize("numeric_only", [True, False])
 @pytest.mark.parametrize("op", ["corr", "cov"])
 def test_cov_corr(op, numeric_only):
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame.from_dict(
         {
-            "x": np.random.randint(0, 5, size=10),
-            "y": np.random.normal(size=10),
+            "x": rng.integers(0, 5, size=10),
+            "y": rng.normal(size=10),
         }
     )
     ddf = dd.from_pandas(df, npartitions=2)
diff --git a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py
index e6fb58ad6df..84ed3b46598 100644
--- a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py
+++ b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py
@@ -51,9 +51,13 @@ def test_series_from_delayed():
 def test_dataframe_to_delayed():
     nelem = 100
 
-    df = cudf.DataFrame()
-    df["x"] = np.arange(nelem)
-    df["y"] = np.random.randint(nelem, size=nelem)
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "x": np.arange(nelem),
+            "y": rng.integers(nelem, size=nelem),
+        }
+    )
 
     ddf = dask_cudf.from_cudf(df, npartitions=5)
 
@@ -80,8 +84,8 @@ def test_dataframe_to_delayed():
 
 def test_series_to_delayed():
     nelem = 100
-
-    sr = cudf.Series(np.random.randint(nelem, size=nelem))
+    rng = np.random.default_rng(seed=0)
+    sr = cudf.Series(rng.integers(nelem, size=nelem))
 
     dsr = dask_cudf.from_cudf(sr, npartitions=5)
 
@@ -109,11 +113,13 @@ def test_series_to_delayed():
 
 def test_mixing_series_frame_error():
     nelem = 20
-
-    df = cudf.DataFrame()
-    df["x"] = np.arange(nelem)
-    df["y"] = np.random.randint(nelem, size=nelem)
-
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "x": np.arange(nelem),
+            "y": rng.integers(nelem, size=nelem),
+        }
+    )
     ddf = dask_cudf.from_cudf(df, npartitions=5)
 
     delay_frame = ddf.to_delayed()
@@ -128,10 +134,13 @@ def test_mixing_series_frame_error():
 
 def test_frame_extra_columns_error():
     nelem = 20
-
-    df = cudf.DataFrame()
-    df["x"] = np.arange(nelem)
-    df["y"] = np.random.randint(nelem, size=nelem)
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(
+        {
+            "x": np.arange(nelem),
+            "y": rng.integers(nelem, size=nelem),
+        }
+    )
     ddf1 = dask_cudf.from_cudf(df, npartitions=5)
 
     df["z"] = np.arange(nelem)
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index a12481a7bb4..fe57d4a4f00 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -32,8 +32,9 @@ def test_pyarrow_conversion_dispatch(preserve_index, index):
         to_pyarrow_table_dispatch,
     )
 
+    rng = np.random.default_rng(seed=0)
     df1 = cudf.DataFrame(
-        np.random.randn(10, 3), columns=list("abc"), index=index
+        rng.standard_normal(size=(10, 3)), columns=list("abc"), index=index
     )
     df2 = from_pyarrow_table_dispatch(
         df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
@@ -108,7 +109,8 @@ def test_pyarrow_schema_dispatch(preserve_index):
         to_pyarrow_table_dispatch,
     )
 
-    df = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
+    rng = np.random.default_rng(seed=0)
+    df = cudf.DataFrame(rng.standard_normal(size=(10, 3)), columns=list("abc"))
     df["d"] = cudf.Series(["cat", "dog"] * 5)
     table = to_pyarrow_table_dispatch(df, preserve_index=preserve_index)
     schema = pyarrow_schema_dispatch(df, preserve_index=preserve_index)
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 7b9f0ca328a..042e69d86f4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -14,7 +14,11 @@
 
 import dask_cudf
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
-from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    QUERY_PLANNING_ON,
+    require_dask_expr,
+    xfail_dask_expr,
+)
 
 
 def assert_cudf_groupby_layers(ddf):
@@ -30,21 +34,21 @@ def assert_cudf_groupby_layers(ddf):
 
 @pytest.fixture(params=["non_null", "null"])
 def pdf(request):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # note that column name "x" is a substring of the groupby key;
     # this gives us coverage for cudf#10829
     pdf = pd.DataFrame(
         {
-            "xx": np.random.randint(0, 5, size=10000),
-            "x": np.random.normal(size=10000),
-            "y": np.random.normal(size=10000),
+            "xx": rng.integers(0, 5, size=10000),
+            "x": rng.normal(size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
     # insert nulls into dataframe at random
     if request.param == "null":
-        pdf = pdf.mask(np.random.choice([True, False], size=pdf.shape))
+        pdf = pdf.mask(rng.choice([True, False], size=pdf.shape))
 
     return pdf
 
@@ -173,11 +177,12 @@ def test_groupby_agg_empty_partition(tmpdir, split_out):
     ],
 )
 def test_groupby_multi_column(func):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "a": np.random.randint(0, 20, size=1000),
-            "b": np.random.randint(0, 5, size=1000),
-            "x": np.random.normal(size=1000),
+            "a": rng.integers(0, 20, size=1000),
+            "b": rng.integers(0, 5, size=1000),
+            "x": rng.normal(size=1000),
         }
     )
 
@@ -371,11 +376,12 @@ def test_groupby_string_index_name(myindex):
     ],
 )
 def test_groupby_split_out_multiindex(agg_func):
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(
         {
-            "a": np.random.randint(0, 10, 100),
-            "b": np.random.randint(0, 5, 100),
-            "c": np.random.random(100),
+            "a": rng.integers(0, 10, 100),
+            "b": rng.integers(0, 5, 100),
+            "c": rng.random(100),
         }
     )
     ddf = dask_cudf.from_cudf(df, 5)
@@ -419,12 +425,13 @@ def test_groupby_multiindex_reset_index(npartitions):
     ],
 )
 def test_groupby_reset_index_multiindex(groupby_keys, agg_func):
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(
         {
-            "a": np.random.randint(0, 10, 10),
-            "b": np.random.randint(0, 5, 10),
-            "c": np.random.randint(0, 5, 10),
-            "dd": np.random.randint(0, 5, 10),
+            "a": rng.integers(0, 10, 10),
+            "b": rng.integers(0, 5, 10),
+            "c": rng.integers(0, 5, 10),
+            "dd": rng.integers(0, 5, 10),
         }
     )
     ddf = dask_cudf.from_cudf(df, 5)
@@ -437,8 +444,9 @@ def test_groupby_reset_index_multiindex(groupby_keys, agg_func):
 
 
 def test_groupby_reset_index_drop_True():
+    rng = np.random.default_rng(seed=0)
     df = cudf.DataFrame(
-        {"a": np.random.randint(0, 10, 10), "b": np.random.randint(0, 5, 10)}
+        {"a": rng.integers(0, 10, 10), "b": rng.integers(0, 5, 10)}
     )
     ddf = dask_cudf.from_cudf(df, 5)
     pddf = dd.from_pandas(df.to_pandas(), 5)
@@ -552,10 +560,22 @@ def test_groupby_categorical_key():
         ),
     ],
 )
+@pytest.mark.parametrize(
+    "fused",
+    [
+        True,
+        pytest.param(
+            False,
+            marks=require_dask_expr("Not supported by legacy API"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
-def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
+def test_groupby_agg_params(
+    npartitions, split_every, split_out, fused, as_index
+):
     df = cudf.datasets.randomdata(
         nrows=150,
         dtypes={"name": str, "a": int, "b": int, "c": float},
@@ -570,6 +590,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
         "c": ["mean", "std", "var"],
     }
 
+    fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {}
     split_kwargs = {"split_every": split_every, "split_out": split_out}
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
@@ -589,6 +610,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
             ddf.groupby(["name", "a"], sort=True, **maybe_as_index)
             .aggregate(
                 agg_dict,
+                **fused_kwarg,
                 **split_kwargs,
             )
             .compute()
@@ -610,6 +632,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     # Full check (`sort=False`)
     gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate(
         agg_dict,
+        **fused_kwarg,
         **split_kwargs,
     )
     pr = pddf.groupby(["name", "a"], sort=False).agg(
@@ -653,10 +676,11 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
 )
 def test_groupby_agg_redirect(aggregations):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
@@ -758,10 +782,11 @@ def test_groupby_with_list_of_series():
     ],
 )
 def test_groupby_nested_dict(func):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
@@ -794,10 +819,11 @@ def test_groupby_nested_dict(func):
     ],
 )
 def test_groupby_all_columns(func):
+    rng = np.random.default_rng(seed=0)
     pdf = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
+            "x": rng.integers(0, 5, size=10000),
+            "y": rng.normal(size=10000),
         }
     )
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 3e078c47cdd..61d0f8d7eb9 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -22,18 +22,18 @@
 def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
     chunksize = 50
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # cuDF
     left = cudf.DataFrame(
         {
-            "x": np.random.randint(0, left_nkeys, size=left_nrows),
+            "x": rng.integers(0, left_nkeys, size=left_nrows),
             "a": np.arange(left_nrows),
         }
     )
     right = cudf.DataFrame(
         {
-            "x": np.random.randint(0, right_nkeys, size=right_nrows),
+            "x": rng.integers(0, right_nkeys, size=right_nrows),
             "a": 1000 * np.arange(right_nrows),
         }
     )
@@ -84,18 +84,18 @@ def gather(df, grows):
 def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how):
     chunksize = 50
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # cuDF
     left = cudf.DataFrame(
         {
-            "x": np.random.randint(0, left_nkeys, size=left_nrows),
+            "x": rng.integers(0, left_nkeys, size=left_nrows),
             "a": np.arange(left_nrows, dtype=np.float64),
         }
     )
     right = cudf.DataFrame(
         {
-            "x": np.random.randint(0, right_nkeys, size=right_nrows),
+            "x": rng.integers(0, right_nkeys, size=right_nrows),
             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
         }
     )
@@ -153,20 +153,20 @@ def test_merge_left(
 ):
     chunksize = 3
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=42)
 
     # cuDF
     left = cudf.DataFrame(
         {
-            "x": np.random.randint(0, left_nkeys, size=left_nrows),
-            "y": np.random.randint(0, left_nkeys, size=left_nrows),
+            "x": rng.integers(0, left_nkeys, size=left_nrows),
+            "y": rng.integers(0, left_nkeys, size=left_nrows),
             "a": np.arange(left_nrows, dtype=np.float64),
         }
     )
     right = cudf.DataFrame(
         {
-            "x": np.random.randint(0, right_nkeys, size=right_nrows),
-            "y": np.random.randint(0, right_nkeys, size=right_nrows),
+            "x": rng.integers(0, right_nkeys, size=right_nrows),
+            "y": rng.integers(0, right_nkeys, size=right_nrows),
             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
         }
     )
@@ -200,18 +200,18 @@ def test_merge_1col_left(
 ):
     chunksize = 3
 
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
 
     # cuDF
     left = cudf.DataFrame(
         {
-            "x": np.random.randint(0, left_nkeys, size=left_nrows),
+            "x": rng.integers(0, left_nkeys, size=left_nrows),
             "a": np.arange(left_nrows, dtype=np.float64),
         }
     )
     right = cudf.DataFrame(
         {
-            "x": np.random.randint(0, right_nkeys, size=right_nrows),
+            "x": rng.integers(0, right_nkeys, size=right_nrows),
             "a": 1000 * np.arange(right_nrows, dtype=np.float64),
         }
     )
@@ -238,13 +238,19 @@ def test_merge_1col_left(
 
 def test_merge_should_fail():
     # Expected failure cases described in #2694
-    df1 = cudf.DataFrame()
-    df1["a"] = [1, 2, 3, 4, 5, 6] * 2
-    df1["b"] = np.random.randint(0, 12, 12)
-
-    df2 = cudf.DataFrame()
-    df2["a"] = [7, 2, 3, 8, 5, 9] * 2
-    df2["c"] = np.random.randint(0, 12, 12)
+    rng = np.random.default_rng(seed=0)
+    df1 = pd.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6] * 2,
+            "b": rng.integers(0, 12, 12),
+        }
+    )
+    df2 = pd.DataFrame(
+        {
+            "a": [7, 2, 3, 8, 5, 9] * 2,
+            "c": rng.integers(0, 12, 12),
+        }
+    )
 
     left = dask_cudf.from_cudf(df1, 1).groupby("a").b.min().to_frame()
     right = dask_cudf.from_cudf(df2, 1).groupby("a").c.min().to_frame()
@@ -257,7 +263,7 @@ def test_merge_should_fail():
         left.merge(right, how="left", on=["c"])
 
     # Same column names
-    df2["b"] = np.random.randint(0, 12, 12)
+    df2["b"] = np.random.default_rng(seed=0).integers(0, 12, 12)
     right = dask_cudf.from_cudf(df2, 1).groupby("a").b.min().to_frame()
 
     with pytest.raises(KeyError):
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index d03e92319be..4351b672151 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -13,11 +13,11 @@
 
 
 def _make_random_frame(nelem, npartitions=2):
-    np.random.seed(0)
+    rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
         {
-            "x": np.random.randint(0, 5, size=nelem),
-            "y": np.random.normal(size=nelem) + 1,
+            "x": rng.integers(0, 5, size=nelem),
+            "y": rng.normal(loc=1.0, scale=1.0, size=nelem),
         }
     )
     gdf = cudf.DataFrame.from_pandas(df)
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 9bbbbc79561..02c815427f3 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -28,7 +28,7 @@
 @pytest.mark.parametrize("nelem", [10, 500])
 @pytest.mark.parametrize("nparts", [1, 10])
 def test_sort_values(nelem, nparts, by, ascending):
-    np.random.seed(0)
+    _ = np.random.default_rng(seed=0)
     df = cudf.DataFrame()
     df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
     df["b"] = np.arange(100, nelem + 100)
@@ -82,7 +82,7 @@ def test_sort_repartition():
     ],
 )
 def test_sort_values_with_nulls(data, by, ascending, na_position):
-    np.random.seed(0)
+    _ = np.random.default_rng(seed=0)
     cp.random.seed(0)
     df = cudf.DataFrame(data)
     ddf = dd.from_pandas(df, npartitions=5)
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index cc0c6899804..9aaf6dc8420 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -19,8 +19,9 @@
 
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
+    rng = np.random.default_rng(seed=None)
     df = pd.DataFrame(
-        {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)}
+        {"x": rng.random(size=nelem), "y": rng.random(size=nelem)}
     )
 
     if include_na:
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 76e47b50c3b..862e8f36eaa 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<3.0a0",
-    "pandas>=2.0,<2.2.3dev0",
+    "pandas>=2.0,<2.2.4dev0",
     "rapids-dask-dependency==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -69,52 +69,21 @@ version = {file = "dask_cudf/VERSION"}
 [tool.setuptools.packages.find]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
+[tool.ruff]
+extend = "../../pyproject.toml"
 
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-]
-known_first_party = [
-    "dask_cudf",
-]
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["dask_cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
 
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-]
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf"]
 
 [tool.pytest.ini_options]
+addopts = "--tb=native --strict-config --strict-markers"
+empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
     "error::FutureWarning",
     "error::DeprecationWarning",
@@ -124,4 +93,8 @@ filterwarnings = [
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
     "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
+    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
+    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
+    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
+xfail_strict = true
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 0a8f5c4807d..5f9a04d3cee 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -22,6 +22,8 @@ project(
   LANGUAGES CXX
 )
 
+option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
+
 # Check if cudf is already available. If so, it is the user's responsibility to ensure that the
 # CMake package is also available at build time of the Python cudf package.
 find_package(cudf "${RAPIDS_VERSION}")
@@ -39,14 +41,20 @@ set(BUILD_TESTS OFF)
 set(BUILD_BENCHMARKS OFF)
 set(CUDF_BUILD_TESTUTIL OFF)
 set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(CUDF_EXPORT_NVCOMP OFF)
+endif()
 set(CUDA_STATIC_RUNTIME ON)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 
 add_subdirectory(../../cpp cudf-cpp)
 
-# Ensure other libraries needed by libcudf.so get installed alongside it.
-include(cmake/Modules/WheelHelpers.cmake)
-install_aliased_imported_targets(
-  TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
-)
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
+  set_property(
+    TARGET cudf
+    PROPERTY INSTALL_RPATH ${rpaths}
+    APPEND
+  )
+endif()
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 5bffe9fd96c..84660cbc276 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -37,6 +37,9 @@ classifiers = [
     "Programming Language :: C++",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
+dependencies = [
+    "nvidia-nvcomp==4.0.1",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
diff --git a/python/pylibcudf/LICENSE b/python/pylibcudf/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/pylibcudf/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index a7cb66d7b16..15dd2b4c34f 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -17,6 +17,7 @@ set(cython_sources
     binaryop.pyx
     column.pyx
     column_factories.pyx
+    contiguous_split.pyx
     concatenate.pyx
     copying.pyx
     datetime.pyx
@@ -27,6 +28,7 @@ set(cython_sources
     groupby.pyx
     interop.pyx
     join.pyx
+    json.pyx
     labeling.pyx
     lists.pyx
     merge.pyx
@@ -66,3 +68,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
 add_subdirectory(libcudf)
 add_subdirectory(strings)
 add_subdirectory(io)
+add_subdirectory(nvtext)
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index a384edd456d..9bdfdab97c2 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -6,17 +6,21 @@ from . cimport (
     binaryop,
     column_factories,
     concatenate,
+    contiguous_split,
     copying,
     datetime,
     experimental,
     expressions,
     filling,
     groupby,
+    interop,
     join,
+    json,
     labeling,
     lists,
     merge,
     null_mask,
+    nvtext,
     partitioning,
     quantiles,
     reduce,
@@ -50,6 +54,7 @@ __all__ = [
     "aggregation",
     "binaryop",
     "column_factories",
+    "contiguous_split",
     "concatenate",
     "copying",
     "datetime",
@@ -58,7 +63,9 @@ __all__ = [
     "filling",
     "gpumemoryview",
     "groupby",
+    "interop",
     "join",
+    "json",
     "lists",
     "merge",
     "null_mask",
@@ -78,4 +85,5 @@ __all__ = [
     "transpose",
     "types",
     "unary",
+    "nvtext",
 ]
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 2a5365e8fad..4033062b7e2 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -15,6 +15,7 @@
     binaryop,
     column_factories,
     concatenate,
+    contiguous_split,
     copying,
     datetime,
     experimental,
@@ -24,10 +25,12 @@
     interop,
     io,
     join,
+    json,
     labeling,
     lists,
     merge,
     null_mask,
+    nvtext,
     partitioning,
     quantiles,
     reduce,
@@ -61,6 +64,7 @@
     "aggregation",
     "binaryop",
     "column_factories",
+    "contiguous_split",
     "concatenate",
     "copying",
     "datetime",
@@ -72,6 +76,7 @@
     "interop",
     "io",
     "join",
+    "json",
     "labeling",
     "lists",
     "merge",
@@ -92,4 +97,5 @@
     "transpose",
     "types",
     "unary",
+    "nvtext",
 ]
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index 5f9d145139a..51b2b4cfaa3 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -52,33 +52,27 @@ cpdef Column binary_operation(
 
     if LeftBinaryOperand is Column and RightBinaryOperand is Column:
         with nogil:
-            result = move(
-                cpp_binaryop.binary_operation(
-                    lhs.view(),
-                    rhs.view(),
-                    op,
-                    output_type.c_obj
-                )
+            result = cpp_binaryop.binary_operation(
+                lhs.view(),
+                rhs.view(),
+                op,
+                output_type.c_obj
             )
     elif LeftBinaryOperand is Column and RightBinaryOperand is Scalar:
         with nogil:
-            result = move(
-                cpp_binaryop.binary_operation(
-                    lhs.view(),
-                    dereference(rhs.c_obj),
-                    op,
-                    output_type.c_obj
-                )
+            result = cpp_binaryop.binary_operation(
+                lhs.view(),
+                dereference(rhs.c_obj),
+                op,
+                output_type.c_obj
             )
     elif LeftBinaryOperand is Scalar and RightBinaryOperand is Column:
         with nogil:
-            result = move(
-                cpp_binaryop.binary_operation(
-                    dereference(lhs.c_obj),
-                    rhs.view(),
-                    op,
-                    output_type.c_obj
-                )
+            result = cpp_binaryop.binary_operation(
+                dereference(lhs.c_obj),
+                rhs.view(),
+                op,
+                output_type.c_obj
             )
     else:
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
index a37a12fc7e1..4e5698566d0 100644
--- a/python/pylibcudf/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
@@ -138,7 +138,7 @@ cdef class Column:
 
         cdef size_type null_count = libcudf_col.get().null_count()
 
-        cdef column_contents contents = move(libcudf_col.get().release())
+        cdef column_contents contents = libcudf_col.get().release()
 
         # Note that when converting to cudf Column objects we'll need to pull
         # out the base object.
@@ -247,7 +247,7 @@ cdef class Column:
         cdef const scalar* c_scalar = slr.get()
         cdef unique_ptr[column] c_result
         with nogil:
-            c_result = move(make_column_from_scalar(dereference(c_scalar), size))
+            c_result = make_column_from_scalar(dereference(c_scalar), size)
         return Column.from_libcudf(move(c_result))
 
     @staticmethod
@@ -269,7 +269,7 @@ cdef class Column:
         cdef Scalar slr = Scalar.empty_like(like)
         cdef unique_ptr[column] c_result
         with nogil:
-            c_result = move(make_column_from_scalar(dereference(slr.get()), size))
+            c_result = make_column_from_scalar(dereference(slr.get()), size)
         return Column.from_libcudf(move(c_result))
 
     @staticmethod
@@ -373,7 +373,7 @@ cdef class Column:
         """Create a copy of the column."""
         cdef unique_ptr[column] c_result
         with nogil:
-            c_result = move(make_unique[column](self.view()))
+            c_result = make_unique[column](self.view())
         return Column.from_libcudf(move(c_result))
 
 
diff --git a/python/pylibcudf/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd
index fef02359240..d556085ab64 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/column_factories.pxd
@@ -1,7 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from pylibcudf.libcudf.types cimport mask_state, size_type
+from pylibcudf.libcudf.types cimport mask_state
 
 from .column cimport Column
 from .types cimport DataType, size_type, type_id
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
index e9085e3ea02..ac942a620b5 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -39,29 +39,17 @@ cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id):
         if isinstance(type_or_id, TypeId):
             id = type_or_id
             with nogil:
-                result = move(
-                    cpp_make_empty_column(
-                        id
-                    )
-                )
+                result = cpp_make_empty_column(id)
         else:
             raise TypeError(
                 "Must pass a TypeId or DataType"
             )
     elif MakeEmptyColumnOperand is DataType:
         with nogil:
-            result = move(
-                cpp_make_empty_column(
-                    type_or_id.c_obj
-                )
-            )
+            result = cpp_make_empty_column(type_or_id.c_obj)
     elif MakeEmptyColumnOperand is type_id:
         with nogil:
-            result = move(
-                cpp_make_empty_column(
-                    type_or_id
-                )
-            )
+            result = cpp_make_empty_column(type_or_id)
     else:
         raise TypeError(
             "Must pass a TypeId or DataType"
@@ -92,12 +80,10 @@ cpdef Column make_numeric_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_numeric_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_numeric_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
@@ -121,12 +107,10 @@ cpdef Column make_fixed_point_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_fixed_point_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_fixed_point_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
@@ -151,12 +135,10 @@ cpdef Column make_timestamp_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_timestamp_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_timestamp_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
@@ -181,12 +163,10 @@ cpdef Column make_duration_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_duration_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_duration_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
@@ -211,12 +191,10 @@ cpdef Column make_fixed_width_column(
     else:
         raise TypeError("Invalid mask argument")
     with nogil:
-        result = move(
-            cpp_make_fixed_width_column(
-                type_.c_obj,
-                size,
-                state
-            )
+        result = cpp_make_fixed_width_column(
+            type_.c_obj,
+            size,
+            state
         )
 
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx
index 8bdcc086e0f..10c860d97bb 100644
--- a/python/pylibcudf/pylibcudf/concatenate.pyx
+++ b/python/pylibcudf/pylibcudf/concatenate.pyx
@@ -40,14 +40,14 @@ cpdef concatenate(list objects):
             c_tables.push_back((<Table?>tbl).view())
 
         with nogil:
-            c_tbl_result = move(cpp_concatenate.concatenate(c_tables))
+            c_tbl_result = cpp_concatenate.concatenate(c_tables)
         return Table.from_libcudf(move(c_tbl_result))
     elif isinstance(objects[0], Column):
         for column in objects:
             c_columns.push_back((<Column?>column).view())
 
         with nogil:
-            c_col_result = move(cpp_concatenate.concatenate(c_columns))
+            c_col_result = cpp_concatenate.concatenate(c_columns)
         return Column.from_libcudf(move(c_col_result))
     else:
         raise ValueError("input must be a list of Columns or Tables")
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd
new file mode 100644
index 00000000000..2a10cb5b3d5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.contiguous_split cimport packed_columns
+
+from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+
+
+cdef class PackedColumns:
+    cdef unique_ptr[packed_columns] c_obj
+
+    @staticmethod
+    cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data)
+
+cpdef PackedColumns pack(Table input)
+
+cpdef Table unpack(PackedColumns input)
+
+cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data)
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx
new file mode 100644
index 00000000000..ed926a3fcc0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx
@@ -0,0 +1,198 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libc.stdint cimport uint8_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.libcudf.contiguous_split cimport (
+    pack as cpp_pack,
+    packed_columns,
+    unpack as cpp_unpack,
+)
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
+
+from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+from .utils cimport int_to_void_ptr
+
+
+cdef class HostBuffer:
+    """Owning host buffer that implements the buffer protocol"""
+    cdef unique_ptr[vector[uint8_t]] c_obj
+    cdef size_t nbytes
+    cdef Py_ssize_t[1] shape
+    cdef Py_ssize_t[1] strides
+
+    @staticmethod
+    cdef HostBuffer from_unique_ptr(
+        unique_ptr[vector[uint8_t]] vec
+    ):
+        cdef HostBuffer out = HostBuffer()
+        out.c_obj = move(vec)
+        out.nbytes = dereference(out.c_obj).size()
+        out.shape[0] = out.nbytes
+        out.strides[0] = 1
+        return out
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        buffer.buf = dereference(self.c_obj).data()
+        buffer.format = NULL  # byte
+        buffer.internal = NULL
+        buffer.itemsize = 1
+        buffer.len = self.nbytes
+        buffer.ndim = 1
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+cdef class PackedColumns:
+    """Column data in a serialized format.
+
+    Contains data from an array of columns in two contiguous buffers:
+    one on host, which contains table metadata and one on device which
+    contains the table data.
+
+    For details, see :cpp:class:`cudf::packed_columns`.
+    """
+    def __init__(self):
+        raise ValueError(
+            "PackedColumns should not be constructed directly. "
+            "Use one of the factories."
+        )
+
+    @staticmethod
+    cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data):
+        """Create a Python PackedColumns from a libcudf packed_columns."""
+        cdef PackedColumns out = PackedColumns.__new__(PackedColumns)
+        out.c_obj = move(data)
+        return out
+
+    def release(self):
+        """Releases and returns the underlying serialized metadata and gpu data.
+
+        The ownership of the memory are transferred to the returned buffers. After
+        this call, `self` is empty.
+
+        Returns
+        -------
+        memoryview (of a HostBuffer)
+            The serialized metadata as contiguous host memory.
+        gpumemoryview (of a rmm.DeviceBuffer)
+            The serialized gpu data as contiguous device memory.
+        """
+        if not (dereference(self.c_obj).metadata and dereference(self.c_obj).gpu_data):
+            raise ValueError("Cannot release empty PackedColumns")
+
+        return (
+            memoryview(
+                HostBuffer.from_unique_ptr(move(dereference(self.c_obj).metadata))
+            ),
+            gpumemoryview(
+                DeviceBuffer.c_from_unique_ptr(move(dereference(self.c_obj).gpu_data))
+            )
+        )
+
+
+cpdef PackedColumns pack(Table input):
+    """Deep-copy a table into a serialized contiguous memory format.
+
+    Later use `unpack` or `unpack_from_memoryviews` to unpack the serialized
+    data back into the table.
+
+    Examples
+    --------
+    >>> packed = pylibcudf.contiguous_split.pack(...)
+    >>> # Either unpack the whole `PackedColumns` at once.
+    >>> pylibcudf.contiguous_split.unpack(packed)
+    >>> # Or unpack the two serialized buffers in `PackedColumns`.
+    >>> metadata, gpu_data = packed.release()
+    >>> pylibcudf.contiguous_split.unpack_from_memoryviews(metadata, gpu_data)
+
+    For details, see :cpp:func:`cudf::pack`.
+
+    Parameters
+    ----------
+    input : Table
+        Table to pack.
+
+    Returns
+    -------
+    PackedColumns
+        The packed columns.
+    """
+    return PackedColumns.from_libcudf(
+        make_unique[packed_columns](cpp_pack(input.view()))
+    )
+
+
+cpdef Table unpack(PackedColumns input):
+    """Deserialize the result of `pack`.
+
+    Copies the result of a serialized table into a table.
+    Contrary to the libcudf C++ function, the returned table is a copy
+    of the serialized data.
+
+    For details, see :cpp:func:`cudf::unpack`.
+
+    Parameters
+    ----------
+    input : PackedColumns
+        The packed columns to unpack.
+
+    Returns
+    -------
+    Table
+        Copy of the packed columns.
+    """
+    cdef table_view v = cpp_unpack(dereference(input.c_obj))
+    # Since `Table.from_table_view` doesn't support an arbitrary owning object,
+    # we copy the table, see <https://github.com/rapidsai/cudf/issues/17040>.
+    cdef unique_ptr[table] t = make_unique[table](v)
+    return Table.from_libcudf(move(t))
+
+
+cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data):
+    """Deserialize the result of `pack`.
+
+    Copies the result of a serialized table into a table.
+    Contrary to the libcudf C++ function, the returned table is a copy
+    of the serialized data.
+
+    For details, see :cpp:func:`cudf::unpack`.
+
+    Parameters
+    ----------
+    metadata : memoryview
+        The packed metadata to unpack.
+    gpu_data : gpumemoryview
+        The packed gpu_data to unpack.
+
+    Returns
+    -------
+    Table
+        Copy of the packed columns.
+    """
+    if metadata.nbytes == 0:
+        if gpu_data.__cuda_array_interface__["data"][0] != 0:
+            raise ValueError("Expected an empty gpu_data from unpacking an empty table")
+        return Table.from_libcudf(make_unique[table](table_view()))
+
+    # Extract the raw data pointers
+    cdef const uint8_t[::1] _metadata = metadata
+    cdef const uint8_t* metadata_ptr = &_metadata[0]
+    cdef const uint8_t* gpu_data_ptr = <uint8_t*>int_to_void_ptr(gpu_data.ptr)
+
+    cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr)
+    # Since `Table.from_table_view` doesn't support an arbitrary owning object,
+    # we copy the table, see <https://github.com/rapidsai/cudf/issues/17040>.
+    cdef unique_ptr[table] t = make_unique[table](v)
+    return Table.from_libcudf(move(t))
diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx
index 9743119d92a..4938f1a3dda 100644
--- a/python/pylibcudf/pylibcudf/copying.pyx
+++ b/python/pylibcudf/pylibcudf/copying.pyx
@@ -67,13 +67,12 @@ cpdef Table gather(
     """
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(
-            cpp_copying.gather(
-                source_table.view(),
-                gather_map.view(),
-                bounds_policy
-            )
+        c_result = cpp_copying.gather(
+            source_table.view(),
+            gather_map.view(),
+            bounds_policy
         )
+
     return Table.from_libcudf(move(c_result))
 
 
@@ -121,22 +120,18 @@ cpdef Table scatter(
     cdef vector[reference_wrapper[const scalar]] source_scalars
     if TableOrListOfScalars is Table:
         with nogil:
-            c_result = move(
-                cpp_copying.scatter(
-                    source.view(),
-                    scatter_map.view(),
-                    target_table.view(),
-                )
+            c_result = cpp_copying.scatter(
+                source.view(),
+                scatter_map.view(),
+                target_table.view(),
             )
     else:
         source_scalars = _as_vector(source)
         with nogil:
-            c_result = move(
-                cpp_copying.scatter(
-                    source_scalars,
-                    scatter_map.view(),
-                    target_table.view(),
-                )
+            c_result = cpp_copying.scatter(
+                source_scalars,
+                scatter_map.view(),
+                target_table.view(),
             )
     return Table.from_libcudf(move(c_result))
 
@@ -160,11 +155,11 @@ cpdef ColumnOrTable empty_like(ColumnOrTable input):
     cdef unique_ptr[column] c_col_result
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.empty_like(input.view()))
+            c_col_result = cpp_copying.empty_like(input.view())
         return Column.from_libcudf(move(c_col_result))
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.empty_like(input.view()))
+            c_tbl_result = cpp_copying.empty_like(input.view())
         return Table.from_libcudf(move(c_tbl_result))
 
 
@@ -195,13 +190,11 @@ cpdef Column allocate_like(
     cdef size_type c_size = size if size is not None else input_column.size()
 
     with nogil:
-        c_result = move(
-            cpp_copying.allocate_like(
+        c_result = cpp_copying.allocate_like(
                 input_column.view(),
                 c_size,
                 policy,
             )
-        )
 
     return Column.from_libcudf(move(c_result))
 
@@ -298,12 +291,12 @@ cpdef Column copy_range(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_copying.copy_range(
+        c_result = cpp_copying.copy_range(
             input_column.view(),
             target_column.view(),
             input_begin,
             input_end,
-            target_begin)
+            target_begin
         )
 
     return Column.from_libcudf(move(c_result))
@@ -337,13 +330,11 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_value):
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_copying.shift(
+        c_result = cpp_copying.shift(
                 input.view(),
                 offset,
                 dereference(fill_value.c_obj)
             )
-        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -378,7 +369,7 @@ cpdef list slice(ColumnOrTable input, list indices):
     cdef int i
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.slice(input.view(), c_indices))
+            c_col_result = cpp_copying.slice(input.view(), c_indices)
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -386,7 +377,7 @@ cpdef list slice(ColumnOrTable input, list indices):
         ]
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.slice(input.view(), c_indices))
+            c_tbl_result = cpp_copying.slice(input.view(), c_indices)
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -418,7 +409,7 @@ cpdef list split(ColumnOrTable input, list splits):
 
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = move(cpp_copying.split(input.view(), c_splits))
+            c_col_result = cpp_copying.split(input.view(), c_splits)
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -426,7 +417,7 @@ cpdef list split(ColumnOrTable input, list splits):
         ]
     else:
         with nogil:
-            c_tbl_result = move(cpp_copying.split(input.view(), c_splits))
+            c_tbl_result = cpp_copying.split(input.view(), c_splits)
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -472,29 +463,25 @@ cpdef Column copy_if_else(
 
     if LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Column:
         with nogil:
-            result = move(
-                cpp_copying.copy_if_else(lhs.view(), rhs.view(), boolean_mask.view())
+            result = cpp_copying.copy_if_else(
+                lhs.view(),
+                rhs.view(),
+                boolean_mask.view()
             )
     elif LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Scalar:
         with nogil:
-            result = move(
-                cpp_copying.copy_if_else(
-                    lhs.view(), dereference(rhs.c_obj), boolean_mask.view()
-                )
+            result = cpp_copying.copy_if_else(
+                lhs.view(), dereference(rhs.c_obj), boolean_mask.view()
             )
     elif LeftCopyIfElseOperand is Scalar and RightCopyIfElseOperand is Column:
         with nogil:
-            result = move(
-                cpp_copying.copy_if_else(
-                    dereference(lhs.c_obj), rhs.view(), boolean_mask.view()
-                )
+            result = cpp_copying.copy_if_else(
+                dereference(lhs.c_obj), rhs.view(), boolean_mask.view()
             )
     else:
         with nogil:
-            result = move(
-                cpp_copying.copy_if_else(
-                    dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view()
-                )
+            result = cpp_copying.copy_if_else(
+                dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view()
             )
 
     return Column.from_libcudf(move(result))
@@ -541,22 +528,18 @@ cpdef Table boolean_mask_scatter(
 
     if TableOrListOfScalars is Table:
         with nogil:
-            result = move(
-                cpp_copying.boolean_mask_scatter(
-                    input.view(),
-                    target.view(),
-                    boolean_mask.view()
-                )
+            result = cpp_copying.boolean_mask_scatter(
+                input.view(),
+                target.view(),
+                boolean_mask.view()
             )
     else:
         source_scalars = _as_vector(input)
         with nogil:
-            result = move(
-                cpp_copying.boolean_mask_scatter(
-                    source_scalars,
-                    target.view(),
-                    boolean_mask.view(),
-                )
+            result = cpp_copying.boolean_mask_scatter(
+                source_scalars,
+                target.view(),
+                boolean_mask.view(),
             )
 
     return Table.from_libcudf(move(result))
@@ -586,8 +569,6 @@ cpdef Scalar get_element(Column input_column, size_type index):
     """
     cdef unique_ptr[scalar] c_output
     with nogil:
-        c_output = move(
-            cpp_copying.get_element(input_column.view(), index)
-        )
+        c_output = cpp_copying.get_element(input_column.view(), index)
 
     return Scalar.from_libcudf(move(c_output))
diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd
index 2fce48cf1b4..72ce680ba7a 100644
--- a/python/pylibcudf/pylibcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/datetime.pxd
@@ -1,8 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from pylibcudf.libcudf.datetime cimport datetime_component
+
 from .column cimport Column
 
 
 cpdef Column extract_year(
     Column col
 )
+
+cpdef Column extract_datetime_component(
+    Column col,
+    datetime_component component
+)
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index e8e0caaf42d..ac4335cca56 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -3,19 +3,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.datetime cimport (
-    day_of_year as cpp_day_of_year,
-    extract_day as cpp_extract_day,
-    extract_hour as cpp_extract_hour,
-    extract_microsecond_fraction as cpp_extract_microsecond_fraction,
-    extract_millisecond_fraction as cpp_extract_millisecond_fraction,
-    extract_minute as cpp_extract_minute,
-    extract_month as cpp_extract_month,
-    extract_nanosecond_fraction as cpp_extract_nanosecond_fraction,
-    extract_second as cpp_extract_second,
-    extract_weekday as cpp_extract_weekday,
+    datetime_component,
+    extract_datetime_component as cpp_extract_datetime_component,
     extract_year as cpp_extract_year,
 )
 
+from pylibcudf.libcudf.datetime import \
+    datetime_component as DatetimeComponent  # no-cython-lint
+
 from .column cimport Column
 
 
@@ -38,44 +33,32 @@ cpdef Column extract_year(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_extract_year(values.view()))
+        result = cpp_extract_year(values.view())
     return Column.from_libcudf(move(result))
 
+cpdef Column extract_datetime_component(
+    Column values,
+    datetime_component component
+):
+    """
+    Extract a datetime component from a datetime column.
 
-def extract_datetime_component(Column col, str field):
+    For details, see :cpp:func:`cudf::extract_datetime_component`.
 
-    cdef unique_ptr[column] c_result
+    Parameters
+    ----------
+    values : Column
+        The column to extract the component from.
+    component : DatetimeComponent
+        The datetime component to extract.
 
-    with nogil:
-        if field == "year":
-            c_result = move(cpp_extract_year(col.view()))
-        elif field == "month":
-            c_result = move(cpp_extract_month(col.view()))
-        elif field == "day":
-            c_result = move(cpp_extract_day(col.view()))
-        elif field == "weekday":
-            c_result = move(cpp_extract_weekday(col.view()))
-        elif field == "hour":
-            c_result = move(cpp_extract_hour(col.view()))
-        elif field == "minute":
-            c_result = move(cpp_extract_minute(col.view()))
-        elif field == "second":
-            c_result = move(cpp_extract_second(col.view()))
-        elif field == "millisecond":
-            c_result = move(
-                cpp_extract_millisecond_fraction(col.view())
-            )
-        elif field == "microsecond":
-            c_result = move(
-                cpp_extract_microsecond_fraction(col.view())
-            )
-        elif field == "nanosecond":
-            c_result = move(
-                cpp_extract_nanosecond_fraction(col.view())
-            )
-        elif field == "day_of_year":
-            c_result = move(cpp_day_of_year(col.view()))
-        else:
-            raise ValueError(f"Invalid datetime field: '{field}'")
+    Returns
+    -------
+    Column
+        Column with the extracted component.
+    """
+    cdef unique_ptr[column] result
 
-    return Column.from_libcudf(move(c_result))
+    with nogil:
+        result = cpp_extract_datetime_component(values.view(), component)
+    return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
index 61b430e64aa..0372e1132cc 100644
--- a/python/pylibcudf/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -48,13 +48,11 @@ cpdef Column fill(
 
     cdef unique_ptr[column] result
     with nogil:
-        result = move(
-            cpp_fill(
-                destination.view(),
-                begin,
-                end,
-                dereference((<Scalar> value).c_obj)
-            )
+        result = cpp_fill(
+            destination.view(),
+            begin,
+            end,
+            dereference((<Scalar> value).c_obj)
         )
     return Column.from_libcudf(move(result))
 
@@ -112,12 +110,10 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step):
     cdef unique_ptr[column] result
     cdef size_type c_size = size
     with nogil:
-        result = move(
-            cpp_sequence(
-                c_size,
-                dereference(init.c_obj),
-                dereference(step.c_obj),
-            )
+        result = cpp_sequence(
+            c_size,
+            dereference(init.c_obj),
+            dereference(step.c_obj),
         )
     return Column.from_libcudf(move(result))
 
@@ -152,18 +148,14 @@ cpdef Table repeat(
 
     if ColumnOrSize is Column:
         with nogil:
-            result = move(
-                cpp_repeat(
-                    input_table.view(),
-                    count.view()
-                )
+            result = cpp_repeat(
+                input_table.view(),
+                count.view()
             )
     if ColumnOrSize is size_type:
         with nogil:
-            result = move(
-                cpp_repeat(
-                    input_table.view(),
-                    count
-                )
+            result = cpp_repeat(
+                input_table.view(),
+                count
             )
     return Table.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
index afb95dba5b3..71f9ecb0453 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -176,7 +176,7 @@ cdef class GroupBy:
         # We rely on libcudf to tell us this rather than checking the types beforehand
         # ourselves.
         with nogil:
-            c_res = move(dereference(self.c_obj).aggregate(c_requests))
+            c_res = dereference(self.c_obj).aggregate(c_requests)
         return GroupBy._parse_outputs(move(c_res))
 
     cpdef tuple scan(self, list requests):
@@ -205,7 +205,7 @@ cdef class GroupBy:
 
         cdef pair[unique_ptr[table], vector[aggregation_result]] c_res
         with nogil:
-            c_res = move(dereference(self.c_obj).scan(c_requests))
+            c_res = dereference(self.c_obj).scan(c_requests)
         return GroupBy._parse_outputs(move(c_res))
 
     cpdef tuple shift(self, Table values, list offset, list fill_values):
@@ -234,10 +234,11 @@ cdef class GroupBy:
         cdef vector[size_type] c_offset = offset
         cdef pair[unique_ptr[table], unique_ptr[table]] c_res
         with nogil:
-            c_res = move(
-                dereference(self.c_obj).shift(values.view(), c_offset, c_fill_values)
+            c_res = dereference(self.c_obj).shift(
+                values.view(),
+                c_offset,
+                c_fill_values
             )
-
         return (
             Table.from_libcudf(move(c_res.first)),
             Table.from_libcudf(move(c_res.second)),
@@ -264,10 +265,10 @@ cdef class GroupBy:
         cdef pair[unique_ptr[table], unique_ptr[table]] c_res
         cdef vector[replace_policy] c_replace_policies = replace_policies
         with nogil:
-            c_res = move(
-                dereference(self.c_obj).replace_nulls(value.view(), c_replace_policies)
+            c_res = dereference(self.c_obj).replace_nulls(
+                value.view(),
+                c_replace_policies
             )
-
         return (
             Table.from_libcudf(move(c_res.first)),
             Table.from_libcudf(move(c_res.second)),
diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd
new file mode 100644
index 00000000000..2a0a8c15fdd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/interop.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table cimport Table
+
+
+cpdef Table from_dlpack(object managed_tensor)
+
+cpdef object to_dlpack(Table input)
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index 1a03fa5b45b..61e812353b7 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -1,6 +1,11 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
+from cpython.pycapsule cimport (
+    PyCapsule_GetPointer,
+    PyCapsule_IsValid,
+    PyCapsule_New,
+    PyCapsule_SetName,
+)
 from libc.stdlib cimport free
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport (
     ArrowArray,
     ArrowArrayStream,
     ArrowSchema,
+    DLManagedTensor,
     column_metadata,
     from_arrow_column as cpp_from_arrow_column,
     from_arrow_stream as cpp_from_arrow_stream,
+    from_dlpack as cpp_from_dlpack,
     to_arrow_host_raw,
     to_arrow_schema_raw,
+    to_dlpack as cpp_to_dlpack,
 )
 from pylibcudf.libcudf.table.table cimport table
 
@@ -131,7 +139,7 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
     cdef unique_ptr[table] c_result
     with nogil:
         # The libcudf function here will release the stream.
-        c_result = move(cpp_from_arrow_stream(c_stream))
+        c_result = cpp_from_arrow_stream(c_stream)
 
     return Table.from_libcudf(move(c_result))
 
@@ -166,7 +174,7 @@ def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
 
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(cpp_from_arrow_column(c_schema, c_array))
+        c_result = cpp_from_arrow_column(c_schema, c_array)
 
     # The capsule destructors should release automatically for us, but we
     # choose to do it explicitly here for clarity.
@@ -315,3 +323,87 @@ def _to_arrow_scalar(cudf_object, metadata=None):
     # Note that metadata for scalars is primarily important for preserving
     # information on nested types since names are otherwise irrelevant.
     return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0]
+
+
+cpdef Table from_dlpack(object managed_tensor):
+    """
+    Convert a DLPack DLTensor into a cudf table.
+
+    For details, see :cpp:func:`cudf::from_dlpack`
+
+    Parameters
+    ----------
+    managed_tensor : PyCapsule
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    Table
+        Table with a copy of the tensor data.
+    """
+    if not PyCapsule_IsValid(managed_tensor, "dltensor"):
+        raise ValueError("Invalid PyCapsule object")
+    cdef unique_ptr[table] c_result
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        managed_tensor, "dltensor"
+    )
+    if dlpack_tensor is NULL:
+        raise ValueError("PyCapsule object contained a NULL pointer")
+    PyCapsule_SetName(managed_tensor, "used_dltensor")
+
+    # Note: A copy is always performed when converting the dlpack
+    # data to a libcudf table. We also delete the dlpack_tensor pointer
+    # as the pointer is not deleted by libcudf's from_dlpack function.
+    # TODO: https://github.com/rapidsai/cudf/issues/10874
+    # TODO: https://github.com/rapidsai/cudf/issues/10849
+    with nogil:
+        c_result = cpp_from_dlpack(dlpack_tensor)
+
+    cdef Table result = Table.from_libcudf(move(c_result))
+    dlpack_tensor.deleter(dlpack_tensor)
+    return result
+
+
+cpdef object to_dlpack(Table input):
+    """
+    Convert a cudf table into a DLPack DLTensor.
+
+    For details, see :cpp:func:`cudf::to_dlpack`
+
+    Parameters
+    ----------
+    input : Table
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    PyCapsule
+        1D or 2D DLPack tensor with a copy of the table data, or nullptr.
+    """
+    for col in input._columns:
+        if col.null_count():
+            raise ValueError(
+                "Cannot create a DLPack tensor with null values. "
+                "Input is required to have null count as zero."
+            )
+    cdef DLManagedTensor *dlpack_tensor
+
+    with nogil:
+        dlpack_tensor = cpp_to_dlpack(input.view())
+
+    return PyCapsule_New(
+        dlpack_tensor,
+        "dltensor",
+        dlmanaged_tensor_pycapsule_deleter
+    )
+
+
+cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
+    if PyCapsule_IsValid(pycap_obj, "used_dltensor"):
+        # we do not call a used capsule's deleter
+        return
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        pycap_obj, "dltensor"
+    )
+    if dlpack_tensor is not NULL:
+        dlpack_tensor.deleter(dlpack_tensor)
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index 438b0ff1634..fe765b34f82 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -45,7 +45,7 @@ cpdef TableWithMetadata read_avro(
         for col in columns:
             c_columns.push_back(str(col).encode())
 
-    cdef avro_reader_options avro_opts = move(
+    cdef avro_reader_options avro_opts = (
         avro_reader_options.builder(source_info.c_obj)
         .columns(c_columns)
         .skip_rows(skip_rows)
diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx
index b53d6771cd6..2c61cc42d82 100644
--- a/python/pylibcudf/pylibcudf/io/csv.pyx
+++ b/python/pylibcudf/pylibcudf/io/csv.pyx
@@ -168,7 +168,7 @@ def read_csv(
     cdef vector[data_type] c_dtypes_list
     cdef map[string, data_type] c_dtypes_map
 
-    cdef csv_reader_options options = move(
+    cdef csv_reader_options options = (
         csv_reader_options.builder(source_info.c_obj)
         .compression(compression)
         .mangle_dupe_cols(mangle_dupe_cols)
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index 29e49083bc6..65f78f830f1 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -59,7 +59,7 @@ cdef json_reader_options _setup_json_reader_options(
         json_recovery_mode_t recovery_mode):
 
     cdef vector[data_type] types_vec
-    cdef json_reader_options opts = move(
+    cdef json_reader_options opts = (
         json_reader_options.builder(source_info.c_obj)
         .compression(compression)
         .lines(lines)
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 01a5e4b04a1..70e0a7995a2 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -252,7 +252,7 @@ cpdef TableWithMetadata read_orc(
     """
     cdef orc_reader_options opts
     cdef vector[vector[size_type]] c_stripes
-    opts = move(
+    opts = (
         orc_reader_options.builder(source_info.c_obj)
         .use_index(use_index)
         .build()
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx
index e02239d7252..f120b65fb2c 100644
--- a/python/pylibcudf/pylibcudf/io/timezone.pyx
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyx
@@ -33,11 +33,9 @@ cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name):
     cdef string c_tzname = timezone_name.encode()
 
     with nogil:
-        c_result = move(
-            cpp_make_timezone_transition_table(
-                make_optional[string](c_tzdir),
-                c_tzname
-            )
+        c_result = cpp_make_timezone_transition_table(
+            make_optional[string](c_tzdir),
+            c_tzname
         )
 
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
index 25664286f19..bc72647ea8e 100644
--- a/python/pylibcudf/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport null_equality
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 from .column cimport Column
 from .table cimport Table
@@ -212,5 +212,5 @@ cpdef Table cross_join(Table left, Table right):
     """
     cdef unique_ptr[table] result
     with nogil:
-        result = move(cpp_join.cross_join(left.view(), right.view()))
+        result = cpp_join.cross_join(left.view(), right.view())
     return Table.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/json.pxd b/python/pylibcudf/pylibcudf/json.pxd
new file mode 100644
index 00000000000..87a87349b8a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/json.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.json cimport get_json_object_options
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class GetJsonObjectOptions:
+    cdef get_json_object_options options
+
+
+cpdef Column get_json_object(
+    Column col,
+    Scalar json_path,
+    GetJsonObjectOptions options=*
+)
diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx
new file mode 100644
index 00000000000..ebb82f80408
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/json.pyx
@@ -0,0 +1,152 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf cimport json as cpp_json
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class GetJsonObjectOptions:
+    """Settings for ``get_json_object()``"""
+    def __init__(
+        self,
+        *,
+        allow_single_quotes=False,
+        strip_quotes_from_single_strings=True,
+        missing_fields_as_nulls=False
+    ):
+        self.set_allow_single_quotes(allow_single_quotes)
+        self.set_strip_quotes_from_single_strings(
+            strip_quotes_from_single_strings
+        )
+        self.set_missing_fields_as_nulls(missing_fields_as_nulls)
+
+    def get_allow_single_quotes(self):
+        """
+        Returns true/false depending on whether single-quotes for representing strings
+        are allowed.
+
+        Returns
+        -------
+        bool
+            true if single-quotes are allowed, false otherwise.
+        """
+        return self.options.get_allow_single_quotes()
+
+    def get_strip_quotes_from_single_strings(self):
+        """
+        Returns true/false depending on whether individually returned string values have
+        their quotes stripped.
+
+        Returns
+        -------
+        bool
+            true if individually returned string values have their quotes stripped.
+        """
+        return self.options.get_strip_quotes_from_single_strings()
+
+    def get_missing_fields_as_nulls(self):
+        """
+        Whether a field not contained by an object is to be interpreted as null.
+
+        Returns
+        -------
+        bool
+            true if missing fields are interpreted as null.
+        """
+        return self.options.get_missing_fields_as_nulls()
+
+    def set_allow_single_quotes(self, bool val):
+        """
+        Set whether single-quotes for strings are allowed.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to allow single quotes
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_allow_single_quotes(val)
+
+    def set_strip_quotes_from_single_strings(self, bool val):
+        """
+        Set whether individually returned string values have their quotes stripped.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to strip quotes from single strings.
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_strip_quotes_from_single_strings(val)
+
+    def set_missing_fields_as_nulls(self, bool val):
+        """
+        Set whether missing fields are interpreted as null.
+
+        Parameters
+        ----------
+        val : bool
+            Whether to treat missing fields as nulls.
+
+        Returns
+        -------
+        None
+        """
+        self.options.set_missing_fields_as_nulls(val)
+
+
+cpdef Column get_json_object(
+    Column col,
+    Scalar json_path,
+    GetJsonObjectOptions options=None
+):
+    """
+    Apply a JSONPath string to all rows in an input strings column.
+
+    For details, see :cpp:func:`cudf::get_json_object`
+
+    Parameters
+    ----------
+    col : Column
+        The input strings column. Each row must contain a valid json string.
+
+    json_path : Scalar
+        The JSONPath string to be applied to each row.
+
+    options : GetJsonObjectOptions
+        Options for controlling the behavior of the function.
+
+    Returns
+    -------
+    Column
+        New strings column containing the retrieved json object strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string_scalar* c_json_path = <string_scalar*>(
+        json_path.c_obj.get()
+    )
+    if options is None:
+        options = GetJsonObjectOptions()
+
+    cdef cpp_json.get_json_object_options c_options = options.options
+
+    with nogil:
+        c_result = cpp_json.get_json_object(
+            col.view(),
+            dereference(c_json_path),
+            c_options
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
index b3f6a92d85c..226a9e14172 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyx
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -54,14 +54,12 @@ cpdef Column label_bins(
     )
 
     with nogil:
-        c_result = move(
-            cpp_labeling.label_bins(
-                input.view(),
-                left_edges.view(),
-                c_left_inclusive,
-                right_edges.view(),
-                c_right_inclusive,
-            )
+        c_result = cpp_labeling.label_bins(
+            input.view(),
+            left_edges.view(),
+            c_left_inclusive,
+            right_edges.view(),
+            c_right_inclusive,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
index 2167616690f..15beaee47d4 100644
--- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx
-                   replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx
+set(cython_sources
+    aggregation.pyx binaryop.pyx copying.pyx datetime.pyx expressions.pyx labeling.pyx reduce.pyx
+    replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
index 7a369701bbd..76f35cbba71 100644
--- a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column_view cimport (
 )
 from pylibcudf.libcudf.types cimport data_type, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
index f1a326bcd40..b2388858127 100644
--- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.types cimport (
     type_id,
 )
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
index 92f5a185a54..def292148c5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
@@ -4,9 +4,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.column.column cimport column, column_view
 from pylibcudf.libcudf.table.table cimport table, table_view
-from pylibcudf.libcudf.utilities.host_span cimport host_span
+from pylibcudf.libcudf.utilities.span cimport host_span
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
index cadac6a0022..12090af16cc 100644
--- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
@@ -6,7 +6,7 @@ from libcpp.vector cimport vector
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
@@ -26,3 +26,8 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
     cdef packed_columns pack (const table_view& input) except +
 
     cdef table_view unpack (const packed_columns& input) except +
+
+    cdef table_view unpack (
+        const uint8_t* metadata,
+        const uint8_t* gpu_data
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
index 4d4a4ba9b89..e6e719d6436 100644
--- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
@@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 ctypedef const scalar constscalar
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
index a4465343197..73cdfb96af5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -7,6 +8,18 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
+    cpdef enum class datetime_component(uint8_t):
+        YEAR
+        MONTH
+        DAY
+        WEEKDAY
+        HOUR
+        MINUTE
+        SECOND
+        MILLISECOND
+        MICROSECOND
+        NANOSECOND
+
     cdef unique_ptr[column] extract_year(const column_view& column) except +
     cdef unique_ptr[column] extract_month(const column_view& column) except +
     cdef unique_ptr[column] extract_day(const column_view& column) except +
@@ -23,6 +36,10 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] extract_nanosecond_fraction(
         const column_view& column
     ) except +
+    cdef unique_ptr[column] extract_datetime_component(
+        const column_view& column,
+        datetime_component component
+    ) except +
 
     ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency":
         DAY "cudf::datetime::rounding_frequency::DAY"
diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pyx b/python/pylibcudf/pylibcudf/libcudf/datetime.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd
new file mode 100644
index 00000000000..e55574020f4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+
+
+cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
+    cdef cppclass scale_type:
+        scale_type(int32_t)
diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
index 848462131fe..17ea33a2066 100644
--- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
@@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport (
     size_type,
     sorted,
 )
-from pylibcudf.libcudf.utilities.host_span cimport host_span
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 30b97fdec34..b75e9ca7001 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -32,11 +32,13 @@ cdef extern from "cudf/interop.hpp" nogil:
 
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
-    cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor
-                                       ) except +
+    cdef unique_ptr[table] from_dlpack(
+        const DLManagedTensor* managed_tensor
+    ) except +
 
-    DLManagedTensor* to_dlpack(table_view input_table
-                               ) except +
+    DLManagedTensor* to_dlpack(
+        const table_view& input
+    ) except +
 
     cdef cppclass column_metadata:
         column_metadata() except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd
index 6f6c145b23c..21033a0284e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport null_equality, size_type
 
-from rmm._lib.device_uvector cimport device_uvector
+from rmm.librmm.device_uvector cimport device_uvector
 
 ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
 ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/libcudf/json.pxd
similarity index 100%
rename from python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
rename to python/pylibcudf/pylibcudf/libcudf/json.pxd
diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
index 5f582091b06..27af4a3bdb1 100644
--- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index f2dd22f43aa..41250037dcf 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,13 +1,21 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
 
+    cdef unique_ptr[column] minhash(
+        const column_view &strings,
+        const numeric_scalar[uint32_t] seed,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] minhash(
         const column_view &strings,
         const column_view &seeds,
@@ -20,6 +28,12 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash64(
+        const column_view &strings,
+        const numeric_scalar[uint64_t] seed,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] word_minhash(
         const column_view &input,
         const column_view &seeds
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
index 673bffa28ae..be3a2d75718 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -8,9 +9,9 @@ from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
-    ctypedef enum letter_type:
-        CONSONANT 'nvtext::letter_type::CONSONANT'
-        VOWEL 'nvtext::letter_type::VOWEL'
+    cpdef enum class letter_type:
+        CONSONANT
+        VOWEL
 
     cdef unique_ptr[column] porter_stemmer_measure(
         const column_view & strings
diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
index 4b40a8a26f6..a51413669c5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
@@ -4,9 +4,9 @@ from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
 from libcpp.string cimport string
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport data_type
-from pylibcudf.libcudf.wrappers.decimals cimport scale_type
 
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
index b8b4343173e..f5f2113332a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)
+set(cython_sources char_types.pyx combine.pyx regex_flags.pyx side_type.pyx translate.pyx)
 
 set(linked_libraries cudf::cudf)
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
index e4c9fa5817a..e659993b834 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+from libcpp cimport int
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 
 cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
 
-    ctypedef enum separator_on_nulls:
-        YES 'cudf::strings::separator_on_nulls::YES'
-        NO  'cudf::strings::separator_on_nulls::NO'
+    cpdef enum class separator_on_nulls(int):
+        YES
+        NO
 
-    ctypedef enum output_if_empty_list:
-        EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING'
-        NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT'
+    cpdef enum class output_if_empty_list(int):
+        EMPTY_STRING
+        NULL_ELEMENT
 
     cdef unique_ptr[column] concatenate(
-        table_view source_strings,
+        table_view strings_columns,
         string_scalar separator,
-        string_scalar narep) except +
+        string_scalar narep,
+        separator_on_nulls separate_nulls) except +
+
+    cdef unique_ptr[column] concatenate(
+        table_view strings_columns,
+        column_view separators,
+        string_scalar separator_narep,
+        string_scalar col_narep,
+        separator_on_nulls separate_nulls) except +
 
     cdef unique_ptr[column] join_strings(
-        column_view source_strings,
+        column_view input,
         string_scalar separator,
         string_scalar narep) except +
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
index 83a9573baad..e6688cfff81 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
@@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_booleans(
-        column_view input_col,
+        column_view input,
         string_scalar true_string) except +
 
     cdef unique_ptr[column] from_booleans(
-        column_view input_col,
+        column_view booleans,
         string_scalar true_string,
         string_scalar false_string) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
index fa8975c4df9..fceddd58df0 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
@@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_timestamps(
-        column_view input_col,
+        column_view input,
         data_type timestamp_type,
         string format) except +
 
     cdef unique_ptr[column] from_timestamps(
-        column_view input_col,
+        column_view timestamps,
         string format,
-        column_view input_strings_names) except +
+        column_view names) except +
 
     cdef unique_ptr[column] is_timestamp(
         column_view input_col,
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
index ebe10574353..43ffad1d89f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
@@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_durations(
-        const column_view & strings_col,
+        const column_view & input,
         data_type duration_type,
         const string & format) except +
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
index 6f820f3c9a4..72ab329f2dd 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
@@ -9,13 +9,13 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_fixed_point(
-        column_view input_col,
+        column_view input,
         data_type output_type) except +
 
     cdef unique_ptr[column] from_fixed_point(
-        column_view input_col) except +
+        column_view input) except +
 
     cdef unique_ptr[column] is_fixed_point(
-        column_view source_strings,
-        data_type output_type
+        column_view input,
+        data_type decimal_type
     ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index f4fc4674506..a45c7f9979e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_floats(
-        column_view input_col,
+        column_view strings,
         data_type output_type) except +
 
     cdef unique_ptr[column] from_floats(
-        column_view input_col) except +
+        column_view floats) except +
 
     cdef unique_ptr[column] is_float(
-        column_view source_strings
+        column_view input
     ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
index f12aab0a2e4..69d566b8c49 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport data_type
@@ -9,23 +10,28 @@ from pylibcudf.libcudf.types cimport data_type
 cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] to_integers(
-        column_view input_col,
-        data_type output_type) except +
+        column_view input,
+        data_type output_type) except +libcudf_exception_handler
 
     cdef unique_ptr[column] from_integers(
-        column_view input_col) except +
+        column_view integers) except +libcudf_exception_handler
+
+    cdef unique_ptr[column] is_integer(
+        column_view input
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_integer(
-        column_view source_strings
-    ) except +
+        column_view input,
+        data_type int_type
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] hex_to_integers(
-        column_view input_col,
+        column_view input,
         data_type output_type) except +
 
     cdef unique_ptr[column] is_hex(
-        column_view source_strings
-    ) except +
+        column_view input
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] integers_to_hex(
-        column_view input_col) except +
+        column_view input) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
index fe571cfced6..801db438e92 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
@@ -8,11 +8,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] ipv4_to_integers(
-        column_view input_col) except +
+        column_view input) except +
 
     cdef unique_ptr[column] integers_to_ipv4(
-        column_view input_col) except +
+        column_view integers) except +
 
     cdef unique_ptr[column] is_ipv4(
-        column_view source_strings
+        column_view input
     ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index 109111568d8..6e1ecd30539 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -9,6 +9,6 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[column] format_list_column(
-        column_view input_col,
+        column_view input,
         string_scalar na_rep,
         column_view separators) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index 5c07b698454..cb319ad143b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] url_encode(
-        column_view input_col) except +
+        column_view input) except +
 
     cdef unique_ptr[column] url_decode(
-        column_view input_col) except +
+        column_view input) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
index 0491644a10a..3d048c1f50b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
         nogil:
 
     cdef unique_ptr[column] find_multiple(
-        column_view source_strings,
+        column_view input,
         column_view targets) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
index e0a8b776465..0d286c36446 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
@@ -11,3 +11,7 @@ cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] findall(
         column_view input,
         regex_program prog) except +
+
+    cdef unique_ptr[column] find_re(
+        column_view input,
+        regex_program prog) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
index 657fe61eb14..875f8cafd14 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
@@ -12,11 +12,11 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] pad(
-        column_view source_strings,
+        column_view input,
         size_type width,
         side_type side,
         string fill_char) except +
 
     cdef unique_ptr[column] zfill(
-        column_view source_strings,
+        column_view input,
         size_type width) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
index 40f0e2fa50c..6b0c90d0acc 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
@@ -6,6 +6,7 @@ from libcpp.vector cimport vector
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
@@ -14,17 +15,18 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] replace_re(
-        column_view source_strings,
-        regex_program,
-        string_scalar repl,
-        size_type maxrepl) except +
-
-    cdef unique_ptr[column] replace_with_backrefs(
-        column_view source_strings,
-        regex_program,
-        string repl) except +
+        column_view input,
+        regex_program prog,
+        string_scalar replacement,
+        size_type max_replace_count) except +
 
     cdef unique_ptr[column] replace_re(
-        column_view source_strings,
+        column_view input,
         vector[string] patterns,
-        column_view repls) except +
+        column_view replacements,
+        regex_flags flags) except +
+
+    cdef unique_ptr[column] replace_with_backrefs(
+        column_view input,
+        regex_program prog,
+        string replacement) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
index 019ff3f17ba..e92c5dc1d66 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
@@ -1,12 +1,10 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
-from libc.stdint cimport int32_t
+from libcpp cimport int
 
 
 cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil:
 
-    cpdef enum class side_type(int32_t):
-        LEFT 'cudf::strings::side_type::LEFT'
-        RIGHT 'cudf::strings::side_type::RIGHT'
-        BOTH 'cudf::strings::side_type::BOTH'
-
-ctypedef int32_t underlying_type_t_side_type
+    cpdef enum class side_type(int):
+        LEFT
+        RIGHT
+        BOTH
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
index 4162e886a7d..4299cf62e99 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
@@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] partition(
-        column_view source_strings,
+        column_view input,
         string_scalar delimiter) except +
 
     cdef unique_ptr[table] rpartition(
-        column_view source_strings,
+        column_view input,
         string_scalar delimiter) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
index 3046149aebb..a22a79fc7d7 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
@@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] split(
-        column_view source_strings,
+        column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[table] rsplit(
-        column_view source_strings,
+        column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] split_record(
-        column_view source_strings,
+        column_view strings,
         string_scalar delimiter,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] rsplit_record(
-        column_view source_strings,
+        column_view strings,
         string_scalar delimiter,
         size_type maxsplit) except +
 
@@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \
         "cudf::strings" nogil:
 
     cdef unique_ptr[table] split_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[table] rsplit_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] split_record_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
 
     cdef unique_ptr[column] rsplit_record_re(
-        const column_view& source_strings,
-        regex_program,
+        const column_view& input,
+        regex_program prog,
         size_type maxsplit) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
index b0ca771762d..dd527a78e7f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
@@ -10,6 +10,6 @@ from pylibcudf.libcudf.strings.side_type cimport side_type
 cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] strip(
-        column_view source_strings,
-        side_type stype,
+        column_view input,
+        side_type side,
         string_scalar to_strip) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
index c0053391328..abc1bd43ad2 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
@@ -9,5 +9,5 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] wrap(
-        column_view source_strings,
+        column_view input,
         size_type width) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
index 0c8fe1060ac..2eca043e451 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
index 38298a7c1f1..d21510bd731 100644
--- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type
 
-from rmm._lib.device_buffer cimport device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
similarity index 100%
rename from python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd
diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
deleted file mode 100644
index 558299501d6..00000000000
--- a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t, int64_t
-from pylibcudf.libcudf.types cimport int128
-
-
-cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
-    # cython type stub to help resolve to numeric::decimal64
-    ctypedef int64_t decimal64
-    # cython type stub to help resolve to numeric::decimal32
-    ctypedef int64_t decimal32
-    # cython type stub to help resolve to numeric::decimal128
-    ctypedef int128 decimal128
-
-    cdef cppclass scale_type:
-        scale_type(int32_t)
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index 6f82124d06e..ecaf62d6895 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -69,7 +69,7 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
     cdef unique_ptr[table] c_result
 
     with nogil:
-        c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx))
+        c_result = cpp_explode.explode_outer(input.view(), explode_column_idx)
 
     return Table.from_libcudf(move(c_result))
 
@@ -92,7 +92,7 @@ cpdef Column concatenate_rows(Table input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_concatenate_rows(input.view()))
+        c_result = cpp_concatenate_rows(input.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -123,10 +123,7 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_concatenate_list_elements(
-            input.view(),
-            null_policy,
-        ))
+        c_result = cpp_concatenate_list_elements(input.view(), null_policy)
 
     return Column.from_libcudf(move(c_result))
 
@@ -161,12 +158,12 @@ cpdef Column contains(Column input, ColumnOrScalar search_key):
         raise TypeError("Must pass a Column or Scalar")
 
     with nogil:
-        c_result = move(cpp_contains.contains(
+        c_result = cpp_contains.contains(
             list_view.view(),
             search_key.view() if ColumnOrScalar is Column else dereference(
                 search_key.get()
             ),
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -190,7 +187,7 @@ cpdef Column contains_nulls(Column input):
     cdef unique_ptr[column] c_result
     cdef ListColumnView list_view = input.list_view()
     with nogil:
-        c_result = move(cpp_contains.contains_nulls(list_view.view()))
+        c_result = cpp_contains.contains_nulls(list_view.view())
     return Column.from_libcudf(move(c_result))
 
 
@@ -229,13 +226,13 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
     )
 
     with nogil:
-        c_result = move(cpp_contains.index_of(
+        c_result = cpp_contains.index_of(
             list_view.view(),
             search_key.view() if ColumnOrScalar is Column else dereference(
                 search_key.get()
             ),
             find_option,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -258,9 +255,7 @@ cpdef Column reverse(Column input):
     cdef ListColumnView list_view = input.list_view()
 
     with nogil:
-        c_result = move(cpp_reverse.reverse(
-            list_view.view(),
-        ))
+        c_result = cpp_reverse.reverse(list_view.view())
     return Column.from_libcudf(move(c_result))
 
 
@@ -288,10 +283,10 @@ cpdef Column segmented_gather(Column input, Column gather_map_list):
     cdef ListColumnView list_view2 = gather_map_list.list_view()
 
     with nogil:
-        c_result = move(cpp_gather.segmented_gather(
+        c_result = cpp_gather.segmented_gather(
             list_view1.view(),
             list_view2.view(),
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -316,10 +311,10 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
     cdef ListColumnView list_view = input.list_view()
 
     with nogil:
-        c_result = move(cpp_extract_list_element(
+        c_result = cpp_extract_list_element(
             list_view.view(),
             index.view() if ColumnOrSizeType is Column else index,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -344,7 +339,7 @@ cpdef Column count_elements(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_count_elements(list_view.view()))
+        c_result = cpp_count_elements(list_view.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -373,17 +368,14 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None):
 
     if steps is not None:
         with nogil:
-            c_result = move(cpp_filling.sequences(
+            c_result = cpp_filling.sequences(
                 starts.view(),
                 steps.view(),
                 sizes.view(),
-            ))
+            )
     else:
         with nogil:
-            c_result = move(cpp_filling.sequences(
-                starts.view(),
-                sizes.view(),
-            ))
+            c_result = cpp_filling.sequences(starts.view(), sizes.view())
     return Column.from_libcudf(move(c_result))
 
 cpdef Column sort_lists(
@@ -423,17 +415,17 @@ cpdef Column sort_lists(
 
     with nogil:
         if stable:
-            c_result = move(cpp_stable_sort_lists(
+            c_result = cpp_stable_sort_lists(
                     list_view.view(),
                     c_sort_order,
                     na_position,
-            ))
+            )
         else:
-            c_result = move(cpp_sort_lists(
+            c_result = cpp_sort_lists(
                     list_view.view(),
                     c_sort_order,
                     na_position,
-            ))
+            )
     return Column.from_libcudf(move(c_result))
 
 
@@ -477,12 +469,12 @@ cpdef Column difference_distinct(
     )
 
     with nogil:
-        c_result = move(cpp_set_operations.difference_distinct(
+        c_result = cpp_set_operations.difference_distinct(
             lhs_view.view(),
             rhs_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -525,12 +517,12 @@ cpdef Column have_overlap(
     )
 
     with nogil:
-        c_result = move(cpp_set_operations.have_overlap(
+        c_result = cpp_set_operations.have_overlap(
             lhs_view.view(),
             rhs_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -573,12 +565,12 @@ cpdef Column intersect_distinct(
     )
 
     with nogil:
-        c_result = move(cpp_set_operations.intersect_distinct(
+        c_result = cpp_set_operations.intersect_distinct(
             lhs_view.view(),
             rhs_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -622,12 +614,12 @@ cpdef Column union_distinct(
     )
 
     with nogil:
-        c_result = move(cpp_set_operations.union_distinct(
+        c_result = cpp_set_operations.union_distinct(
             lhs_view.view(),
             rhs_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -652,10 +644,10 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
     cdef ListColumnView list_view = input.list_view()
     cdef ListColumnView mask_view = boolean_mask.list_view()
     with nogil:
-        c_result = move(cpp_apply_boolean_mask(
+        c_result = cpp_apply_boolean_mask(
             list_view.view(),
             mask_view.view(),
-        ))
+        )
     return Column.from_libcudf(move(c_result))
 
 
@@ -690,9 +682,9 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
     )
 
     with nogil:
-        c_result = move(cpp_distinct(
+        c_result = cpp_distinct(
             list_view.view(),
             c_nulls_equal,
             c_nans_equal,
-        ))
+        )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
index 6d707b67449..61a21aafdb2 100644
--- a/python/pylibcudf/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -47,12 +47,10 @@ cpdef Table merge (
 
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(
-            cpp_merge.merge(
-                c_tables_to_merge,
-                c_key_cols,
-                c_column_order,
-                c_null_precedence,
-            )
+        c_result = cpp_merge.merge(
+            c_tables_to_merge,
+            c_key_cols,
+            c_column_order,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd
index ab5c0080312..9bdfaee2842 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/null_mask.pxd
@@ -2,7 +2,7 @@
 
 from pylibcudf.libcudf.types cimport mask_state, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .column cimport Column
 
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
index 5bdde06f21f..74180951562 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pyx
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -6,7 +6,8 @@ from libcpp.utility cimport move
 from pylibcudf.libcudf cimport null_mask as cpp_null_mask
 from pylibcudf.libcudf.types cimport mask_state, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint
 
@@ -31,13 +32,13 @@ cpdef DeviceBuffer copy_bitmask(Column col):
     Returns
     -------
     rmm.DeviceBuffer
-        A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer``
-        if ``col`` is not nullable
+        A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty
+        ``DeviceBuffer`` if ``col`` is not nullable
     """
     cdef device_buffer db
 
     with nogil:
-        db = move(cpp_null_mask.copy_bitmask(col.view()))
+        db = cpp_null_mask.copy_bitmask(col.view())
 
     return buffer_to_python(move(db))
 
@@ -89,7 +90,7 @@ cpdef DeviceBuffer create_null_mask(
     cdef device_buffer db
 
     with nogil:
-        db = move(cpp_null_mask.create_null_mask(size, state))
+        db = cpp_null_mask.create_null_mask(size, state)
 
     return buffer_to_python(move(db))
 
@@ -113,7 +114,7 @@ cpdef tuple bitmask_and(list columns):
     cdef pair[device_buffer, size_type] c_result
 
     with nogil:
-        c_result = move(cpp_null_mask.bitmask_and(c_table.view()))
+        c_result = cpp_null_mask.bitmask_and(c_table.view())
 
     return buffer_to_python(move(c_result.first)), c_result.second
 
@@ -137,6 +138,6 @@ cpdef tuple bitmask_or(list columns):
     cdef pair[device_buffer, size_type] c_result
 
     with nogil:
-        c_result = move(cpp_null_mask.bitmask_or(c_table.view()))
+        c_result = cpp_null_mask.bitmask_or(c_table.view())
 
     return buffer_to_python(move(c_result.first)), c_result.second
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
new file mode 100644
index 00000000000..d97c0a73267
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -0,0 +1,24 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
+)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
new file mode 100644
index 00000000000..a658e57018e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . cimport (
+    edit_distance,
+    generate_ngrams,
+    jaccard,
+    minhash,
+    ngrams_tokenize,
+    normalize,
+    replace,
+    stemmer,
+)
+
+__all__ = [
+    "edit_distance",
+    "generate_ngrams",
+    "jaccard",
+    "minhash",
+    "ngrams_tokenize",
+    "normalize",
+    "replace",
+    "stemmer",
+]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
new file mode 100644
index 00000000000..2c1feb089a2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import (
+    edit_distance,
+    generate_ngrams,
+    jaccard,
+    minhash,
+    ngrams_tokenize,
+    normalize,
+    replace,
+    stemmer,
+)
+
+__all__ = [
+    "edit_distance",
+    "generate_ngrams",
+    "jaccard",
+    "minhash",
+    "ngrams_tokenize",
+    "normalize",
+    "replace",
+    "stemmer",
+]
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
new file mode 100644
index 00000000000..446b95afabb
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column edit_distance(Column input, Column targets)
+
+cpdef Column edit_distance_matrix(Column input)
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
new file mode 100644
index 00000000000..dcacb2e1267
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.edit_distance cimport (
+    edit_distance as cpp_edit_distance,
+    edit_distance_matrix as cpp_edit_distance_matrix,
+)
+
+
+cpdef Column edit_distance(Column input, Column targets):
+    """
+    Returns the edit distance between individual strings in two strings columns
+
+    For details, see :cpp:func:`edit_distance`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    targets : Column
+        Strings to compute edit distance against
+
+    Returns
+    -------
+    Column
+        New column of edit distance values
+    """
+    cdef column_view c_strings = input.view()
+    cdef column_view c_targets = targets.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_edit_distance(c_strings, c_targets)
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column edit_distance_matrix(Column input):
+    """
+    Returns the edit distance between all strings in the input strings column
+
+    For details, see :cpp:func:`edit_distance_matrix`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+
+    Returns
+    -------
+    Column
+        New column of edit distance values
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_edit_distance_matrix(c_strings)
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
new file mode 100644
index 00000000000..f15eb1f25e9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator)
+
+cpdef Column generate_character_ngrams(Column input, size_type ngrams=*)
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams=*)
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
new file mode 100644
index 00000000000..09859d09e9e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
+    generate_character_ngrams as cpp_generate_character_ngrams,
+    generate_ngrams as cpp_generate_ngrams,
+    hash_character_ngrams as cpp_hash_character_ngrams,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
+    """
+    Returns a single column of strings by generating ngrams from a strings column.
+
+    For details, see :cpp:func:`generate_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+    separator : Scalar
+        The string to use for separating ngram tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef column_view c_strings = input.view()
+    cdef const string_scalar* c_separator = <const string_scalar*>separator.c_obj.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_generate_ngrams(
+            c_strings,
+            ngrams,
+            c_separator[0]
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
+    """
+    Returns a lists column of ngrams of characters within each string.
+
+    For details, see :cpp:func:`generate_character_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Lists column of strings
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_generate_character_ngrams(
+            c_strings,
+            ngrams,
+        )
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
+    """
+    Returns a lists column of hash values of the characters in each string
+
+    For details, see :cpp:func:`hash_character_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Lists column of hash values
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_hash_character_ngrams(
+            c_strings,
+            ngrams,
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
new file mode 100644
index 00000000000..a4d4a15335b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column jaccard_index(Column input1, Column input2, size_type width)
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
new file mode 100644
index 00000000000..3d8669865d9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
@@ -0,0 +1,45 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.jaccard cimport (
+    jaccard_index as cpp_jaccard_index,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column jaccard_index(Column input1, Column input2, size_type width):
+    """
+    Returns the Jaccard similarity between individual rows in two strings columns.
+
+    For details, see :cpp:func:`jaccard_index`
+
+    Parameters
+    ----------
+    input1 : Column
+        Input strings column
+    input2 : Column
+        Input strings column
+    width : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Index calculation values
+    """
+    cdef column_view c_input1 = input1.view()
+    cdef column_view c_input2 = input2.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_jaccard_index(
+            c_input1,
+            c_input2,
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
new file mode 100644
index 00000000000..97e8c9dc83c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t, uint64_t
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
+
+cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
+
+cpdef Column word_minhash(Column input, Column seeds)
+
+cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
new file mode 100644
index 00000000000..f1e012e60e5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -0,0 +1,152 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.minhash cimport (
+    minhash as cpp_minhash,
+    minhash64 as cpp_minhash64,
+    word_minhash as cpp_word_minhash,
+    word_minhash64 as cpp_word_minhash64,
+)
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+
+cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
+    """
+    Returns the minhash values for each string per seed.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seeds : Column or Scalar
+        Seed value(s) used for the hash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+        Default is 4 characters.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    if not isinstance(seeds, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = cpp_minhash(
+            input.view(),
+            seeds.view() if ColumnOrScalar is Column else
+            dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
+    """
+    Returns the minhash values for each string per seed.
+    This function uses MurmurHash3_x64_128 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash64`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seeds : Column or Scalar
+        Seed value(s) used for the hash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+        Default is 4 characters.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    if not isinstance(seeds, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = cpp_minhash64(
+            input.view(),
+            seeds.view() if ColumnOrScalar is Column else
+            dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column word_minhash(Column input, Column seeds):
+    """
+    Returns the minhash values for each row of strings per seed.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`word_minhash`.
+
+    Parameters
+    ----------
+    input : Column
+        Lists column of strings to compute minhash
+    seeds : Column or Scalar
+        Seed values used for the hash algorithm.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_word_minhash(
+            input.view(),
+            seeds.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column word_minhash64(Column input, Column seeds):
+    """
+    Returns the minhash values for each row of strings per seed.
+    This function uses MurmurHash3_x64_128 for the hash algorithm though
+    only the first 64-bits of the hash are used in computing the output.
+
+    For details, see :cpp:func:`word_minhash64`.
+
+    Parameters
+    ----------
+    input : Column
+        Lists column of strings to compute minhash
+    seeds : Column or Scalar
+        Seed values used for the hash algorithm.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_word_minhash64(
+            input.view(),
+            seeds.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
new file mode 100644
index 00000000000..4f791ba1ee9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column ngrams_tokenize(
+    Column input,
+    size_type ngrams,
+    Scalar delimiter,
+    Scalar separator
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
new file mode 100644
index 00000000000..8a1854c5f0d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
+    ngrams_tokenize as cpp_ngrams_tokenize,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column ngrams_tokenize(
+    Column input,
+    size_type ngrams,
+    Scalar delimiter,
+    Scalar separator
+):
+    """
+    Returns a single column of strings by tokenizing the input strings column
+    and then producing ngrams of each string.
+
+    For details, see :cpp:func:`ngrams_tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngrams : size_type
+        The ngram number to generate
+    delimiter : Scalar
+        UTF-8 characters used to separate each string into tokens.
+        An empty string will separate tokens using whitespace.
+    separator : Scalar
+        The string to use for separating ngram tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_ngrams_tokenize(
+            input.view(),
+            ngrams,
+            dereference(<const string_scalar*>delimiter.get()),
+            dereference(<const string_scalar*>separator.get()),
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
new file mode 100644
index 00000000000..90676145afa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+
+
+cpdef Column normalize_spaces(Column input)
+
+cpdef Column normalize_characters(Column input, bool do_lower_case)
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
new file mode 100644
index 00000000000..637d900b659
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.normalize cimport (
+    normalize_characters as cpp_normalize_characters,
+    normalize_spaces as cpp_normalize_spaces,
+)
+
+
+cpdef Column normalize_spaces(Column input):
+    """
+    Returns a new strings column by normalizing the whitespace in
+    each string in the input column.
+
+    For details, see :cpp:func:`normalize_spaces`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+
+    Returns
+    -------
+    Column
+        New strings columns of normalized strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize_spaces(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column normalize_characters(Column input, bool do_lower_case):
+    """
+    Normalizes strings characters for tokenizing.
+
+    For details, see :cpp:func:`normalize_characters`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    do_lower_case : bool
+        If true, upper-case characters are converted to lower-case
+        and accents are stripped from those characters. If false,
+        accented and upper-case characters are not transformed.
+
+    Returns
+    -------
+    Column
+        Normalized strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize_characters(input.view(), do_lower_case)
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
new file mode 100644
index 00000000000..624f90e7486
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace_tokens(
+    Column input,
+    Column targets,
+    Column replacements,
+    Scalar delimiter=*,
+)
+
+cpdef Column filter_tokens(
+    Column input,
+    size_type min_token_length,
+    Scalar replacement=*,
+    Scalar delimiter=*
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
new file mode 100644
index 00000000000..b65348ce14d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.replace cimport (
+    filter_tokens as cpp_filter_tokens,
+    replace_tokens as cpp_replace_tokens,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace_tokens(
+    Column input,
+    Column targets,
+    Column replacements,
+    Scalar delimiter=None,
+):
+    """
+    Replaces specified tokens with corresponding replacement strings.
+
+    For details, see :cpp:func:`replace_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to replace
+    targets : Column
+        Strings to compare against tokens found in ``input``
+    replacements : Column
+        Replacement strings for each string in ``targets``
+    delimiter : Scalar, optional
+        Characters used to separate each string into tokens.
+        The default of empty string will identify tokens using whitespace.
+
+    Returns
+    -------
+    Column
+        New strings column with replaced strings
+    """
+    cdef unique_ptr[column] c_result
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    with nogil:
+        c_result = cpp_replace_tokens(
+            input.view(),
+            targets.view(),
+            replacements.view(),
+            dereference(<const string_scalar*>delimiter.get()),
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column filter_tokens(
+    Column input,
+    size_type min_token_length,
+    Scalar replacement=None,
+    Scalar delimiter=None
+):
+    """
+    Removes tokens whose lengths are less than a specified number of characters.
+
+    For details, see :cpp:func:`filter_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to replace
+    min_token_length : size_type
+        The minimum number of characters to retain a
+        token in the output string
+    replacement : Scalar, optional
+        Optional replacement string to be used in place of removed tokens
+    delimiter : Scalar, optional
+        Characters used to separate each string into tokens.
+        The default of empty string will identify tokens using whitespace.
+    Returns
+    -------
+    Column
+        New strings column of filtered strings
+    """
+    cdef unique_ptr[column] c_result
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    if replacement is None:
+        replacement = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_filter_tokens(
+            input.view(),
+            min_token_length,
+            dereference(<const string_scalar*>replacement.get()),
+            dereference(<const string_scalar*>delimiter.get()),
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
new file mode 100644
index 00000000000..48762efc01f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnOrSize:
+    Column
+    size_type
+
+cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices)
+
+cpdef Column porter_stemmer_measure(Column input)
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
new file mode 100644
index 00000000000..854d1053624
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.stemmer cimport (
+    is_letter as cpp_is_letter,
+    letter_type,
+    porter_stemmer_measure as cpp_porter_stemmer_measure,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column is_letter(
+    Column input,
+    bool check_vowels,
+    ColumnOrSize indices
+):
+    """
+    Returns boolean column indicating if the character
+    or characters at the provided character index or
+    indices (respectively) are consonants or vowels
+
+    For details, see :cpp:func:`is_letter`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    check_vowels : bool
+        If true, the check is for vowels. Otherwise the check is
+        for consonants.
+    indices : Union[Column, size_type]
+        The character position(s) to check in each string
+
+    Returns
+    -------
+    Column
+        New boolean column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_is_letter(
+            input.view(),
+            letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
+            indices if ColumnOrSize is size_type else indices.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column porter_stemmer_measure(Column input):
+    """
+    Returns the Porter Stemmer measurements of a strings column.
+
+    For details, see :cpp:func:`porter_stemmer_measure`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column of words to measure
+
+    Returns
+    -------
+    Column
+        New column of measure values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_porter_stemmer_measure(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx
index 8fa70daab5a..3cff4843735 100644
--- a/python/pylibcudf/pylibcudf/partitioning.pyx
+++ b/python/pylibcudf/pylibcudf/partitioning.pyx
@@ -41,10 +41,10 @@ cpdef tuple[Table, list] hash_partition(
     cdef int c_num_partitions = num_partitions
 
     with nogil:
-        c_result = move(
-            cpp_partitioning.hash_partition(
-                input.view(), c_columns_to_hash, c_num_partitions
-            )
+        c_result = cpp_partitioning.hash_partition(
+            input.view(),
+            c_columns_to_hash,
+            c_num_partitions
         )
 
     return Table.from_libcudf(move(c_result.first)), list(c_result.second)
@@ -74,8 +74,10 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit
     cdef int c_num_partitions = num_partitions
 
     with nogil:
-        c_result = move(
-            cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions)
+        c_result = cpp_partitioning.partition(
+            t.view(),
+            partition_map.view(),
+            c_num_partitions
         )
 
     return Table.from_libcudf(move(c_result.first)), list(c_result.second)
@@ -111,10 +113,8 @@ cpdef tuple[Table, list] round_robin_partition(
     cdef int c_start_partition = start_partition
 
     with nogil:
-        c_result = move(
-            cpp_partitioning.round_robin_partition(
-                input.view(), c_num_partitions, c_start_partition
-            )
+        c_result = cpp_partitioning.round_robin_partition(
+            input.view(), c_num_partitions, c_start_partition
         )
 
     return Table.from_libcudf(move(c_result.first)), list(c_result.second)
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
index 3a771fbe7ef..7d92b598bd0 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -66,14 +66,12 @@ cpdef Column quantile(
         ordered_indices_view = ordered_indices.view()
 
     with nogil:
-        c_result = move(
-            cpp_quantile(
-                input.view(),
-                q,
-                interp,
-                ordered_indices_view,
-                exact,
-            )
+        c_result = cpp_quantile(
+            input.view(),
+            q,
+            interp,
+            ordered_indices_view,
+            exact,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -141,15 +139,13 @@ cpdef Table quantiles(
         null_precedence_vec = null_precedence
 
     with nogil:
-        c_result = move(
-            cpp_quantiles(
-                input.view(),
-                q,
-                interp,
-                is_input_sorted,
-                column_order_vec,
-                null_precedence_vec,
-            )
+        c_result = cpp_quantiles(
+            input.view(),
+            q,
+            interp,
+            is_input_sorted,
+            column_order_vec,
+            null_precedence_vec,
         )
 
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx
index b0212a5b9c1..d9ec3a9bdc4 100644
--- a/python/pylibcudf/pylibcudf/reduce.pyx
+++ b/python/pylibcudf/pylibcudf/reduce.pyx
@@ -39,12 +39,10 @@ cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
     cdef unique_ptr[scalar] result
     cdef const reduce_aggregation *c_agg = agg.view_underlying_as_reduce()
     with nogil:
-        result = move(
-            cpp_reduce.cpp_reduce(
-                col.view(),
-                dereference(c_agg),
-                data_type.c_obj
-            )
+        result = cpp_reduce.cpp_reduce(
+            col.view(),
+            dereference(c_agg),
+            data_type.c_obj
         )
     return Scalar.from_libcudf(move(result))
 
@@ -71,12 +69,10 @@ cpdef Column scan(Column col, Aggregation agg, scan_type inclusive):
     cdef unique_ptr[column] result
     cdef const scan_aggregation *c_agg = agg.view_underlying_as_scan()
     with nogil:
-        result = move(
-            cpp_reduce.cpp_scan(
-                col.view(),
-                dereference(c_agg),
-                inclusive,
-            )
+        result = cpp_reduce.cpp_scan(
+            col.view(),
+            dereference(c_agg),
+            inclusive,
         )
     return Column.from_libcudf(move(result))
 
@@ -99,7 +95,7 @@ cpdef tuple minmax(Column col):
     """
     cdef pair[unique_ptr[scalar], unique_ptr[scalar]] result
     with nogil:
-        result = move(cpp_reduce.cpp_minmax(col.view()))
+        result = cpp_reduce.cpp_minmax(col.view())
 
     return (
         Scalar.from_libcudf(move(result.first)),
diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx
index 115dee132fd..f77eba7ace5 100644
--- a/python/pylibcudf/pylibcudf/replace.pyx
+++ b/python/pylibcudf/pylibcudf/replace.pyx
@@ -56,28 +56,23 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement):
         if isinstance(replacement, ReplacePolicy):
             policy = replacement
             with nogil:
-                c_result = move(
-                    cpp_replace.replace_nulls(source_column.view(), policy)
-                )
+                c_result = cpp_replace.replace_nulls(source_column.view(), policy)
             return Column.from_libcudf(move(c_result))
         else:
             raise TypeError("replacement must be a Column, Scalar, or replace_policy")
 
     with nogil:
         if ReplacementType is Column:
-            c_result = move(
-                cpp_replace.replace_nulls(source_column.view(), replacement.view())
+            c_result = cpp_replace.replace_nulls(
+                source_column.view(),
+                replacement.view()
             )
         elif ReplacementType is Scalar:
-            c_result = move(
-                cpp_replace.replace_nulls(
-                    source_column.view(), dereference(replacement.c_obj)
-                )
+            c_result = cpp_replace.replace_nulls(
+                source_column.view(), dereference(replacement.c_obj)
             )
         elif ReplacementType is replace_policy:
-            c_result = move(
-                cpp_replace.replace_nulls(source_column.view(), replacement)
-            )
+            c_result = cpp_replace.replace_nulls(source_column.view(), replacement)
         else:
             assert False, "Internal error. Please contact pylibcudf developers"
     return Column.from_libcudf(move(c_result))
@@ -109,12 +104,10 @@ cpdef Column find_and_replace_all(
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_replace.find_and_replace_all(
-                source_column.view(),
-                values_to_replace.view(),
-                replacement_values.view(),
-            )
+        c_result = cpp_replace.find_and_replace_all(
+            source_column.view(),
+            values_to_replace.view(),
+            replacement_values.view(),
         )
     return Column.from_libcudf(move(c_result))
 
@@ -156,22 +149,18 @@ cpdef Column clamp(
     cdef unique_ptr[column] c_result
     with nogil:
         if lo_replace is None:
-            c_result = move(
-                cpp_replace.clamp(
-                    source_column.view(),
-                    dereference(lo.c_obj),
-                    dereference(hi.c_obj),
-                )
+            c_result = cpp_replace.clamp(
+                source_column.view(),
+                dereference(lo.c_obj),
+                dereference(hi.c_obj),
             )
         else:
-            c_result = move(
-                cpp_replace.clamp(
-                    source_column.view(),
-                    dereference(lo.c_obj),
-                    dereference(hi.c_obj),
-                    dereference(lo_replace.c_obj),
-                    dereference(hi_replace.c_obj),
-                )
+            c_result = cpp_replace.clamp(
+                source_column.view(),
+                dereference(lo.c_obj),
+                dereference(hi.c_obj),
+                dereference(lo_replace.c_obj),
+                dereference(hi_replace.c_obj),
             )
     return Column.from_libcudf(move(c_result))
 
@@ -199,9 +188,7 @@ cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False):
         if inplace:
             cpp_replace.normalize_nans_and_zeros(source_column.mutable_view())
         else:
-            c_result = move(
-                cpp_replace.normalize_nans_and_zeros(source_column.view())
-            )
+            c_result = cpp_replace.normalize_nans_and_zeros(source_column.view())
 
     if not inplace:
         return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
index eb1499ebbea..6540b5198ab 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -38,7 +38,7 @@ cpdef Column interleave_columns(Table source_table):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_interleave_columns(source_table.view()))
+        c_result = cpp_interleave_columns(source_table.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -63,6 +63,6 @@ cpdef Table tile(Table source_table, size_type count):
     cdef unique_ptr[table] c_result
 
     with nogil:
-        c_result = move(cpp_tile(source_table.view(), count))
+        c_result = cpp_tile(source_table.view(), count)
 
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx
index a46540d7ffa..4fd0b005431 100644
--- a/python/pylibcudf/pylibcudf/rolling.pyx
+++ b/python/pylibcudf/pylibcudf/rolling.pyx
@@ -49,24 +49,21 @@ cpdef Column rolling_window(
     cdef const rolling_aggregation *c_agg = agg.view_underlying_as_rolling()
     if WindowType is Column:
         with nogil:
-            result = move(
-                cpp_rolling.rolling_window(
-                    source.view(),
-                    preceding_window.view(),
-                    following_window.view(),
-                    min_periods,
-                    dereference(c_agg),
-                )
+            result = cpp_rolling.rolling_window(
+                source.view(),
+                preceding_window.view(),
+                following_window.view(),
+                min_periods,
+                dereference(c_agg),
             )
     else:
         with nogil:
-            result = move(
-                cpp_rolling.rolling_window(
-                    source.view(),
-                    preceding_window,
-                    following_window,
-                    min_periods,
-                    dereference(c_agg),
-                )
+            result = cpp_rolling.rolling_window(
+                source.view(),
+                preceding_window,
+                following_window,
+                min_periods,
+                dereference(c_agg),
             )
+
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx
index dc60d53b07e..689363e652d 100644
--- a/python/pylibcudf/pylibcudf/round.pyx
+++ b/python/pylibcudf/pylibcudf/round.pyx
@@ -39,12 +39,10 @@ cpdef Column round(
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_round(
-                source.view(),
-                decimal_places,
-                round_method
-            )
+        c_result = cpp_round(
+            source.view(),
+            decimal_places,
+            round_method
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd
index 8664dfa4b7e..a273647c98d 100644
--- a/python/pylibcudf/pylibcudf/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/scalar.pxd
@@ -4,7 +4,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
-from rmm._lib.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
index 3e20938af0c..d4888a62ad1 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -6,7 +6,7 @@ from libcpp.utility cimport move
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like
 
-from rmm._lib.memory_resource cimport get_current_device_resource
+from rmm.pylibrmm.memory_resource cimport get_current_device_resource
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
index 814bc6553d8..1a870248046 100644
--- a/python/pylibcudf/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -41,13 +41,11 @@ cpdef Column lower_bound(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_search.lower_bound(
-                haystack.view(),
-                needles.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_search.lower_bound(
+            haystack.view(),
+            needles.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -82,13 +80,11 @@ cpdef Column upper_bound(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_search.upper_bound(
-                haystack.view(),
-                needles.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_search.upper_bound(
+            haystack.view(),
+            needles.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -112,10 +108,8 @@ cpdef Column contains(Column haystack, Column needles):
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_search.contains(
-                haystack.view(),
-                needles.view(),
-            )
+        c_result = cpp_search.contains(
+            haystack.view(),
+            needles.view(),
         )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
index 42289d54bca..fc40f03e1fd 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -36,12 +36,10 @@ cpdef Column sorted_order(Table source_table, list column_order, list null_prece
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.sorted_order(
-                source_table.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.sorted_order(
+            source_table.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -74,12 +72,10 @@ cpdef Column stable_sorted_order(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.stable_sorted_order(
-                source_table.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.stable_sorted_order(
+            source_table.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -118,15 +114,13 @@ cpdef Column rank(
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_sorting.rank(
-                input_view.view(),
-                method,
-                column_order,
-                null_handling,
-                null_precedence,
-                percentage,
-            )
+        c_result = cpp_sorting.rank(
+            input_view.view(),
+            method,
+            column_order,
+            null_handling,
+            null_precedence,
+            percentage,
         )
     return Column.from_libcudf(move(c_result))
 
@@ -154,12 +148,10 @@ cpdef bool is_sorted(Table tbl, list column_order, list null_precedence):
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.is_sorted(
-                tbl.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.is_sorted(
+            tbl.view(),
+            c_orders,
+            c_null_precedence,
         )
     return c_result
 
@@ -197,14 +189,12 @@ cpdef Table segmented_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.segmented_sort_by_key(
-                values.view(),
-                keys.view(),
-                segment_offsets.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.segmented_sort_by_key(
+            values.view(),
+            keys.view(),
+            segment_offsets.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -243,14 +233,12 @@ cpdef Table stable_segmented_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.stable_segmented_sort_by_key(
-                values.view(),
-                keys.view(),
-                segment_offsets.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.stable_segmented_sort_by_key(
+            values.view(),
+            keys.view(),
+            segment_offsets.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -285,13 +273,11 @@ cpdef Table sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.sort_by_key(
-                values.view(),
-                keys.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.sort_by_key(
+            values.view(),
+            keys.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -326,13 +312,11 @@ cpdef Table stable_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.stable_sort_by_key(
-                values.view(),
-                keys.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.stable_sort_by_key(
+            values.view(),
+            keys.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -360,12 +344,10 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence):
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.sort(
-                source_table.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.sort(
+            source_table.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
 
@@ -393,11 +375,9 @@ cpdef Table stable_sort(Table source_table, list column_order, list null_precede
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
     with nogil:
-        c_result = move(
-            cpp_sorting.stable_sort(
-                source_table.view(),
-                c_orders,
-                c_null_precedence,
-            )
+        c_result = cpp_sorting.stable_sort(
+            source_table.view(),
+            c_orders,
+            c_null_precedence,
         )
     return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index d5475ea79d5..2145398a191 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -44,10 +44,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.drop_nulls(
-                source_table.view(), c_keys, keep_threshold
-            )
+        c_result = cpp_stream_compaction.drop_nulls(
+            source_table.view(), c_keys, keep_threshold
         )
     return Table.from_libcudf(move(c_result))
 
@@ -74,10 +72,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.drop_nulls(
-                source_table.view(), c_keys, keep_threshold
-            )
+        c_result = cpp_stream_compaction.drop_nulls(
+            source_table.view(), c_keys, keep_threshold
         )
     return Table.from_libcudf(move(c_result))
 
@@ -101,10 +97,8 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     """
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.apply_boolean_mask(
-                source_table.view(), boolean_mask.view()
-            )
+        c_result = cpp_stream_compaction.apply_boolean_mask(
+            source_table.view(), boolean_mask.view()
         )
     return Table.from_libcudf(move(c_result))
 
@@ -144,10 +138,8 @@ cpdef Table unique(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.unique(
-                input.view(), c_keys, keep, nulls_equal
-            )
+        c_result = cpp_stream_compaction.unique(
+            input.view(), c_keys, keep, nulls_equal
         )
     return Table.from_libcudf(move(c_result))
 
@@ -185,10 +177,8 @@ cpdef Table distinct(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.distinct(
-                input.view(), c_keys, keep, nulls_equal, nans_equal
-            )
+        c_result = cpp_stream_compaction.distinct(
+            input.view(), c_keys, keep, nulls_equal, nans_equal
         )
     return Table.from_libcudf(move(c_result))
 
@@ -221,10 +211,8 @@ cpdef Column distinct_indices(
     """
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.distinct_indices(
-                input.view(), keep, nulls_equal, nans_equal
-            )
+        c_result = cpp_stream_compaction.distinct_indices(
+            input.view(), keep, nulls_equal, nans_equal
         )
     return Column.from_libcudf(move(c_result))
 
@@ -262,10 +250,8 @@ cpdef Table stable_distinct(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
-        c_result = move(
-            cpp_stream_compaction.stable_distinct(
-                input.view(), c_keys, keep, nulls_equal, nans_equal
-            )
+        c_result = cpp_stream_compaction.stable_distinct(
+            input.view(), c_keys, keep, nulls_equal, nans_equal
         )
     return Table.from_libcudf(move(c_result))
 
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 052a0cf3c56..5d7fbd24b91 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -18,17 +18,22 @@ set(cython_sources
     case.pyx
     char_types.pyx
     contains.pyx
+    combine.pyx
     extract.pyx
     find.pyx
+    find_multiple.pyx
     findall.pyx
+    padding.pyx
     regex_flags.pyx
     regex_program.pyx
     repeat.pyx
     replace.pyx
+    replace_re.pyx
     side_type.pyx
     slice.pyx
     strip.pyx
     translate.pyx
+    wrap.pyx
 )
 
 set(linked_libraries cudf::cudf)
@@ -39,3 +44,4 @@ rapids_cython_create_modules(
 )
 
 add_subdirectory(convert)
+add_subdirectory(split)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 142637ff577..da1c1c576c0 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -5,17 +5,25 @@ from . cimport (
     capitalize,
     case,
     char_types,
+    combine,
     contains,
     convert,
     extract,
     find,
+    find_multiple,
     findall,
+    padding,
     regex_flags,
     regex_program,
+    repeat,
     replace,
+    replace_re,
+    side_type,
     slice,
+    split,
     strip,
     translate,
+    wrap,
 )
 from .side_type cimport side_type
 
@@ -28,12 +36,18 @@ __all__ = [
     "convert",
     "extract",
     "find",
+    "find_multiple",
     "findall",
+    "padding",
     "regex_flags",
     "regex_program",
+    "repeat",
     "replace",
+    "replace_re",
     "slice",
     "strip",
+    "split",
     "side_type",
     "translate",
+    "wrap",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index decfadd63a4..40fa8261905 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -5,18 +5,25 @@
     capitalize,
     case,
     char_types,
+    combine,
     contains,
     convert,
     extract,
     find,
+    find_multiple,
     findall,
+    padding,
     regex_flags,
     regex_program,
     repeat,
     replace,
+    replace_re,
+    side_type,
     slice,
+    split,
     strip,
     translate,
+    wrap,
 )
 from .side_type import SideType
 
@@ -29,12 +36,18 @@
     "convert",
     "extract",
     "find",
+    "find_multiple",
     "findall",
+    "padding",
     "regex_flags",
     "regex_program",
+    "repeat",
     "replace",
+    "replace_re",
     "slice",
     "strip",
+    "split",
     "SideType",
     "translate",
+    "wrap",
 ]
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx
index 36bee7bd1d9..8e46a32835d 100644
--- a/python/pylibcudf/pylibcudf/strings/attributes.pyx
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx
@@ -25,7 +25,7 @@ cpdef Column count_characters(Column source_strings):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_attributes.count_characters(source_strings.view()))
+        c_result = cpp_attributes.count_characters(source_strings.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -48,7 +48,7 @@ cpdef Column count_bytes(Column source_strings):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_attributes.count_bytes(source_strings.view()))
+        c_result = cpp_attributes.count_bytes(source_strings.view())
 
     return Column.from_libcudf(move(c_result))
 
@@ -71,6 +71,6 @@ cpdef Column code_points(Column source_strings):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_attributes.code_points(source_strings.view()))
+        c_result = cpp_attributes.code_points(source_strings.view())
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
index 6a24d79bc4b..cb04efe5e8f 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -38,12 +38,10 @@ cpdef Column all_characters_of_type(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_char_types.all_characters_of_type(
-                source_strings.view(),
-                types,
-                verify_types,
-            )
+        c_result = cpp_char_types.all_characters_of_type(
+            source_strings.view(),
+            types,
+            verify_types,
         )
 
     return Column.from_libcudf(move(c_result))
@@ -81,13 +79,11 @@ cpdef Column filter_characters_of_type(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_char_types.filter_characters_of_type(
-                source_strings.view(),
-                types_to_remove,
-                dereference(c_replacement),
-                types_to_keep,
-            )
+        c_result = cpp_char_types.filter_characters_of_type(
+            source_strings.view(),
+            types_to_remove,
+            dereference(c_replacement),
+            types_to_keep,
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd
new file mode 100644
index 00000000000..ea22f626973
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/combine.pxd
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.strings.combine cimport (
+    output_if_empty_list,
+    separator_on_nulls,
+)
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column concatenate(
+    Table strings_columns,
+    ColumnOrScalar separator,
+    Scalar narep=*,
+    Scalar col_narep=*,
+    separator_on_nulls separate_nulls=*,
+)
+
+cpdef Column join_strings(Column input, Scalar separator, Scalar narep)
+
+
+cpdef Column join_list_elements(
+    Column source_strings,
+    ColumnOrScalar separator,
+    Scalar separator_narep,
+    Scalar string_narep,
+    separator_on_nulls separate_nulls,
+    output_if_empty_list empty_list_policy,
+)
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx
new file mode 100644
index 00000000000..f17d5265ab4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/combine.pyx
@@ -0,0 +1,223 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings cimport combine as cpp_combine
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+from pylibcudf.libcudf.strings.combine import \
+    output_if_empty_list as OutputIfEmptyList  # no-cython-lint
+from pylibcudf.libcudf.strings.combine import \
+    separator_on_nulls as SeparatorOnNulls  # no-cython-lint
+
+
+cpdef Column concatenate(
+    Table strings_columns,
+    ColumnOrScalar separator,
+    Scalar narep=None,
+    Scalar col_narep=None,
+    separator_on_nulls separate_nulls=separator_on_nulls.YES,
+):
+    """
+    Concatenate all columns in the table horizontally into one new string
+    delimited by an optional separator string.
+
+    Parameters
+    ----------
+    strings_columns : Table
+        Strings for this operation
+
+    separator : Column or Scalar
+        Separator(s) for a given row
+
+    narep : Scalar
+        String to replace a null separator for a given row.
+
+    col_narep : Scalar
+        String that should be used in place of any null strings found in any column.
+        An exception is raised when separator is a Scalar.
+
+    separate_nulls : SeparatorOnNulls
+        If YES, then the separator is included for null rows.
+
+    Returns
+    -------
+    Column
+        New column with concatenated results
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_col_narep
+    cdef const string_scalar* c_separator
+
+    if narep is None:
+        narep = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    cdef const string_scalar* c_narep = <const string_scalar*>(
+        narep.c_obj.get()
+    )
+
+    if ColumnOrScalar is Column:
+        if col_narep is None:
+            col_narep = Scalar.from_libcudf(
+                cpp_make_string_scalar("".encode())
+            )
+        c_col_narep = <const string_scalar*>(
+            col_narep.c_obj.get()
+        )
+        with nogil:
+            c_result = move(
+                cpp_combine.concatenate(
+                    strings_columns.view(),
+                    separator.view(),
+                    dereference(c_narep),
+                    dereference(c_col_narep),
+                    separate_nulls
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        if col_narep is not None:
+            raise ValueError(
+                "col_narep cannot be specified when separator is a Scalar"
+            )
+        c_separator = <const string_scalar*>(separator.c_obj.get())
+        with nogil:
+            c_result = move(
+                cpp_combine.concatenate(
+                    strings_columns.view(),
+                    dereference(c_separator),
+                    dereference(c_narep),
+                    separate_nulls
+                )
+            )
+    else:
+        raise ValueError("separator must be a Column or a Scalar")
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column join_strings(Column input, Scalar separator, Scalar narep):
+    """
+    Concatenates all strings in the column into one new string delimited
+    by an optional separator string.
+
+    Parameters
+    ----------
+    input : Column
+        List of strings columns to concatenate
+
+    separator : Scalar
+        Strings column that provides the separator for a given row
+
+    narep : Scalar
+        String to replace any null strings found.
+
+    Returns
+    -------
+    Column
+        New column containing one string
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_separator = <const string_scalar*>(
+        separator.c_obj.get()
+    )
+    cdef const string_scalar* c_narep = <const string_scalar*>(
+        narep.c_obj.get()
+    )
+    with nogil:
+        c_result = move(
+            cpp_combine.join_strings(
+                input.view(),
+                dereference(c_separator),
+                dereference(c_narep),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column join_list_elements(
+    Column lists_strings_column,
+    ColumnOrScalar separator,
+    Scalar separator_narep,
+    Scalar string_narep,
+    separator_on_nulls separate_nulls,
+    output_if_empty_list empty_list_policy,
+):
+    """
+    Given a lists column of strings (each row is a list of strings),
+    concatenates the strings within each row and returns a single strings
+    column result.
+
+    Parameters
+    ----------
+    lists_strings_column : Column
+        Column containing lists of strings to concatenate
+
+    separator : Column or Scalar
+        String(s) that should inserted between each string from each row.
+
+    separator_narep : Scalar
+        String that should be used to replace a null separator.
+
+    string_narep : Scalar
+        String to replace null strings in any non-null list row.
+        Ignored if separator is a Scalar.
+
+    separate_nulls : SeparatorOnNulls
+        If YES, then the separator is included for null rows
+        if `narep` is valid
+
+    empty_list_policy : OutputIfEmptyList
+        If set to EMPTY_STRING, any input row that is an empty
+        list will result in an empty string. Otherwise, it will
+        result in a null.
+
+
+    Returns
+    -------
+    Column
+        New strings column with concatenated results
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_separator_narep = <const string_scalar*>(
+        separator_narep.c_obj.get()
+    )
+    cdef const string_scalar* c_string_narep = <const string_scalar*>(
+        string_narep.c_obj.get()
+    )
+    cdef const string_scalar* c_separator
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            c_result = move(
+                cpp_combine.join_list_elements(
+                    lists_strings_column.view(),
+                    separator.view(),
+                    dereference(c_separator_narep),
+                    dereference(c_string_narep),
+                    separate_nulls,
+                    empty_list_policy,
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        c_separator = <const string_scalar*>(separator.c_obj.get())
+        with nogil:
+            c_result = move(
+                cpp_combine.join_list_elements(
+                    lists_strings_column.view(),
+                    dereference(c_separator),
+                    dereference(c_separator_narep),
+                    separate_nulls,
+                    empty_list_policy,
+                )
+            )
+    else:
+        raise ValueError("separator must be a Column or a Scalar")
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
index 82bd1fbea32..d4b1130241d 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -38,10 +38,10 @@ cpdef Column contains_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_contains.contains_re(
+        result = cpp_contains.contains_re(
             input.view(),
             prog.c_obj.get()[0]
-        ))
+        )
 
     return Column.from_libcudf(move(result))
 
@@ -71,10 +71,10 @@ cpdef Column count_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_contains.count_re(
+        result = cpp_contains.count_re(
             input.view(),
             prog.c_obj.get()[0]
-        ))
+        )
 
     return Column.from_libcudf(move(result))
 
@@ -105,10 +105,10 @@ cpdef Column matches_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_contains.matches_re(
+        result = cpp_contains.matches_re(
             input.view(),
             prog.c_obj.get()[0]
-        ))
+        )
 
     return Column.from_libcudf(move(result))
 
@@ -149,19 +149,19 @@ cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=
 
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(cpp_contains.like(
+            result = cpp_contains.like(
                 input.view(),
                 pattern.view(),
                 dereference(c_escape_character)
-            ))
+            )
     elif ColumnOrScalar is Scalar:
         c_pattern = <const string_scalar*>(pattern.c_obj.get())
         with nogil:
-            result = move(cpp_contains.like(
+            result = cpp_contains.like(
                 input.view(),
                 dereference(c_pattern),
                 dereference(c_escape_character)
-            ))
+            )
     else:
         raise ValueError("pattern must be a Column or a Scalar")
 
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
index 175c9b3738e..8ba84ba7d50 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -12,7 +12,10 @@
 # the License.
 # =============================================================================
 
-set(cython_sources convert_durations.pyx convert_datetime.pyx)
+set(cython_sources
+    convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx
+    convert_floats.pyx convert_integers.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
+)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
index 05324cb49df..85300936e4d 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -1,2 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from . cimport convert_datetime, convert_durations
+from . cimport (
+    convert_booleans,
+    convert_datetime,
+    convert_durations,
+    convert_fixed_point,
+    convert_floats,
+    convert_integers,
+    convert_ipv4,
+    convert_lists,
+    convert_urls,
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
index d803399d53c..aa27a7c8929 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -1,2 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from . import convert_datetime, convert_durations
+from . import (
+    convert_booleans,
+    convert_datetime,
+    convert_durations,
+    convert_fixed_point,
+    convert_floats,
+    convert_integers,
+    convert_ipv4,
+    convert_lists,
+    convert_urls,
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd
new file mode 100644
index 00000000000..312ac3c0ca0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column to_booleans(Column input, Scalar true_string)
+
+cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
new file mode 100644
index 00000000000..dc12b291b11
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
@@ -0,0 +1,87 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_booleans as cpp_convert_booleans,
+)
+from pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+
+cpdef Column to_booleans(Column input, Scalar true_string):
+    """
+    Returns a new bool column by parsing boolean values from the strings
+    in the provided strings column.
+
+    For details, see :cpp:func:`cudf::strings::to_booleans`.
+
+    Parameters
+    ----------
+    input :  Column
+        Strings instance for this operation
+
+    true_string : Scalar
+        String to expect for true. Non-matching strings are false
+
+    Returns
+    -------
+    Column
+        New bool column converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_true_string = <const string_scalar*>(
+        true_string.c_obj.get()
+    )
+
+    with nogil:
+        c_result = cpp_convert_booleans.to_booleans(
+            input.view(),
+            dereference(c_true_string)
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string):
+    """
+    Returns a new strings column converting the boolean values from the
+    provided column into strings.
+
+    For details, see :cpp:func:`cudf::strings::from_booleans`.
+
+    Parameters
+    ----------
+    booleans :  Column
+        Boolean column to convert.
+
+    true_string : Scalar
+        String to use for true in the output column.
+
+    false_string : Scalar
+        String to use for false in the output column.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_true_string = <const string_scalar*>(
+        true_string.c_obj.get()
+    )
+    cdef const string_scalar* c_false_string = <const string_scalar*>(
+        false_string.c_obj.get()
+    )
+
+    with nogil:
+        c_result = cpp_convert_booleans.from_booleans(
+            booleans.view(),
+            dereference(c_true_string),
+            dereference(c_false_string),
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
index 07c84d263d6..80ec168644b 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
@@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType
 cpdef Column to_timestamps(
     Column input,
     DataType timestamp_type,
-    const string& format
+    str format
 )
 
 cpdef Column from_timestamps(
-    Column input,
-    const string& format,
+    Column timestamps,
+    str format,
     Column input_strings_names
 )
+
+cpdef Column is_timestamp(
+    Column input,
+    str format,
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
index fcacb096f87..0ee60812e00 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
@@ -15,28 +15,74 @@ from pylibcudf.types import DataType
 cpdef Column to_timestamps(
     Column input,
     DataType timestamp_type,
-    const string& format
+    str format
 ):
+    """
+    Returns a new timestamp column converting a strings column into
+    timestamps using the provided format pattern.
+
+    For details, see cpp:`cudf::strings::to_timestamps`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    timestamp_type : DataType
+        The timestamp type used for creating the output column.
+
+    format : str
+        String specifying the timestamp format in strings.
+
+    Returns
+    -------
+    Column
+        New datetime column
+    """
     cdef unique_ptr[column] c_result
+    cdef string c_format = format.encode()
     with nogil:
         c_result = cpp_convert_datetime.to_timestamps(
             input.view(),
             timestamp_type.c_obj,
-            format
+            c_format
         )
 
     return Column.from_libcudf(move(c_result))
 
 cpdef Column from_timestamps(
-    Column input,
-    const string& format,
+    Column timestamps,
+    str format,
     Column input_strings_names
 ):
+    """
+    Returns a new strings column converting a timestamp column into
+    strings using the provided format pattern.
+
+    For details, see cpp:`cudf::strings::from_timestamps`.
+
+    Parameters
+    ----------
+    timestamps : Column
+        Timestamp values to convert
+
+    format : str
+        The string specifying output format.
+
+    input_strings_names : Column
+        The string names to use for weekdays ("%a", "%A") and months ("%b", "%B").
+
+    Returns
+    -------
+    Column
+        New strings column with formatted timestamps.
+    """
     cdef unique_ptr[column] c_result
+    cdef string c_format = format.encode()
     with nogil:
         c_result = cpp_convert_datetime.from_timestamps(
-            input.view(),
-            format,
+            timestamps.view(),
+            c_format,
             input_strings_names.view()
         )
 
@@ -44,13 +90,33 @@ cpdef Column from_timestamps(
 
 cpdef Column is_timestamp(
     Column input,
-    const string& format
+    str format
 ):
+    """
+    Verifies the given strings column can be parsed to timestamps
+    using the provided format pattern.
+
+    For details, see cpp:`cudf::strings::is_timestamp`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    format : str
+        String specifying the timestamp format in strings.
+
+    Returns
+    -------
+    Column
+        New bool column.
+    """
     cdef unique_ptr[column] c_result
+    cdef string c_format = format.encode()
     with nogil:
         c_result = cpp_convert_datetime.is_timestamp(
             input.view(),
-            format
+            c_format
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
index ac11b8959ed..eecdade4ef9 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
@@ -8,10 +8,10 @@ from pylibcudf.types cimport DataType
 cpdef Column to_durations(
     Column input,
     DataType duration_type,
-    const string& format
+    str format
 )
 
 cpdef Column from_durations(
-    Column input,
-    const string& format
+    Column durations,
+    str format=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
index f3e0b7c9c8e..31980ace418 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -15,27 +15,76 @@ from pylibcudf.types import DataType
 cpdef Column to_durations(
     Column input,
     DataType duration_type,
-    const string& format
+    str format
 ):
+    """
+    Returns a new duration column converting a strings column into
+    durations using the provided format pattern.
+
+    For details, see cpp:func:`cudf::strings::to_durations`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    duration_type : DataType
+        The duration type used for creating the output column.
+
+    format : str
+        String specifying the duration format in strings.
+
+    Returns
+    -------
+    Column
+        New duration column.
+    """
     cdef unique_ptr[column] c_result
+    cdef string c_format = format.encode()
+
     with nogil:
         c_result = cpp_convert_durations.to_durations(
             input.view(),
             duration_type.c_obj,
-            format
+            c_format
         )
 
     return Column.from_libcudf(move(c_result))
 
 cpdef Column from_durations(
-    Column input,
-    const string& format
+    Column durations,
+    str format=None
 ):
+    """
+    Returns a new strings column converting a duration column into
+    strings using the provided format pattern.
+
+    For details, see cpp:func:`cudf::strings::from_durations`
+
+    Parameters
+    ----------
+    durations : Column
+        Duration values to convert.
+
+    format : str
+        The string specifying output format.
+        Default format is "%D days %H:%M:%S".
+
+    Returns
+    -------
+    Column
+        New strings column with formatted durations.
+    """
     cdef unique_ptr[column] c_result
+
+    if format is None:
+        format = "%D days %H:%M:%S"
+    cdef string c_format = format.encode()
+
     with nogil:
         c_result = cpp_convert_durations.from_durations(
-            input.view(),
-            format
+            durations.view(),
+            c_format
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd
new file mode 100644
index 00000000000..049b9b3fffe
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_fixed_point(Column input, DataType output_type)
+
+cpdef Column from_fixed_point(Column input)
+
+cpdef Column is_fixed_point(Column input, DataType decimal_type=*)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
new file mode 100644
index 00000000000..962a47dfadf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
@@ -0,0 +1,99 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_fixed_point as cpp_fixed_point,
+)
+from pylibcudf.types cimport DataType, type_id
+
+
+cpdef Column to_fixed_point(Column input, DataType output_type):
+    """
+    Returns a new fixed-point column parsing decimal values from the
+    provided strings column.
+
+    For details, see :cpp:func:`cudf::strings::to_fixed_point`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of fixed-point column to return including the scale value.
+
+    Returns
+    -------
+    Column
+        New column of output_type.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_fixed_point.to_fixed_point(
+            input.view(),
+            output_type.c_obj,
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column from_fixed_point(Column input):
+    """
+    Returns a new strings column converting the fixed-point values
+    into a strings column.
+
+    For details, see :cpp:func:`cudf::strings::from_fixed_point`
+
+    Parameters
+    ----------
+    input : Column
+        Fixed-point column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_fixed_point.from_fixed_point(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column is_fixed_point(Column input, DataType decimal_type=None):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to fixed-point.
+
+    For details, see :cpp:func:`cudf::strings::is_fixed_point`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    decimal_type : DataType
+        Fixed-point type (with scale) used only for checking overflow.
+        Defaults to Decimal64
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    if decimal_type is None:
+        decimal_type = DataType(type_id.DECIMAL64)
+
+    with nogil:
+        c_result = cpp_fixed_point.is_fixed_point(
+            input.view(),
+            decimal_type.c_obj,
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
new file mode 100644
index 00000000000..1284ff552aa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type)
+
+cpdef Column from_floats(Column floats)
+
+cpdef Column is_float(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..1296f4f9db5
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_floats as cpp_convert_floats,
+)
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_floats(Column strings, DataType output_type):
+    """
+    Returns a new numeric column by parsing float values from each string
+    in the provided strings column.
+
+    For details, see cpp:func:`cudf::strings::to_floats`
+
+    Parameters
+    ----------
+    strings : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of float numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with floats converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_convert_floats.to_floats(
+            strings.view(),
+            output_type.c_obj,
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column from_floats(Column floats):
+    """
+    Returns a new strings column converting the float values from the
+    provided column into strings.
+
+    For details, see cpp:func:`cudf::strings::from_floats`
+
+    Parameters
+    ----------
+    floats : Column
+        Numeric column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column with floats as strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_convert_floats.from_floats(floats.view())
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_float(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to floats.
+
+    For details, see cpp:func:`cudf::strings::is_float`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_convert_floats.is_float(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
new file mode 100644
index 00000000000..eff2e080c27
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_integers(Column input, DataType output_type)
+
+cpdef Column from_integers(Column integers)
+
+cpdef Column is_integer(Column input, DataType int_type=*)
+
+cpdef Column hex_to_integers(Column input, DataType output_type)
+
+cpdef Column is_hex(Column input)
+
+cpdef Column integers_to_hex(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
new file mode 100644
index 00000000000..5558683a502
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
@@ -0,0 +1,206 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_integers as cpp_convert_integers,
+)
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_integers(Column input, DataType output_type):
+    """
+    Returns a new integer numeric column parsing integer values from the
+    provided strings column.
+
+    For details, cpp:func:`cudf::strings::to_integers`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of integer numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with integers converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.to_integers(
+                input.view(),
+                output_type.c_obj
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column from_integers(Column integers):
+    """
+    Returns a new strings column converting the integer values from the
+    provided column into strings.
+
+    For details, cpp:func:`cudf::strings::from_integers`.
+
+    Parameters
+    ----------
+    integers : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New strings column with integers as strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.from_integers(
+                integers.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_integer(Column input, DataType int_type=None):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to integers.
+
+    For details, cpp:func:`cudf::strings::is_integer`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    int_type : DataType
+        Integer type used for checking underflow and overflow.
+        By default, does not check an integer type for underflow
+        or overflow.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    if int_type is None:
+        with nogil:
+            c_result = move(
+                cpp_convert_integers.is_integer(
+                    input.view(),
+                )
+            )
+    else:
+        with nogil:
+            c_result = move(
+                cpp_convert_integers.is_integer(
+                    input.view(),
+                    int_type.c_obj
+                )
+            )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column hex_to_integers(Column input, DataType output_type):
+    """
+    Returns a new integer numeric column parsing hexadecimal values
+    from the provided strings column.
+
+    For details, cpp:func:`cudf::strings::hex_to_integers`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    output_type : DataType
+        Type of integer numeric column to return.
+
+    Returns
+    -------
+    Column
+        New column with integers converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.hex_to_integers(
+                input.view(),
+                output_type.c_obj
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_hex(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to integers from hex.
+
+    For details, cpp:func:`cudf::strings::is_hex`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.is_hex(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column integers_to_hex(Column input):
+    """
+    Returns a new strings column converting integer columns to hexadecimal
+    characters.
+
+    For details, cpp:func:`cudf::strings::integers_to_hex`.
+
+    Parameters
+    ----------
+    input : Column
+        Integer column to convert to hex.
+
+    Returns
+    -------
+    Column
+        New strings column with hexadecimal characters.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_convert_integers.integers_to_hex(
+                input.view(),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd
new file mode 100644
index 00000000000..c61f5c0bdca
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column ipv4_to_integers(Column input)
+
+cpdef Column integers_to_ipv4(Column integers)
+
+cpdef Column is_ipv4(Column input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
new file mode 100644
index 00000000000..834781f95f3
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
@@ -0,0 +1,80 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4
+
+
+cpdef Column ipv4_to_integers(Column input):
+    """
+    Converts IPv4 addresses into integers.
+
+    For details, see cpp:func:`cudf::strings::ipv4_to_integers`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    Returns
+    -------
+    Column
+        New uint32 column converted from strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_convert_ipv4.ipv4_to_integers(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column integers_to_ipv4(Column integers):
+    """
+    Converts integers into IPv4 addresses as strings.
+
+    For details, see cpp:func:`cudf::strings::integers_to_ipv4`
+
+    Parameters
+    ----------
+    integers : Column
+        Integer (uint32) column to convert.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_convert_ipv4.integers_to_ipv4(integers.view())
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_ipv4(Column input):
+    """
+    Returns a boolean column identifying strings in which all
+    characters are valid for conversion to integers from IPv4 format.
+
+    For details, see cpp:func:`cudf::strings::is_ipv4`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New column of boolean results for each string.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_convert_ipv4.is_ipv4(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
new file mode 100644
index 00000000000..1ba4272afa2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column format_list_column(
+    Column input,
+    Scalar na_rep=*,
+    Column separators=*
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
new file mode 100644
index 00000000000..cbfe5f5aa8b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.column_factories cimport make_empty_column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_lists as cpp_convert_lists,
+)
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.types cimport type_id
+
+from cython.operator import dereference
+
+
+cpdef Column format_list_column(
+    Column input,
+    Scalar na_rep=None,
+    Column separators=None
+):
+    """
+    Convert a list column of strings into a formatted strings column.
+
+    For details, see :cpp:func`cudf::strings::format_list_column`
+
+    Parameters
+    ----------
+    input : Column
+        Lists column to format
+
+    na_rep : Scalar
+        Replacement string for null elements.
+        Default, empty string
+
+    separators : Column
+        Strings to use for enclosing list components and separating elements.
+        Default, ``,``, ``[``, ``]``
+
+    Returns
+    -------
+    Column
+        New strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    if na_rep is None:
+        na_rep = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    cdef const string_scalar* c_na_rep = <const string_scalar*>(
+        na_rep.c_obj.get()
+    )
+
+    if separators is None:
+        separators = make_empty_column(type_id.STRING)
+
+    with nogil:
+        c_result = cpp_convert_lists.format_list_column(
+            input.view(),
+            dereference(c_na_rep),
+            separators.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
new file mode 100644
index 00000000000..da05ce93426
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column url_encode(Column Input)
+
+cpdef Column url_decode(Column Input)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
new file mode 100644
index 00000000000..82f8a75f1d9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls
+
+
+cpdef Column url_encode(Column input):
+    """
+    Encodes each string using URL encoding.
+
+    For details, see :cpp:func:`cudf::strings::url_encode`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_convert_urls.url_encode(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column url_decode(Column input):
+    """
+    Decodes each string using URL encoding.
+
+    For details, see :cpp:func:`cudf::strings::url_decode`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_convert_urls.url_decode(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
index dcb11ca10ce..b56eccc8287 100644
--- a/python/pylibcudf/pylibcudf/strings/extract.pyx
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -33,11 +33,9 @@ cpdef Table extract(Column input, RegexProgram prog):
     cdef unique_ptr[table] c_result
 
     with nogil:
-        c_result = move(
-            cpp_extract.extract(
-                input.view(),
-                prog.c_obj.get()[0]
-            )
+        c_result = cpp_extract.extract(
+            input.view(),
+            prog.c_obj.get()[0]
         )
 
     return Table.from_libcudf(move(c_result))
@@ -66,11 +64,9 @@ cpdef Column extract_all_record(Column input, RegexProgram prog):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_extract.extract_all_record(
-                input.view(),
-                prog.c_obj.get()[0]
-            )
+        c_result = cpp_extract.extract_all_record(
+            input.view(),
+            prog.c_obj.get()[0]
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx
index 22d370bf7e8..6fc6dca24fd 100644
--- a/python/pylibcudf/pylibcudf/strings/find.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find.pyx
@@ -50,22 +50,18 @@ cpdef Column find(
     cdef unique_ptr[column] result
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(
-                cpp_find.find(
-                    input.view(),
-                    target.view(),
-                    start
-                )
+            result = cpp_find.find(
+                input.view(),
+                target.view(),
+                start
             )
     elif ColumnOrScalar is Scalar:
         with nogil:
-            result = move(
-                cpp_find.find(
-                    input.view(),
-                    dereference(<string_scalar*>(target.c_obj.get())),
-                    start,
-                    stop
-                )
+            result = cpp_find.find(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get())),
+                start,
+                stop
             )
     else:
         raise ValueError(f"Invalid target {target}")
@@ -104,13 +100,11 @@ cpdef Column rfind(
     """
     cdef unique_ptr[column] result
     with nogil:
-        result = move(
-            cpp_find.rfind(
-                input.view(),
-                dereference(<string_scalar*>(target.c_obj.get())),
-                start,
-                stop
-            )
+        result = cpp_find.rfind(
+            input.view(),
+            dereference(<string_scalar*>(target.c_obj.get())),
+            start,
+            stop
         )
     return Column.from_libcudf(move(result))
 
@@ -149,19 +143,15 @@ cpdef Column contains(
     cdef unique_ptr[column] result
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(
-                cpp_find.contains(
-                    input.view(),
-                    target.view()
-                )
+            result = cpp_find.contains(
+                input.view(),
+                target.view()
             )
     elif ColumnOrScalar is Scalar:
         with nogil:
-            result = move(
-                cpp_find.contains(
-                    input.view(),
-                    dereference(<string_scalar*>(target.c_obj.get()))
-                )
+            result = cpp_find.contains(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get()))
             )
     else:
         raise ValueError(f"Invalid target {target}")
@@ -204,19 +194,15 @@ cpdef Column starts_with(
 
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(
-                cpp_find.starts_with(
-                    input.view(),
-                    target.view()
-                )
+            result = cpp_find.starts_with(
+                input.view(),
+                target.view()
             )
     elif ColumnOrScalar is Scalar:
         with nogil:
-            result = move(
-                cpp_find.starts_with(
-                    input.view(),
-                    dereference(<string_scalar*>(target.c_obj.get()))
-                )
+            result = cpp_find.starts_with(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get()))
             )
     else:
         raise ValueError(f"Invalid target {target}")
@@ -256,19 +242,15 @@ cpdef Column ends_with(
     cdef unique_ptr[column] result
     if ColumnOrScalar is Column:
         with nogil:
-            result = move(
-                cpp_find.ends_with(
-                    input.view(),
-                    target.view()
-                )
+            result = cpp_find.ends_with(
+                input.view(),
+                target.view()
             )
     elif ColumnOrScalar is Scalar:
         with nogil:
-            result = move(
-                cpp_find.ends_with(
-                    input.view(),
-                    dereference(<string_scalar*>(target.c_obj.get()))
-                )
+            result = cpp_find.ends_with(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get()))
             )
     else:
         raise ValueError(f"Invalid target {target}")
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
new file mode 100644
index 00000000000..b7b3aefa336
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
@@ -0,0 +1,6 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+
+
+cpdef Column find_multiple(Column input, Column targets)
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
new file mode 100644
index 00000000000..672aa606bd0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
@@ -0,0 +1,37 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple
+
+
+cpdef Column find_multiple(Column input, Column targets):
+    """
+    Returns a lists column with character position values where each
+    of the target strings are found in each string.
+
+    For details, see :cpp:func:`cudf::strings::find_multiple`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    targets : Column
+        Strings to search for in each string
+
+    Returns
+    -------
+    Column
+        Lists column with character position values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_find_multiple.find_multiple(
+            input.view(),
+            targets.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd
index 54afa088141..3c35a9c9aa9 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/strings/findall.pxd
@@ -4,4 +4,5 @@ from pylibcudf.column cimport Column
 from pylibcudf.strings.regex_program cimport RegexProgram
 
 
+cpdef Column find_re(Column input, RegexProgram pattern)
 cpdef Column findall(Column input, RegexProgram pattern)
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
index 3a6b87504b3..89fa4302824 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyx
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -30,11 +30,39 @@ cpdef Column findall(Column input, RegexProgram pattern):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_findall.findall(
-                input.view(),
-                pattern.c_obj.get()[0]
-            )
+        c_result = cpp_findall.findall(
+            input.view(),
+            pattern.c_obj.get()[0]
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column find_re(Column input, RegexProgram pattern):
+    """
+    Returns character positions where the pattern first matches
+    the elements in input strings.
+
+    For details, see :cpp:func:`cudf::strings::find_re`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    pattern : RegexProgram
+        Regex pattern
+
+    Returns
+    -------
+    Column
+        New column of integers
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_findall.find_re(
+            input.view(),
+            pattern.c_obj.get()[0]
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/strings/padding.pxd
new file mode 100644
index 00000000000..a035a5ad187
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/padding.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column pad(Column input, size_type width, side_type side, str fill_char)
+
+cpdef Column zfill(Column input, size_type width)
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx
new file mode 100644
index 00000000000..f6950eecf60
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyx
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport padding as cpp_padding
+from pylibcudf.libcudf.strings.side_type cimport side_type
+
+
+cpdef Column pad(Column input, size_type width, side_type side, str fill_char):
+    """
+    Add padding to each string using a provided character.
+
+    For details, see :cpp:func:`cudf::strings::pad`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    width : int
+        The minimum number of characters for each string.
+    side : SideType
+        Where to place the padding characters.
+    fill_char : str
+        Single UTF-8 character to use for padding
+
+    Returns
+    -------
+    Column
+        New column with padded strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_fill_char = fill_char.encode("utf-8")
+
+    with nogil:
+        c_result = cpp_padding.pad(
+            input.view(),
+            width,
+            side,
+            c_fill_char,
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column zfill(Column input, size_type width):
+    """
+    Add '0' as padding to the left of each string.
+
+    For details, see :cpp:func:`cudf::strings::zfill`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    width : int
+        The minimum number of characters for each string.
+
+    Returns
+    -------
+    Column
+        New column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_padding.zfill(
+            input.view(),
+            width,
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx
index 5f627218f6e..fb2bb13c666 100644
--- a/python/pylibcudf/pylibcudf/strings/repeat.pyx
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx
@@ -31,19 +31,15 @@ cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times):
 
     if ColumnorSizeType is Column:
         with nogil:
-            c_result = move(
-                cpp_repeat.repeat_strings(
-                    input.view(),
-                    repeat_times.view()
-                )
+            c_result = cpp_repeat.repeat_strings(
+                input.view(),
+                repeat_times.view()
             )
     elif ColumnorSizeType is size_type:
         with nogil:
-            c_result = move(
-                cpp_repeat.repeat_strings(
-                    input.view(),
-                    repeat_times
-                )
+            c_result = cpp_repeat.repeat_strings(
+                input.view(),
+                repeat_times
             )
     else:
         raise ValueError("repeat_times must be size_type or integer")
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
index 9d0ebf4a814..6db7f04fcbb 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -55,12 +55,12 @@ cpdef Column replace(
     repl_str = <string_scalar *>(repl.c_obj.get())
 
     with nogil:
-        c_result = move(cpp_replace(
+        c_result = cpp_replace(
             input.view(),
             target_str[0],
             repl_str[0],
             maxrepl,
-        ))
+        )
 
     return Column.from_libcudf(move(c_result))
 
@@ -98,11 +98,11 @@ cpdef Column replace_multiple(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_replace_multiple(
+        c_result = cpp_replace_multiple(
             input.view(),
             target.view(),
             repl.view(),
-        ))
+        )
 
     return Column.from_libcudf(move(c_result))
 
@@ -151,11 +151,11 @@ cpdef Column replace_slice(
     cdef const string_scalar* scalar_str = <string_scalar*>(repl.c_obj.get())
 
     with nogil:
-        c_result = move(cpp_replace_slice(
+        c_result = cpp_replace_slice(
             input.view(),
             scalar_str[0],
             start,
             stop
-        ))
+        )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
new file mode 100644
index 00000000000..e27ccd55f7d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+ctypedef fused Replacement:
+    Column
+    Scalar
+
+ctypedef fused Patterns:
+    RegexProgram
+    list
+
+
+cpdef Column replace_re(
+    Column input,
+    Patterns patterns,
+    Replacement replacement=*,
+    size_type max_replace_count=*,
+    regex_flags flags=*
+)
+
+cpdef Column replace_with_backrefs(
+    Column input,
+    RegexProgram prog,
+    str replacement
+)
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
new file mode 100644
index 00000000000..ccc33fd4425
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
@@ -0,0 +1,134 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings cimport replace_re as cpp_replace_re
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column replace_re(
+    Column input,
+    Patterns patterns,
+    Replacement replacement=None,
+    size_type max_replace_count=-1,
+    regex_flags flags=regex_flags.DEFAULT,
+):
+    """
+    For each string, replaces any character sequence matching the given patterns
+    with the provided replacement.
+
+    For details, see :cpp:func:`cudf::strings::replace_re`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+    patterns: RegexProgram or list[str]
+        If RegexProgram, the regex to match to each string.
+        If list[str], a list of regex strings to search within each string.
+    replacement : Scalar or Column
+        If Scalar, the string used to replace the matched sequence in each string.
+        ``patterns`` must be a RegexProgram.
+        If Column, the strings used for replacement.
+        ``patterns`` must be a list[str].
+    max_replace_count : int
+        The maximum number of times to replace the matched pattern
+        within each string. ``patterns`` must be a RegexProgram.
+        Default replaces every substring that is matched.
+    flags : RegexFlags
+        Regex flags for interpreting special characters in the patterns.
+        ``patterns`` must be a list[str]
+
+    Returns
+    -------
+    Column
+        New strings column
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[string] c_patterns
+
+    if Patterns is RegexProgram and Replacement is Scalar:
+        if replacement is None:
+            replacement = Scalar.from_libcudf(
+                cpp_make_string_scalar("".encode())
+            )
+        with nogil:
+            c_result = move(
+                cpp_replace_re.replace_re(
+                    input.view(),
+                    patterns.c_obj.get()[0],
+                    dereference(<string_scalar*>(replacement.get())),
+                    max_replace_count
+                )
+            )
+
+        return Column.from_libcudf(move(c_result))
+    elif Patterns is list and Replacement is Column:
+        c_patterns.reserve(len(patterns))
+        for pattern in patterns:
+            c_patterns.push_back(pattern.encode())
+
+        with nogil:
+            c_result = move(
+                cpp_replace_re.replace_re(
+                    input.view(),
+                    c_patterns,
+                    replacement.view(),
+                    flags,
+                )
+            )
+
+        return Column.from_libcudf(move(c_result))
+    else:
+        raise TypeError("Must pass either a RegexProgram and a Scalar or a list")
+
+
+cpdef Column replace_with_backrefs(
+    Column input,
+    RegexProgram prog,
+    str replacement
+):
+    """
+    For each string, replaces any character sequence matching the given regex
+    using the replacement template for back-references.
+
+    For details, see :cpp:func:`cudf::strings::replace_with_backrefs`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    prog: RegexProgram
+        Regex program instance.
+
+    replacement : str
+         The replacement template for creating the output string.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_replacement = replacement.encode()
+
+    with nogil:
+        c_result = cpp_replace_re.replace_with_backrefs(
+            input.view(),
+            prog.c_obj.get()[0],
+            c_replacement,
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd
index 34b7a580380..34b03e9bc27 100644
--- a/python/pylibcudf/pylibcudf/strings/side_type.pxd
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd
@@ -1,3 +1,2 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from pylibcudf.libcudf.strings.side_type cimport side_type
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx
index acdc7d6ff1f..cf0c770cc11 100644
--- a/python/pylibcudf/pylibcudf/strings/side_type.pyx
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx
@@ -1,4 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from pylibcudf.libcudf.strings.side_type import \
     side_type as SideType  # no-cython-lint
diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
new file mode 100644
index 00000000000..8f544f6f537
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt
@@ -0,0 +1,22 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources partition.pyx split.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd
similarity index 56%
rename from python/cudf_kafka/cudf_kafka/tests/pytest.ini
rename to python/pylibcudf/pylibcudf/strings/split/__init__.pxd
index 7b0a9f29fb1..72086e57d9f 100644
--- a/python/cudf_kafka/cudf_kafka/tests/pytest.ini
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd
@@ -1,4 +1,2 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
+from . cimport partition, split
diff --git a/python/cudf_polars/tests/pytest.ini b/python/pylibcudf/pylibcudf/strings/split/__init__.py
similarity index 56%
rename from python/cudf_polars/tests/pytest.ini
rename to python/pylibcudf/pylibcudf/strings/split/__init__.py
index 7b0a9f29fb1..2033e5e275b 100644
--- a/python/cudf_polars/tests/pytest.ini
+++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py
@@ -1,4 +1,2 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
+from . import partition, split
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
new file mode 100644
index 00000000000..c18257a4787
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+
+cpdef Table partition(Column input, Scalar delimiter=*)
+
+cpdef Table rpartition(Column input, Scalar delimiter=*)
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
new file mode 100644
index 00000000000..0fb4f186c41
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings.split cimport partition as cpp_partition
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+
+
+cpdef Table partition(Column input, Scalar delimiter=None):
+    """
+    Returns a set of 3 columns by splitting each string using the
+    specified delimiter.
+
+    For details, see :cpp:func:`cudf::strings::partition`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating where to split each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_partition.partition(
+            input.view(),
+            dereference(c_delimiter)
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Table rpartition(Column input, Scalar delimiter=None):
+    """
+    Returns a set of 3 columns by splitting each string using the
+    specified delimiter starting from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rpartition`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating where to split each string.
+
+    Returns
+    -------
+    Table
+       New strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_partition.rpartition(
+            input.view(),
+            dereference(c_delimiter)
+        )
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd
new file mode 100644
index 00000000000..355a1874298
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit)
+
+cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit)
+
+cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit)
+
+cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit)
+
+cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit)
+
+cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit)
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx
new file mode 100644
index 00000000000..e3827f6645e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx
@@ -0,0 +1,310 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.split cimport split as cpp_split
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+from cython.operator import dereference
+
+
+cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit):
+    """
+    Returns a list of columns by splitting each string using the
+    specified delimiter.
+
+    For details, see :cpp:func:`cudf::strings::split`.
+
+    Parameters
+    ----------
+    strings_column : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating the split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = cpp_split.split(
+            strings_column.view(),
+            dereference(c_delimiter),
+            maxsplit,
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit):
+    """
+    Returns a list of columns by splitting each string using the
+    specified delimiter starting from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit`.
+
+    Parameters
+    ----------
+    strings_column : Column
+        Strings instance for this operation
+
+    delimiter : Scalar
+        UTF-8 encoded string indicating the split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        New table of strings columns.
+    """
+    cdef unique_ptr[table] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = cpp_split.rsplit(
+            strings_column.view(),
+            dereference(c_delimiter),
+            maxsplit,
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit):
+    """
+    Splits individual strings elements into a list of strings.
+
+    For details, see :cpp:func:`cudf::strings::split_record`.
+
+    Parameters
+    ----------
+    strings : Column
+        A column of string elements to be split.
+
+    delimiter : Scalar
+        The string to identify split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = cpp_split.split_record(
+            strings.view(),
+            dereference(c_delimiter),
+            maxsplit,
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit):
+    """
+    Splits individual strings elements into a list of strings starting
+    from the end of each string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_record`.
+
+    Parameters
+    ----------
+    strings : Column
+        A column of string elements to be split.
+
+    delimiter : Scalar
+        The string to identify split points in each string.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+    cdef const string_scalar* c_delimiter = <const string_scalar*>(
+        delimiter.c_obj.get()
+    )
+
+    with nogil:
+        c_result = cpp_split.rsplit_record(
+            strings.view(),
+            dereference(c_delimiter),
+            maxsplit,
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a table of strings columns
+    using a regex_program's pattern to delimit each string.
+
+    For details, see :cpp:func:`cudf::strings::split_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        A table of columns of strings.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = cpp_split.split_re(
+            input.view(),
+            prog.c_obj.get()[0],
+            maxsplit,
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a table of strings columns
+    using a regex_program's pattern to delimit each string starting from
+    the end of the string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Table
+        A table of columns of strings.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = cpp_split.rsplit_re(
+            input.view(),
+            prog.c_obj.get()[0],
+            maxsplit,
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a list column of strings using the given
+    regex_program to delimit each string.
+
+    For details, see :cpp:func:`cudf::strings::split_record_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_split.split_record_re(
+            input.view(),
+            prog.c_obj.get()[0],
+            maxsplit,
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit):
+    """
+    Splits strings elements into a list column of strings using the given
+    regex_program to delimit each string starting from the end of the string.
+
+    For details, see :cpp:func:`cudf::strings::rsplit_record_re`.
+
+    Parameters
+    ----------
+    input : Column
+        A column of string elements to be split.
+
+    prog : RegexProgram
+        Regex program instance.
+
+    maxsplit : int
+        Maximum number of splits to perform. -1 indicates all possible
+        splits on each string.
+
+    Returns
+    -------
+    Column
+        Lists column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_split.rsplit_record_re(
+            input.view(),
+            prog.c_obj.get()[0],
+            maxsplit,
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx
index a62c7ec4528..d85da8e6cdd 100644
--- a/python/pylibcudf/pylibcudf/strings/translate.pyx
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyx
@@ -62,11 +62,9 @@ cpdef Column translate(Column input, dict chars_table):
     )
 
     with nogil:
-        c_result = move(
-            cpp_translate.translate(
-                input.view(),
-                c_chars_table
-            )
+        c_result = cpp_translate.translate(
+            input.view(),
+            c_chars_table
         )
     return Column.from_libcudf(move(c_result))
 
@@ -111,12 +109,10 @@ cpdef Column filter_characters(
     )
 
     with nogil:
-        c_result = move(
-            cpp_translate.filter_characters(
-                input.view(),
-                c_characters_to_filter,
-                keep_characters,
-                dereference(c_replacement),
-            )
+        c_result = cpp_translate.filter_characters(
+            input.view(),
+            c_characters_to_filter,
+            keep_characters,
+            dereference(c_replacement),
         )
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/strings/wrap.pxd
new file mode 100644
index 00000000000..fcc86650acf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column wrap(Column input, size_type width)
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx
new file mode 100644
index 00000000000..2ced250f837
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport wrap as cpp_wrap
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column wrap(Column input, size_type width):
+    """
+    Wraps strings onto multiple lines shorter than `width` by
+    replacing appropriate white space with
+    new-line characters (ASCII 0x0A).
+
+    For details, see :cpp:func:`cudf::strings::wrap`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+
+    width : int
+        Maximum character width of a line within each string
+
+    Returns
+    -------
+    Column
+        Column of wrapped strings
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_wrap.wrap(
+            input.view(),
+            width,
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx
index 5f77b89a605..d0d6f2343d0 100644
--- a/python/pylibcudf/pylibcudf/table.pyx
+++ b/python/pylibcudf/pylibcudf/table.pyx
@@ -49,9 +49,7 @@ cdef class Table:
         calling libcudf algorithms, and should generally not be needed by users
         (even direct pylibcudf Cython users).
         """
-        cdef vector[unique_ptr[column]] c_columns = move(
-            dereference(libcudf_tbl).release()
-        )
+        cdef vector[unique_ptr[column]] c_columns = dereference(libcudf_tbl).release()
 
         cdef vector[unique_ptr[column]].size_type i
         return Table([
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index 9f389fa42c4..d95849ef371 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -7,10 +7,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.orc import write_table as orc_write_table
 from pyarrow.parquet import write_table as pq_write_table
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
index fdce6f353ca..a19a8835498 100644
--- a/python/pylibcudf/pylibcudf/tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -8,8 +8,9 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 0cd5064a697..3d9d99ffa61 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -5,10 +5,11 @@
 
 import fastavro
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_and_meta_eq
 
+import pylibcudf as plc
+
 avro_dtype_pairs = [
     ("boolean", pa.bool_()),
     ("int", pa.int32()),
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index ab26f23418d..22c83acc47c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -5,9 +5,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     _convert_types,
     assert_table_and_meta_eq,
@@ -15,6 +13,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_CSV_SOURCE_KWARGS = {
     "format": "csv",
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
index 9d976fedf00..453e5ce32a8 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -3,9 +3,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     assert_table_and_meta_eq,
     make_source,
@@ -13,6 +11,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 42b14b1feff..5ed660ba6cf 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import _convert_types, assert_table_and_meta_eq, make_source
 
+import pylibcudf as plc
+
 # Shared kwargs to pass to make_source
 _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index f6e843ccf66..41298601539 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.parquet import read_table
+from utils import assert_table_and_meta_eq, make_source
+
+import pylibcudf as plc
 from pylibcudf.expressions import (
     ASTOperator,
     ColumnNameReference,
@@ -11,7 +13,6 @@
     Literal,
     Operation,
 )
-from utils import assert_table_and_meta_eq, make_source
 
 # Shared kwargs to pass to make_source
 _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
index 747f58ec8cf..0c43c363e55 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
@@ -2,9 +2,10 @@
 
 import io
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
 def io_class(request):
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
index 76b0424b2af..b3555013927 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import zoneinfo
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 def test_make_timezone_transition_table():
     if len(zoneinfo.TZPATH) == 0:
diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini
deleted file mode 100644
index f572f85ca49..00000000000
--- a/python/pylibcudf/pylibcudf/tests/pytest.ini
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-xfail_strict = true
-filterwarnings =
-    error
-    ignore:::.*xdist.*
-    ignore:::.*pytest.*
-addopts = --tb=native
diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
index f784cb3c191..bbb08e8b95a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -4,10 +4,11 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def idfn(param):
     ltype, rtype, outtype, plc_op, _ = param
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
index 8cedbc6d42f..e317362a76b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
 
+import pylibcudf as plc
+
 EMPTY_COL_SIZE = 3
 
 NUMERIC_TYPES = [
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
index 0e129fdf0ef..24cd6b9e35f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
 import rmm
 
+import pylibcudf as plc
+
 VALID_TYPES = [
     pa.int8(),
     pa.int16(),
diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
new file mode 100644
index 00000000000..6d8b5993964
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_table_eq
+
+import pylibcudf as plc
+
+param_pyarrow_tables = [
+    pa.table([]),
+    pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
+    pa.table({"a": [1, 2, 3]}),
+    pa.table({"a": [1], "b": [2], "c": [3]}),
+    pa.table({"a": ["a", "bb", "ccc"]}),
+    pa.table({"a": [1, 2, None], "b": [None, 3, 4]}),
+    pa.table(
+        {
+            "a": [["a", "b"], ["cde"]],
+            "b": [
+                {"alpha": [1, 2], "beta": None},
+                {"alpha": [3, 4], "beta": 5},
+            ],
+        }
+    ),
+]
+
+
+@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables)
+def test_pack_and_unpack(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    packed = plc.contiguous_split.pack(plc_tbl)
+
+    res = plc.contiguous_split.unpack(packed)
+    assert_table_eq(arrow_tbl, res)
+
+
+@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables)
+def test_pack_and_unpack_from_memoryviews(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    packed = plc.contiguous_split.pack(plc_tbl)
+
+    metadata, gpudata = packed.release()
+
+    with pytest.raises(ValueError, match="Cannot release empty"):
+        packed.release()
+
+    del packed  # `metadata` and `gpudata` will survive
+
+    res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpudata)
+    assert_table_eq(arrow_tbl, res)
diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py
index 628682d0a66..c0a41b96b1a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_copying.py
+++ b/python/pylibcudf/pylibcudf/tests/test_copying.py
@@ -2,7 +2,6 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
@@ -16,6 +15,8 @@
     metadata_from_arrow_type,
 )
 
+import pylibcudf as plc
+
 
 # TODO: consider moving this to conftest and "pairing"
 # it with pa_type, so that they don't get out of sync
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index 89c96829e71..a80ab8d9f65 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -1,26 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import datetime
-import functools
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-
-@pytest.fixture
-def date_column(has_nulls):
-    values = [
-        datetime.date(1999, 1, 1),
-        datetime.date(2024, 10, 12),
-        datetime.date(1, 1, 1),
-        datetime.date(9999, 1, 1),
-    ]
-    if has_nulls:
-        values[2] = None
-    return plc.interop.from_arrow(pa.array(values, type=pa.date32()))
+import pylibcudf as plc
 
 
 @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
@@ -40,24 +27,35 @@ def datetime_column(has_nulls, request):
     )
 
 
-@pytest.mark.parametrize(
-    "component, pc_fun",
-    [
-        ("year", pc.year),
-        ("month", pc.month),
-        ("day", pc.day),
-        ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)),
-        ("hour", pc.hour),
-        ("minute", pc.minute),
-        ("second", pc.second),
-        ("millisecond", pc.millisecond),
-        ("microsecond", pc.microsecond),
-        ("nanosecond", pc.nanosecond),
+@pytest.fixture(
+    params=[
+        ("year", plc.datetime.DatetimeComponent.YEAR),
+        ("month", plc.datetime.DatetimeComponent.MONTH),
+        ("day", plc.datetime.DatetimeComponent.DAY),
+        ("day_of_week", plc.datetime.DatetimeComponent.WEEKDAY),
+        ("hour", plc.datetime.DatetimeComponent.HOUR),
+        ("minute", plc.datetime.DatetimeComponent.MINUTE),
+        ("second", plc.datetime.DatetimeComponent.SECOND),
+        ("millisecond", plc.datetime.DatetimeComponent.MILLISECOND),
+        ("microsecond", plc.datetime.DatetimeComponent.MICROSECOND),
+        ("nanosecond", plc.datetime.DatetimeComponent.NANOSECOND),
     ],
+    ids=lambda x: x[0],
 )
-def test_extraction(datetime_column, component, pc_fun):
+def component(request):
+    return request.param
+
+
+def test_extract_datetime_component(datetime_column, component):
+    attr, component = component
+    kwargs = {}
+    if attr == "day_of_week":
+        kwargs = {"count_from_zero": False}
     got = plc.datetime.extract_datetime_component(datetime_column, component)
     # libcudf produces an int16, arrow produces an int64
-    expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16())
+
+    expect = getattr(pc, attr)(
+        plc.interop.to_arrow(datetime_column), **kwargs
+    ).cast(pa.int16())
 
     assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py
index 5894ef4624c..6eabd6db617 100644
--- a/python/pylibcudf/pylibcudf/tests/test_expressions.py
+++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 # We can't really evaluate these expressions, so just make sure
 # construction works properly
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
index 01c998f16d4..af80b6e5978 100644
--- a/python/pylibcudf/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,8 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import cupy as cp
+import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
+from utils import assert_table_eq
+
+import pylibcudf as plc
 
 
 def test_list_dtype_roundtrip():
@@ -66,3 +70,31 @@ def test_decimal_other(data_type):
 
     arrow_type = plc.interop.to_arrow(data_type, precision=precision)
     assert arrow_type == pa.decimal128(precision, 0)
+
+
+def test_round_trip_dlpack_plc_table():
+    expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]})
+    plc_table = plc.interop.from_arrow(expected)
+    result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+    assert_table_eq(expected, result)
+
+
+@pytest.mark.parametrize("array", [np.array, cp.array])
+def test_round_trip_dlpack_array(array):
+    arr = array([1, 2, 3])
+    result = plc.interop.from_dlpack(arr.__dlpack__())
+    expected = pa.table({"a": [1, 2, 3]})
+    assert_table_eq(expected, result)
+
+
+def test_to_dlpack_error():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": [1, None, 3], "b": [5, 6, 7]})
+    )
+    with pytest.raises(ValueError, match="Cannot create a DLPack tensor"):
+        plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+
+
+def test_from_dlpack_error():
+    with pytest.raises(ValueError, match="Invalid PyCapsule object"):
+        plc.interop.from_dlpack(1)
diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py
index 61e02f4d28d..f43a56046a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_join.py
+++ b/python/pylibcudf/pylibcudf/tests/test_join.py
@@ -2,9 +2,10 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 def test_cross_join():
     left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py
new file mode 100644
index 00000000000..486a9524e92
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_json.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def plc_col():
+    arr = pa.array(
+        ['{"foo": {"bar": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None]
+    )
+    return plc.interop.from_arrow(arr)
+
+
+@pytest.fixture(scope="module")
+def json_path():
+    slr = pa.scalar("$.foo.bar")
+    return plc.interop.from_arrow(slr)
+
+
+@pytest.mark.parametrize("allow_single_quotes", [True, False])
+@pytest.mark.parametrize("strip_quotes_from_single_strings", [True, False])
+@pytest.mark.parametrize("missing_fields_as_nulls", [True, False])
+def test_get_json_object(
+    plc_col,
+    json_path,
+    allow_single_quotes,
+    strip_quotes_from_single_strings,
+    missing_fields_as_nulls,
+):
+    result = plc.json.get_json_object(
+        plc_col,
+        json_path,
+        plc.json.GetJsonObjectOptions(
+            allow_single_quotes=allow_single_quotes,
+            strip_quotes_from_single_strings=strip_quotes_from_single_strings,
+            missing_fields_as_nulls=missing_fields_as_nulls,
+        ),
+    )
+    expected = pa.array(['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
index f7fb7463b50..beacfc63ce5 100644
--- a/python/pylibcudf/pylibcudf/tests/test_labeling.py
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("left_inclusive", [True, False])
 @pytest.mark.parametrize("right_inclusive", [True, False])
diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
index 2353a6ff8f9..f3ef555f11d 100644
--- a/python/pylibcudf/pylibcudf/tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def test_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
index 3edcae59edc..cd3da856de2 100644
--- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py
+++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.null_mask import MaskState
 
 import rmm
 
+import pylibcudf as plc
+from pylibcudf.null_mask import MaskState
+
 
 @pytest.fixture(params=[False, True])
 def nullable(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
new file mode 100644
index 00000000000..8b14e0db576
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def edit_distance_data():
+    arr1 = ["hallo", "goodbye", "world"]
+    arr2 = ["hello", "", "world"]
+    return pa.array(arr1), pa.array(arr2)
+
+
+def test_edit_distance(edit_distance_data):
+    input_col, targets = edit_distance_data
+    result = plc.nvtext.edit_distance.edit_distance(
+        plc.interop.from_arrow(input_col),
+        plc.interop.from_arrow(targets),
+    )
+    expected = pa.array([1, 7, 0], type=pa.int32())
+    assert_column_eq(result, expected)
+
+
+def test_edit_distance_matrix(edit_distance_data):
+    input_col, _ = edit_distance_data
+    result = plc.nvtext.edit_distance.edit_distance_matrix(
+        plc.interop.from_arrow(input_col)
+    )
+    expected = pa.array(
+        [[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32())
+    )
+    assert_column_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
new file mode 100644
index 00000000000..fae4685f81b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["ab", "cde", "fgh"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+@pytest.mark.parametrize("sep", ["_", "**", ","])
+def test_generate_ngrams(input_col, ngram, sep):
+    result = plc.nvtext.generate_ngrams.generate_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+        plc.interop.from_arrow(pa.scalar(sep)),
+    )
+    expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"])
+    if ngram == 3:
+        expected = pa.array([f"ab{sep}cde{sep}fgh"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+def test_generate_character_ngrams(input_col, ngram):
+    result = plc.nvtext.generate_ngrams.generate_character_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+    )
+    expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]])
+    if ngram == 3:
+        expected = pa.array([[], ["cde"], ["fgh"]])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+def test_hash_character_ngrams(input_col, ngram):
+    result = plc.nvtext.generate_ngrams.hash_character_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(
+        len(got) == max(0, len(s.as_py()) - ngram + 1)
+        for got, s in zip(pa_result, input_col)
+    )
+    assert pa_result.type == pa.list_(
+        pa.field("element", pa.uint32(), nullable=False)
+    )
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
new file mode 100644
index 00000000000..05fe7b53c16
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_data():
+    input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
+    input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
+    return pa.array(input1), pa.array(input2)
+
+
+@pytest.mark.parametrize("width", [2, 3])
+def test_jaccard_index(input_data, width):
+    def get_tokens(s, width):
+        return [s[i : i + width] for i in range(len(s) - width + 1)]
+
+    def jaccard_index(s1, s2, width):
+        x = set(get_tokens(s1, width))
+        y = set(get_tokens(s2, width))
+        return len(x & y) / len(x | y)
+
+    input1, input2 = input_data
+    result = plc.nvtext.jaccard.jaccard_index(
+        plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width
+    )
+    expected = pa.array(
+        [
+            jaccard_index(s1.as_py(), s2.as_py(), width)
+            for s1, s2 in zip(input1, input2)
+        ],
+        type=pa.float32(),
+    )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
new file mode 100644
index 00000000000..ead9ee094af
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
+def minhash_input_data(request):
+    input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"])
+    seeds = pa.array([2, 3, 4, 5], request.param)
+    return input_arr, seeds, request.param
+
+
+@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
+def word_minhash_input_data(request):
+    input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]])
+    seeds = pa.array([2, 3, 4, 5], request.param)
+    return input_arr, seeds, request.param
+
+
+@pytest.mark.parametrize("width", [5, 12])
+def test_minhash(minhash_input_data, width):
+    input_arr, seeds, seed_type = minhash_input_data
+    minhash_func = (
+        plc.nvtext.minhash.minhash
+        if seed_type == pa.uint32()
+        else plc.nvtext.minhash.minhash64
+    )
+    result = minhash_func(
+        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
+    assert pa_result.type == pa.list_(
+        pa.field("element", seed_type, nullable=False)
+    )
+
+
+def test_word_minhash(word_minhash_input_data):
+    input_arr, seeds, seed_type = word_minhash_input_data
+    word_minhash_func = (
+        plc.nvtext.minhash.word_minhash
+        if seed_type == pa.uint32()
+        else plc.nvtext.minhash.word_minhash64
+    )
+    result = word_minhash_func(
+        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds)
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
+    assert pa_result.type == pa.list_(
+        pa.field("element", seed_type, nullable=False)
+    )
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
new file mode 100644
index 00000000000..84748b5597e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["a*b*c*d", "a b c d", "a-b-c-d", "a*b c-d"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("ngrams", [2, 3])
+@pytest.mark.parametrize("delim", ["*", " ", "-"])
+@pytest.mark.parametrize("sep", ["_", "&", ","])
+def test_ngrams_tokenize(input_col, ngrams, delim, sep):
+    def ngrams_tokenize(strings, ngrams, delim, sep):
+        tokens = []
+        for s in strings:
+            ss = s.split(delim)
+            for i in range(len(ss) - ngrams + 1):
+                token = sep.join(ss[i : i + ngrams])
+                tokens.append(token)
+        return tokens
+
+    result = plc.nvtext.ngrams_tokenize.ngrams_tokenize(
+        plc.interop.from_arrow(input_col),
+        ngrams,
+        plc.interop.from_arrow(pa.scalar(delim)),
+        plc.interop.from_arrow(pa.scalar(sep)),
+    )
+    expected = pa.array(
+        ngrams_tokenize(input_col.to_pylist(), ngrams, delim, sep)
+    )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
new file mode 100644
index 00000000000..25b6d1389ec
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def norm_spaces_input_data():
+    arr = ["a b", "  c  d\n", "e \t f "]
+    return pa.array(arr)
+
+
+@pytest.fixture(scope="module")
+def norm_chars_input_data():
+    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+    return pa.array(arr)
+
+
+def test_normalize_spaces(norm_spaces_input_data):
+    result = plc.nvtext.normalize.normalize_spaces(
+        plc.interop.from_arrow(norm_spaces_input_data)
+    )
+    expected = pa.array(["a b", "c d", "e f"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalize_characters(norm_chars_input_data, do_lower):
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        do_lower,
+    )
+    expected = pa.array(
+        ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+    )
+    if not do_lower:
+        expected = pa.array(
+            ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+        )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
new file mode 100644
index 00000000000..65687f31c85
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    return pa.array(arr)
+
+
+@pytest.fixture(scope="module")
+def targets():
+    arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("delim", ["*", None])
+def test_replace_tokens(input_col, targets, delim):
+    replacements = pa.array(["slow", "cat", "looked", "rat"])
+    result = plc.nvtext.replace.replace_tokens(
+        plc.interop.from_arrow(input_col),
+        plc.interop.from_arrow(targets),
+        plc.interop.from_arrow(replacements),
+        plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
+    )
+    expected = pa.array(["slow", "cat", "jumps*over the", "rat"])
+    if not delim:
+        expected = pa.array(
+            ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("min_token_length", [4, 5])
+@pytest.mark.parametrize("replace", ["---", None])
+@pytest.mark.parametrize("delim", ["*", None])
+def test_filter_tokens(input_col, min_token_length, replace, delim):
+    result = plc.nvtext.replace.filter_tokens(
+        plc.interop.from_arrow(input_col),
+        min_token_length,
+        plc.interop.from_arrow(pa.scalar(replace)) if replace else None,
+        plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
+    )
+    expected = pa.array(
+        ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    )
+    if not delim and not replace and min_token_length == 4:
+        expected = pa.array([" quick", "brown ", "jumps*over ", "lazy "])
+    if not delim and not replace and min_token_length == 5:
+        expected = pa.array([" quick", "brown ", "jumps*over ", " "])
+    if not delim and replace == "---" and min_token_length == 4:
+        expected = pa.array(
+            ["--- quick", "brown ---", "jumps*over ---", "lazy ---"]
+        )
+    if not delim and replace == "---" and min_token_length == 5:
+        expected = pa.array(
+            ["--- quick", "brown ---", "jumps*over ---", "--- ---"]
+        )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
new file mode 100644
index 00000000000..e7f4a971f08
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["trouble", "toy", "syzygy"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("check_vowels", [True, False])
+@pytest.mark.parametrize("indices", [[3, 1, 4], 1])
+def test_is_letter(input_col, check_vowels, indices):
+    def is_letter(s, i, check):
+        vowels = "aeiouy"
+        return (s[i] in vowels) == check
+
+    result = plc.nvtext.stemmer.is_letter(
+        plc.interop.from_arrow(input_col),
+        check_vowels,
+        plc.interop.from_arrow(pa.array(indices))
+        if isinstance(indices, list)
+        else indices,
+    )
+    expected = pa.array(
+        [
+            is_letter(
+                s,
+                indices[i] if isinstance(indices, list) else indices,
+                check_vowels,
+            )
+            for i, s in enumerate(input_col.to_pylist())
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_porter_stemmer_measure(input_col):
+    result = plc.nvtext.stemmer.porter_stemmer_measure(
+        plc.interop.from_arrow(input_col),
+    )
+    expected = pa.array([1, 1, 2], type=pa.int32())
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
index 444d0089d2c..c55e54cebc6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py
+++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def partitioning_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
index bac56691306..e4a24fb1c98 100644
--- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py
+++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 # Map pylibcudf interpolation options to pyarrow options
 interp_mapping = {
     plc.types.Interpolation.LINEAR: "linear",
diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
index 777315df538..52598f2c462 100644
--- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py
+++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("pat", ["(", "*", "\\"])
 def test_regex_program_invalid(pat):
diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py
index 01115bc363a..ef23e23766a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_reshape.py
+++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def reshape_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py
index 0b30316b9a0..2526580bc13 100644
--- a/python/pylibcudf/pylibcudf/tests/test_round.py
+++ b/python/pylibcudf/pylibcudf/tests/test_round.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=["float32", "float64"])
 def column(request, has_nulls):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
index a1820def0b1..f461657281a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture()
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
index 176ccc55b96..3e31c75c38a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py
index 233cc253b14..08ac371fd96 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_case.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def string_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
index bcd030c019e..06b44210d74 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_all_characters_of_type():
     pa_array = pa.array(["1", "A"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
new file mode 100644
index 00000000000..eea3ac68e84
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_concatenate_scalar_seperator():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]})
+    )
+    sep = plc.interop.from_arrow(pa.scalar("-"))
+    result = plc.strings.combine.concatenate(
+        plc_table,
+        sep,
+    )
+    expected = pa.array(["a-a", "-b", "c-"])
+    assert_column_eq(result, expected)
+
+    result = plc.strings.combine.concatenate(
+        plc_table, sep, narep=plc.interop.from_arrow(pa.scalar("!"))
+    )
+    expected = pa.array(["a-a", "!-b", "c-!"])
+    assert_column_eq(result, expected)
+
+    with pytest.raises(ValueError):
+        plc.strings.combine.concatenate(
+            plc_table,
+            sep,
+            narep=plc.interop.from_arrow(pa.scalar("!")),
+            col_narep=plc.interop.from_arrow(pa.scalar("?")),
+        )
+
+
+def test_concatenate_column_seperator():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]})
+    )
+    sep = plc.interop.from_arrow(pa.array(["-", "?", ","]))
+    result = plc.strings.combine.concatenate(
+        plc_table,
+        sep,
+    )
+    expected = pa.array(["a-a", "?b", "c,"])
+    assert_column_eq(result, expected)
+
+    result = plc.strings.combine.concatenate(
+        plc_table,
+        plc.interop.from_arrow(pa.array([None, "?", ","])),
+        narep=plc.interop.from_arrow(pa.scalar("1")),
+        col_narep=plc.interop.from_arrow(pa.scalar("*")),
+    )
+    expected = pa.array(["a1a", "*?b", "c,*"])
+    assert_column_eq(result, expected)
+
+
+def test_join_strings():
+    pa_arr = pa.array(list("abc"))
+    sep = pa.scalar("")
+    result = plc.strings.combine.join_strings(
+        plc.interop.from_arrow(pa_arr),
+        plc.interop.from_arrow(sep),
+        plc.interop.from_arrow(pa.scalar("")),
+    )
+    expected = pa.array(["abc"])
+    assert_column_eq(result, expected)
+
+
+def test_join_list_elements():
+    pa_arr = pa.array([["a", "a"], ["b", "b"]])
+    sep = pa.scalar("")
+    result = plc.strings.combine.join_list_elements(
+        plc.interop.from_arrow(pa_arr),
+        plc.interop.from_arrow(sep),
+        plc.interop.from_arrow(pa.scalar("")),
+        plc.interop.from_arrow(pa.scalar("")),
+        plc.strings.combine.SeparatorOnNulls.YES,
+        plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
+    )
+    expected = pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep)
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
index 4e4dd7cbb00..ba9a4a7d3b8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def target_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
index e9e95459d0e..3f3f452c4f6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -1,12 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from datetime import datetime
-
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     scope="module",
@@ -21,39 +20,16 @@ def timestamp_type(request):
     return request.param
 
 
-@pytest.fixture(
-    scope="module",
-    params=[
-        pa.duration("ns"),
-        pa.duration("us"),
-        pa.duration("ms"),
-        pa.duration("s"),
-    ],
-)
-def duration_type(request):
-    return request.param
-
-
 @pytest.fixture(scope="module")
 def pa_timestamp_col():
     return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"])
 
 
-@pytest.fixture(scope="module")
-def pa_duration_col():
-    return pa.array(["05:20:25"])
-
-
 @pytest.fixture(scope="module")
 def plc_timestamp_col(pa_timestamp_col):
     return plc.interop.from_arrow(pa_timestamp_col)
 
 
-@pytest.fixture(scope="module")
-def plc_duration_col(pa_duration_col):
-    return plc.interop.from_arrow(pa_duration_col)
-
-
 @pytest.mark.parametrize("format", ["%Y-%m-%d"])
 def test_to_datetime(
     pa_timestamp_col, plc_timestamp_col, timestamp_type, format
@@ -62,24 +38,6 @@ def test_to_datetime(
     got = plc.strings.convert.convert_datetime.to_timestamps(
         plc_timestamp_col,
         plc.interop.from_arrow(timestamp_type),
-        format.encode(),
-    )
-    assert_column_eq(expect, got)
-
-
-@pytest.mark.parametrize("format", ["%H:%M:%S"])
-def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format):
-    def to_timedelta(duration_str):
-        date = datetime.strptime(duration_str, format)
-        return date - datetime(1900, 1, 1)  # "%H:%M:%S" zero date
-
-    expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast(
-        duration_type
-    )
-
-    got = plc.strings.convert.convert_durations.to_durations(
-        plc_duration_col,
-        plc.interop.from_arrow(duration_type),
-        format.encode(),
+        format,
     )
     assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
new file mode 100644
index 00000000000..b391d2b290e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_to_booleans():
+    pa_array = pa.array(["true", None, "True"])
+    result = plc.strings.convert.convert_booleans.to_booleans(
+        plc.interop.from_arrow(pa_array),
+        plc.interop.from_arrow(pa.scalar("True")),
+    )
+    expected = pa.array([False, None, True])
+    assert_column_eq(result, expected)
+
+
+def test_from_booleans():
+    pa_array = pa.array([True, None, False])
+    result = plc.strings.convert.convert_booleans.from_booleans(
+        plc.interop.from_arrow(pa_array),
+        plc.interop.from_arrow(pa.scalar("A")),
+        plc.interop.from_arrow(pa.scalar("B")),
+    )
+    expected = pa.array(["A", None, "B"])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
new file mode 100644
index 00000000000..c9368d858a4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import datetime
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def fmt():
+    return "%Y-%m-%dT%H:%M:%S"
+
+
+def test_to_timestamp(fmt):
+    arr = pa.array(["2020-01-01T01:01:01", None])
+    result = plc.strings.convert.convert_datetime.to_timestamps(
+        plc.interop.from_arrow(arr),
+        plc.DataType(plc.TypeId.TIMESTAMP_SECONDS),
+        fmt,
+    )
+    expected = pc.strptime(arr, fmt, "s")
+    assert_column_eq(result, expected)
+
+
+def test_from_timestamp(fmt):
+    arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None])
+    result = plc.strings.convert.convert_datetime.from_timestamps(
+        plc.interop.from_arrow(arr),
+        fmt,
+        plc.interop.from_arrow(pa.array([], type=pa.string())),
+    )
+    # pc.strftime will add the extra %f
+    expected = pa.array(["2020-01-01T01:01:01", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_timestamp(fmt):
+    arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"])
+    result = plc.strings.convert.convert_datetime.is_timestamp(
+        plc.interop.from_arrow(arr),
+        fmt,
+    )
+    expected = pa.array([True, None, False])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
new file mode 100644
index 00000000000..2d3578e4e71
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from datetime import datetime, timedelta
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(
+    params=[
+        pa.duration("ns"),
+        pa.duration("us"),
+        pa.duration("ms"),
+        pa.duration("s"),
+    ],
+)
+def duration_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def pa_duration_col():
+    return pa.array(["05:20:25"])
+
+
+@pytest.fixture(scope="module")
+def plc_duration_col(pa_duration_col):
+    return plc.interop.from_arrow(pa_duration_col)
+
+
+def test_to_duration(pa_duration_col, plc_duration_col, duration_type):
+    format = "%H:%M:%S"
+
+    def to_timedelta(duration_str):
+        date = datetime.strptime(duration_str, format)
+        return date - datetime(1900, 1, 1)  # "%H:%M:%S" zero date
+
+    expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast(
+        duration_type
+    )
+
+    got = plc.strings.convert.convert_durations.to_durations(
+        plc_duration_col,
+        plc.interop.from_arrow(duration_type),
+        format,
+    )
+    assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize("format", [None, "%D days %H:%M:%S"])
+def test_from_durations(format):
+    pa_array = pa.array(
+        [timedelta(days=1, hours=1, minutes=1, seconds=1), None]
+    )
+    result = plc.strings.convert.convert_durations.from_durations(
+        plc.interop.from_arrow(pa_array), format
+    )
+    expected = pa.array(["1 days 01:01:01", None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
new file mode 100644
index 00000000000..012e722038e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import decimal
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_to_fixed_point():
+    typ = pa.decimal128(38, 2)
+    arr = pa.array(["123", "1.23", None])
+    result = plc.strings.convert.convert_fixed_point.to_fixed_point(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_fixed_point():
+    arr = pa.array([decimal.Decimal("1.1"), None])
+    result = plc.strings.convert.convert_fixed_point.from_fixed_point(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array(["1.1", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_fixed_point():
+    arr = pa.array(["123", "1.23", "1.2.3", "", None])
+    result = plc.strings.convert.convert_fixed_point.is_fixed_point(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
new file mode 100644
index 00000000000..8ee2b5075af
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_to_floats():
+    typ = pa.float32()
+    arr = pa.array(["-1.23", "1", None])
+    result = plc.strings.convert.convert_floats.to_floats(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_floats():
+    arr = pa.array([-1.23, 1, None])
+    result = plc.strings.convert.convert_floats.from_floats(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array(["-1.23", "1.0", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_float():
+    arr = pa.array(["-1.23", "1", "1.2.3", "A", None])
+    result = plc.strings.convert.convert_floats.is_float(
+        plc.interop.from_arrow(arr),
+    )
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
new file mode 100644
index 00000000000..01192c2d1f8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_to_integers():
+    typ = pa.int8()
+    arr = pa.array(["1", "-1", None])
+    result = plc.strings.convert.convert_integers.to_integers(
+        plc.interop.from_arrow(arr), plc.interop.from_arrow(typ)
+    )
+    expected = arr.cast(typ)
+    assert_column_eq(result, expected)
+
+
+def test_from_integers():
+    arr = pa.array([1, -1, None])
+    result = plc.strings.convert.convert_integers.from_integers(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(["1", "-1", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_integer():
+    arr = pa.array(["1", "-1", "1.2", "A", None])
+    plc_column = plc.interop.from_arrow(arr)
+    result = plc.strings.convert.convert_integers.is_integer(plc_column)
+    expected = pa.array([True, True, False, False, None])
+    assert_column_eq(result, expected)
+
+    result = plc.strings.convert.convert_integers.is_integer(
+        plc_column, plc.interop.from_arrow(pa.uint8())
+    )
+    expected = pa.array([True, False, False, False, None])
+    assert_column_eq(result, expected)
+
+
+def test_hex_to_integers():
+    typ = pa.int32()
+    data = ["0xff", "0x2a", None]
+    result = plc.strings.convert.convert_integers.hex_to_integers(
+        plc.interop.from_arrow(pa.array(data)), plc.interop.from_arrow(typ)
+    )
+    expected = pa.array(
+        [int(val, 16) if isinstance(val, str) else val for val in data],
+        type=typ,
+    )
+    assert_column_eq(result, expected)
+
+
+def test_is_hex():
+    arr = pa.array(["0xff", "123", "!", None])
+    result = plc.strings.convert.convert_integers.is_hex(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array([True, True, False, None])
+    assert_column_eq(result, expected)
+
+
+def test_integers_to_hex():
+    data = [255, -42, None]
+    arr = pa.array(data)
+    result = plc.strings.convert.convert_integers.integers_to_hex(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(["FF", "FFFFFFFFFFFFFFD6", None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
new file mode 100644
index 00000000000..b533809f106
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_ipv4_to_integers():
+    arr = pa.array(["123.45.67.890", None])
+    result = plc.strings.convert.convert_ipv4.ipv4_to_integers(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array([2066564730, None], type=pa.uint32())
+    assert_column_eq(result, expected)
+
+
+def test_integers_to_ipv4():
+    arr = pa.array([1, 0, None], type=pa.uint32())
+    result = plc.strings.convert.convert_ipv4.integers_to_ipv4(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(["0.0.0.1", "0.0.0.0", None])
+    assert_column_eq(result, expected)
+
+
+def test_is_ipv4():
+    arr = pa.array(["0.0.0.1", "1.2.34", "A", None])
+    result = plc.strings.convert.convert_ipv4.is_ipv4(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array([True, False, False, None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
new file mode 100644
index 00000000000..737036a4f0f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.mark.parametrize("na_rep", [None, pa.scalar("")])
+@pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])])
+def test_format_list_column(na_rep, separators):
+    arr = pa.array([["1", "A"], None])
+    result = plc.strings.convert.convert_lists.format_list_column(
+        plc.interop.from_arrow(arr),
+        na_rep if na_rep is None else plc.interop.from_arrow(na_rep),
+        separators
+        if separators is None
+        else plc.interop.from_arrow(separators),
+    )
+    expected = pa.array(["[1,A]", ""])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
new file mode 100644
index 00000000000..528736798c7
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import urllib
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_url_encode():
+    data = ["/home/nfs", None]
+    arr = pa.array(data)
+    result = plc.strings.convert.convert_urls.url_encode(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(
+        [
+            urllib.parse.quote(url, safe="") if isinstance(url, str) else url
+            for url in data
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_url_decode():
+    data = ["%2Fhome%2fnfs", None]
+    arr = pa.array(data)
+    result = plc.strings.convert.convert_urls.url_decode(
+        plc.interop.from_arrow(arr)
+    )
+    expected = pa.array(
+        [
+            urllib.parse.unquote(url) if isinstance(url, str) else url
+            for url in data
+        ]
+    )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
index 788b86423c4..e70edf4fb33 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py
index db3b13a5aae..82ec18832a9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
new file mode 100644
index 00000000000..fa9eee3594b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_find_multiple():
+    arr = pa.array(["abc", "def"])
+    targets = pa.array(["a", "c", "e"])
+    result = plc.strings.find_multiple.find_multiple(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(targets),
+    )
+    expected = pa.array(
+        [
+            [elem.find(target) for target in targets.to_pylist()]
+            for elem in arr.to_pylist()
+        ],
+        type=pa.list_(pa.int32()),
+    )
+    assert_column_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
index 994552fa276..b73d812c898 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
@@ -2,9 +2,10 @@
 import re
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_findall():
     arr = pa.array(["bunny", "rabbit", "hare", "dog"])
@@ -21,3 +22,20 @@ def test_findall():
         type=pa_result.type,
     )
     assert_column_eq(result, expected)
+
+
+def test_find_re():
+    arr = pa.array(["bunny", "rabbit", "hare", "dog"])
+    pattern = "[eb]"
+    result = plc.strings.findall.find_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    pa_result = plc.interop.to_arrow(result)
+    expected = pa.array(
+        [0, 2, 3, -1],
+        type=pa_result.type,
+    )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
new file mode 100644
index 00000000000..79498132097
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+
+import pylibcudf as plc
+
+
+def test_pad():
+    arr = pa.array(["a", "1", None])
+    plc_result = plc.strings.padding.pad(
+        plc.interop.from_arrow(arr),
+        2,
+        plc.strings.side_type.SideType.LEFT,
+        "!",
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="!"))
+    assert result.equals(expected)
+
+
+def test_zfill():
+    arr = pa.array(["a", "1", None])
+    plc_result = plc.strings.padding.zfill(plc.interop.from_arrow(arr), 2)
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="0"))
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
index 18b5d8bf4d0..c06c06be7c6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2])
 def test_repeat_strings(repeats):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
index 5a9c2007b73..2c7d25133de 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
new file mode 100644
index 00000000000..511f826441a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.mark.parametrize("max_replace_count", [-1, 1])
+def test_replace_re_regex_program_scalar(max_replace_count):
+    arr = pa.array(["foo", "fuz", None])
+    pat = "f."
+    repl = "ba"
+    result = plc.strings.replace_re.replace_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pat, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        plc.interop.from_arrow(pa.scalar(repl)),
+        max_replace_count=max_replace_count,
+    )
+    expected = pc.replace_substring_regex(
+        arr,
+        pat,
+        repl,
+        max_replacements=max_replace_count
+        if max_replace_count != -1
+        else None,
+    )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "flags",
+    [
+        plc.strings.regex_flags.RegexFlags.DEFAULT,
+        plc.strings.regex_flags.RegexFlags.DOTALL,
+    ],
+)
+def test_replace_re_list_str_columns(flags):
+    arr = pa.array(["foo", "fuz", None])
+    pats = ["oo", "uz"]
+    repls = ["a", "b"]
+    result = plc.strings.replace_re.replace_re(
+        plc.interop.from_arrow(arr),
+        pats,
+        plc.interop.from_arrow(pa.array(repls)),
+        flags=flags,
+    )
+    expected = arr
+    for pat, repl in zip(pats, repls):
+        expected = pc.replace_substring_regex(
+            expected,
+            pat,
+            repl,
+        )
+    assert_column_eq(result, expected)
+
+
+def test_replace_with_backrefs():
+    arr = pa.array(["Z756", None])
+    result = plc.strings.replace_re.replace_with_backrefs(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            "(\\d)(\\d)", plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        "V\\2\\1",
+    )
+    expected = pa.array(["ZV576", None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
index d9ce5591b98..1759f739e31 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def pa_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
new file mode 100644
index 00000000000..4e80f19b814
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_table_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def data_col():
+    pa_arr = pa.array(["ab_cd", "def_g_h", None])
+    plc_column = plc.interop.from_arrow(pa_arr)
+    return pa_arr, plc_column
+
+
+def test_partition(data_col):
+    pa_arr, plc_column = data_col
+    result = plc.strings.split.partition.partition(
+        plc_column, plc.interop.from_arrow(pa.scalar("_"))
+    )
+    expected = pa.table(
+        {
+            "a": ["ab", "def", None],
+            "b": ["_", "_", None],
+            "c": ["cd", "g_h", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rpartition(data_col):
+    pa_arr, plc_column = data_col
+    result = plc.strings.split.partition.rpartition(
+        plc_column, plc.interop.from_arrow(pa.scalar("_"))
+    )
+    expected = pa.table(
+        {
+            "a": ["ab", "def_g", None],
+            "b": ["_", "_", None],
+            "c": ["cd", "h", None],
+        }
+    )
+    assert_table_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
new file mode 100644
index 00000000000..450b336ce65
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def data_col():
+    pa_array = pa.array(["a_b_c", "d-e-f", None])
+    plc_column = plc.interop.from_arrow(pa_array)
+    return pa_array, plc_column
+
+
+@pytest.fixture
+def delimiter():
+    delimiter = "_"
+    plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter))
+    return delimiter, plc_delimiter
+
+
+@pytest.fixture
+def re_delimiter():
+    return "[_-]"
+
+
+def test_split(data_col, delimiter):
+    _, plc_column = data_col
+    _, plc_delimiter = delimiter
+    result = plc.strings.split.split.split(plc_column, plc_delimiter, 1)
+    expected = pa.table(
+        {
+            "a": ["a", "d-e-f", None],
+            "b": ["b_c", None, None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rsplit(data_col, delimiter):
+    _, plc_column = data_col
+    _, plc_delimiter = delimiter
+    result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1)
+    expected = pa.table(
+        {
+            "a": ["a_b", "d-e-f", None],
+            "b": ["c", None, None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_split_record(data_col, delimiter):
+    pa_array, plc_column = data_col
+    delim, plc_delim = delimiter
+    result = plc.strings.split.split.split_record(plc_column, plc_delim, 1)
+    expected = pc.split_pattern(pa_array, delim, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_rsplit_record(data_col, delimiter):
+    pa_array, plc_column = data_col
+    delim, plc_delim = delimiter
+    result = plc.strings.split.split.split_record(plc_column, plc_delim, 1)
+    expected = pc.split_pattern(pa_array, delim, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_split_re(data_col, re_delimiter):
+    _, plc_column = data_col
+    result = plc.strings.split.split.split_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pa.table(
+        {
+            "a": ["a", "d", None],
+            "b": ["b_c", "e-f", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_rsplit_re(data_col, re_delimiter):
+    _, plc_column = data_col
+    result = plc.strings.split.split.rsplit_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pa.table(
+        {
+            "a": ["a_b", "d-e", None],
+            "b": ["c", "f", None],
+        }
+    )
+    assert_table_eq(expected, result)
+
+
+def test_split_record_re(data_col, re_delimiter):
+    pa_array, plc_column = data_col
+    result = plc.strings.split.split.split_record_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        1,
+    )
+    expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1)
+    assert_column_eq(expected, result)
+
+
+def test_rsplit_record_re(data_col, re_delimiter):
+    pa_array, plc_column = data_col
+    result = plc.strings.split.split.rsplit_record_re(
+        plc_column,
+        plc.strings.regex_program.RegexProgram.create(
+            re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        -1,
+    )
+    expected = pc.split_pattern_regex(pa_array, re_delimiter)
+    assert_column_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
index 005e5e4a405..5869e5f4920 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 data_strings = [
     "AbC",
     "123abc",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
index 2ae893e69fb..84fd3354ac6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_translate.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
new file mode 100644
index 00000000000..00442d866e9
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import textwrap
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+def test_wrap():
+    width = 12
+    pa_array = pa.array(
+        [
+            "the quick brown fox jumped over the lazy brown dog",
+            "hello, world",
+            None,
+        ]
+    )
+    result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width)
+    expected = pa.array(
+        [
+            textwrap.fill(val, width) if isinstance(val, str) else val
+            for val in pa_array.to_pylist()
+        ]
+    )
+    assert_column_eq(expected, result)
diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py
index e822d6a97a8..ac39ef4c5c9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_table.py
+++ b/python/pylibcudf/pylibcudf/tests/test_table.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize(
     "arrow_tbl",
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index d5c618f07e4..49802fe64ac 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -3,9 +3,10 @@
 import math
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_nans_to_nulls(has_nans):
     if has_nans:
diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py
index ac11123f680..b0c0bc72ead 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transpose.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from packaging.version import parse
 
+import pylibcudf as plc
+
 
 @pytest.mark.skipif(
     parse(pa.__version__) < parse("16.0.0"),
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index de425a27c15..bce9702752a 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -9,7 +9,8 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, size_type
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
@@ -34,7 +35,7 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     cdef pair[unique_ptr[device_buffer], size_type] c_result
 
     with nogil:
-        c_result = move(cpp_transform.nans_to_nulls(input.view()))
+        c_result = cpp_transform.nans_to_nulls(input.view())
 
     return (
         gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
@@ -58,7 +59,7 @@ cpdef tuple[gpumemoryview, int] bools_to_mask(Column input):
     cdef pair[unique_ptr[device_buffer], size_type] c_result
 
     with nogil:
-        c_result = move(cpp_transform.bools_to_mask(input.view()))
+        c_result = cpp_transform.bools_to_mask(input.view())
 
     return (
         gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
@@ -87,7 +88,7 @@ cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit):
     cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask)
 
     with nogil:
-        c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit))
+        c_result = cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)
 
     return Column.from_libcudf(move(c_result))
 
@@ -118,10 +119,8 @@ cpdef Column transform(Column input, str unary_udf, DataType output_type, bool i
     cdef bool c_is_ptx = is_ptx
 
     with nogil:
-        c_result = move(
-            cpp_transform.transform(
-                input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
-            )
+        c_result = cpp_transform.transform(
+            input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
         )
 
     return Column.from_libcudf(move(c_result))
@@ -143,7 +142,7 @@ cpdef tuple[Table, Column] encode(Table input):
     cdef pair[unique_ptr[table], unique_ptr[column]] c_result
 
     with nogil:
-        c_result = move(cpp_transform.encode(input.view()))
+        c_result = cpp_transform.encode(input.view())
 
     return (
         Table.from_libcudf(move(c_result.first)),
@@ -171,7 +170,7 @@ cpdef Table one_hot_encode(Column input, Column categories):
     cdef Table owner_table
 
     with nogil:
-        c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view()))
+        c_result = cpp_transform.one_hot_encode(input.view(), categories.view())
 
     owner_table = Table(
         [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx
index a708f6cc37f..a24f937ced3 100644
--- a/python/pylibcudf/pylibcudf/transpose.pyx
+++ b/python/pylibcudf/pylibcudf/transpose.pyx
@@ -29,7 +29,7 @@ cpdef Table transpose(Table input_table):
     cdef Table owner_table
 
     with nogil:
-        c_result = move(cpp_transpose.transpose(input_table.view()))
+        c_result = cpp_transpose.transpose(input_table.view())
 
     owner_table = Table(
         [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx
index 839360ef406..53e8c382b5e 100644
--- a/python/pylibcudf/pylibcudf/unary.pyx
+++ b/python/pylibcudf/pylibcudf/unary.pyx
@@ -34,7 +34,7 @@ cpdef Column unary_operation(Column input, unary_operator op):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.unary_operation(input.view(), op))
+        result = cpp_unary.unary_operation(input.view(), op)
 
     return Column.from_libcudf(move(result))
 
@@ -57,7 +57,7 @@ cpdef Column is_null(Column input):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.is_null(input.view()))
+        result = cpp_unary.is_null(input.view())
 
     return Column.from_libcudf(move(result))
 
@@ -80,7 +80,7 @@ cpdef Column is_valid(Column input):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.is_valid(input.view()))
+        result = cpp_unary.is_valid(input.view())
 
     return Column.from_libcudf(move(result))
 
@@ -105,7 +105,7 @@ cpdef Column cast(Column input, DataType data_type):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.cast(input.view(), data_type.c_obj))
+        result = cpp_unary.cast(input.view(), data_type.c_obj)
 
     return Column.from_libcudf(move(result))
 
@@ -128,7 +128,7 @@ cpdef Column is_nan(Column input):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.is_nan(input.view()))
+        result = cpp_unary.is_nan(input.view())
 
     return Column.from_libcudf(move(result))
 
@@ -151,7 +151,7 @@ cpdef Column is_not_nan(Column input):
     cdef unique_ptr[column] result
 
     with nogil:
-        result = move(cpp_unary.is_not_nan(input.view()))
+        result = cpp_unary.is_not_nan(input.view())
 
     return Column.from_libcudf(move(result))
 
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index a8224f54e1c..a80c85a1fa8 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -22,7 +22,8 @@ dependencies = [
     "libcudf==24.12.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=14.0.0,<18.0.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'",
     "rmm==24.12.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -52,48 +53,31 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
+
+[tool.pytest.ini_options]
+# --import-mode=importlib because two test_json.py exists and tests directory is not a structured module
+addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib"
+empty_parameter_set_mark = "fail_at_collect"
+filterwarnings = [
+  "error",
+  "ignore:::.*xdist.*",
+  "ignore:::.*pytest.*"
 ]
+xfail_strict = true
 
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"