From ff2480b6c2e8acb1c6bccfb197b11a53d229699c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 18 Sep 2024 02:48:09 +0000
Subject: [PATCH 01/37] Add the new multithreaded parquet example

---
 cpp/examples/parquet_io/CMakeLists.txt        |  13 +-
 .../parquet_io/{parquet_io.hpp => common.hpp} |   6 +-
 cpp/examples/parquet_io/parquet_io.cpp        |   4 +-
 .../parquet_io/parquet_io_multithreaded.cpp   | 290 ++++++++++++++++++
 4 files changed, 306 insertions(+), 7 deletions(-)
 rename cpp/examples/parquet_io/{parquet_io.hpp => common.hpp} (97%)
 create mode 100644 cpp/examples/parquet_io/parquet_io_multithreaded.cpp

diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index d8e9205ffd4..1e1d2c3516f 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -16,10 +16,17 @@ project(
 
 include(../fetch_dependencies.cmake)
 
-# Configure your project here
+# Build and install parquet_io
 add_executable(parquet_io parquet_io.cpp)
 target_link_libraries(parquet_io PRIVATE cudf::cudf)
 target_compile_features(parquet_io PRIVATE cxx_std_17)
-
 install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
-install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
+
+# Build and install parquet_io_multithreaded
+add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp)
+target_link_libraries(parquet_io_multithreaded PRIVATE cudf::cudf)
+target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17)
+install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf)
+
+# Install the example.parquet file
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
\ No newline at end of file
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/common.hpp
similarity index 97%
rename from cpp/examples/parquet_io/parquet_io.hpp
rename to cpp/examples/parquet_io/common.hpp
index e27cbec4fce..2095a0b237c 100644
--- a/cpp/examples/parquet_io/parquet_io.hpp
+++ b/cpp/examples/parquet_io/common.hpp
@@ -16,12 +16,16 @@
 
 #pragma once
 
+#include "../utilities/timer.hpp"
+
+#include <cudf/concatenate.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -123,4 +127,4 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
   }
 
   return std::nullopt;
-}
+}
\ No newline at end of file
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 442731694fa..cfd230d3751 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include "parquet_io.hpp"
-
-#include "../utilities/timer.hpp"
+#include "common.hpp"
 
 /**
  * @file parquet_io.cpp
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
new file mode 100644
index 00000000000..8f1b08754a9
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <filesystem>
+
+/**
+ * @file parquet_io_multithreaded.cpp
+ * @brief Demonstrates usage of the libcudf APIs to read and write
+ * parquet file format with different encodings and compression types
+ * using multiple threads.
+ *
+ * The following encoding and compression ztypes are demonstrated:
+ * Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,
+ *                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY
+ *
+ * Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD
+ *
+ */
+
+using table_t = std::unique_ptr<cudf::table>;
+
+struct read_fn {
+  std::vector<std::string> const& input_files;
+  std::vector<table_t>& tables;
+  int const thread_id;
+  int const thread_count;
+  rmm::cuda_stream_view stream;
+
+  void operator()()
+  {
+    std::vector<table_t> tables_this_thread;
+    for (auto curr_file_idx = thread_id; curr_file_idx < input_files.size();
+         curr_file_idx += thread_count) {
+      auto const source_info = cudf::io::source_info(input_files[curr_file_idx]);
+      auto builder           = cudf::io::parquet_reader_options::builder(source_info);
+      auto const options     = builder.build();
+      tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl);
+    }
+
+    // Concatenate all tables read by this thread.
+    auto table = std::move(tables_this_thread[0]);
+    std::for_each(tables_this_thread.begin() + 1, tables_this_thread.end(), [&](auto& tbl) {
+      std::vector<cudf::table_view> const table_views{table->view(), tbl->view()};
+      table = cudf::concatenate(table_views, stream);
+    });
+
+    // Done with this stream
+    stream.synchronize_no_throw();
+
+    tables[thread_id] = std::move(table);
+  }
+};
+
+struct write_fn {
+  std::string const& output_path;
+  std::vector<table_t> const& tables;
+  cudf::io::column_encoding const encoding;
+  cudf::io::compression_type const compression;
+  std::optional<cudf::io::statistics_freq> const stats_level;
+  int const thread_id;
+
+  void operator()()
+  {
+    // write the data for inspection
+    auto sink_info =
+      cudf::io::sink_info(output_path + "/table_" + std::to_string(thread_id) + ".parquet");
+    auto builder = cudf::io::parquet_writer_options::builder(sink_info, tables[thread_id]->view())
+                     .compression(compression)
+                     .stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE));
+    auto table_metadata = cudf::io::table_input_metadata{tables[thread_id]->view()};
+
+    std::for_each(table_metadata.column_metadata.begin(),
+                  table_metadata.column_metadata.end(),
+                  [=](auto& col_meta) { col_meta.set_encoding(encoding); });
+
+    builder.metadata(table_metadata);
+    auto options = builder.build();
+    // Write parquet data
+    cudf::io::write_parquet(options);
+  }
+};
+
+int main(int argc, char const** argv)
+{
+  std::string input_paths;
+  std::string output_path;
+  cudf::io::column_encoding encoding;
+  cudf::io::compression_type compression;
+  std::optional<cudf::io::statistics_freq> page_stats;
+  int thread_count;
+
+  switch (argc) {
+    case 1:
+      input_paths  = "example.parquet";
+      output_path  = "output.parquet";
+      encoding     = get_encoding_type("DELTA_BINARY_PACKED");
+      compression  = get_compression_type("ZSTD");
+      thread_count = 2;
+      break;
+    case 7: page_stats = get_page_size_stats(argv[6]); [[fallthrough]];
+    case 6:
+      input_paths  = std::string{argv[1]};
+      output_path  = std::string{argv[2]};
+      encoding     = get_encoding_type(argv[3]);
+      compression  = get_compression_type(argv[4]);
+      thread_count = std::stoi(std::string(argv[5]));
+      break;
+    default:
+      throw std::runtime_error(
+        "Either provide all command-line arguments, or none to use defaults\n"
+        "Use: parquet_io_multithreaded <comma delimited directories or parquet files>"
+        "<output path> <encoding type> <compression type> <thread count> "
+        "<write_page_stats? yes/no>\n");
+  }
+
+  // Process and extract all input files
+  auto const input_files = [&]() {
+    std::vector<std::string> parquet_files;
+    std::vector<std::string> delimited_paths = [&]() {
+      std::vector<std::string> paths_list;
+      std::stringstream stream{input_paths};
+      std::string path;
+      // extract the delimited paths.
+      while (std::getline(stream, path, char{','})) {
+        paths_list.push_back(path);  // Add each token to the vector
+      }
+      return paths_list;
+    }();
+
+    std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
+      std::filesystem::path path{path_string};
+      // If this is a parquet file, add it.
+      if (std::filesystem::is_regular_file(path)) {
+        parquet_files.push_back(path_string);
+      }
+      // If this is a directory, add all files at this path
+      else if (std::filesystem::is_directory(path)) {
+        for (auto const& file : std::filesystem::directory_iterator(path)) {
+          if (std::filesystem::is_regular_file(file.path())) {
+            parquet_files.push_back(file.path().string());
+          }
+        }
+      } else {
+        throw std::runtime_error("Encountered an invalid input path\n");
+      }
+    });
+
+    // Add parquet files from existing ones if less than thread_count
+    for (size_t idx = 0, initial_size = parquet_files.size();
+         thread_count > static_cast<int>(parquet_files.size());
+         idx++) {
+      parquet_files.push_back(parquet_files[idx % initial_size]);
+    }
+
+    return parquet_files;
+  }();
+
+  // Exit early if nothing to do.
+  if (not input_files.size()) { return 0; }
+
+  // Check if output path is a directory.
+  if (not std::filesystem::is_directory(std::filesystem::path{output_path})) {
+    throw std::runtime_error("The provided output path is not a directory\n");
+  }
+
+  auto const is_pool_used = true;
+  auto resource           = create_memory_resource(is_pool_used);
+  auto default_stream     = cudf::get_default_stream();
+  auto stream_pool        = rmm::cuda_stream_pool(thread_count);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+
+  // Lambda function to setup and launch multithread parquet read
+  auto const read_parquet_multithreaded = [&]() {
+    // Tables read by each thread
+    std::vector<table_t> tables(thread_count);
+
+    // Tasks to read each parquet file
+    std::vector<read_fn> read_tasks;
+    read_tasks.reserve(thread_count);
+    std::for_each(thrust::make_counting_iterator(0),
+                  thrust::make_counting_iterator(thread_count),
+                  [&](auto tid) {
+                    read_tasks.emplace_back(
+                      read_fn{input_files, tables, tid, thread_count, stream_pool.get_stream()});
+                  });
+
+    std::vector<std::thread> threads;
+    threads.reserve(thread_count);
+    for (auto& c : read_tasks) {
+      threads.emplace_back(std::thread{c});
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+    return tables;
+  };
+
+  // Lambda function to setup and launch multithread parquet write
+  auto const write_parquet_multithreaded = [&](std::vector<table_t> const& tables) {
+    // Tasks to read each parquet file
+    std::vector<write_fn> write_tasks;
+    write_tasks.reserve(thread_count);
+    std::for_each(thrust::make_counting_iterator(0),
+                  thrust::make_counting_iterator(thread_count),
+                  [&](auto tid) {
+                    write_tasks.emplace_back(
+                      write_fn{output_path, tables, encoding, compression, page_stats, tid});
+                  });
+
+    std::vector<std::thread> threads;
+    threads.reserve(thread_count);
+    for (auto& c : write_tasks) {
+      threads.emplace_back(std::thread{c});
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+  };
+
+  // Read the parquet files with multiple threads
+  {
+    std::cout << "Note: Not timing the initial parquet read as it may include\n"
+                 "times for nvcomp, cufile loading and RMM growth."
+              << std::endl
+              << std::endl;
+
+    // tables read by each thread
+    auto const tables = read_parquet_multithreaded();
+
+    // In case some kernels are still running on the default stre
+    default_stream.synchronize();
+
+    // Write parquet file with the specified encoding and compression
+    auto const page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
+    std::cout << "Writing at: " << output_path << " with encoding, compression and "
+              << page_stat_string << ".." << std::endl;
+
+    // Write tables using multiple threads
+    cudf::examples::timer timer;
+    write_parquet_multithreaded(tables);
+
+    // In case some kernels are still running on the default stream
+    default_stream.synchronize();
+
+    // Print elapsed time
+    timer.print_elapsed_millis();
+  }
+
+  // Re-read the parquet files with multiple threads
+  {
+    std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl;
+    cudf::examples::timer timer;
+    auto tables = read_parquet_multithreaded();
+
+    // Construct the final table
+    auto table = std::move(tables[0]);
+    std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) {
+      std::vector<cudf::table_view> const table_views{table->view(), tbl->view()};
+      table = cudf::concatenate(table_views, default_stream);
+    });
+
+    // In case some kernels are still running on the default stream
+    default_stream.synchronize();
+
+    // Print elapsed time and peak memory
+    timer.print_elapsed_millis();
+    std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+  }
+
+  return 0;
+}

From d06f7f2b584b53f0a24c8fbd1ff5c78875a8f4be Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 18 Sep 2024 02:55:07 +0000
Subject: [PATCH 02/37] Set the default output path to the current path

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 8f1b08754a9..8b90bce68c1 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -109,7 +109,7 @@ int main(int argc, char const** argv)
   switch (argc) {
     case 1:
       input_paths  = "example.parquet";
-      output_path  = "output.parquet";
+      output_path  = std::filesystem::current_path().string();
       encoding     = get_encoding_type("DELTA_BINARY_PACKED");
       compression  = get_compression_type("ZSTD");
       thread_count = 2;

From c13a4087407bcb6e7f85bce60450f6e84eca7245 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 18 Sep 2024 18:48:28 +0000
Subject: [PATCH 03/37] Style fix

---
 cpp/examples/parquet_io/CMakeLists.txt | 2 +-
 cpp/examples/parquet_io/common.hpp     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index 1e1d2c3516f..28ade3666bf 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -29,4 +29,4 @@ target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17)
 install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf)
 
 # Install the example.parquet file
-install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
\ No newline at end of file
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp
index 2095a0b237c..57c6a8b4f0f 100644
--- a/cpp/examples/parquet_io/common.hpp
+++ b/cpp/examples/parquet_io/common.hpp
@@ -127,4 +127,4 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
   }
 
   return std::nullopt;
-}
\ No newline at end of file
+}

From 12adeebd6900e7bf203108a2b5a4f0a1d0c1ed11 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 18 Sep 2024 19:03:51 +0000
Subject: [PATCH 04/37] Use stream pool for parquet write as well

---
 .../parquet_io/parquet_io_multithreaded.cpp   | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 8b90bce68c1..f39f86b7e08 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -75,6 +75,7 @@ struct write_fn {
   cudf::io::compression_type const compression;
   std::optional<cudf::io::statistics_freq> const stats_level;
   int const thread_id;
+  rmm::cuda_stream_view stream;
 
   void operator()()
   {
@@ -92,8 +93,12 @@ struct write_fn {
 
     builder.metadata(table_metadata);
     auto options = builder.build();
+
     // Write parquet data
-    cudf::io::write_parquet(options);
+    cudf::io::write_parquet(options, stream);
+
+    // Done with this stream
+    stream.synchronize_no_throw();
   }
 };
 
@@ -189,7 +194,7 @@ int main(int argc, char const** argv)
   rmm::mr::set_current_device_resource(&stats_mr);
 
   // Lambda function to setup and launch multithread parquet read
-  auto const read_parquet_multithreaded = [&]() {
+  auto const read_parquet_multithreaded = [&](std::vector<std::string> const& files) {
     // Tables read by each thread
     std::vector<table_t> tables(thread_count);
 
@@ -200,7 +205,7 @@ int main(int argc, char const** argv)
                   thrust::make_counting_iterator(thread_count),
                   [&](auto tid) {
                     read_tasks.emplace_back(
-                      read_fn{input_files, tables, tid, thread_count, stream_pool.get_stream()});
+                      read_fn{files, tables, tid, thread_count, stream_pool.get_stream()});
                   });
 
     std::vector<std::thread> threads;
@@ -219,12 +224,13 @@ int main(int argc, char const** argv)
     // Tasks to read each parquet file
     std::vector<write_fn> write_tasks;
     write_tasks.reserve(thread_count);
-    std::for_each(thrust::make_counting_iterator(0),
-                  thrust::make_counting_iterator(thread_count),
-                  [&](auto tid) {
-                    write_tasks.emplace_back(
-                      write_fn{output_path, tables, encoding, compression, page_stats, tid});
-                  });
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(thread_count),
+      [&](auto tid) {
+        write_tasks.emplace_back(write_fn{
+          output_path, tables, encoding, compression, page_stats, tid, stream_pool.get_stream()});
+      });
 
     std::vector<std::thread> threads;
     threads.reserve(thread_count);
@@ -244,7 +250,7 @@ int main(int argc, char const** argv)
               << std::endl;
 
     // tables read by each thread
-    auto const tables = read_parquet_multithreaded();
+    auto const tables = read_parquet_multithreaded(input_files);
 
     // In case some kernels are still running on the default stre
     default_stream.synchronize();
@@ -265,12 +271,11 @@ int main(int argc, char const** argv)
     timer.print_elapsed_millis();
   }
 
-  // Re-read the parquet files with multiple threads
+  // Re-read the same parquet files with multiple threads
   {
     std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl;
     cudf::examples::timer timer;
-    auto tables = read_parquet_multithreaded();
-
+    auto tables = read_parquet_multithreaded(input_files);
     // Construct the final table
     auto table = std::move(tables[0]);
     std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) {

From a8ae50a42060f79904b7217d0a9eb4daabd046fd Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 18 Sep 2024 20:10:46 +0000
Subject: [PATCH 05/37] Add more details to the example

---
 .../parquet_io/parquet_io_multithreaded.cpp   | 81 +++++++++++++++----
 1 file changed, 64 insertions(+), 17 deletions(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index f39f86b7e08..f46f02966f1 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -16,8 +16,11 @@
 
 #include "common.hpp"
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/statistics_resource_adaptor.hpp>
 
+#include <fmt/chrono.h>
+
 #include <filesystem>
 
 /**
@@ -136,15 +139,15 @@ int main(int argc, char const** argv)
   }
 
   // Process and extract all input files
-  auto const input_files = [&]() {
+  auto const extract_input_files = [thread_count = thread_count](std::string const& paths) {
     std::vector<std::string> parquet_files;
     std::vector<std::string> delimited_paths = [&]() {
       std::vector<std::string> paths_list;
-      std::stringstream stream{input_paths};
+      std::stringstream stream{paths};
       std::string path;
-      // extract the delimited paths.
+      // Extract the delimited paths.
       while (std::getline(stream, path, char{','})) {
-        paths_list.push_back(path);  // Add each token to the vector
+        paths_list.push_back(path);
       }
       return paths_list;
     }();
@@ -175,13 +178,37 @@ int main(int argc, char const** argv)
     }
 
     return parquet_files;
-  }();
+  };
+
+  // Concatenate a vector of tables and return
+  auto const concatenate_tables = [](std::vector<table_t>& tables, rmm::cuda_stream_view stream) {
+    // Construct the final table
+    auto table = std::move(tables[0]);
+    std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) {
+      std::vector<cudf::table_view> const table_views{table->view(), tbl->view()};
+      table = cudf::concatenate(table_views, stream);
+    });
+    return table;
+  };
+
+  // make input files from the input_paths string.
+  auto const input_files = extract_input_files(input_paths);
 
   // Exit early if nothing to do.
-  if (not input_files.size()) { return 0; }
+  if (not input_files.size()) {
+    std::cerr << "No input files to read. Exiting early.\n";
+    return 0;
+  }
 
-  // Check if output path is a directory.
-  if (not std::filesystem::is_directory(std::filesystem::path{output_path})) {
+  // Check if output path is a valid
+  if (std::filesystem::is_directory({output_path})) {
+    // Create a new directory in output path if not empty.
+    if (not std::filesystem::is_empty({output_path})) {
+      output_path +=
+        "/output_" + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now());
+      std::filesystem::create_directory({output_path});
+    }
+  } else {
     throw std::runtime_error("The provided output path is not a directory\n");
   }
 
@@ -275,20 +302,40 @@ int main(int argc, char const** argv)
   {
     std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl;
     cudf::examples::timer timer;
-    auto tables = read_parquet_multithreaded(input_files);
-    // Construct the final table
-    auto table = std::move(tables[0]);
-    std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) {
-      std::vector<cudf::table_view> const table_views{table->view(), tbl->view()};
-      table = cudf::concatenate(table_views, default_stream);
-    });
-
+    auto tables      = read_parquet_multithreaded(input_files);
+    auto const table = concatenate_tables(tables, default_stream);
     // In case some kernels are still running on the default stream
     default_stream.synchronize();
+    // Print elapsed time and peak memory
+    timer.print_elapsed_millis();
 
+    std::cout << "Reading transcoded files using " << thread_count << " threads..." << std::endl;
+    timer.reset();
+    auto transcoded_tables      = read_parquet_multithreaded(extract_input_files(output_path));
+    auto const transcoded_table = concatenate_tables(transcoded_tables, default_stream);
     // Print elapsed time and peak memory
     timer.print_elapsed_millis();
-    std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+    // In case some kernels are still running on the default stream
+    default_stream.synchronize();
+
+    std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n";
+
+    // Check for validity
+    try {
+      // Left anti-join the original and transcoded tables
+      // identical tables should not throw an exception and
+      // return an empty indices vector
+      auto const indices = cudf::left_anti_join(
+        table->view(), transcoded_table->view(), cudf::null_equality::EQUAL, resource.get());
+
+      // No exception thrown, check indices
+      auto const valid = indices->size() == 0;
+      std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
+    } catch (std::exception& e) {
+      std::cerr << e.what() << std::endl << std::endl;
+      std::cout << "Transcoding valid: false" << std::endl;
+    }
   }
 
   return 0;

From 6679f89196615361763357058dc1293d78ed88b3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 18 Sep 2024 20:26:19 +0000
Subject: [PATCH 06/37] Minor improvements

---
 cpp/examples/parquet_io/common.hpp            | 24 +++++++++++++
 cpp/examples/parquet_io/parquet_io.cpp        | 15 +-------
 .../parquet_io/parquet_io_multithreaded.cpp   | 36 +++++--------------
 3 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp
index 57c6a8b4f0f..f4e5757412a 100644
--- a/cpp/examples/parquet_io/common.hpp
+++ b/cpp/examples/parquet_io/common.hpp
@@ -128,3 +128,27 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   return std::nullopt;
 }
+
+/**
+ * @brief Check if two tables are identical, throw an error otherwise
+ *
+ * @param lhs_table View to lhs table
+ * @param rhs_table View to rhs table
+ */
+inline void check_identical_tables(cudf::table_view const& lhs_table,
+                                   cudf::table_view const& rhs_table)
+{
+  try {
+    // Left anti-join the original and transcoded tables
+    // identical tables should not throw an exception and
+    // return an empty indices vector
+    auto const indices = cudf::left_anti_join(lhs_table, rhs_table, cudf::null_equality::EQUAL);
+
+    // No exception thrown, check indices
+    auto const valid = indices->size() == 0;
+    std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl << std::endl;
+    throw std::runtime_error("Transcoding valid: false\n");
+  }
+}
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index cfd230d3751..c981928e8f2 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -153,20 +153,7 @@ int main(int argc, char const** argv)
   timer.print_elapsed_millis();
 
   // Check for validity
-  try {
-    // Left anti-join the original and transcoded tables
-    // identical tables should not throw an exception and
-    // return an empty indices vector
-    auto const indices = cudf::left_anti_join(
-      input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get());
-
-    // No exception thrown, check indices
-    auto const valid = indices->size() == 0;
-    std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
-  } catch (std::exception& e) {
-    std::cerr << e.what() << std::endl << std::endl;
-    std::cout << "Transcoding valid: false" << std::endl;
-  }
+  check_identical_tables(input->view(), transcoded_input->view());
 
   return 0;
 }
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index f46f02966f1..6664eccb496 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -181,7 +181,7 @@ int main(int argc, char const** argv)
   };
 
   // Concatenate a vector of tables and return
-  auto const concatenate_tables = [](std::vector<table_t>& tables, rmm::cuda_stream_view stream) {
+  auto const concatenate_tables = [](std::vector<table_t> tables, rmm::cuda_stream_view stream) {
     // Construct the final table
     auto table = std::move(tables[0]);
     std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) {
@@ -275,10 +275,8 @@ int main(int argc, char const** argv)
                  "times for nvcomp, cufile loading and RMM growth."
               << std::endl
               << std::endl;
-
-    // tables read by each thread
+    // Tables read by each thread
     auto const tables = read_parquet_multithreaded(input_files);
-
     // In case some kernels are still running on the default stre
     default_stream.synchronize();
 
@@ -290,10 +288,8 @@ int main(int argc, char const** argv)
     // Write tables using multiple threads
     cudf::examples::timer timer;
     write_parquet_multithreaded(tables);
-
     // In case some kernels are still running on the default stream
     default_stream.synchronize();
-
     // Print elapsed time
     timer.print_elapsed_millis();
   }
@@ -302,8 +298,8 @@ int main(int argc, char const** argv)
   {
     std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl;
     cudf::examples::timer timer;
-    auto tables      = read_parquet_multithreaded(input_files);
-    auto const table = concatenate_tables(tables, default_stream);
+    auto const input_table =
+      concatenate_tables(read_parquet_multithreaded(input_files), default_stream);
     // In case some kernels are still running on the default stream
     default_stream.synchronize();
     // Print elapsed time and peak memory
@@ -311,31 +307,17 @@ int main(int argc, char const** argv)
 
     std::cout << "Reading transcoded files using " << thread_count << " threads..." << std::endl;
     timer.reset();
-    auto transcoded_tables      = read_parquet_multithreaded(extract_input_files(output_path));
-    auto const transcoded_table = concatenate_tables(transcoded_tables, default_stream);
-    // Print elapsed time and peak memory
-    timer.print_elapsed_millis();
-
+    auto const transcoded_table = concatenate_tables(
+      read_parquet_multithreaded(extract_input_files(output_path)), default_stream);
     // In case some kernels are still running on the default stream
     default_stream.synchronize();
+    // Print elapsed time and peak memory
+    timer.print_elapsed_millis();
 
     std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n";
 
     // Check for validity
-    try {
-      // Left anti-join the original and transcoded tables
-      // identical tables should not throw an exception and
-      // return an empty indices vector
-      auto const indices = cudf::left_anti_join(
-        table->view(), transcoded_table->view(), cudf::null_equality::EQUAL, resource.get());
-
-      // No exception thrown, check indices
-      auto const valid = indices->size() == 0;
-      std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
-    } catch (std::exception& e) {
-      std::cerr << e.what() << std::endl << std::endl;
-      std::cout << "Transcoding valid: false" << std::endl;
-    }
+    check_identical_tables(input_table->view(), transcoded_table->view());
   }
 
   return 0;

From e04602c34c42982efaffc78a5c3b7bcfb02d74b1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 18 Sep 2024 20:58:18 +0000
Subject: [PATCH 07/37] Minor improvement

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 6664eccb496..5c9be0892cb 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -128,7 +128,7 @@ int main(int argc, char const** argv)
       output_path  = std::string{argv[2]};
       encoding     = get_encoding_type(argv[3]);
       compression  = get_compression_type(argv[4]);
-      thread_count = std::stoi(std::string(argv[5]));
+      thread_count = std::max(thread_count, std::stoi(std::string{argv[5]}));
       break;
     default:
       throw std::runtime_error(

From 21ce7c7cc42596880ca82de4cd56d24323275122 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 18 Sep 2024 23:09:32 +0000
Subject: [PATCH 08/37] Minor improvements

---
 cpp/examples/parquet_io/common.hpp            |  8 +++-
 cpp/examples/parquet_io/parquet_io.cpp        | 14 +++---
 .../parquet_io/parquet_io_multithreaded.cpp   | 44 ++++++++++++-------
 3 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp
index f4e5757412a..25f81022d07 100644
--- a/cpp/examples/parquet_io/common.hpp
+++ b/cpp/examples/parquet_io/common.hpp
@@ -31,6 +31,8 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
+#include <fmt/color.h>
+
 #include <chrono>
 #include <iostream>
 #include <optional>
@@ -146,9 +148,11 @@ inline void check_identical_tables(cudf::table_view const& lhs_table,
 
     // No exception thrown, check indices
     auto const valid = indices->size() == 0;
-    std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
+    fmt::print(
+      fmt::emphasis::bold | fg(fmt::color::green_yellow), "Transcoding valid: {}\n", valid);
   } catch (std::exception& e) {
     std::cerr << e.what() << std::endl << std::endl;
-    throw std::runtime_error("Transcoding valid: false\n");
+    throw std::runtime_error(
+      fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Transcoding valid: false\n"));
   }
 }
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index c981928e8f2..06505016ab9 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -126,18 +126,16 @@ int main(int argc, char const** argv)
   // Read input parquet file
   // We do not want to time the initial read time as it may include
   // time for nvcomp, cufile loading and RMM growth
-  std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl;
-  std::cout << "Note: Not timing the initial parquet read as it may include\n"
-               "times for nvcomp, cufile loading and RMM growth."
-            << std::endl
-            << std::endl;
+  fmt::print("\nReading {}...", input_filepath);
+  fmt::print(
+    "Note: Not timing the initial parquet read as it may include\n"
+    "times for nvcomp, cufile loading and RMM growth.\n\n");
   auto [input, metadata] = read_parquet(input_filepath);
 
   // Status string to indicate if page stats are set to be written or not
   auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
   // Write parquet file with the specified encoding and compression
-  std::cout << "Writing " << output_filepath << " with encoding, compression and "
-            << page_stat_string << ".." << std::endl;
+  fmt::print("Writing {} with encoding, compression and {}..\n", output_filepath, page_stat_string);
 
   // `timer` is automatically started here
   cudf::examples::timer timer;
@@ -145,7 +143,7 @@ int main(int argc, char const** argv)
   timer.print_elapsed_millis();
 
   // Read the parquet file written with encoding and compression
-  std::cout << "Reading " << output_filepath << "..." << std::endl;
+  fmt::print("Reading {}...\n", output_filepath);
 
   // Reset the timer
   timer.reset();
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 5c9be0892cb..361683c0e9e 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -73,7 +73,7 @@ struct read_fn {
 
 struct write_fn {
   std::string const& output_path;
-  std::vector<table_t> const& tables;
+  std::vector<cudf::table_view> const& table_views;
   cudf::io::column_encoding const encoding;
   cudf::io::compression_type const compression;
   std::optional<cudf::io::statistics_freq> const stats_level;
@@ -85,10 +85,10 @@ struct write_fn {
     // write the data for inspection
     auto sink_info =
       cudf::io::sink_info(output_path + "/table_" + std::to_string(thread_id) + ".parquet");
-    auto builder = cudf::io::parquet_writer_options::builder(sink_info, tables[thread_id]->view())
+    auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id])
                      .compression(compression)
                      .stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE));
-    auto table_metadata = cudf::io::table_input_metadata{tables[thread_id]->view()};
+    auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]};
 
     std::for_each(table_metadata.column_metadata.begin(),
                   table_metadata.column_metadata.end(),
@@ -246,8 +246,8 @@ int main(int argc, char const** argv)
     return tables;
   };
 
-  // Lambda function to setup and launch multithread parquet write
-  auto const write_parquet_multithreaded = [&](std::vector<table_t> const& tables) {
+  // Lambda function to setup and launch multithreaded parquet writes
+  auto const write_parquet_multithreaded = [&](std::vector<cudf::table_view> const& tables) {
     // Tasks to read each parquet file
     std::vector<write_fn> write_tasks;
     write_tasks.reserve(thread_count);
@@ -271,23 +271,33 @@ int main(int argc, char const** argv)
 
   // Read the parquet files with multiple threads
   {
-    std::cout << "Note: Not timing the initial parquet read as it may include\n"
-                 "times for nvcomp, cufile loading and RMM growth."
-              << std::endl
-              << std::endl;
+    fmt::print(
+      "Note: Not timing the initial parquet read as it may include\n"
+      "times for nvcomp, cufile loading and RMM growth.\n\n");
     // Tables read by each thread
     auto const tables = read_parquet_multithreaded(input_files);
     // In case some kernels are still running on the default stre
     default_stream.synchronize();
 
-    // Write parquet file with the specified encoding and compression
+    // Construct a vector of table views for write_parquet_multithreaded
+    auto const table_views = [&tables]() {
+      std::vector<cudf::table_view> table_views;
+      table_views.reserve(tables.size());
+
+      std::transform(
+        tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) {
+          return tbl->view();
+        });
+      return table_views;
+    }();
+
+    // Write tables to parquet with the specified encoding and compression
     auto const page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
-    std::cout << "Writing at: " << output_path << " with encoding, compression and "
-              << page_stat_string << ".." << std::endl;
+    fmt::print(
+      "Writing at: {} with encoding, compression and {}..\n", output_path, page_stat_string);
 
-    // Write tables using multiple threads
     cudf::examples::timer timer;
-    write_parquet_multithreaded(tables);
+    write_parquet_multithreaded(table_views);
     // In case some kernels are still running on the default stream
     default_stream.synchronize();
     // Print elapsed time
@@ -296,7 +306,7 @@ int main(int argc, char const** argv)
 
   // Re-read the same parquet files with multiple threads
   {
-    std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl;
+    fmt::print("Reading for the second time using {} threads...\n", thread_count);
     cudf::examples::timer timer;
     auto const input_table =
       concatenate_tables(read_parquet_multithreaded(input_files), default_stream);
@@ -305,7 +315,7 @@ int main(int argc, char const** argv)
     // Print elapsed time and peak memory
     timer.print_elapsed_millis();
 
-    std::cout << "Reading transcoded files using " << thread_count << " threads..." << std::endl;
+    fmt::print("Reading transcoded files using {} threads...\n", thread_count);
     timer.reset();
     auto const transcoded_table = concatenate_tables(
       read_parquet_multithreaded(extract_input_files(output_path)), default_stream);
@@ -314,7 +324,7 @@ int main(int argc, char const** argv)
     // Print elapsed time and peak memory
     timer.print_elapsed_millis();
 
-    std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n";
+    fmt::print("Peak memory: {} MB\n\n", (stats_mr.get_bytes_counter().peak / 1048576.0));
 
     // Check for validity
     check_identical_tables(input_table->view(), transcoded_table->view());

From b8b8bb954ab3d529ffab2e31fbfe2a58b1194965 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 19 Sep 2024 00:36:59 +0000
Subject: [PATCH 09/37] Move the vector to concatenate tables

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 361683c0e9e..30fc4ec9354 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -309,7 +309,7 @@ int main(int argc, char const** argv)
     fmt::print("Reading for the second time using {} threads...\n", thread_count);
     cudf::examples::timer timer;
     auto const input_table =
-      concatenate_tables(read_parquet_multithreaded(input_files), default_stream);
+      concatenate_tables(std::move(read_parquet_multithreaded(input_files)), default_stream);
     // In case some kernels are still running on the default stream
     default_stream.synchronize();
     // Print elapsed time and peak memory
@@ -318,7 +318,7 @@ int main(int argc, char const** argv)
     fmt::print("Reading transcoded files using {} threads...\n", thread_count);
     timer.reset();
     auto const transcoded_table = concatenate_tables(
-      read_parquet_multithreaded(extract_input_files(output_path)), default_stream);
+      std::move(read_parquet_multithreaded(extract_input_files(output_path))), default_stream);
     // In case some kernels are still running on the default stream
     default_stream.synchronize();
     // Print elapsed time and peak memory

From 188ce11900072121145ce1ba554ec15be9eedd82 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Mon, 23 Sep 2024 18:09:25 +0000
Subject: [PATCH 10/37] Minor improvement

---
 .../parquet_io/parquet_io_multithreaded.cpp       | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 30fc4ec9354..0a05e22f8d7 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -180,15 +180,16 @@ int main(int argc, char const** argv)
     return parquet_files;
   };
 
-  // Concatenate a vector of tables and return
+  // Lambda to concatenate a vector of tables
   auto const concatenate_tables = [](std::vector<table_t> tables, rmm::cuda_stream_view stream) {
+    std::vector<cudf::table_view> table_views;
+    table_views.reserve(tables.size());
+    std::transform(
+      tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) {
+        return tbl->view();
+      });
     // Construct the final table
-    auto table = std::move(tables[0]);
-    std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) {
-      std::vector<cudf::table_view> const table_views{table->view(), tbl->view()};
-      table = cudf::concatenate(table_views, stream);
-    });
-    return table;
+    return cudf::concatenate(table_views, stream);
   };
 
   // make input files from the input_paths string.

From 990f2bbacdfbb96a283000b7612587271244464d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 27 Sep 2024 00:30:05 +0000
Subject: [PATCH 11/37] Make multithreaded parquet io example more
 sophisticated

---
 cpp/examples/parquet_io/common.hpp            | 111 ++++-
 cpp/examples/parquet_io/parquet_io.cpp        |   4 +-
 .../parquet_io/parquet_io_multithreaded.cpp   | 440 ++++++++++--------
 3 files changed, 334 insertions(+), 221 deletions(-)

diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp
index 25f81022d07..eaff77708e6 100644
--- a/cpp/examples/parquet_io/common.hpp
+++ b/cpp/examples/parquet_io/common.hpp
@@ -34,6 +34,7 @@
 #include <fmt/color.h>
 
 #include <chrono>
+#include <filesystem>
 #include <iostream>
 #include <optional>
 #include <string>
@@ -64,7 +65,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 {
   using encoding_type = cudf::io::column_encoding;
 
-  static const std::unordered_map<std::string_view, cudf::io::column_encoding> map = {
+  static const std::unordered_map<std::string_view, encoding_type> map = {
     {"DEFAULT", encoding_type::USE_DEFAULT},
     {"DICTIONARY", encoding_type::DICTIONARY},
     {"PLAIN", encoding_type::PLAIN},
@@ -79,9 +80,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
                               " is not a valid encoding type.\n\n"
                               "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
                               "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
-                              "DELTA_BYTE_ARRAY\n"
-                              "\n"
-                              "Exiting...\n");
+                              "DELTA_BYTE_ARRAY\n\n");
 }
 
 /**
@@ -94,7 +93,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 {
   using compression_type = cudf::io::compression_type;
 
-  static const std::unordered_map<std::string_view, cudf::io::compression_type> map = {
+  static const std::unordered_map<std::string_view, compression_type> map = {
     {"NONE", compression_type::NONE},
     {"AUTO", compression_type::AUTO},
     {"SNAPPY", compression_type::SNAPPY},
@@ -106,29 +105,26 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
   throw std::invalid_argument("FATAL: " + std::string(name) +
                               " is not a valid compression type.\n\n"
                               "Available compression_type types: NONE, AUTO, SNAPPY,\n"
-                              "LZ4, ZSTD\n"
-                              "\n"
-                              "Exiting...\n");
+                              "LZ4, ZSTD\n\n");
 }
 
 /**
- * @brief Get the optional page size stat frequency from they keyword
+ * @brief Get boolean from they keyword
  *
- * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON
- * @return optional page statistics frequency set to full (STATISTICS_COLUMN)
+ * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON
+ * @return true or false
  */
-[[nodiscard]] std::optional<cudf::io::statistics_freq> get_page_size_stats(std::string use_stats)
+[[nodiscard]] bool get_boolean(std::string input)
 {
-  std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper);
+  std::transform(input.begin(), input.end(), input.begin(), ::toupper);
 
   // Check if the input string matches to any of the following
-  if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or
-      not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) {
-    // Full column and offset indices - STATISTICS_COLUMN
-    return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN);
+  if (not input.compare("ON") or not input.compare("TRUE") or not input.compare("YES") or
+      not input.compare("Y") or not input.compare("T")) {
+    return true;
+  } else {
+    return false;
   }
-
-  return std::nullopt;
 }
 
 /**
@@ -149,10 +145,83 @@ inline void check_identical_tables(cudf::table_view const& lhs_table,
     // No exception thrown, check indices
     auto const valid = indices->size() == 0;
     fmt::print(
-      fmt::emphasis::bold | fg(fmt::color::green_yellow), "Transcoding valid: {}\n", valid);
+      fmt::emphasis::bold | fg(fmt::color::green_yellow), "Transcoding valid: {}\n\n", valid);
   } catch (std::exception& e) {
     std::cerr << e.what() << std::endl << std::endl;
     throw std::runtime_error(
-      fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Transcoding valid: false\n"));
+      fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Transcoding valid: false\n\n"));
+  }
+}
+
+/**
+ * @brief Get io sink type from the string keyword argumnet
+ *
+ * @param name io sink type keyword name
+ * @return corresponding io sink type type
+ */
+[[nodiscard]] std::optional<cudf::io::io_type> get_io_sink_type(std::string name)
+{
+  using io_type = cudf::io::io_type;
+
+  static const std::unordered_map<std::string_view, io_type> map = {
+    {"FILEPATH", io_type::FILEPATH},
+    {"HOST_BUFFER", io_type::HOST_BUFFER},
+    {"PINNED_BUFFER", io_type::HOST_BUFFER},
+    {"DEVICE_BUFFER", io_type::DEVICE_BUFFER}};
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) {
+    return {map.at(name)};
+  } else {
+    fmt::print(
+      "{} is not a valid io sink type. Available: FILEPATH,\n"
+      "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER. Ignoring\n\n",
+      name);
+    return std::nullopt;
+  }
+}
+
+/**
+ * @brief Concatenate a vector of tables and return the resultant table
+ *
+ * @param tables Vector of tables to concatenate
+ * @param stream CUDA stream to use
+ *
+ * @return Unique pointer to the resultant concatenated table.
+ */
+std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
+                                                rmm::cuda_stream_view stream)
+{
+  if (tables.size() == 1) { return std::move(tables[0]); }
+
+  std::vector<cudf::table_view> table_views;
+  table_views.reserve(tables.size());
+  std::transform(
+    tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) {
+      return tbl->view();
+    });
+  // Construct the final table
+  return cudf::concatenate(table_views, stream);
+}
+
+/**
+ * @brief Thread unsafe function to create a directory for FILEPATH io sink type and return its path
+ *
+ * @return File path of the created directory
+ */
+[[nodiscard]] std::string get_default_output_path()
+{
+  static std::string output_path = std::filesystem::current_path().string();
+  if (output_path == std::filesystem::current_path().string()) {
+    // Check if output path is a valid directory
+    if (std::filesystem::is_directory({output_path})) {
+      // Create a new directory in output path if not empty.
+      if (not std::filesystem::is_empty({output_path})) {
+        output_path +=
+          "/output_" + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now());
+        std::filesystem::create_directory({output_path});
+      }
+    }
   }
+  return output_path;
 }
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 5ea41a8fc67..a4ee550b0e4 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -108,7 +108,9 @@ int main(int argc, char const** argv)
       encoding        = get_encoding_type("DELTA_BINARY_PACKED");
       compression     = get_compression_type("ZSTD");
       break;
-    case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]];
+    case 6:
+      if (get_boolean(argv[5])) { page_stats = cudf::io::statistics_freq::STATISTICS_COLUMN; };
+      [[fallthrough]];
     case 5:
       input_filepath  = argv[1];
       output_filepath = argv[2];
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 0a05e22f8d7..419cef23d33 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -21,24 +21,34 @@
 
 #include <fmt/chrono.h>
 
-#include <filesystem>
-
 /**
  * @file parquet_io_multithreaded.cpp
- * @brief Demonstrates usage of the libcudf APIs to read and write
- * parquet file format with different encodings and compression types
- * using multiple threads.
+ * @brief Demonstrates multithreaded read of parquet files and optionally
+ * multithreaded writing the read tables to the specified io sink source type.
  *
- * The following encoding and compression ztypes are demonstrated:
- * Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,
- *                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY
+ * Run: ``parquet_io_multithreaded -h`` to see help with input args and more.
  *
- * Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD
+ * The following io sink types are supported:
+ * IO sink types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER
  *
  */
 
+// Type alias for unique ptr to cudf table
 using table_t = std::unique_ptr<cudf::table>;
 
+/**
+ * @brief Behavior when handling the read tables by multiple threads
+ */
+enum class read_mode {
+  NOWORK,              ///< Only read and discard tables
+  CONCATENATE_THREAD,  ///< Read and concatenate tables from each thread
+  CONCATENATE_ALL,     ///< Read and concatenate everything to a single table
+};
+
+/**
+ * @brief Functor for multithreaded parquet reading based on the provided read_mode
+ */
+template <read_mode READ_FN>
 struct read_fn {
   std::vector<std::string> const& input_files;
   std::vector<table_t>& tables;
@@ -48,52 +58,99 @@ struct read_fn {
 
   void operator()()
   {
+    // Tables read by this thread
     std::vector<table_t> tables_this_thread;
+
+    // Sweep the available input files
     for (auto curr_file_idx = thread_id; curr_file_idx < input_files.size();
          curr_file_idx += thread_count) {
       auto const source_info = cudf::io::source_info(input_files[curr_file_idx]);
       auto builder           = cudf::io::parquet_reader_options::builder(source_info);
       auto const options     = builder.build();
-      tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl);
+      if constexpr (READ_FN != read_mode::NOWORK) {
+        tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl);
+      } else {
+        cudf::io::read_parquet(options, stream);
+      }
     }
 
-    // Concatenate all tables read by this thread.
-    auto table = std::move(tables_this_thread[0]);
-    std::for_each(tables_this_thread.begin() + 1, tables_this_thread.end(), [&](auto& tbl) {
-      std::vector<cudf::table_view> const table_views{table->view(), tbl->view()};
-      table = cudf::concatenate(table_views, stream);
+    // Concatenate the tables read by this thread if not NOWORK read_mode.
+    if constexpr (READ_FN != read_mode::NOWORK) {
+      auto table = concatenate_tables(std::move(tables_this_thread), stream);
+      stream.synchronize_no_throw();
+      tables[thread_id] = std::move(table);
+    } else {
+      // Just synchronize this stream and exit
+      stream.synchronize_no_throw();
+    }
+  }
+};
+
+/**
+ * @brief Function to setup and launch multithreaded parquet reading.
+ */
+template <read_mode read_mode>
+std::vector<table_t> read_parquet_multithreaded(std::vector<std::string> const& files,
+                                                int32_t thread_count,
+                                                rmm::cuda_stream_pool& stream_pool)
+{
+  // Tables read by each thread
+  std::vector<table_t> tables(thread_count);
+
+  // Table reading tasks
+  std::vector<read_fn<read_mode>> read_tasks;
+  read_tasks.reserve(thread_count);
+
+  // Create the read tasks
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) {
+      read_tasks.emplace_back(
+        read_fn<read_mode>{files, tables, tid, thread_count, stream_pool.get_stream()});
     });
 
-    // Done with this stream
-    stream.synchronize_no_throw();
+  // Create threads with tasks
+  std::vector<std::thread> threads;
+  threads.reserve(thread_count);
+  for (auto& c : read_tasks) {
+    threads.emplace_back(std::thread{c});
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
 
-    tables[thread_id] = std::move(table);
+  // If CONCATENATE_ALL mode, then concatenate to a vector of one final table.
+  if (read_mode == read_mode::CONCATENATE_ALL) {
+    auto stream    = stream_pool.get_stream();
+    auto final_tbl = concatenate_tables(std::move(tables), stream);
+    stream.synchronize();
+    tables.clear();
+    tables.emplace_back(std::move(final_tbl));
   }
-};
 
+  return tables;
+}
+
+/**
+ * @brief Functor for multithreaded parquet writing
+ */
 struct write_fn {
-  std::string const& output_path;
+  cudf::io::io_type io_sink_type;
   std::vector<cudf::table_view> const& table_views;
-  cudf::io::column_encoding const encoding;
-  cudf::io::compression_type const compression;
-  std::optional<cudf::io::statistics_freq> const stats_level;
   int const thread_id;
   rmm::cuda_stream_view stream;
 
   void operator()()
   {
-    // write the data for inspection
-    auto sink_info =
-      cudf::io::sink_info(output_path + "/table_" + std::to_string(thread_id) + ".parquet");
-    auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id])
-                     .compression(compression)
-                     .stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE));
+    // Create a sink
+    auto const sink_info = [io_sink_type = io_sink_type, thread_id = thread_id]() {
+      return cudf::io::sink_info(get_default_output_path() + "/table_" + std::to_string(thread_id) +
+                                 ".parquet");
+    }();
+    // Writer options builder
+    auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]);
+    // Create a new metadata for the table
     auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]};
 
-    std::for_each(table_metadata.column_metadata.begin(),
-                  table_metadata.column_metadata.end(),
-                  [=](auto& col_meta) { col_meta.set_encoding(encoding); });
-
     builder.metadata(table_metadata);
     auto options = builder.build();
 
@@ -105,43 +162,53 @@ struct write_fn {
   }
 };
 
-int main(int argc, char const** argv)
+/**
+ * @brief The main function
+ */
+int32_t main(int argc, char const** argv)
 {
-  std::string input_paths;
-  std::string output_path;
-  cudf::io::column_encoding encoding;
-  cudf::io::compression_type compression;
-  std::optional<cudf::io::statistics_freq> page_stats;
-  int thread_count;
+  // Set arguments to defaults
+  std::string input_paths                  = "example.parquet";
+  int32_t input_multiplier                 = 1;
+  int32_t thread_count                     = 2;
+  std::optional<cudf::io::io_type> io_type = std::nullopt;
+  bool validate_output                     = false;
+
+  // Function to print example usage
+  auto const print_usage = [] {
+    fmt::print(fg(fmt::color::yellow),
+               "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files>\n"
+               "                                <input files multiplier> <number of threads> \n"
+               "                                <io sink type> <validate output: yes/no>\n\n");
+    fmt::print(
+      fg(fmt::color::light_sky_blue),
+      "Note: Provide as many arguments as you like in the above order. Default values\n"
+      "      for the unprovided arguments will be used. No output parquet will be written\n"
+      "      if <io sink type> isn't provided.\n\n");
+  };
 
+  // Set to the provided args
   switch (argc) {
-    case 1:
-      input_paths  = "example.parquet";
-      output_path  = std::filesystem::current_path().string();
-      encoding     = get_encoding_type("DELTA_BINARY_PACKED");
-      compression  = get_compression_type("ZSTD");
-      thread_count = 2;
-      break;
-    case 7: page_stats = get_page_size_stats(argv[6]); [[fallthrough]];
-    case 6:
-      input_paths  = std::string{argv[1]};
-      output_path  = std::string{argv[2]};
-      encoding     = get_encoding_type(argv[3]);
-      compression  = get_compression_type(argv[4]);
-      thread_count = std::max(thread_count, std::stoi(std::string{argv[5]}));
-      break;
-    default:
-      throw std::runtime_error(
-        "Either provide all command-line arguments, or none to use defaults\n"
-        "Use: parquet_io_multithreaded <comma delimited directories or parquet files>"
-        "<output path> <encoding type> <compression type> <thread count> "
-        "<write_page_stats? yes/no>\n");
+    case 6: validate_output = get_boolean(argv[5]); [[fallthrough]];
+    case 5: io_type = get_io_sink_type(argv[4]); [[fallthrough]];
+    case 4: thread_count = std::max(thread_count, std::stoi(std::string{argv[3]})); [[fallthrough]];
+    case 3:
+      input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]}));
+      [[fallthrough]];
+    case 2:
+      if (auto arg = std::string{argv[1]}; arg == "-h" or arg == "--help") {
+        print_usage();
+        return 0;
+      } else
+        input_paths = std::string{argv[1]};
+      [[fallthrough]];
+    case 1: break;
+    default: print_usage(); throw std::runtime_error("");
   }
 
-  // Process and extract all input files
-  auto const extract_input_files = [thread_count = thread_count](std::string const& paths) {
-    std::vector<std::string> parquet_files;
-    std::vector<std::string> delimited_paths = [&]() {
+  // Lambda function to process and extract all input files
+  auto const extract_input_files = [thread_count, input_multiplier](std::string const& paths) {
+    std::vector<std::string> const delimited_paths = [&]() {
       std::vector<std::string> paths_list;
       std::stringstream stream{paths};
       std::string path;
@@ -152,114 +219,60 @@ int main(int argc, char const** argv)
       return paths_list;
     }();
 
-    std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
-      std::filesystem::path path{path_string};
-      // If this is a parquet file, add it.
-      if (std::filesystem::is_regular_file(path)) {
-        parquet_files.push_back(path_string);
-      }
-      // If this is a directory, add all files at this path
-      else if (std::filesystem::is_directory(path)) {
-        for (auto const& file : std::filesystem::directory_iterator(path)) {
-          if (std::filesystem::is_regular_file(file.path())) {
-            parquet_files.push_back(file.path().string());
-          }
-        }
-      } else {
-        throw std::runtime_error("Encountered an invalid input path\n");
-      }
-    });
+    // The final list of parquet files to be read.
+    std::vector<std::string> parquet_files;
+    parquet_files.reserve(
+      std::max<size_t>(thread_count, input_multiplier * delimited_paths.size()));
+    // Append the input files by input_multiplier times
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(input_multiplier),
+      [&](auto i) {
+        std::for_each(
+          delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
+            std::filesystem::path path{path_string};
+            // If this is a parquet file, add it.
+            if (std::filesystem::is_regular_file(path)) {
+              parquet_files.emplace_back(path_string);
+            }
+            // If this is a directory, add all files at this path
+            else if (std::filesystem::is_directory(path)) {
+              for (auto const& file : std::filesystem::directory_iterator(path)) {
+                if (std::filesystem::is_regular_file(file.path())) {
+                  parquet_files.emplace_back(file.path().string());
+                }
+              }
+            } else {
+              throw std::runtime_error("Encountered an invalid input path\n");
+            }
+          });
+      });
 
-    // Add parquet files from existing ones if less than thread_count
+    // Cycle append parquet files from the existing ones if less than the thread_count
     for (size_t idx = 0, initial_size = parquet_files.size();
          thread_count > static_cast<int>(parquet_files.size());
          idx++) {
-      parquet_files.push_back(parquet_files[idx % initial_size]);
+      parquet_files.emplace_back(parquet_files[idx % initial_size]);
     }
 
     return parquet_files;
   };
 
-  // Lambda to concatenate a vector of tables
-  auto const concatenate_tables = [](std::vector<table_t> tables, rmm::cuda_stream_view stream) {
-    std::vector<cudf::table_view> table_views;
-    table_views.reserve(tables.size());
-    std::transform(
-      tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) {
-        return tbl->view();
-      });
-    // Construct the final table
-    return cudf::concatenate(table_views, stream);
-  };
-
-  // make input files from the input_paths string.
-  auto const input_files = extract_input_files(input_paths);
-
-  // Exit early if nothing to do.
-  if (not input_files.size()) {
-    std::cerr << "No input files to read. Exiting early.\n";
-    return 0;
-  }
-
-  // Check if output path is a valid
-  if (std::filesystem::is_directory({output_path})) {
-    // Create a new directory in output path if not empty.
-    if (not std::filesystem::is_empty({output_path})) {
-      output_path +=
-        "/output_" + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now());
-      std::filesystem::create_directory({output_path});
-    }
-  } else {
-    throw std::runtime_error("The provided output path is not a directory\n");
-  }
-
-  auto const is_pool_used = true;
-  auto resource           = create_memory_resource(is_pool_used);
-  auto default_stream     = cudf::get_default_stream();
-  auto stream_pool        = rmm::cuda_stream_pool(thread_count);
-  auto stats_mr =
-    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
-  rmm::mr::set_current_device_resource(&stats_mr);
-
-  // Lambda function to setup and launch multithread parquet read
-  auto const read_parquet_multithreaded = [&](std::vector<std::string> const& files) {
-    // Tables read by each thread
-    std::vector<table_t> tables(thread_count);
-
-    // Tasks to read each parquet file
-    std::vector<read_fn> read_tasks;
-    read_tasks.reserve(thread_count);
-    std::for_each(thrust::make_counting_iterator(0),
-                  thrust::make_counting_iterator(thread_count),
-                  [&](auto tid) {
-                    read_tasks.emplace_back(
-                      read_fn{files, tables, tid, thread_count, stream_pool.get_stream()});
-                  });
-
-    std::vector<std::thread> threads;
-    threads.reserve(thread_count);
-    for (auto& c : read_tasks) {
-      threads.emplace_back(std::thread{c});
-    }
-    for (auto& t : threads) {
-      t.join();
-    }
-    return tables;
-  };
-
   // Lambda function to setup and launch multithreaded parquet writes
-  auto const write_parquet_multithreaded = [&](std::vector<cudf::table_view> const& tables) {
-    // Tasks to read each parquet file
+  auto const write_parquet_multithreaded = [&](std::vector<cudf::table_view> const& tables,
+                                               int32_t thread_count,
+                                               rmm::cuda_stream_pool& stream_pool) {
+    // Table writing tasks
     std::vector<write_fn> write_tasks;
     write_tasks.reserve(thread_count);
     std::for_each(
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(thread_count),
       [&](auto tid) {
-        write_tasks.emplace_back(write_fn{
-          output_path, tables, encoding, compression, page_stats, tid, stream_pool.get_stream()});
+        write_tasks.emplace_back(write_fn{io_type.value(), tables, tid, stream_pool.get_stream()});
       });
 
+    // Writer threads
     std::vector<std::thread> threads;
     threads.reserve(thread_count);
     for (auto& c : write_tasks) {
@@ -270,66 +283,95 @@ int main(int argc, char const** argv)
     }
   };
 
+  // Make a list of input files from the input_paths string.
+  auto const input_files  = extract_input_files(input_paths);
+  auto const is_pool_used = true;
+  auto resource           = create_memory_resource(is_pool_used);
+  auto default_stream     = cudf::get_default_stream();
+  auto stream_pool        = rmm::cuda_stream_pool(thread_count);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+
+  // Exit early if nothing to do.
+  if (not input_files.size()) {
+    std::cerr << "No input files to read. Exiting early.\n";
+    return 0;
+  }
+
   // Read the parquet files with multiple threads
   {
-    fmt::print(
-      "Note: Not timing the initial parquet read as it may include\n"
-      "times for nvcomp, cufile loading and RMM growth.\n\n");
-    // Tables read by each thread
-    auto const tables = read_parquet_multithreaded(input_files);
-    // In case some kernels are still running on the default stre
+    fmt::print(fg(fmt::color::yellow),
+               "\nNote: Not timing the initial parquet read as it may include\n"
+               "times for nvcomp, cufile loading and RMM growth.\n\n");
+    // Tasks to read each parquet file
+    auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
+      input_files, thread_count, stream_pool);
     default_stream.synchronize();
 
-    // Construct a vector of table views for write_parquet_multithreaded
-    auto const table_views = [&tables]() {
-      std::vector<cudf::table_view> table_views;
-      table_views.reserve(tables.size());
-
-      std::transform(
-        tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) {
-          return tbl->view();
-        });
-      return table_views;
-    }();
+    if (io_type.has_value()) {
+      // Initialize the default output path to avoid race condition with multiple writer threads.
+      std::ignore = get_default_output_path();
+
+      // Construct a vector of table views for write_parquet_multithreaded
+      auto const table_views = [&tables]() {
+        std::vector<cudf::table_view> table_views;
+        table_views.reserve(tables.size());
+
+        std::transform(
+          tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) {
+            return tbl->view();
+          });
+        return table_views;
+      }();
+
+      // Write tables to parquet
+      fmt::print("Writing parquet output to sink type: {}\n", std::string{argv[4]});
+      cudf::examples::timer timer;
+      write_parquet_multithreaded(table_views, thread_count, stream_pool);
+      default_stream.synchronize();
+      timer.print_elapsed_millis();
+    }
+  }
 
-    // Write tables to parquet with the specified encoding and compression
-    auto const page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
+  // Re-read the same parquet files with multiple threads and discard the read tables
+  {
     fmt::print(
-      "Writing at: {} with encoding, compression and {}..\n", output_path, page_stat_string);
-
+      "Reading {} input files for the second time using {} threads and discarding output "
+      "tables...\n",
+      input_files.size(),
+      thread_count);
     cudf::examples::timer timer;
-    write_parquet_multithreaded(table_views);
-    // In case some kernels are still running on the default stream
+    // Read parquet files and discard the tables
+    std::ignore =
+      read_parquet_multithreaded<read_mode::NOWORK>(input_files, thread_count, stream_pool);
     default_stream.synchronize();
-    // Print elapsed time
     timer.print_elapsed_millis();
   }
 
-  // Re-read the same parquet files with multiple threads
-  {
-    fmt::print("Reading for the second time using {} threads...\n", thread_count);
-    cudf::examples::timer timer;
-    auto const input_table =
-      concatenate_tables(std::move(read_parquet_multithreaded(input_files)), default_stream);
-    // In case some kernels are still running on the default stream
-    default_stream.synchronize();
-    // Print elapsed time and peak memory
-    timer.print_elapsed_millis();
+  // Verify the output files if requested
+  if (validate_output and io_type.has_value()) {
+    fmt::print("Verifying transcoding...\n");
 
-    fmt::print("Reading transcoded files using {} threads...\n", thread_count);
-    timer.reset();
-    auto const transcoded_table = concatenate_tables(
-      std::move(read_parquet_multithreaded(extract_input_files(output_path))), default_stream);
-    // In case some kernels are still running on the default stream
-    default_stream.synchronize();
-    // Print elapsed time and peak memory
-    timer.print_elapsed_millis();
+    // CONCATENATE_ALL returns a vector of 1 table
+    auto const input_table = std::move(
+      read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(input_files, thread_count, stream_pool)
+        .back());
 
-    fmt::print("Peak memory: {} MB\n\n", (stats_mr.get_bytes_counter().peak / 1048576.0));
+    auto const transcoded_table =
+      std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(
+                  extract_input_files(get_default_output_path()), thread_count, stream_pool)
+                  .back());
+    default_stream.synchronize();
 
     // Check for validity
     check_identical_tables(input_table->view(), transcoded_table->view());
   }
 
+  // Print peak memory
+  fmt::print(fmt::emphasis::bold | fg(fmt::color::medium_purple),
+             "Peak memory: {} MB\n\n",
+             (stats_mr.get_bytes_counter().peak / 1048576.0));
+
   return 0;
-}
+}
\ No newline at end of file

From 06817d05b644464582e173decefb67d2bdd0eba6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 27 Sep 2024 01:27:47 +0000
Subject: [PATCH 12/37] Minor updates

---
 cpp/examples/parquet_io/common.hpp            |  2 +-
 .../parquet_io/parquet_io_multithreaded.cpp   | 40 ++++++++++++++-----
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp
index eaff77708e6..c4cbd6a589a 100644
--- a/cpp/examples/parquet_io/common.hpp
+++ b/cpp/examples/parquet_io/common.hpp
@@ -154,7 +154,7 @@ inline void check_identical_tables(cudf::table_view const& lhs_table,
 }
 
 /**
- * @brief Get io sink type from the string keyword argumnet
+ * @brief Get io sink type from the string keyword argument
  *
  * @param name io sink type keyword name
  * @return corresponding io sink type type
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 419cef23d33..02e4c772e5b 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -88,6 +88,15 @@ struct read_fn {
 
 /**
  * @brief Function to setup and launch multithreaded parquet reading.
+ *
+ * @tparam read_mode Specifies if to concatenate and return the actual
+ *                    tables or discard them and return an empty vector
+ *
+ * @param files List of files to read
+ * @param thread_count Number of threads
+ * @param stream_pool CUDA stream pool to use for threads
+ *
+ * @return Vector of read tables.
  */
 template <read_mode read_mode>
 std::vector<table_t> read_parquet_multithreaded(std::vector<std::string> const& files,
@@ -302,14 +311,17 @@ int32_t main(int argc, char const** argv)
   // Read the parquet files with multiple threads
   {
     fmt::print(fg(fmt::color::yellow),
-               "\nNote: Not timing the initial parquet read as it may include\n"
-               "times for nvcomp, cufile loading and RMM growth.\n\n");
-    // Tasks to read each parquet file
-    auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
-      input_files, thread_count, stream_pool);
-    default_stream.synchronize();
+               "\nReading {} input files using {} threads without timing it as \n"
+               "it may include times for nvcomp, cufile loading and RMM growth.\n\n",
+               input_files.size(),
+               thread_count);
 
+    // If we are writing output then read with CONCATENATE_THREAD
     if (io_type.has_value()) {
+      // Launch
+      auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
+        input_files, thread_count, stream_pool);
+      default_stream.synchronize();
       // Initialize the default output path to avoid race condition with multiple writer threads.
       std::ignore = get_default_output_path();
 
@@ -326,19 +338,25 @@ int32_t main(int argc, char const** argv)
       }();
 
       // Write tables to parquet
-      fmt::print("Writing parquet output to sink type: {}\n", std::string{argv[4]});
+      fmt::print("Writing parquet output to sink type: {}..\n", std::string{argv[4]});
       cudf::examples::timer timer;
       write_parquet_multithreaded(table_views, thread_count, stream_pool);
       default_stream.synchronize();
       timer.print_elapsed_millis();
     }
+    // Else simply read with NOWORK mode
+    else {
+      std::ignore =
+        read_parquet_multithreaded<read_mode::NOWORK>(input_files, thread_count, stream_pool);
+      default_stream.synchronize();
+    }
   }
 
   // Re-read the same parquet files with multiple threads and discard the read tables
   {
     fmt::print(
-      "Reading {} input files for the second time using {} threads and discarding output "
-      "tables...\n",
+      "Re-reading {} input files using {} threads and discarding output "
+      "tables..\n",
       input_files.size(),
       thread_count);
     cudf::examples::timer timer;
@@ -351,7 +369,7 @@ int32_t main(int argc, char const** argv)
 
   // Verify the output files if requested
   if (validate_output and io_type.has_value()) {
-    fmt::print("Verifying transcoding...\n");
+    fmt::print("Verifying output..\n");
 
     // CONCATENATE_ALL returns a vector of 1 table
     auto const input_table = std::move(
@@ -374,4 +392,4 @@ int32_t main(int argc, char const** argv)
              (stats_mr.get_bytes_counter().peak / 1048576.0));
 
   return 0;
-}
\ No newline at end of file
+}

From af8ec6a9476654de9c7aece60cee2e9cb304ddc1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 27 Sep 2024 19:13:39 +0000
Subject: [PATCH 13/37] Minor improvements

---
 cpp/examples/parquet_io/common.hpp            |   4 +-
 .../parquet_io/parquet_io_multithreaded.cpp   | 134 +++++++++---------
 2 files changed, 66 insertions(+), 72 deletions(-)

diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp
index c4cbd6a589a..37eb138640a 100644
--- a/cpp/examples/parquet_io/common.hpp
+++ b/cpp/examples/parquet_io/common.hpp
@@ -145,11 +145,11 @@ inline void check_identical_tables(cudf::table_view const& lhs_table,
     // No exception thrown, check indices
     auto const valid = indices->size() == 0;
     fmt::print(
-      fmt::emphasis::bold | fg(fmt::color::green_yellow), "Transcoding valid: {}\n\n", valid);
+      fmt::emphasis::bold | fg(fmt::color::green_yellow), "Tables identical: {}\n\n", valid);
   } catch (std::exception& e) {
     std::cerr << e.what() << std::endl << std::endl;
     throw std::runtime_error(
-      fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Transcoding valid: false\n\n"));
+      fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Tables identical: false\n\n"));
   }
 }
 
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 02e4c772e5b..7728c91cbb7 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -179,16 +179,19 @@ int32_t main(int argc, char const** argv)
   // Set arguments to defaults
   std::string input_paths                  = "example.parquet";
   int32_t input_multiplier                 = 1;
+  int32_t num_reads                        = 1;
   int32_t thread_count                     = 2;
   std::optional<cudf::io::io_type> io_type = std::nullopt;
   bool validate_output                     = false;
 
   // Function to print example usage
   auto const print_usage = [] {
-    fmt::print(fg(fmt::color::yellow),
-               "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files>\n"
-               "                                <input files multiplier> <number of threads> \n"
-               "                                <io sink type> <validate output: yes/no>\n\n");
+    fmt::print(
+      fg(fmt::color::yellow),
+      "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files>\n"
+      "                                <input files multiplier> <number of times to reads>\n"
+      "                                <thread count> <io sink type> <validate output: "
+      "yes/no>\n\n");
     fmt::print(
       fg(fmt::color::light_sky_blue),
       "Note: Provide as many arguments as you like in the above order. Default values\n"
@@ -198,9 +201,10 @@ int32_t main(int argc, char const** argv)
 
   // Set to the provided args
   switch (argc) {
-    case 6: validate_output = get_boolean(argv[5]); [[fallthrough]];
-    case 5: io_type = get_io_sink_type(argv[4]); [[fallthrough]];
-    case 4: thread_count = std::max(thread_count, std::stoi(std::string{argv[3]})); [[fallthrough]];
+    case 7: validate_output = get_boolean(argv[6]); [[fallthrough]];
+    case 6: io_type = get_io_sink_type(argv[5]); [[fallthrough]];
+    case 5: thread_count = std::max(thread_count, std::stoi(std::string{argv[4]})); [[fallthrough]];
+    case 4: num_reads = std::max(1, std::stoi(std::string{argv[3]})); [[fallthrough]];
     case 3:
       input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]}));
       [[fallthrough]];
@@ -308,82 +312,72 @@ int32_t main(int argc, char const** argv)
     return 0;
   }
 
-  // Read the parquet files with multiple threads
-  {
-    fmt::print(fg(fmt::color::yellow),
-               "\nReading {} input files using {} threads without timing it as \n"
-               "it may include times for nvcomp, cufile loading and RMM growth.\n\n",
-               input_files.size(),
-               thread_count);
-
-    // If we are writing output then read with CONCATENATE_THREAD
-    if (io_type.has_value()) {
-      // Launch
-      auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
-        input_files, thread_count, stream_pool);
-      default_stream.synchronize();
-      // Initialize the default output path to avoid race condition with multiple writer threads.
-      std::ignore = get_default_output_path();
-
-      // Construct a vector of table views for write_parquet_multithreaded
-      auto const table_views = [&tables]() {
-        std::vector<cudf::table_view> table_views;
-        table_views.reserve(tables.size());
-
-        std::transform(
-          tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) {
-            return tbl->view();
-          });
-        return table_views;
-      }();
-
-      // Write tables to parquet
-      fmt::print("Writing parquet output to sink type: {}..\n", std::string{argv[4]});
-      cudf::examples::timer timer;
-      write_parquet_multithreaded(table_views, thread_count, stream_pool);
-      default_stream.synchronize();
-      timer.print_elapsed_millis();
-    }
-    // Else simply read with NOWORK mode
-    else {
-      std::ignore =
-        read_parquet_multithreaded<read_mode::NOWORK>(input_files, thread_count, stream_pool);
-      default_stream.synchronize();
-    }
-  }
-
-  // Re-read the same parquet files with multiple threads and discard the read tables
+  // Read the same parquet files specified times with multiple threads and discard the read tables
   {
     fmt::print(
-      "Re-reading {} input files using {} threads and discarding output "
+      "\nReading {} input files {} times using {} threads and discarding output "
       "tables..\n",
       input_files.size(),
+      num_reads,
       thread_count);
+    fmt::print(
+      fg(fmt::color::yellow),
+      "Note that the first read may include times for nvcomp, cufile loading and RMM growth.\n\n");
     cudf::examples::timer timer;
-    // Read parquet files and discard the tables
-    std::ignore =
-      read_parquet_multithreaded<read_mode::NOWORK>(input_files, thread_count, stream_pool);
+    std::for_each(thrust::make_counting_iterator(0),
+                  thrust::make_counting_iterator(num_reads),
+                  [&](auto i) {  // Read parquet files and discard the tables
+                    std::ignore = read_parquet_multithreaded<read_mode::NOWORK>(
+                      input_files, thread_count, stream_pool);
+                  });
     default_stream.synchronize();
     timer.print_elapsed_millis();
   }
 
-  // Verify the output files if requested
-  if (validate_output and io_type.has_value()) {
-    fmt::print("Verifying output..\n");
-
-    // CONCATENATE_ALL returns a vector of 1 table
-    auto const input_table = std::move(
-      read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(input_files, thread_count, stream_pool)
-        .back());
+  // Do we need to write parquet as well?
+  if (io_type.has_value()) {
+    // Read input files with CONCATENATE_THREADS mode
+    auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
+      input_files, thread_count, stream_pool);
+    default_stream.synchronize();
+    // Initialize the default output path to avoid race condition with multiple writer threads.
+    std::ignore = get_default_output_path();
+
+    // Construct a vector of table views for write_parquet_multithreaded
+    auto const table_views = [&tables]() {
+      std::vector<cudf::table_view> table_views;
+      table_views.reserve(tables.size());
+
+      std::transform(
+        tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) {
+          return tbl->view();
+        });
+      return table_views;
+    }();
 
-    auto const transcoded_table =
-      std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(
-                  extract_input_files(get_default_output_path()), thread_count, stream_pool)
-                  .back());
+    // Write tables to parquet
+    fmt::print("Writing parquet output to sink type: {}..\n", std::string{argv[5]});
+    cudf::examples::timer timer;
+    write_parquet_multithreaded(table_views, thread_count, stream_pool);
     default_stream.synchronize();
+    timer.print_elapsed_millis();
+
+    // Verify the output if requested
+    if (validate_output) {
+      fmt::print("Verifying output..\n");
+
+      // CONCATENATE_ALL returns a vector of 1 table
+      auto const input_table = cudf::concatenate(table_views, default_stream);
 
-    // Check for validity
-    check_identical_tables(input_table->view(), transcoded_table->view());
+      auto const transcoded_table =
+        std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(
+                    extract_input_files(get_default_output_path()), thread_count, stream_pool)
+                    .back());
+      default_stream.synchronize();
+
+      // Check if the tables are identical
+      check_identical_tables(input_table->view(), transcoded_table->view());
+    }
   }
 
   // Print peak memory

From d3778cc33d9a6e00a881344e9c35c2429859c6b5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 27 Sep 2024 19:14:44 +0000
Subject: [PATCH 14/37] Set default thread count = 1 instead of 2

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 7728c91cbb7..1b4b342d1f5 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -180,7 +180,7 @@ int32_t main(int argc, char const** argv)
   std::string input_paths                  = "example.parquet";
   int32_t input_multiplier                 = 1;
   int32_t num_reads                        = 1;
-  int32_t thread_count                     = 2;
+  int32_t thread_count                     = 1;
   std::optional<cudf::io::io_type> io_type = std::nullopt;
   bool validate_output                     = false;
 

From c2b39ccb9ede6300056bd5717a81727d69d6bf13 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 27 Sep 2024 19:16:45 +0000
Subject: [PATCH 15/37] Minor improvement

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 1b4b342d1f5..354b399c050 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -307,7 +307,7 @@ int32_t main(int argc, char const** argv)
   rmm::mr::set_current_device_resource(&stats_mr);
 
   // Exit early if nothing to do.
-  if (not input_files.size()) {
+  if (input_files.empty()) {
     std::cerr << "No input files to read. Exiting early.\n";
     return 0;
   }

From 8f39fb22897f4aaf21ffd9cc0283ae306473f5de Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 1 Oct 2024 01:37:43 +0000
Subject: [PATCH 16/37] Add io source types

---
 cpp/examples/parquet_io/common.hpp            |  85 ++----
 cpp/examples/parquet_io/io_source.hpp         | 145 ++++++++++
 .../parquet_io/parquet_io_multithreaded.cpp   | 272 ++++++++++--------
 3 files changed, 316 insertions(+), 186 deletions(-)
 create mode 100644 cpp/examples/parquet_io/io_source.hpp

diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp
index 37eb138640a..16fd16ee7c1 100644
--- a/cpp/examples/parquet_io/common.hpp
+++ b/cpp/examples/parquet_io/common.hpp
@@ -25,20 +25,21 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_device.hpp>
-#include <rmm/cuda_stream_pool.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <fmt/color.h>
 
-#include <chrono>
 #include <filesystem>
-#include <iostream>
-#include <optional>
 #include <string>
 
+/**
+ * @file commons.hpp
+ * @brief Common utilities for `parquet_io` examples
+ *
+ */
+
 /**
  * @brief Create memory resource for libcudf functions
  *
@@ -65,7 +66,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 {
   using encoding_type = cudf::io::column_encoding;
 
-  static const std::unordered_map<std::string_view, encoding_type> map = {
+  static std::unordered_map<std::string_view, encoding_type> const map = {
     {"DEFAULT", encoding_type::USE_DEFAULT},
     {"DICTIONARY", encoding_type::DICTIONARY},
     {"PLAIN", encoding_type::PLAIN},
@@ -76,11 +77,12 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   std::transform(name.begin(), name.end(), name.begin(), ::toupper);
   if (map.find(name) != map.end()) { return map.at(name); }
-  throw std::invalid_argument("FATAL: " + std::string(name) +
-                              " is not a valid encoding type.\n\n"
-                              "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
-                              "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
-                              "DELTA_BYTE_ARRAY\n\n");
+  throw std::invalid_argument(fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+                                          "{} is not a valid encoding type.\n\n"
+                                          "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
+                                          "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
+                                          "DELTA_BYTE_ARRAY\n\n",
+                                          name));
 }
 
 /**
@@ -93,7 +95,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 {
   using compression_type = cudf::io::compression_type;
 
-  static const std::unordered_map<std::string_view, compression_type> map = {
+  static std::unordered_map<std::string_view, compression_type> const map = {
     {"NONE", compression_type::NONE},
     {"AUTO", compression_type::AUTO},
     {"SNAPPY", compression_type::SNAPPY},
@@ -102,10 +104,11 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   std::transform(name.begin(), name.end(), name.begin(), ::toupper);
   if (map.find(name) != map.end()) { return map.at(name); }
-  throw std::invalid_argument("FATAL: " + std::string(name) +
-                              " is not a valid compression type.\n\n"
-                              "Available compression_type types: NONE, AUTO, SNAPPY,\n"
-                              "LZ4, ZSTD\n\n");
+  throw std::invalid_argument(fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+                                          "{} is not a valid compression type.\n\n"
+                                          "Available compression types: NONE, AUTO, SNAPPY,\n"
+                                          "LZ4, ZSTD\n\n",
+                                          name));
 }
 
 /**
@@ -153,34 +156,6 @@ inline void check_identical_tables(cudf::table_view const& lhs_table,
   }
 }
 
-/**
- * @brief Get io sink type from the string keyword argument
- *
- * @param name io sink type keyword name
- * @return corresponding io sink type type
- */
-[[nodiscard]] std::optional<cudf::io::io_type> get_io_sink_type(std::string name)
-{
-  using io_type = cudf::io::io_type;
-
-  static const std::unordered_map<std::string_view, io_type> map = {
-    {"FILEPATH", io_type::FILEPATH},
-    {"HOST_BUFFER", io_type::HOST_BUFFER},
-    {"PINNED_BUFFER", io_type::HOST_BUFFER},
-    {"DEVICE_BUFFER", io_type::DEVICE_BUFFER}};
-
-  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
-  if (map.find(name) != map.end()) {
-    return {map.at(name)};
-  } else {
-    fmt::print(
-      "{} is not a valid io sink type. Available: FILEPATH,\n"
-      "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER. Ignoring\n\n",
-      name);
-    return std::nullopt;
-  }
-}
-
 /**
  * @brief Concatenate a vector of tables and return the resultant table
  *
@@ -203,25 +178,3 @@ std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf
   // Construct the final table
   return cudf::concatenate(table_views, stream);
 }
-
-/**
- * @brief Thread unsafe function to create a directory for FILEPATH io sink type and return its path
- *
- * @return File path of the created directory
- */
-[[nodiscard]] std::string get_default_output_path()
-{
-  static std::string output_path = std::filesystem::current_path().string();
-  if (output_path == std::filesystem::current_path().string()) {
-    // Check if output path is a valid directory
-    if (std::filesystem::is_directory({output_path})) {
-      // Create a new directory in output path if not empty.
-      if (not std::filesystem::is_empty({output_path})) {
-        output_path +=
-          "/output_" + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now());
-        std::filesystem::create_directory({output_path});
-      }
-    }
-  }
-  return output_path;
-}
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
new file mode 100644
index 00000000000..677cc99385c
--- /dev/null
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+
+#include <fmt/chrono.h>
+#include <fmt/color.h>
+
+#include <filesystem>
+#include <string>
+
+/**
+ * @file io_source.hpp
+ * @brief Utilities for construction IO sources from the input parquet files.
+ *
+ */
+
+/**
+ * @brief Available IO source types
+ */
+enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER };
+
+/**
+ * @brief Create and return a reference to a static pinned memory pool
+ *
+ * @return Reference to a static pinned memory pool
+ */
+rmm::host_async_resource_ref pinned_memory_resource()
+{
+  static auto mr = rmm::mr::pinned_host_memory_resource{};
+  return mr;
+}
+
+/**
+ * @brief Get io source type from the string keyword argument
+ *
+ * @param name io source type keyword name
+ * @return io source type
+ */
+[[nodiscard]] io_source_type get_io_source_type(std::string name)
+{
+  static std::unordered_map<std::string_view, io_source_type> const map = {
+    {"FILEPATH", io_source_type::FILEPATH},
+    {"HOST_BUFFER", io_source_type::HOST_BUFFER},
+    {"PINNED_BUFFER", io_source_type::PINNED_BUFFER},
+    {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}};
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) {
+    return map.at(name);
+  } else {
+    throw std::invalid_argument(
+      fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+                  "{} is not a valid io source type. Available: FILEPATH,\n"
+                  "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n",
+                  name));
+  }
+}
+
+/**
+ * @brief Class to create a cudf::io::source_info of given type from the input parquet file
+ *
+ */
+class io_source {
+ public:
+  io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream)
+    : type{io_type},
+      file_name{file_path},
+      file_size{std::filesystem::file_size(file_name)},
+      pinned_buffer({pinned_memory_resource(), stream}),
+      d_buffer{0, stream}
+  {
+    // For filepath make a quick source_info and return early
+    if (type == io_source_type::FILEPATH) {
+      source_info = cudf::io::source_info(file_name);
+      return;
+    }
+
+    std::ifstream file{file_name, std::ifstream::binary};
+
+    // Copy file contents to the specified io source buffer
+    switch (type) {
+      case io_source_type::HOST_BUFFER: {
+        h_buffer.resize(file_size);
+        file.read(h_buffer.data(), file_size);
+        source_info = cudf::io::source_info(h_buffer.data(), h_buffer.size());
+        break;
+      }
+      case io_source_type::PINNED_BUFFER: {
+        pinned_buffer.resize(file_size);
+        file.read(pinned_buffer.data(), file_size);
+        source_info = cudf::io::source_info(pinned_buffer.data(), pinned_buffer.size());
+        break;
+      }
+      case io_source_type::DEVICE_BUFFER: {
+        h_buffer.resize(file_size);
+        file.read(h_buffer.data(), file_size);
+        d_buffer.resize(file_size, stream);
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+
+        source_info = cudf::io::source_info(d_buffer);
+        break;
+      }
+      default: {
+        throw std::runtime_error(fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+                                             "Encountered unexpected source type\n\n"));
+      }
+    }
+  }
+
+  // Get the internal source info
+  [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; }
+
+ private:
+  io_source_type const type;
+  std::string const file_name;
+  size_t const file_size;
+  cudf::io::source_info source_info;
+  std::vector<char> h_buffer;
+  cudf::detail::host_vector<char> pinned_buffer;
+  rmm::device_uvector<std::byte> d_buffer;
+};
\ No newline at end of file
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 354b399c050..4204e50c271 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -15,21 +15,26 @@
  */
 
 #include "common.hpp"
+#include "io_source.hpp"
 
-#include <rmm/cuda_stream_view.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/mr/device/statistics_resource_adaptor.hpp>
 
 #include <fmt/chrono.h>
 
 /**
  * @file parquet_io_multithreaded.cpp
- * @brief Demonstrates multithreaded read of parquet files and optionally
- * multithreaded writing the read tables to the specified io sink source type.
+ * @brief Demonstrates reading parquet data from the specified io source using multiple threads.
  *
- * Run: ``parquet_io_multithreaded -h`` to see help with input args and more.
+ * The input parquet data is provided via files which are converted to the specified io source type
+ * to be read using multiple threads. Optionally, the parquet data read by each thread can be
+ * written to corresponding files and checked for validatity of the output files against the input
+ * data.
  *
- * The following io sink types are supported:
- * IO sink types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER
+ * Run: ``parquet_io_multithreaded -h`` to see help with input args and more information.
+ *
+ * The following io source types are supported:
+ * IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER
  *
  */
 
@@ -50,7 +55,7 @@ enum class read_mode {
  */
 template <read_mode READ_FN>
 struct read_fn {
-  std::vector<std::string> const& input_files;
+  std::vector<io_source> const& input_sources;
   std::vector<table_t>& tables;
   int const thread_id;
   int const thread_count;
@@ -62,11 +67,11 @@ struct read_fn {
     std::vector<table_t> tables_this_thread;
 
     // Sweep the available input files
-    for (auto curr_file_idx = thread_id; curr_file_idx < input_files.size();
+    for (auto curr_file_idx = thread_id; curr_file_idx < input_sources.size();
          curr_file_idx += thread_count) {
-      auto const source_info = cudf::io::source_info(input_files[curr_file_idx]);
-      auto builder           = cudf::io::parquet_reader_options::builder(source_info);
-      auto const options     = builder.build();
+      auto builder =
+        cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info());
+      auto const options = builder.build();
       if constexpr (READ_FN != read_mode::NOWORK) {
         tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl);
       } else {
@@ -99,7 +104,7 @@ struct read_fn {
  * @return Vector of read tables.
  */
 template <read_mode read_mode>
-std::vector<table_t> read_parquet_multithreaded(std::vector<std::string> const& files,
+std::vector<table_t> read_parquet_multithreaded(std::vector<io_source> const& input_sources,
                                                 int32_t thread_count,
                                                 rmm::cuda_stream_pool& stream_pool)
 {
@@ -114,7 +119,7 @@ std::vector<table_t> read_parquet_multithreaded(std::vector<std::string> const&
   std::for_each(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) {
       read_tasks.emplace_back(
-        read_fn<read_mode>{files, tables, tid, thread_count, stream_pool.get_stream()});
+        read_fn<read_mode>{input_sources, tables, tid, thread_count, stream_pool.get_stream()});
     });
 
   // Create threads with tasks
@@ -143,7 +148,7 @@ std::vector<table_t> read_parquet_multithreaded(std::vector<std::string> const&
  * @brief Functor for multithreaded parquet writing
  */
 struct write_fn {
-  cudf::io::io_type io_sink_type;
+  std::string const& output_path;
   std::vector<cudf::table_view> const& table_views;
   int const thread_id;
   rmm::cuda_stream_view stream;
@@ -151,10 +156,8 @@ struct write_fn {
   void operator()()
   {
     // Create a sink
-    auto const sink_info = [io_sink_type = io_sink_type, thread_id = thread_id]() {
-      return cudf::io::sink_info(get_default_output_path() + "/table_" + std::to_string(thread_id) +
-                                 ".parquet");
-    }();
+    cudf::io::sink_info const sink_info{output_path + "/table_" + std::to_string(thread_id) +
+                                        ".parquet"};
     // Writer options builder
     auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]);
     // Create a new metadata for the table
@@ -171,40 +174,46 @@ struct write_fn {
   }
 };
 
+/**
+ * @brief Function to print example usage
+ */
+void print_usage()
+{
+  fmt::print(
+    fg(fmt::color::yellow),
+    "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files> <input "
+    "multiplier>\n"
+    "                                <io source type> <number of times to read> <thread count>\n"
+    "                                <write and validate output: "
+    "yes/no>\n\n");
+  fmt::print(
+    "Available IO source types: FILEPATH, HOST_BUFFER, {}, DEVICE_BUFFER\n\n",
+    fmt::format(fmt::emphasis::bold | fg(fmt::color::green_yellow), "PINNED_BUFFER (Default)"));
+  fmt::print(fg(fmt::color::light_sky_blue),
+             "Note: Provide as many arguments as you like in the above order. Default values\n"
+             "      for the unprovided arguments will be used. All input parquet files will\n"
+             "      be converted to the specified <io source type> before reading\n\n");
+}
+
 /**
  * @brief The main function
  */
 int32_t main(int argc, char const** argv)
 {
   // Set arguments to defaults
-  std::string input_paths                  = "example.parquet";
-  int32_t input_multiplier                 = 1;
-  int32_t num_reads                        = 1;
-  int32_t thread_count                     = 1;
-  std::optional<cudf::io::io_type> io_type = std::nullopt;
-  bool validate_output                     = false;
-
-  // Function to print example usage
-  auto const print_usage = [] {
-    fmt::print(
-      fg(fmt::color::yellow),
-      "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files>\n"
-      "                                <input files multiplier> <number of times to reads>\n"
-      "                                <thread count> <io sink type> <validate output: "
-      "yes/no>\n\n");
-    fmt::print(
-      fg(fmt::color::light_sky_blue),
-      "Note: Provide as many arguments as you like in the above order. Default values\n"
-      "      for the unprovided arguments will be used. No output parquet will be written\n"
-      "      if <io sink type> isn't provided.\n\n");
-  };
+  std::string input_paths       = "example.parquet";
+  int32_t input_multiplier      = 1;
+  int32_t num_reads             = 1;
+  int32_t thread_count          = 1;
+  io_source_type io_source_type = io_source_type::PINNED_BUFFER;
+  bool write_and_validate       = false;
 
   // Set to the provided args
   switch (argc) {
-    case 7: validate_output = get_boolean(argv[6]); [[fallthrough]];
-    case 6: io_type = get_io_sink_type(argv[5]); [[fallthrough]];
-    case 5: thread_count = std::max(thread_count, std::stoi(std::string{argv[4]})); [[fallthrough]];
-    case 4: num_reads = std::max(1, std::stoi(std::string{argv[3]})); [[fallthrough]];
+    case 7: write_and_validate = get_boolean(argv[6]); [[fallthrough]];
+    case 6: thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); [[fallthrough]];
+    case 5: num_reads = std::max(1, std::stoi(argv[4])); [[fallthrough]];
+    case 4: io_source_type = get_io_source_type(argv[3]); [[fallthrough]];
     case 3:
       input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]}));
       [[fallthrough]];
@@ -220,59 +229,68 @@ int32_t main(int argc, char const** argv)
   }
 
   // Lambda function to process and extract all input files
-  auto const extract_input_files = [thread_count, input_multiplier](std::string const& paths) {
-    std::vector<std::string> const delimited_paths = [&]() {
-      std::vector<std::string> paths_list;
-      std::stringstream stream{paths};
-      std::string path;
-      // Extract the delimited paths.
-      while (std::getline(stream, path, char{','})) {
-        paths_list.push_back(path);
-      }
-      return paths_list;
-    }();
-
-    // The final list of parquet files to be read.
-    std::vector<std::string> parquet_files;
-    parquet_files.reserve(
-      std::max<size_t>(thread_count, input_multiplier * delimited_paths.size()));
-    // Append the input files by input_multiplier times
-    std::for_each(
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(input_multiplier),
-      [&](auto i) {
-        std::for_each(
-          delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
-            std::filesystem::path path{path_string};
-            // If this is a parquet file, add it.
-            if (std::filesystem::is_regular_file(path)) {
-              parquet_files.emplace_back(path_string);
-            }
-            // If this is a directory, add all files at this path
-            else if (std::filesystem::is_directory(path)) {
-              for (auto const& file : std::filesystem::directory_iterator(path)) {
-                if (std::filesystem::is_regular_file(file.path())) {
-                  parquet_files.emplace_back(file.path().string());
+  auto const extract_input_sources_async =
+    [thread_count, input_multiplier, io_source_type = io_source_type](
+      std::string const& paths, rmm::cuda_stream_view stream) {
+      std::vector<std::string> const delimited_paths = [&]() {
+        std::vector<std::string> paths_list;
+        std::stringstream strstream{paths};
+        std::string path;
+        // Extract the delimited paths.
+        while (std::getline(strstream, path, char{','})) {
+          paths_list.push_back(path);
+        }
+        return paths_list;
+      }();
+
+      // The final list of parquet files to be read.
+      std::vector<std::string> parquet_files;
+      parquet_files.reserve(
+        std::max<size_t>(thread_count, input_multiplier * delimited_paths.size()));
+      // Append the input files by input_multiplier times
+      std::for_each(
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(input_multiplier),
+        [&](auto i) {
+          std::for_each(
+            delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
+              std::filesystem::path path{path_string};
+              // If this is a parquet file, add it.
+              if (std::filesystem::is_regular_file(path)) {
+                parquet_files.emplace_back(path_string);
+              }
+              // If this is a directory, add all files at this path
+              else if (std::filesystem::is_directory(path)) {
+                for (auto const& file : std::filesystem::directory_iterator(path)) {
+                  if (std::filesystem::is_regular_file(file.path())) {
+                    parquet_files.emplace_back(file.path().string());
+                  }
                 }
+              } else {
+                throw std::runtime_error("Encountered an invalid input path\n");
               }
-            } else {
-              throw std::runtime_error("Encountered an invalid input path\n");
-            }
-          });
-      });
+            });
+        });
 
-    // Cycle append parquet files from the existing ones if less than the thread_count
-    for (size_t idx = 0, initial_size = parquet_files.size();
-         thread_count > static_cast<int>(parquet_files.size());
-         idx++) {
-      parquet_files.emplace_back(parquet_files[idx % initial_size]);
-    }
+      // Cycle append parquet files from the existing ones if less than the thread_count
+      for (size_t idx = 0, initial_size = parquet_files.size();
+           thread_count > static_cast<int>(parquet_files.size());
+           idx++) {
+        parquet_files.emplace_back(parquet_files[idx % initial_size]);
+      }
 
-    return parquet_files;
-  };
+      std::vector<io_source> input_sources;
+      input_sources.reserve(parquet_files.size());
+      std::transform(parquet_files.begin(),
+                     parquet_files.end(),
+                     std::back_inserter(input_sources),
+                     [&](auto& file_name) { return io_source(file_name, io_source_type, stream); });
+      return input_sources;
+    };
 
   // Lambda function to setup and launch multithreaded parquet writes
-  auto const write_parquet_multithreaded = [&](std::vector<cudf::table_view> const& tables,
+  auto const write_parquet_multithreaded = [&](std::string const& output_path,
+                                               std::vector<cudf::table_view> const& tables,
                                                int32_t thread_count,
                                                rmm::cuda_stream_pool& stream_pool) {
     // Table writing tasks
@@ -282,7 +300,7 @@ int32_t main(int argc, char const** argv)
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(thread_count),
       [&](auto tid) {
-        write_tasks.emplace_back(write_fn{io_type.value(), tables, tid, stream_pool.get_stream()});
+        write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()});
       });
 
     // Writer threads
@@ -296,8 +314,7 @@ int32_t main(int argc, char const** argv)
     }
   };
 
-  // Make a list of input files from the input_paths string.
-  auto const input_files  = extract_input_files(input_paths);
+  // Initialize mr, default stream and stream pool
   auto const is_pool_used = true;
   auto resource           = create_memory_resource(is_pool_used);
   auto default_stream     = cudf::get_default_stream();
@@ -306,42 +323,54 @@ int32_t main(int argc, char const** argv)
     rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
   rmm::mr::set_current_device_resource(&stats_mr);
 
+  // Make a list of input sources from the input_paths string.
+  auto const input_sources = extract_input_sources_async(input_paths, default_stream);
+  default_stream.synchronize();
+
   // Exit early if nothing to do.
-  if (input_files.empty()) {
+  if (input_sources.empty()) {
     std::cerr << "No input files to read. Exiting early.\n";
     return 0;
   }
 
   // Read the same parquet files specified times with multiple threads and discard the read tables
   {
+    // Print status
     fmt::print(
-      "\nReading {} input files {} times using {} threads and discarding output "
+      "\nReading {} input sources {} time(s) using {} threads and discarding output "
       "tables..\n",
-      input_files.size(),
+      input_sources.size(),
       num_reads,
       thread_count);
-    fmt::print(
-      fg(fmt::color::yellow),
-      "Note that the first read may include times for nvcomp, cufile loading and RMM growth.\n\n");
+
+    if (io_source_type == io_source_type::FILEPATH) {
+      fmt::print(fg(fmt::color::yellow),
+                 "Note that the first read may include times for nvcomp, cufile loading and RMM "
+                 "growth.\n\n");
+    }
+
     cudf::examples::timer timer;
     std::for_each(thrust::make_counting_iterator(0),
                   thrust::make_counting_iterator(num_reads),
                   [&](auto i) {  // Read parquet files and discard the tables
                     std::ignore = read_parquet_multithreaded<read_mode::NOWORK>(
-                      input_files, thread_count, stream_pool);
+                      input_sources, thread_count, stream_pool);
                   });
     default_stream.synchronize();
     timer.print_elapsed_millis();
   }
 
-  // Do we need to write parquet as well?
-  if (io_type.has_value()) {
+  // Do we need to write parquet files and validate?
+  if (write_and_validate) {
     // Read input files with CONCATENATE_THREADS mode
     auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
-      input_files, thread_count, stream_pool);
+      input_sources, thread_count, stream_pool);
     default_stream.synchronize();
-    // Initialize the default output path to avoid race condition with multiple writer threads.
-    std::ignore = get_default_output_path();
+
+    // Create a directory at the tmpdir path.
+    std::string output_path = std::filesystem::temp_directory_path().string() + "/output_" +
+                              fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now());
+    std::filesystem::create_directory({output_path});
 
     // Construct a vector of table views for write_parquet_multithreaded
     auto const table_views = [&tables]() {
@@ -356,28 +385,31 @@ int32_t main(int argc, char const** argv)
     }();
 
     // Write tables to parquet
-    fmt::print("Writing parquet output to sink type: {}..\n", std::string{argv[5]});
+    fmt::print("Writing parquet output files..\n");
     cudf::examples::timer timer;
-    write_parquet_multithreaded(table_views, thread_count, stream_pool);
+    write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool);
     default_stream.synchronize();
     timer.print_elapsed_millis();
 
-    // Verify the output if requested
-    if (validate_output) {
-      fmt::print("Verifying output..\n");
+    // Verify the output
+    fmt::print("Verifying output..\n");
+
+    // CONCATENATE_ALL returns a vector of 1 table
+    auto const input_table = cudf::concatenate(table_views, default_stream);
 
-      // CONCATENATE_ALL returns a vector of 1 table
-      auto const input_table = cudf::concatenate(table_views, default_stream);
+    auto const transcoded_input_sources = extract_input_sources_async(output_path, default_stream);
+    default_stream.synchronize();
+
+    auto const transcoded_table = std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(
+                                              transcoded_input_sources, thread_count, stream_pool)
+                                              .back());
+    default_stream.synchronize();
 
-      auto const transcoded_table =
-        std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(
-                    extract_input_files(get_default_output_path()), thread_count, stream_pool)
-                    .back());
-      default_stream.synchronize();
+    // Check if the tables are identical
+    check_identical_tables(input_table->view(), transcoded_table->view());
 
-      // Check if the tables are identical
-      check_identical_tables(input_table->view(), transcoded_table->view());
-    }
+    // Remove the created temp directory and parquet data.
+    std::filesystem::remove_all(output_path);
   }
 
   // Print peak memory

From d0c2a62cfc230e463a73495a70e4d6a962e34cb2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 1 Oct 2024 01:38:28 +0000
Subject: [PATCH 17/37] Minor comment updates

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 4204e50c271..868195eb256 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -192,7 +192,7 @@ void print_usage()
   fmt::print(fg(fmt::color::light_sky_blue),
              "Note: Provide as many arguments as you like in the above order. Default values\n"
              "      for the unprovided arguments will be used. All input parquet files will\n"
-             "      be converted to the specified <io source type> before reading\n\n");
+             "      be converted to the specified IO source type before reading\n\n");
 }
 
 /**

From 945c0c008226f5410052e90741568fb42474b70c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 1 Oct 2024 01:52:55 +0000
Subject: [PATCH 18/37] Style fix and add to CI.

---
 ci/run_cudf_examples.sh               | 3 +++
 cpp/examples/parquet_io/io_source.hpp | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 0819eacf636..830bb610cc8 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -26,4 +26,7 @@ compute-sanitizer --tool memcheck custom_with_malloc names.csv
 compute-sanitizer --tool memcheck parquet_io
 compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
 
+compute-sanitizer --tool memcheck parquet_io_multithreaded
+compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 PINNED_BUFFER 2 2
+
 exit ${EXITCODE}
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
index 677cc99385c..d9a6e0ee608 100644
--- a/cpp/examples/parquet_io/io_source.hpp
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -142,4 +142,4 @@ class io_source {
   std::vector<char> h_buffer;
   cudf::detail::host_vector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
-};
\ No newline at end of file
+};

From f30c80168d0479e2d4ee72dc47b4b2e199ab0ab0 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 1 Oct 2024 02:01:19 +0000
Subject: [PATCH 19/37] Minor improvement

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 868195eb256..732609d1ad2 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -184,7 +184,7 @@ void print_usage()
     "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files> <input "
     "multiplier>\n"
     "                                <io source type> <number of times to read> <thread count>\n"
-    "                                <write and validate output: "
+    "                                <write to temp output files and validate: "
     "yes/no>\n\n");
   fmt::print(
     "Available IO source types: FILEPATH, HOST_BUFFER, {}, DEVICE_BUFFER\n\n",

From 719bfb6e25ce9af60b5fb21b6b6533fb9cd6f7a1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 1 Oct 2024 22:26:10 +0000
Subject: [PATCH 20/37] Updates

---
 cpp/examples/parquet_io/CMakeLists.txt        |   7 +-
 .../{common.hpp => common_utils.cpp}          |  21 +-
 cpp/examples/parquet_io/common_utils.hpp      |  81 ++++++
 cpp/examples/parquet_io/io_source.hpp         |  44 ++-
 cpp/examples/parquet_io/parquet_io.cpp        |  10 +-
 .../parquet_io/parquet_io_multithreaded.cpp   | 256 +++++++++++-------
 6 files changed, 291 insertions(+), 128 deletions(-)
 rename cpp/examples/parquet_io/{common.hpp => common_utils.cpp} (91%)
 create mode 100644 cpp/examples/parquet_io/common_utils.hpp

diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index 28ade3666bf..9d81a726217 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -16,15 +16,18 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+add_library(parquet_io_common_utils OBJECT common_utils.cpp)
+target_link_libraries(parquet_io_common_utils PRIVATE cudf::cudf)
+
 # Build and install parquet_io
 add_executable(parquet_io parquet_io.cpp)
-target_link_libraries(parquet_io PRIVATE cudf::cudf)
+target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_common_utils>)
 target_compile_features(parquet_io PRIVATE cxx_std_17)
 install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
 
 # Build and install parquet_io_multithreaded
 add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp)
-target_link_libraries(parquet_io_multithreaded PRIVATE cudf::cudf)
+target_link_libraries(parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_common_utils>)
 target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17)
 install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf)
 
diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common_utils.cpp
similarity index 91%
rename from cpp/examples/parquet_io/common.hpp
rename to cpp/examples/parquet_io/common_utils.cpp
index 16fd16ee7c1..aa3d4c922e4 100644
--- a/cpp/examples/parquet_io/common.hpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (c) 2024, NVIDIA CORPORATION.
  *
@@ -14,29 +15,24 @@
  * limitations under the License.
  */
 
-#pragma once
-
-#include "../utilities/timer.hpp"
+#include "common_utils.hpp"
 
 #include <cudf/concatenate.hpp>
-#include <cudf/io/parquet.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <fmt/color.h>
 
-#include <filesystem>
 #include <string>
 
 /**
- * @file commons.hpp
- * @brief Common utilities for `parquet_io` examples
+ * @file commons.cpp
+ * @brief Definitions for common utilities for `parquet_io` examples
  *
  */
 
@@ -62,7 +58,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
  * @param name encoding keyword name
  * @return corresponding column encoding type
  */
-[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name)
+cudf::io::column_encoding get_encoding_type(std::string name)
 {
   using encoding_type = cudf::io::column_encoding;
 
@@ -91,7 +87,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
  * @param name compression keyword name
  * @return corresponding compression type
  */
-[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name)
+cudf::io::compression_type get_compression_type(std::string name)
 {
   using compression_type = cudf::io::compression_type;
 
@@ -117,7 +113,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
  * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON
  * @return true or false
  */
-[[nodiscard]] bool get_boolean(std::string input)
+bool get_boolean(std::string input)
 {
   std::transform(input.begin(), input.end(), input.begin(), ::toupper);
 
@@ -136,8 +132,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
  * @param lhs_table View to lhs table
  * @param rhs_table View to rhs table
  */
-inline void check_identical_tables(cudf::table_view const& lhs_table,
-                                   cudf::table_view const& rhs_table)
+void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table)
 {
   try {
     // Left anti-join the original and transcoded tables
diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp
new file mode 100644
index 00000000000..135b40a09a3
--- /dev/null
+++ b/cpp/examples/parquet_io/common_utils.hpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <string>
+
+/**
+ * @file commons.hpp
+ * @brief Common utilities for `parquet_io` examples
+ *
+ */
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used);
+
+/**
+ * @brief Get encoding type from the keyword
+ *
+ * @param name encoding keyword name
+ * @return corresponding column encoding type
+ */
+[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name);
+
+/**
+ * @brief Get compression type from the keyword
+ *
+ * @param name compression keyword name
+ * @return corresponding compression type
+ */
+[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name);
+
+/**
+ * @brief Get boolean from they keyword
+ *
+ * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON
+ * @return true or false
+ */
+[[nodiscard]] bool get_boolean(std::string input);
+
+/**
+ * @brief Check if two tables are identical, throw an error otherwise
+ *
+ * @param lhs_table View to lhs table
+ * @param rhs_table View to rhs table
+ */
+void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table);
+
+/**
+ * @brief Concatenate a vector of tables and return the resultant table
+ *
+ * @param tables Vector of tables to concatenate
+ * @param stream CUDA stream to use
+ *
+ * @return Unique pointer to the resultant concatenated table.
+ */
+std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
+                                                rmm::cuda_stream_view stream);
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
index d9a6e0ee608..3900877a4f7 100644
--- a/cpp/examples/parquet_io/io_source.hpp
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -16,15 +16,14 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
+#include <thrust/host_vector.h>
+
 #include <fmt/chrono.h>
 #include <fmt/color.h>
 
@@ -53,6 +52,33 @@ rmm::host_async_resource_ref pinned_memory_resource()
   return mr;
 }
 
+/**
+ * @brief Custom allocator for pinned_buffer via RMM.
+ */
+template <typename T>
+struct pinned_allocator : public std::allocator<T> {
+  pinned_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
+    : mr{_mr}, stream{_stream}
+  {
+  }
+
+  T* allocate(std::size_t n)
+  {
+    auto ptr = mr.allocate_async(n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+    stream.synchronize();
+    return static_cast<T*>(ptr);
+  }
+
+  void deallocate(T* ptr, std::size_t n)
+  {
+    mr.deallocate_async(ptr, n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+ private:
+  rmm::host_async_resource_ref mr;
+  rmm::cuda_stream_view stream;
+};
+
 /**
  * @brief Get io source type from the string keyword argument
  *
@@ -105,13 +131,13 @@ class io_source {
       case io_source_type::HOST_BUFFER: {
         h_buffer.resize(file_size);
         file.read(h_buffer.data(), file_size);
-        source_info = cudf::io::source_info(h_buffer.data(), h_buffer.size());
+        source_info = cudf::io::source_info(h_buffer.data(), file_size);
         break;
       }
       case io_source_type::PINNED_BUFFER: {
         pinned_buffer.resize(file_size);
         file.read(pinned_buffer.data(), file_size);
-        source_info = cudf::io::source_info(pinned_buffer.data(), pinned_buffer.size());
+        source_info = cudf::io::source_info(pinned_buffer.data(), file_size);
         break;
       }
       case io_source_type::DEVICE_BUFFER: {
@@ -119,7 +145,7 @@ class io_source {
         file.read(h_buffer.data(), file_size);
         d_buffer.resize(file_size, stream);
         CUDF_CUDA_TRY(cudaMemcpyAsync(
-          d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+          d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value()));
 
         source_info = cudf::io::source_info(d_buffer);
         break;
@@ -135,11 +161,15 @@ class io_source {
   [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; }
 
  private:
+  // alias for pinned vector
+  template <typename T>
+  using pinned_vector = thrust::host_vector<T, pinned_allocator<T>>;
+
   io_source_type const type;
   std::string const file_name;
   size_t const file_size;
   cudf::io::source_info source_info;
   std::vector<char> h_buffer;
-  cudf::detail::host_vector<char> pinned_buffer;
+  pinned_vector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
 };
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index a4ee550b0e4..08dbaa0bdd6 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -14,9 +14,15 @@
  * limitations under the License.
  */
 
-#include "common.hpp"
+#include "../utilities/timer.hpp"
+#include "common_utils.hpp"
+#include "io_source.hpp"
 
-#include <cudf/utilities/default_stream.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <string>
 
 /**
  * @file parquet_io.cpp
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 732609d1ad2..95ade08c791 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -14,13 +14,25 @@
  * limitations under the License.
  */
 
-#include "common.hpp"
+#include "../utilities/timer.hpp"
+#include "common_utils.hpp"
 #include "io_source.hpp"
 
+#include <cudf/concatenate.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
 #include <rmm/cuda_stream_pool.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/statistics_resource_adaptor.hpp>
 
 #include <fmt/chrono.h>
+#include <fmt/color.h>
+
+#include <filesystem>
+#include <stdexcept>
+#include <string>
 
 /**
  * @file parquet_io_multithreaded.cpp
@@ -175,7 +187,40 @@ struct write_fn {
 };
 
 /**
- * @brief Function to print example usage
+ * @brief Function to setup and launch multithreaded writing parquet files.
+ *
+ * @param output_path Path to output directory
+ * @param tables List of at least table views to be written
+ * @param thread_count Number of threads to use for writing tables.
+ * @param stream_pool CUDA stream pool to use for threads
+ *
+ */
+void write_parquet_multithreaded(std::string const& output_path,
+                                 std::vector<cudf::table_view> const& tables,
+                                 int32_t thread_count,
+                                 rmm::cuda_stream_pool& stream_pool)
+{
+  // Table writing tasks
+  std::vector<write_fn> write_tasks;
+  write_tasks.reserve(thread_count);
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) {
+      write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()});
+    });
+
+  // Writer threads
+  std::vector<std::thread> threads;
+  threads.reserve(thread_count);
+  for (auto& c : write_tasks) {
+    threads.emplace_back(std::thread{c});
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+/**
+ * @brief Function to print example usage and argument information.
  */
 void print_usage()
 {
@@ -195,6 +240,94 @@ void print_usage()
              "      be converted to the specified IO source type before reading\n\n");
 }
 
+/**
+ * @brief Function to process comma delimited input paths string to parquet files and/or dirs
+ *        and asynchronously convert them to specified io sources.
+ *
+ * Process the input path string containing directories (of parquet files) and/or individual
+ * parquet files into a list of input parquet files, multiple the list by `input_multiplier`,
+ * make sure to have at least `thread_count` files to satisfy at least file per parallel thread,
+ * and asynchronously convert the final list of files to a list of `io_source` and return.
+ *
+ * @param paths Comma delimited input paths string
+ * @param input_multiplier Multiplier for the input files list
+ * @param thread_count Number of threads being used in the example
+ * @param io_source_type Specified IO source type to convert input files to
+ * @param stream CUDA stream to use
+ *
+ */
+std::vector<io_source> extract_input_sources_async(std::string const& paths,
+                                                   int32_t input_multiplier,
+                                                   int32_t thread_count,
+                                                   io_source_type io_source_type,
+                                                   rmm::cuda_stream_view stream)
+{
+  // Get the delimited paths to directory and/or files.
+  std::vector<std::string> const delimited_paths = [&]() {
+    std::vector<std::string> paths_list;
+    std::stringstream strstream{paths};
+    std::string path;
+    // Extract the delimited paths.
+    while (std::getline(strstream, path, char{','})) {
+      paths_list.push_back(path);
+    }
+    return paths_list;
+  }();
+
+  // List of parquet files
+  std::vector<std::string> parquet_files;
+  std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
+    std::filesystem::path path{path_string};
+    // If this is a parquet file, add it.
+    if (std::filesystem::is_regular_file(path)) {
+      parquet_files.push_back(path_string);
+    }
+    // If this is a directory, add all files in the directory.
+    else if (std::filesystem::is_directory(path)) {
+      for (auto const& file : std::filesystem::directory_iterator(path)) {
+        if (std::filesystem::is_regular_file(file.path())) {
+          parquet_files.push_back(file.path().string());
+        }
+      }
+    } else {
+      throw std::runtime_error("Encountered an invalid input path\n");
+    }
+  });
+
+  // Current size of list of parquet files
+  auto const initial_size = parquet_files.size();
+  if (initial_size == 0) { return {}; }
+
+  // Reserve space
+  parquet_files.reserve(std::max<size_t>(thread_count, input_multiplier * parquet_files.size()));
+
+  // Append the input files by input_multiplier times
+  std::for_each(thrust::make_counting_iterator(1),
+                thrust::make_counting_iterator(input_multiplier),
+                [&](auto i) {
+                  parquet_files.insert(parquet_files.end(),
+                                       parquet_files.begin(),
+                                       parquet_files.begin() + initial_size);
+                });
+
+  // Cycle append parquet files from the existing ones if less than the thread_count
+  for (size_t idx = 0; thread_count > static_cast<int>(parquet_files.size()); idx++) {
+    parquet_files.emplace_back(parquet_files[idx % initial_size]);
+  }
+
+  // Vector of io sources
+  std::vector<io_source> input_sources;
+  input_sources.reserve(parquet_files.size());
+  // Transform input files to the specified io sources
+  std::transform(parquet_files.begin(),
+                 parquet_files.end(),
+                 std::back_inserter(input_sources),
+                 [&](auto const& file_name) {
+                   return io_source{file_name, io_source_type, stream};
+                 });
+  return input_sources;
+}
+
 /**
  * @brief The main function
  */
@@ -228,92 +361,6 @@ int32_t main(int argc, char const** argv)
     default: print_usage(); throw std::runtime_error("");
   }
 
-  // Lambda function to process and extract all input files
-  auto const extract_input_sources_async =
-    [thread_count, input_multiplier, io_source_type = io_source_type](
-      std::string const& paths, rmm::cuda_stream_view stream) {
-      std::vector<std::string> const delimited_paths = [&]() {
-        std::vector<std::string> paths_list;
-        std::stringstream strstream{paths};
-        std::string path;
-        // Extract the delimited paths.
-        while (std::getline(strstream, path, char{','})) {
-          paths_list.push_back(path);
-        }
-        return paths_list;
-      }();
-
-      // The final list of parquet files to be read.
-      std::vector<std::string> parquet_files;
-      parquet_files.reserve(
-        std::max<size_t>(thread_count, input_multiplier * delimited_paths.size()));
-      // Append the input files by input_multiplier times
-      std::for_each(
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(input_multiplier),
-        [&](auto i) {
-          std::for_each(
-            delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) {
-              std::filesystem::path path{path_string};
-              // If this is a parquet file, add it.
-              if (std::filesystem::is_regular_file(path)) {
-                parquet_files.emplace_back(path_string);
-              }
-              // If this is a directory, add all files at this path
-              else if (std::filesystem::is_directory(path)) {
-                for (auto const& file : std::filesystem::directory_iterator(path)) {
-                  if (std::filesystem::is_regular_file(file.path())) {
-                    parquet_files.emplace_back(file.path().string());
-                  }
-                }
-              } else {
-                throw std::runtime_error("Encountered an invalid input path\n");
-              }
-            });
-        });
-
-      // Cycle append parquet files from the existing ones if less than the thread_count
-      for (size_t idx = 0, initial_size = parquet_files.size();
-           thread_count > static_cast<int>(parquet_files.size());
-           idx++) {
-        parquet_files.emplace_back(parquet_files[idx % initial_size]);
-      }
-
-      std::vector<io_source> input_sources;
-      input_sources.reserve(parquet_files.size());
-      std::transform(parquet_files.begin(),
-                     parquet_files.end(),
-                     std::back_inserter(input_sources),
-                     [&](auto& file_name) { return io_source(file_name, io_source_type, stream); });
-      return input_sources;
-    };
-
-  // Lambda function to setup and launch multithreaded parquet writes
-  auto const write_parquet_multithreaded = [&](std::string const& output_path,
-                                               std::vector<cudf::table_view> const& tables,
-                                               int32_t thread_count,
-                                               rmm::cuda_stream_pool& stream_pool) {
-    // Table writing tasks
-    std::vector<write_fn> write_tasks;
-    write_tasks.reserve(thread_count);
-    std::for_each(
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(thread_count),
-      [&](auto tid) {
-        write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()});
-      });
-
-    // Writer threads
-    std::vector<std::thread> threads;
-    threads.reserve(thread_count);
-    for (auto& c : write_tasks) {
-      threads.emplace_back(std::thread{c});
-    }
-    for (auto& t : threads) {
-      t.join();
-    }
-  };
-
   // Initialize mr, default stream and stream pool
   auto const is_pool_used = true;
   auto resource           = create_memory_resource(is_pool_used);
@@ -323,14 +370,14 @@ int32_t main(int argc, char const** argv)
     rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
   rmm::mr::set_current_device_resource(&stats_mr);
 
-  // Make a list of input sources from the input_paths string.
-  auto const input_sources = extract_input_sources_async(input_paths, default_stream);
+  // List of input sources from the input_paths string.
+  auto const input_sources = extract_input_sources_async(
+    input_paths, input_multiplier, thread_count, io_source_type, default_stream);
   default_stream.synchronize();
 
-  // Exit early if nothing to do.
+  // Check if there is nothing to do
   if (input_sources.empty()) {
-    std::cerr << "No input files to read. Exiting early.\n";
-    return 0;
+    throw std::runtime_error("No input files to read. Exiting early.\n");
   }
 
   // Read the same parquet files specified times with multiple threads and discard the read tables
@@ -362,21 +409,15 @@ int32_t main(int argc, char const** argv)
 
   // Do we need to write parquet files and validate?
   if (write_and_validate) {
-    // Read input files with CONCATENATE_THREADS mode
+    // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables
     auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(
       input_sources, thread_count, stream_pool);
     default_stream.synchronize();
 
-    // Create a directory at the tmpdir path.
-    std::string output_path = std::filesystem::temp_directory_path().string() + "/output_" +
-                              fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now());
-    std::filesystem::create_directory({output_path});
-
     // Construct a vector of table views for write_parquet_multithreaded
     auto const table_views = [&tables]() {
       std::vector<cudf::table_view> table_views;
       table_views.reserve(tables.size());
-
       std::transform(
         tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) {
           return tbl->view();
@@ -386,6 +427,10 @@ int32_t main(int argc, char const** argv)
 
     // Write tables to parquet
     fmt::print("Writing parquet output files..\n");
+    // Create a directory at the tmpdir path.
+    std::string output_path = std::filesystem::temp_directory_path().string() + "/output_" +
+                              fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now());
+    std::filesystem::create_directory({output_path});
     cudf::examples::timer timer;
     write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool);
     default_stream.synchronize();
@@ -394,21 +439,24 @@ int32_t main(int argc, char const** argv)
     // Verify the output
     fmt::print("Verifying output..\n");
 
-    // CONCATENATE_ALL returns a vector of 1 table
+    // Simply concatenate the previously read tables from input sources
     auto const input_table = cudf::concatenate(table_views, default_stream);
 
-    auto const transcoded_input_sources = extract_input_sources_async(output_path, default_stream);
+    // Sources from written parquet files
+    auto const written_pq_sources = extract_input_sources_async(
+      output_path, input_multiplier, thread_count, io_source_type, default_stream);
     default_stream.synchronize();
 
+    // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only
     auto const transcoded_table = std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(
-                                              transcoded_input_sources, thread_count, stream_pool)
+                                              written_pq_sources, thread_count, stream_pool)
                                               .back());
     default_stream.synchronize();
 
     // Check if the tables are identical
     check_identical_tables(input_table->view(), transcoded_table->view());
 
-    // Remove the created temp directory and parquet data.
+    // Remove the created temp directory and parquet data
     std::filesystem::remove_all(output_path);
   }
 

From 2ade064859edaf9529175703f5ce6805f6e434a7 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 1 Oct 2024 22:43:35 +0000
Subject: [PATCH 21/37] Style fix.

---
 cpp/examples/parquet_io/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index 9d81a726217..7c963e5192b 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -21,13 +21,17 @@ target_link_libraries(parquet_io_common_utils PRIVATE cudf::cudf)
 
 # Build and install parquet_io
 add_executable(parquet_io parquet_io.cpp)
-target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_common_utils>)
+target_link_libraries(
+  parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_common_utils>
+)
 target_compile_features(parquet_io PRIVATE cxx_std_17)
 install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
 
 # Build and install parquet_io_multithreaded
 add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp)
-target_link_libraries(parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_common_utils>)
+target_link_libraries(
+  parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_common_utils>
+)
 target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17)
 install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf)
 

From b559eafeb1990d9591a1914b640282bf2f6b4326 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 00:36:56 +0000
Subject: [PATCH 22/37] Print message when skipping a subdirectory

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 95ade08c791..3a46552e863 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -287,6 +287,8 @@ std::vector<io_source> extract_input_sources_async(std::string const& paths,
       for (auto const& file : std::filesystem::directory_iterator(path)) {
         if (std::filesystem::is_regular_file(file.path())) {
           parquet_files.push_back(file.path().string());
+        } else {
+          fmt::print("Skipping sub-directory: {}\n", file.path().string());
         }
       }
     } else {

From 73de5bcc028c22d9fc924c572d078f65a5843942 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 1 Oct 2024 17:39:17 -0700
Subject: [PATCH 23/37] Update cpp/examples/parquet_io/io_source.hpp

---
 cpp/examples/parquet_io/io_source.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
index 3900877a4f7..d52470ab5c7 100644
--- a/cpp/examples/parquet_io/io_source.hpp
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -32,7 +32,7 @@
 
 /**
  * @file io_source.hpp
- * @brief Utilities for construction IO sources from the input parquet files.
+ * @brief Utilities for constructing the specified IO sources from the input parquet files.
  *
  */
 

From 52e6953f97498bd06f679be9aa0018d69fcc6148 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 1 Oct 2024 17:40:00 -0700
Subject: [PATCH 24/37] Update cpp/examples/parquet_io/common_utils.cpp

---
 cpp/examples/parquet_io/common_utils.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp
index aa3d4c922e4..0c78a2d2877 100644
--- a/cpp/examples/parquet_io/common_utils.cpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -1,4 +1,3 @@
-
 /*
  * Copyright (c) 2024, NVIDIA CORPORATION.
  *

From 6194a50e75612e3c997063d845472963380ef136 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 04:33:02 +0000
Subject: [PATCH 25/37] Do not use `fmtlib`

---
 cpp/examples/parquet_io/common_utils.cpp      | 78 ++++++-------------
 cpp/examples/parquet_io/common_utils.hpp      |  6 ++
 cpp/examples/parquet_io/io_source.hpp         | 14 +---
 cpp/examples/parquet_io/parquet_io.cpp        | 12 +--
 .../parquet_io/parquet_io_multithreaded.cpp   | 58 ++++++--------
 5 files changed, 63 insertions(+), 105 deletions(-)

diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp
index 0c78a2d2877..3b89a66c902 100644
--- a/cpp/examples/parquet_io/common_utils.cpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -25,8 +25,8 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-#include <fmt/color.h>
-
+#include <chrono>
+#include <iomanip>
 #include <string>
 
 /**
@@ -35,12 +35,6 @@
  *
  */
 
-/**
- * @brief Create memory resource for libcudf functions
- *
- * @param pool Whether to use a pool memory resource.
- * @return Memory resource instance
- */
 std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used)
 {
   auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
@@ -51,12 +45,6 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
   return cuda_mr;
 }
 
-/**
- * @brief Get encoding type from the keyword
- *
- * @param name encoding keyword name
- * @return corresponding column encoding type
- */
 cudf::io::column_encoding get_encoding_type(std::string name)
 {
   using encoding_type = cudf::io::column_encoding;
@@ -72,20 +60,13 @@ cudf::io::column_encoding get_encoding_type(std::string name)
 
   std::transform(name.begin(), name.end(), name.begin(), ::toupper);
   if (map.find(name) != map.end()) { return map.at(name); }
-  throw std::invalid_argument(fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
-                                          "{} is not a valid encoding type.\n\n"
-                                          "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
-                                          "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
-                                          "DELTA_BYTE_ARRAY\n\n",
-                                          name));
+  throw std::invalid_argument(name +
+                              " is not a valid encoding type.\n\n"
+                              "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
+                              "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
+                              "DELTA_BYTE_ARRAY\n\n");
 }
 
-/**
- * @brief Get compression type from the keyword
- *
- * @param name compression keyword name
- * @return corresponding compression type
- */
 cudf::io::compression_type get_compression_type(std::string name)
 {
   using compression_type = cudf::io::compression_type;
@@ -99,19 +80,12 @@ cudf::io::compression_type get_compression_type(std::string name)
 
   std::transform(name.begin(), name.end(), name.begin(), ::toupper);
   if (map.find(name) != map.end()) { return map.at(name); }
-  throw std::invalid_argument(fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
-                                          "{} is not a valid compression type.\n\n"
-                                          "Available compression types: NONE, AUTO, SNAPPY,\n"
-                                          "LZ4, ZSTD\n\n",
-                                          name));
+  throw std::invalid_argument(name +
+                              " is not a valid compression type.\n\n"
+                              "Available compression types: NONE, AUTO, SNAPPY,\n"
+                              "LZ4, ZSTD\n\n");
 }
 
-/**
- * @brief Get boolean from they keyword
- *
- * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON
- * @return true or false
- */
 bool get_boolean(std::string input)
 {
   std::transform(input.begin(), input.end(), input.begin(), ::toupper);
@@ -125,12 +99,6 @@ bool get_boolean(std::string input)
   }
 }
 
-/**
- * @brief Check if two tables are identical, throw an error otherwise
- *
- * @param lhs_table View to lhs table
- * @param rhs_table View to rhs table
- */
 void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table)
 {
   try {
@@ -141,23 +109,13 @@ void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view
 
     // No exception thrown, check indices
     auto const valid = indices->size() == 0;
-    fmt::print(
-      fmt::emphasis::bold | fg(fmt::color::green_yellow), "Tables identical: {}\n\n", valid);
+    std::cout << "Tables identical: " << valid << "\n\n";
   } catch (std::exception& e) {
     std::cerr << e.what() << std::endl << std::endl;
-    throw std::runtime_error(
-      fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Tables identical: false\n\n"));
+    throw std::runtime_error("Tables identical: false\n\n");
   }
 }
 
-/**
- * @brief Concatenate a vector of tables and return the resultant table
- *
- * @param tables Vector of tables to concatenate
- * @param stream CUDA stream to use
- *
- * @return Unique pointer to the resultant concatenated table.
- */
 std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
                                                 rmm::cuda_stream_view stream)
 {
@@ -172,3 +130,13 @@ std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf
   // Construct the final table
   return cudf::concatenate(table_views, stream);
 }
+
+std::string current_time_and_date()
+{
+  auto const time       = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+  auto const local_time = *std::localtime(&time);
+  // Stringstream to format the date and time
+  std::stringstream ss;
+  ss << std::put_time(&local_time, "%Y-%m-%d-%H-%M-%S");
+  return ss.str();
+}
diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp
index 135b40a09a3..630f96758f4 100644
--- a/cpp/examples/parquet_io/common_utils.hpp
+++ b/cpp/examples/parquet_io/common_utils.hpp
@@ -79,3 +79,9 @@ void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view
  */
 std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf::table>> tables,
                                                 rmm::cuda_stream_view stream);
+
+/**
+ * @brief Returns a string containing current date and time
+ *
+ */
+std::string current_time_and_date();
\ No newline at end of file
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
index d52470ab5c7..6ccc031d382 100644
--- a/cpp/examples/parquet_io/io_source.hpp
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -24,9 +24,6 @@
 
 #include <thrust/host_vector.h>
 
-#include <fmt/chrono.h>
-#include <fmt/color.h>
-
 #include <filesystem>
 #include <string>
 
@@ -97,11 +94,9 @@ struct pinned_allocator : public std::allocator<T> {
   if (map.find(name) != map.end()) {
     return map.at(name);
   } else {
-    throw std::invalid_argument(
-      fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
-                  "{} is not a valid io source type. Available: FILEPATH,\n"
-                  "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n",
-                  name));
+    throw std::invalid_argument(name +
+                                " is not a valid io source type. Available: FILEPATH,\n"
+                                "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n");
   }
 }
 
@@ -151,8 +146,7 @@ class io_source {
         break;
       }
       default: {
-        throw std::runtime_error(fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
-                                             "Encountered unexpected source type\n\n"));
+        throw std::runtime_error("Encountered unexpected source type\n\n");
       }
     }
   }
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 08dbaa0bdd6..513bd9c0518 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -136,16 +136,16 @@ int main(int argc, char const** argv)
   // Read input parquet file
   // We do not want to time the initial read time as it may include
   // time for nvcomp, cufile loading and RMM growth
-  fmt::print("\nReading {}...", input_filepath);
-  fmt::print(
-    "Note: Not timing the initial parquet read as it may include\n"
-    "times for nvcomp, cufile loading and RMM growth.\n\n");
+  std::cout << "\nReading " << input_filepath << "...";
+  std::cout << "Note: Not timing the initial parquet read as it may include\n"
+               "times for nvcomp, cufile loading and RMM growth.\n\n";
   auto [input, metadata] = read_parquet(input_filepath);
 
   // Status string to indicate if page stats are set to be written or not
   auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
   // Write parquet file with the specified encoding and compression
-  fmt::print("Writing {} with encoding, compression and {}..\n", output_filepath, page_stat_string);
+  std::cout << "Writing " << output_filepath << " with encoding, compression and "
+            << page_stat_string << "..\n";
 
   // `timer` is automatically started here
   cudf::examples::timer timer;
@@ -153,7 +153,7 @@ int main(int argc, char const** argv)
   timer.print_elapsed_millis();
 
   // Read the parquet file written with encoding and compression
-  fmt::print("Reading {}...\n", output_filepath);
+  std::cout << "Reading " << output_filepath << "...\n";
 
   // Reset the timer
   timer.reset();
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 3a46552e863..4e3f61866db 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -27,9 +27,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/statistics_resource_adaptor.hpp>
 
-#include <fmt/chrono.h>
-#include <fmt/color.h>
-
 #include <filesystem>
 #include <stdexcept>
 #include <string>
@@ -224,20 +221,17 @@ void write_parquet_multithreaded(std::string const& output_path,
  */
 void print_usage()
 {
-  fmt::print(
-    fg(fmt::color::yellow),
-    "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files> <input "
-    "multiplier>\n"
-    "                                <io source type> <number of times to read> <thread count>\n"
-    "                                <write to temp output files and validate: "
-    "yes/no>\n\n");
-  fmt::print(
-    "Available IO source types: FILEPATH, HOST_BUFFER, {}, DEVICE_BUFFER\n\n",
-    fmt::format(fmt::emphasis::bold | fg(fmt::color::green_yellow), "PINNED_BUFFER (Default)"));
-  fmt::print(fg(fmt::color::light_sky_blue),
-             "Note: Provide as many arguments as you like in the above order. Default values\n"
-             "      for the unprovided arguments will be used. All input parquet files will\n"
-             "      be converted to the specified IO source type before reading\n\n");
+  std::cout
+    << "\nUsage: parquet_io_multithreaded <comma delimited list of dirs and/or files> <input "
+       "multiplier>\n"
+       "                                <io source type> <number of times to read> <thread count>\n"
+       "                                <write to temp output files and validate: "
+       "yes/no>\n\n"
+       "Available IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER (Default), "
+       "DEVICE_BUFFER\n\n"
+       "Note: Provide as many arguments as you like in the above order. Default values\n"
+       "      for the unprovided arguments will be used. All input parquet files will\n"
+       "      be converted to the specified IO source type before reading\n\n";
 }
 
 /**
@@ -288,7 +282,7 @@ std::vector<io_source> extract_input_sources_async(std::string const& paths,
         if (std::filesystem::is_regular_file(file.path())) {
           parquet_files.push_back(file.path().string());
         } else {
-          fmt::print("Skipping sub-directory: {}\n", file.path().string());
+          std::cout << "Skipping sub-directory: " << file.path().string() << "\n";
         }
       }
     } else {
@@ -385,17 +379,14 @@ int32_t main(int argc, char const** argv)
   // Read the same parquet files specified times with multiple threads and discard the read tables
   {
     // Print status
-    fmt::print(
-      "\nReading {} input sources {} time(s) using {} threads and discarding output "
-      "tables..\n",
-      input_sources.size(),
-      num_reads,
-      thread_count);
+    std::cout << "\nReading " << input_sources.size() << " input sources " << num_reads
+              << " time(s) using " << thread_count
+              << " threads and discarding output "
+                 "tables..\n";
 
     if (io_source_type == io_source_type::FILEPATH) {
-      fmt::print(fg(fmt::color::yellow),
-                 "Note that the first read may include times for nvcomp, cufile loading and RMM "
-                 "growth.\n\n");
+      std::cout << "Note that the first read may include times for nvcomp, cufile loading and RMM "
+                   "growth.\n\n";
     }
 
     cudf::examples::timer timer;
@@ -428,10 +419,11 @@ int32_t main(int argc, char const** argv)
     }();
 
     // Write tables to parquet
-    fmt::print("Writing parquet output files..\n");
+    std::cout << "Writing parquet output files..\n";
+
     // Create a directory at the tmpdir path.
-    std::string output_path = std::filesystem::temp_directory_path().string() + "/output_" +
-                              fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now());
+    std::string output_path =
+      std::filesystem::temp_directory_path().string() + "/output_" + current_time_and_date();
     std::filesystem::create_directory({output_path});
     cudf::examples::timer timer;
     write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool);
@@ -439,7 +431,7 @@ int32_t main(int argc, char const** argv)
     timer.print_elapsed_millis();
 
     // Verify the output
-    fmt::print("Verifying output..\n");
+    std::cout << "Verifying output..\n";
 
     // Simply concatenate the previously read tables from input sources
     auto const input_table = cudf::concatenate(table_views, default_stream);
@@ -463,9 +455,7 @@ int32_t main(int argc, char const** argv)
   }
 
   // Print peak memory
-  fmt::print(fmt::emphasis::bold | fg(fmt::color::medium_purple),
-             "Peak memory: {} MB\n\n",
-             (stats_mr.get_bytes_counter().peak / 1048576.0));
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n";
 
   return 0;
 }

From 3420c3f0f20a7a6d61e5e1b96072a52e7ed27e52 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 16:53:03 +0000
Subject: [PATCH 26/37] Minor style fix

---
 cpp/examples/parquet_io/common_utils.cpp             | 2 +-
 cpp/examples/parquet_io/common_utils.hpp             | 2 +-
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp
index 3b89a66c902..abb03f33c15 100644
--- a/cpp/examples/parquet_io/common_utils.cpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -131,7 +131,7 @@ std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf
   return cudf::concatenate(table_views, stream);
 }
 
-std::string current_time_and_date()
+std::string current_date_and_time()
 {
   auto const time       = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
   auto const local_time = *std::localtime(&time);
diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp
index 630f96758f4..5aa62a4fb68 100644
--- a/cpp/examples/parquet_io/common_utils.hpp
+++ b/cpp/examples/parquet_io/common_utils.hpp
@@ -84,4 +84,4 @@ std::unique_ptr<cudf::table> concatenate_tables(std::vector<std::unique_ptr<cudf
  * @brief Returns a string containing current date and time
  *
  */
-std::string current_time_and_date();
\ No newline at end of file
+std::string current_date_and_time();
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 4e3f61866db..d5dee85c7e0 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -423,7 +423,7 @@ int32_t main(int argc, char const** argv)
 
     // Create a directory at the tmpdir path.
     std::string output_path =
-      std::filesystem::temp_directory_path().string() + "/output_" + current_time_and_date();
+      std::filesystem::temp_directory_path().string() + "/output_" + current_date_and_time();
     std::filesystem::create_directory({output_path});
     cudf::examples::timer timer;
     write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool);

From 85d906d9e670696e9acedf3e3959c2099fdad698 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 2 Oct 2024 10:25:35 -0700
Subject: [PATCH 27/37] Minor change

---
 cpp/examples/parquet_io/parquet_io.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 513bd9c0518..12b9e5525a3 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -136,7 +136,7 @@ int main(int argc, char const** argv)
   // Read input parquet file
   // We do not want to time the initial read time as it may include
   // time for nvcomp, cufile loading and RMM growth
-  std::cout << "\nReading " << input_filepath << "...";
+  std::cout << "\nReading " << input_filepath << "...\n";
   std::cout << "Note: Not timing the initial parquet read as it may include\n"
                "times for nvcomp, cufile loading and RMM growth.\n\n";
   auto [input, metadata] = read_parquet(input_filepath);

From 70ec6fd328ca4093510df752ab53b04e05d40739 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 3 Oct 2024 03:35:58 +0000
Subject: [PATCH 28/37] Address minor nits from reviews

---
 cpp/examples/parquet_io/common_utils.cpp             | 2 +-
 cpp/examples/parquet_io/common_utils.hpp             | 6 ++++--
 cpp/examples/parquet_io/parquet_io.cpp               | 2 +-
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp
index abb03f33c15..62a5f4bdeae 100644
--- a/cpp/examples/parquet_io/common_utils.cpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -30,7 +30,7 @@
 #include <string>
 
 /**
- * @file commons.cpp
+ * @file common_utils.cpp
  * @brief Definitions for common utilities for `parquet_io` examples
  *
  */
diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp
index 5aa62a4fb68..12896e61a0d 100644
--- a/cpp/examples/parquet_io/common_utils.hpp
+++ b/cpp/examples/parquet_io/common_utils.hpp
@@ -19,12 +19,14 @@
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <memory>
 #include <string>
 
 /**
- * @file commons.hpp
+ * @file common_utils.hpp
  * @brief Common utilities for `parquet_io` examples
  *
  */
@@ -67,7 +69,7 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
  * @param lhs_table View to lhs table
  * @param rhs_table View to rhs table
  */
-void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table);
+void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table);
 
 /**
  * @brief Concatenate a vector of tables and return the resultant table
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 12b9e5525a3..aeb47225cac 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -161,7 +161,7 @@ int main(int argc, char const** argv)
   timer.print_elapsed_millis();
 
   // Check for validity
-  check_identical_tables(input->view(), transcoded_input->view());
+  check_tables_equal(input->view(), transcoded_input->view());
 
   return 0;
 }
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index d5dee85c7e0..19af739032d 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -448,7 +448,7 @@ int32_t main(int argc, char const** argv)
     default_stream.synchronize();
 
     // Check if the tables are identical
-    check_identical_tables(input_table->view(), transcoded_table->view());
+    check_tables_equal(input_table->view(), transcoded_table->view());
 
     // Remove the created temp directory and parquet data
     std::filesystem::remove_all(output_path);

From 00390cd342a8e910fef07a6e298769e8cce3f6c2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 3 Oct 2024 12:16:37 -0700
Subject: [PATCH 29/37] Update
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 19af739032d..e7966357cbd 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -248,7 +248,7 @@ void print_usage()
  * @param thread_count Number of threads being used in the example
  * @param io_source_type Specified IO source type to convert input files to
  * @param stream CUDA stream to use
- *
+ * @return Vector of input sources for the given paths
  */
 std::vector<io_source> extract_input_sources_async(std::string const& paths,
                                                    int32_t input_multiplier,

From 5ad8ecdb08aca5831ba24af9e9e09e92b78d3cf6 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 3 Oct 2024 19:27:11 +0000
Subject: [PATCH 30/37] Move code to cpp files and minor refactoring

---
 ci/run_cudf_examples.sh                       |  4 +-
 cpp/examples/parquet_io/CMakeLists.txt        |  8 +-
 cpp/examples/parquet_io/common_utils.cpp      |  2 +-
 cpp/examples/parquet_io/io_source.cpp         | 99 +++++++++++++++++++
 cpp/examples/parquet_io/io_source.hpp         | 88 +++--------------
 .../parquet_io/parquet_io_multithreaded.cpp   |  4 +-
 6 files changed, 121 insertions(+), 84 deletions(-)
 create mode 100644 cpp/examples/parquet_io/io_source.cpp

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 830bb610cc8..f8c3ed20b03 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -26,7 +26,7 @@ compute-sanitizer --tool memcheck custom_with_malloc names.csv
 compute-sanitizer --tool memcheck parquet_io
 compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
 
-compute-sanitizer --tool memcheck parquet_io_multithreaded
-compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 PINNED_BUFFER 2 2
+compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet
+compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2
 
 exit ${EXITCODE}
diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index 7c963e5192b..e2e9919fc49 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -16,13 +16,13 @@ project(
 
 include(../fetch_dependencies.cmake)
 
-add_library(parquet_io_common_utils OBJECT common_utils.cpp)
-target_link_libraries(parquet_io_common_utils PRIVATE cudf::cudf)
+add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp)
+target_link_libraries(parquet_io_utils PRIVATE cudf::cudf)
 
 # Build and install parquet_io
 add_executable(parquet_io parquet_io.cpp)
 target_link_libraries(
-  parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_common_utils>
+  parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_utils>
 )
 target_compile_features(parquet_io PRIVATE cxx_std_17)
 install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
@@ -30,7 +30,7 @@ install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
 # Build and install parquet_io_multithreaded
 add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp)
 target_link_libraries(
-  parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_common_utils>
+  parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_utils>
 )
 target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17)
 install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp
index 62a5f4bdeae..13a8293e64c 100644
--- a/cpp/examples/parquet_io/common_utils.cpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -99,7 +99,7 @@ bool get_boolean(std::string input)
   }
 }
 
-void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table)
+void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table)
 {
   try {
     // Left anti-join the original and transcoded tables
diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp
new file mode 100644
index 00000000000..342c2749d7a
--- /dev/null
+++ b/cpp/examples/parquet_io/io_source.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io_source.hpp"
+
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <filesystem>
+#include <string>
+
+rmm::host_async_resource_ref pinned_memory_resource()
+{
+  static auto mr = rmm::mr::pinned_host_memory_resource{};
+  return mr;
+}
+
+io_source_type get_io_source_type(std::string name)
+{
+  static std::unordered_map<std::string_view, io_source_type> const map = {
+    {"FILEPATH", io_source_type::FILEPATH},
+    {"HOST_BUFFER", io_source_type::HOST_BUFFER},
+    {"PINNED_BUFFER", io_source_type::PINNED_BUFFER},
+    {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}};
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) {
+    return map.at(name);
+  } else {
+    throw std::invalid_argument(name +
+                                " is not a valid io source type. Available: FILEPATH,\n"
+                                "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n");
+  }
+}
+
+io_source::io_source(std::string_view file_path,
+                     io_source_type io_type,
+                     rmm::cuda_stream_view stream)
+  : type{io_type},
+    file_name{file_path},
+    file_size{std::filesystem::file_size(file_name)},
+    pinned_buffer({pinned_memory_resource(), stream}),
+    d_buffer{0, stream}
+{
+  // For filepath make a quick source_info and return early
+  if (type == io_source_type::FILEPATH) {
+    source_info = cudf::io::source_info(file_name);
+    return;
+  }
+
+  std::ifstream file{file_name, std::ifstream::binary};
+
+  // Copy file contents to the specified io source buffer
+  switch (type) {
+    case io_source_type::HOST_BUFFER: {
+      h_buffer.resize(file_size);
+      file.read(h_buffer.data(), file_size);
+      source_info = cudf::io::source_info(h_buffer.data(), file_size);
+      break;
+    }
+    case io_source_type::PINNED_BUFFER: {
+      pinned_buffer.resize(file_size);
+      file.read(pinned_buffer.data(), file_size);
+      source_info = cudf::io::source_info(pinned_buffer.data(), file_size);
+      break;
+    }
+    case io_source_type::DEVICE_BUFFER: {
+      h_buffer.resize(file_size);
+      file.read(h_buffer.data(), file_size);
+      d_buffer.resize(file_size, stream);
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value()));
+
+      source_info = cudf::io::source_info(d_buffer);
+      break;
+    }
+    default: {
+      throw std::runtime_error("Encountered unexpected source type\n\n");
+    }
+  }
+}
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
index 6ccc031d382..a296938adaa 100644
--- a/cpp/examples/parquet_io/io_source.hpp
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -17,14 +17,13 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 
-#include <filesystem>
 #include <string>
 
 /**
@@ -38,16 +37,20 @@
  */
 enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER };
 
+/**
+ * @brief Get io source type from the string keyword argument
+ *
+ * @param name io source type keyword name
+ * @return io source type
+ */
+[[nodiscard]] io_source_type get_io_source_type(std::string name);
+
 /**
  * @brief Create and return a reference to a static pinned memory pool
  *
  * @return Reference to a static pinned memory pool
  */
-rmm::host_async_resource_ref pinned_memory_resource()
-{
-  static auto mr = rmm::mr::pinned_host_memory_resource{};
-  return mr;
-}
+rmm::host_async_resource_ref pinned_memory_resource();
 
 /**
  * @brief Custom allocator for pinned_buffer via RMM.
@@ -76,80 +79,13 @@ struct pinned_allocator : public std::allocator<T> {
   rmm::cuda_stream_view stream;
 };
 
-/**
- * @brief Get io source type from the string keyword argument
- *
- * @param name io source type keyword name
- * @return io source type
- */
-[[nodiscard]] io_source_type get_io_source_type(std::string name)
-{
-  static std::unordered_map<std::string_view, io_source_type> const map = {
-    {"FILEPATH", io_source_type::FILEPATH},
-    {"HOST_BUFFER", io_source_type::HOST_BUFFER},
-    {"PINNED_BUFFER", io_source_type::PINNED_BUFFER},
-    {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}};
-
-  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
-  if (map.find(name) != map.end()) {
-    return map.at(name);
-  } else {
-    throw std::invalid_argument(name +
-                                " is not a valid io source type. Available: FILEPATH,\n"
-                                "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n");
-  }
-}
-
 /**
  * @brief Class to create a cudf::io::source_info of given type from the input parquet file
  *
  */
 class io_source {
  public:
-  io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream)
-    : type{io_type},
-      file_name{file_path},
-      file_size{std::filesystem::file_size(file_name)},
-      pinned_buffer({pinned_memory_resource(), stream}),
-      d_buffer{0, stream}
-  {
-    // For filepath make a quick source_info and return early
-    if (type == io_source_type::FILEPATH) {
-      source_info = cudf::io::source_info(file_name);
-      return;
-    }
-
-    std::ifstream file{file_name, std::ifstream::binary};
-
-    // Copy file contents to the specified io source buffer
-    switch (type) {
-      case io_source_type::HOST_BUFFER: {
-        h_buffer.resize(file_size);
-        file.read(h_buffer.data(), file_size);
-        source_info = cudf::io::source_info(h_buffer.data(), file_size);
-        break;
-      }
-      case io_source_type::PINNED_BUFFER: {
-        pinned_buffer.resize(file_size);
-        file.read(pinned_buffer.data(), file_size);
-        source_info = cudf::io::source_info(pinned_buffer.data(), file_size);
-        break;
-      }
-      case io_source_type::DEVICE_BUFFER: {
-        h_buffer.resize(file_size);
-        file.read(h_buffer.data(), file_size);
-        d_buffer.resize(file_size, stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value()));
-
-        source_info = cudf::io::source_info(d_buffer);
-        break;
-      }
-      default: {
-        throw std::runtime_error("Encountered unexpected source type\n\n");
-      }
-    }
-  }
+  io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream);
 
   // Get the internal source info
   [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; }
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index e7966357cbd..b621dcde1c9 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -248,6 +248,7 @@ void print_usage()
  * @param thread_count Number of threads being used in the example
  * @param io_source_type Specified IO source type to convert input files to
  * @param stream CUDA stream to use
+ *
  * @return Vector of input sources for the given paths
  */
 std::vector<io_source> extract_input_sources_async(std::string const& paths,
@@ -286,6 +287,7 @@ std::vector<io_source> extract_input_sources_async(std::string const& paths,
         }
       }
     } else {
+      print_usage();
       throw std::runtime_error("Encountered an invalid input path\n");
     }
   });
@@ -353,7 +355,6 @@ int32_t main(int argc, char const** argv)
       } else
         input_paths = std::string{argv[1]};
       [[fallthrough]];
-    case 1: break;
     default: print_usage(); throw std::runtime_error("");
   }
 
@@ -373,6 +374,7 @@ int32_t main(int argc, char const** argv)
 
   // Check if there is nothing to do
   if (input_sources.empty()) {
+    print_usage();
     throw std::runtime_error("No input files to read. Exiting early.\n");
   }
 

From 74763b05de8795af40db3d27c7708c8c9e262ab7 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 3 Oct 2024 19:51:13 +0000
Subject: [PATCH 31/37] Minor style fix

---
 cpp/examples/parquet_io/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index e2e9919fc49..3b87efbac7e 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -21,9 +21,7 @@ target_link_libraries(parquet_io_utils PRIVATE cudf::cudf)
 
 # Build and install parquet_io
 add_executable(parquet_io parquet_io.cpp)
-target_link_libraries(
-  parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_utils>
-)
+target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:parquet_io_utils>)
 target_compile_features(parquet_io PRIVATE cxx_std_17)
 install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
 

From 06afb49d5879a3a53fb14555dae400ada2821eda Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 4 Oct 2024 17:52:13 +0000
Subject: [PATCH 32/37] Minor updates

---
 cpp/examples/parquet_io/CMakeLists.txt               |  1 +
 cpp/examples/parquet_io/io_source.cpp                |  1 +
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 11 ++++++-----
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index 3b87efbac7e..a7d0146b170 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -17,6 +17,7 @@ project(
 include(../fetch_dependencies.cmake)
 
 add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp)
+target_compile_features(parquet_io_utils PRIVATE cxx_std_17)
 target_link_libraries(parquet_io_utils PRIVATE cudf::cudf)
 
 # Build and install parquet_io
diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp
index 342c2749d7a..d3cdf6bb276 100644
--- a/cpp/examples/parquet_io/io_source.cpp
+++ b/cpp/examples/parquet_io/io_source.cpp
@@ -25,6 +25,7 @@
 #include <thrust/host_vector.h>
 
 #include <filesystem>
+#include <fstream>
 #include <string>
 
 rmm::host_async_resource_ref pinned_memory_resource()
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index b621dcde1c9..6b6a147f3a4 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -30,6 +30,7 @@
 #include <filesystem>
 #include <stdexcept>
 #include <string>
+#include <thread>
 
 /**
  * @file parquet_io_multithreaded.cpp
@@ -349,11 +350,11 @@ int32_t main(int argc, char const** argv)
       input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]}));
       [[fallthrough]];
     case 2:
-      if (auto arg = std::string{argv[1]}; arg == "-h" or arg == "--help") {
-        print_usage();
-        return 0;
-      } else
-        input_paths = std::string{argv[1]};
+      // Check if instead of input_paths, the first argument is `-h` or `--help`
+      if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") {
+        input_paths = std::move(arg);
+        break;
+      }
       [[fallthrough]];
     default: print_usage(); throw std::runtime_error("");
   }

From 1a044099c9a0bb400ea0c4a57c20cdcd4bcfe66f Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 4 Oct 2024 23:47:52 +0000
Subject: [PATCH 33/37] Nits from code reviews

---
 cpp/examples/parquet_io/common_utils.cpp      |  7 +---
 cpp/examples/parquet_io/io_source.cpp         | 13 +++---
 cpp/examples/parquet_io/io_source.hpp         |  4 --
 .../parquet_io/parquet_io_multithreaded.cpp   | 40 ++++++++++---------
 4 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp
index 13a8293e64c..a79ca48af86 100644
--- a/cpp/examples/parquet_io/common_utils.cpp
+++ b/cpp/examples/parquet_io/common_utils.cpp
@@ -91,12 +91,7 @@ bool get_boolean(std::string input)
   std::transform(input.begin(), input.end(), input.begin(), ::toupper);
 
   // Check if the input string matches to any of the following
-  if (not input.compare("ON") or not input.compare("TRUE") or not input.compare("YES") or
-      not input.compare("Y") or not input.compare("T")) {
-    return true;
-  } else {
-    return false;
-  }
+  return input == "ON" or input == "TRUE" or input == "YES" or input == "Y" or input == "T";
 }
 
 void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table)
diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp
index d3cdf6bb276..019b3f96474 100644
--- a/cpp/examples/parquet_io/io_source.cpp
+++ b/cpp/examples/parquet_io/io_source.cpp
@@ -52,15 +52,12 @@ io_source_type get_io_source_type(std::string name)
   }
 }
 
-io_source::io_source(std::string_view file_path,
-                     io_source_type io_type,
-                     rmm::cuda_stream_view stream)
-  : type{io_type},
-    file_name{file_path},
-    file_size{std::filesystem::file_size(file_name)},
-    pinned_buffer({pinned_memory_resource(), stream}),
-    d_buffer{0, stream}
+io_source::io_source(std::string_view file_path, io_source_type type, rmm::cuda_stream_view stream)
+  : pinned_buffer({pinned_memory_resource(), stream}), d_buffer{0, stream}
 {
+  std::string const file_name{file_path};
+  auto const file_size = std::filesystem::file_size(file_name);
+
   // For filepath make a quick source_info and return early
   if (type == io_source_type::FILEPATH) {
     source_info = cudf::io::source_info(file_name);
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
index a296938adaa..a614d348fae 100644
--- a/cpp/examples/parquet_io/io_source.hpp
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -94,10 +94,6 @@ class io_source {
   // alias for pinned vector
   template <typename T>
   using pinned_vector = thrust::host_vector<T, pinned_allocator<T>>;
-
-  io_source_type const type;
-  std::string const file_name;
-  size_t const file_size;
   cudf::io::source_info source_info;
   std::vector<char> h_buffer;
   pinned_vector<char> pinned_buffer;
diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 6b6a147f3a4..32d5aaa0e41 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -55,7 +55,7 @@ using table_t = std::unique_ptr<cudf::table>;
  * @brief Behavior when handling the read tables by multiple threads
  */
 enum class read_mode {
-  NOWORK,              ///< Only read and discard tables
+  NO_CONCATENATE,      ///< Only read and discard tables
   CONCATENATE_THREAD,  ///< Read and concatenate tables from each thread
   CONCATENATE_ALL,     ///< Read and concatenate everything to a single table
 };
@@ -82,15 +82,15 @@ struct read_fn {
       auto builder =
         cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info());
       auto const options = builder.build();
-      if constexpr (READ_FN != read_mode::NOWORK) {
+      if constexpr (READ_FN != read_mode::NO_CONCATENATE) {
         tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl);
       } else {
         cudf::io::read_parquet(options, stream);
       }
     }
 
-    // Concatenate the tables read by this thread if not NOWORK read_mode.
-    if constexpr (READ_FN != read_mode::NOWORK) {
+    // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode.
+    if constexpr (READ_FN != read_mode::NO_CONCATENATE) {
       auto table = concatenate_tables(std::move(tables_this_thread), stream);
       stream.synchronize_no_throw();
       tables[thread_id] = std::move(table);
@@ -107,7 +107,7 @@ struct read_fn {
  * @tparam read_mode Specifies if to concatenate and return the actual
  *                    tables or discard them and return an empty vector
  *
- * @param files List of files to read
+ * @param input_sources List of input sources to read
  * @param thread_count Number of threads
  * @param stream_pool CUDA stream pool to use for threads
  *
@@ -136,7 +136,7 @@ std::vector<table_t> read_parquet_multithreaded(std::vector<io_source> const& in
   std::vector<std::thread> threads;
   threads.reserve(thread_count);
   for (auto& c : read_tasks) {
-    threads.emplace_back(std::thread{c});
+    threads.emplace_back(c);
   }
   for (auto& t : threads) {
     t.join();
@@ -210,7 +210,7 @@ void write_parquet_multithreaded(std::string const& output_path,
   std::vector<std::thread> threads;
   threads.reserve(thread_count);
   for (auto& c : write_tasks) {
-    threads.emplace_back(std::thread{c});
+    threads.emplace_back(c);
   }
   for (auto& t : threads) {
     t.join();
@@ -237,12 +237,12 @@ void print_usage()
 
 /**
  * @brief Function to process comma delimited input paths string to parquet files and/or dirs
- *        and asynchronously convert them to specified io sources.
+ *        and convert them to specified io sources.
  *
  * Process the input path string containing directories (of parquet files) and/or individual
  * parquet files into a list of input parquet files, multiple the list by `input_multiplier`,
  * make sure to have at least `thread_count` files to satisfy at least file per parallel thread,
- * and asynchronously convert the final list of files to a list of `io_source` and return.
+ * and convert the final list of files to a list of `io_source` and return.
  *
  * @param paths Comma delimited input paths string
  * @param input_multiplier Multiplier for the input files list
@@ -252,11 +252,11 @@ void print_usage()
  *
  * @return Vector of input sources for the given paths
  */
-std::vector<io_source> extract_input_sources_async(std::string const& paths,
-                                                   int32_t input_multiplier,
-                                                   int32_t thread_count,
-                                                   io_source_type io_source_type,
-                                                   rmm::cuda_stream_view stream)
+std::vector<io_source> extract_input_sources(std::string const& paths,
+                                             int32_t input_multiplier,
+                                             int32_t thread_count,
+                                             io_source_type io_source_type,
+                                             rmm::cuda_stream_view stream)
 {
   // Get the delimited paths to directory and/or files.
   std::vector<std::string> const delimited_paths = [&]() {
@@ -310,6 +310,9 @@ std::vector<io_source> extract_input_sources_async(std::string const& paths,
                 });
 
   // Cycle append parquet files from the existing ones if less than the thread_count
+  std::cout << "Warning: Number of input sources < thread count. Cycling from\n"
+               "and appending to current input sources such that the number of\n"
+               "input source == thread count\n";
   for (size_t idx = 0; thread_count > static_cast<int>(parquet_files.size()); idx++) {
     parquet_files.emplace_back(parquet_files[idx % initial_size]);
   }
@@ -324,6 +327,7 @@ std::vector<io_source> extract_input_sources_async(std::string const& paths,
                  [&](auto const& file_name) {
                    return io_source{file_name, io_source_type, stream};
                  });
+  stream.synchronize();
   return input_sources;
 }
 
@@ -369,9 +373,8 @@ int32_t main(int argc, char const** argv)
   rmm::mr::set_current_device_resource(&stats_mr);
 
   // List of input sources from the input_paths string.
-  auto const input_sources = extract_input_sources_async(
+  auto const input_sources = extract_input_sources(
     input_paths, input_multiplier, thread_count, io_source_type, default_stream);
-  default_stream.synchronize();
 
   // Check if there is nothing to do
   if (input_sources.empty()) {
@@ -396,7 +399,7 @@ int32_t main(int argc, char const** argv)
     std::for_each(thrust::make_counting_iterator(0),
                   thrust::make_counting_iterator(num_reads),
                   [&](auto i) {  // Read parquet files and discard the tables
-                    std::ignore = read_parquet_multithreaded<read_mode::NOWORK>(
+                    std::ignore = read_parquet_multithreaded<read_mode::NO_CONCATENATE>(
                       input_sources, thread_count, stream_pool);
                   });
     default_stream.synchronize();
@@ -440,9 +443,8 @@ int32_t main(int argc, char const** argv)
     auto const input_table = cudf::concatenate(table_views, default_stream);
 
     // Sources from written parquet files
-    auto const written_pq_sources = extract_input_sources_async(
+    auto const written_pq_sources = extract_input_sources(
       output_path, input_multiplier, thread_count, io_source_type, default_stream);
-    default_stream.synchronize();
 
     // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only
     auto const transcoded_table = std::move(read_parquet_multithreaded<read_mode::CONCATENATE_ALL>(

From 3a590275242658600b78ee7eb7ccac01d88a7f5a Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 10 Oct 2024 20:20:18 +0000
Subject: [PATCH 34/37] Minor arg setting

---
 ci/run_cudf_examples.sh                |  2 +-
 cpp/examples/parquet_io/parquet_io.cpp | 51 +++++++++++++++-----------
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index f8c3ed20b03..2439af5b644 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -23,7 +23,7 @@ compute-sanitizer --tool memcheck custom_optimized names.csv
 compute-sanitizer --tool memcheck custom_prealloc names.csv
 compute-sanitizer --tool memcheck custom_with_malloc names.csv
 
-compute-sanitizer --tool memcheck parquet_io
+compute-sanitizer --tool memcheck parquet_io example.parquet
 compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
 
 compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index aeb47225cac..9c34b342b62 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -85,6 +85,18 @@ void write_parquet(cudf::table_view input,
   cudf::io::write_parquet(options);
 }
 
+/**
+ * @brief Function to print example usage and argument information.
+ */
+void print_usage()
+{
+  std::cout << "\nUsage: parquet_io <input parquet file> <output parquet file> <encoding type>\n"
+               "                               <compression type> <write page stats: yes/no>\n\n"
+               "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n"
+               "                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n"
+               "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n";
+}
+
 /**
  * @brief Main for nested_types examples
  *
@@ -101,31 +113,28 @@ void write_parquet(cudf::table_view input,
  */
 int main(int argc, char const** argv)
 {
-  std::string input_filepath;
-  std::string output_filepath;
-  cudf::io::column_encoding encoding;
-  cudf::io::compression_type compression;
-  std::optional<cudf::io::statistics_freq> page_stats;
+  std::string input_filepath                          = "example.parquet";
+  std::string output_filepath                         = "output.parquet";
+  cudf::io::column_encoding encoding                  = get_encoding_type("DELTA_BINARY_PACKED");
+  cudf::io::compression_type compression              = get_compression_type("ZSTD");
+  std::optional<cudf::io::statistics_freq> page_stats = std::nullopt;
 
   switch (argc) {
-    case 1:
-      input_filepath  = "example.parquet";
-      output_filepath = "output.parquet";
-      encoding        = get_encoding_type("DELTA_BINARY_PACKED");
-      compression     = get_compression_type("ZSTD");
-      break;
     case 6:
-      if (get_boolean(argv[5])) { page_stats = cudf::io::statistics_freq::STATISTICS_COLUMN; };
+      page_stats = get_boolean(argv[5])
+                     ? std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN)
+                     : std::nullopt;
+      [[fallthrough]];
+    case 5: compression = get_compression_type(argv[4]); [[fallthrough]];
+    case 4: encoding = get_encoding_type(argv[3]); [[fallthrough]];
+    case 3: output_filepath = argv[2]; [[fallthrough]];
+    case 2:  // Check if instead of input_paths, the first argument is `-h` or `--help`
+      if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") {
+        input_filepath = std::move(arg);
+        break;
+      }
       [[fallthrough]];
-    case 5:
-      input_filepath  = argv[1];
-      output_filepath = argv[2];
-      encoding        = get_encoding_type(argv[3]);
-      compression     = get_compression_type(argv[4]);
-      break;
-    default:
-      throw std::runtime_error(
-        "Either provide all command-line arguments, or none to use defaults\n");
+    default: print_usage(); throw std::runtime_error("");
   }
 
   // Create and use a memory pool

From 7cfd7ae4b0edc7547ed4e354f332dfa253aa4c21 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 11 Oct 2024 00:05:34 +0000
Subject: [PATCH 35/37] Adjust spacing

---
 cpp/examples/parquet_io/parquet_io.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 9c34b342b62..c11b8de82b5 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -91,7 +91,7 @@ void write_parquet(cudf::table_view input,
 void print_usage()
 {
   std::cout << "\nUsage: parquet_io <input parquet file> <output parquet file> <encoding type>\n"
-               "                               <compression type> <write page stats: yes/no>\n\n"
+               "                  <compression type> <write page stats: yes/no>\n\n"
                "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n"
                "                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n"
                "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n";

From d9102f00f9b7589fdbf8d456c019d42bca1f75ad Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 11 Oct 2024 00:07:28 +0000
Subject: [PATCH 36/37] Apply suggestion

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 32d5aaa0e41..82f55473e7e 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -406,7 +406,7 @@ int32_t main(int argc, char const** argv)
     timer.print_elapsed_millis();
   }
 
-  // Do we need to write parquet files and validate?
+  // Write parquet files and validate if needed
   if (write_and_validate) {
     // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables
     auto const tables = read_parquet_multithreaded<read_mode::CONCATENATE_THREAD>(

From b61f18ee1769c2b67c969c8ccdec9fc150292997 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 11 Oct 2024 18:48:28 +0000
Subject: [PATCH 37/37] Minor

---
 cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
index 82f55473e7e..6ad4b862240 100644
--- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
+++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp
@@ -63,7 +63,7 @@ enum class read_mode {
 /**
  * @brief Functor for multithreaded parquet reading based on the provided read_mode
  */
-template <read_mode READ_FN>
+template <read_mode read_mode>
 struct read_fn {
   std::vector<io_source> const& input_sources;
   std::vector<table_t>& tables;
@@ -82,7 +82,7 @@ struct read_fn {
       auto builder =
         cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info());
       auto const options = builder.build();
-      if constexpr (READ_FN != read_mode::NO_CONCATENATE) {
+      if constexpr (read_mode != read_mode::NO_CONCATENATE) {
         tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl);
       } else {
         cudf::io::read_parquet(options, stream);
@@ -90,7 +90,7 @@ struct read_fn {
     }
 
     // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode.
-    if constexpr (READ_FN != read_mode::NO_CONCATENATE) {
+    if constexpr (read_mode != read_mode::NO_CONCATENATE) {
       auto table = concatenate_tables(std::move(tables_this_thread), stream);
       stream.synchronize_no_throw();
       tables[thread_id] = std::move(table);