From ff2480b6c2e8acb1c6bccfb197b11a53d229699c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Sep 2024 02:48:09 +0000 Subject: [PATCH 01/37] Add the new multithreaded parquet example --- cpp/examples/parquet_io/CMakeLists.txt | 13 +- .../parquet_io/{parquet_io.hpp => common.hpp} | 6 +- cpp/examples/parquet_io/parquet_io.cpp | 4 +- .../parquet_io/parquet_io_multithreaded.cpp | 290 ++++++++++++++++++ 4 files changed, 306 insertions(+), 7 deletions(-) rename cpp/examples/parquet_io/{parquet_io.hpp => common.hpp} (97%) create mode 100644 cpp/examples/parquet_io/parquet_io_multithreaded.cpp diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index d8e9205ffd4..1e1d2c3516f 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -16,10 +16,17 @@ project( include(../fetch_dependencies.cmake) -# Configure your project here +# Build and install parquet_io add_executable(parquet_io parquet_io.cpp) target_link_libraries(parquet_io PRIVATE cudf::cudf) target_compile_features(parquet_io PRIVATE cxx_std_17) - install(TARGETS parquet_io DESTINATION bin/examples/libcudf) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) + +# Build and install parquet_io_multithreaded +add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) +target_link_libraries(parquet_io_multithreaded PRIVATE cudf::cudf) +target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17) +install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) + +# Install the example.parquet file +install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) \ No newline at end of file diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/common.hpp similarity index 97% rename from cpp/examples/parquet_io/parquet_io.hpp rename to cpp/examples/parquet_io/common.hpp index e27cbec4fce..2095a0b237c 100644 --- a/cpp/examples/parquet_io/parquet_io.hpp +++ b/cpp/examples/parquet_io/common.hpp @@ -16,12 +16,16 @@ #pragma once +#include "../utilities/timer.hpp" + +#include #include #include #include #include #include +#include #include #include #include @@ -123,4 +127,4 @@ std::shared_ptr create_memory_resource(bool is_ } return std::nullopt; -} +} \ No newline at end of file diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 442731694fa..cfd230d3751 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -14,9 +14,7 @@ * limitations under the License. */ -#include "parquet_io.hpp" - -#include "../utilities/timer.hpp" +#include "common.hpp" /** * @file parquet_io.cpp diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp new file mode 100644 index 00000000000..8f1b08754a9 --- /dev/null +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include + +#include + +/** + * @file parquet_io_multithreaded.cpp + * @brief Demonstrates usage of the libcudf APIs to read and write + * parquet file format with different encodings and compression types + * using multiple threads. + * + * The following encoding and compression ztypes are demonstrated: + * Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED, + * DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY + * + * Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD + * + */ + +using table_t = std::unique_ptr; + +struct read_fn { + std::vector const& input_files; + std::vector& tables; + int const thread_id; + int const thread_count; + rmm::cuda_stream_view stream; + + void operator()() + { + std::vector tables_this_thread; + for (auto curr_file_idx = thread_id; curr_file_idx < input_files.size(); + curr_file_idx += thread_count) { + auto const source_info = cudf::io::source_info(input_files[curr_file_idx]); + auto builder = cudf::io::parquet_reader_options::builder(source_info); + auto const options = builder.build(); + tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); + } + + // Concatenate all tables read by this thread. + auto table = std::move(tables_this_thread[0]); + std::for_each(tables_this_thread.begin() + 1, tables_this_thread.end(), [&](auto& tbl) { + std::vector const table_views{table->view(), tbl->view()}; + table = cudf::concatenate(table_views, stream); + }); + + // Done with this stream + stream.synchronize_no_throw(); + + tables[thread_id] = std::move(table); + } +}; + +struct write_fn { + std::string const& output_path; + std::vector const& tables; + cudf::io::column_encoding const encoding; + cudf::io::compression_type const compression; + std::optional const stats_level; + int const thread_id; + + void operator()() + { + // write the data for inspection + auto sink_info = + cudf::io::sink_info(output_path + "/table_" + std::to_string(thread_id) + ".parquet"); + auto builder = cudf::io::parquet_writer_options::builder(sink_info, tables[thread_id]->view()) + .compression(compression) + .stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE)); + auto table_metadata = cudf::io::table_input_metadata{tables[thread_id]->view()}; + + std::for_each(table_metadata.column_metadata.begin(), + table_metadata.column_metadata.end(), + [=](auto& col_meta) { col_meta.set_encoding(encoding); }); + + builder.metadata(table_metadata); + auto options = builder.build(); + // Write parquet data + cudf::io::write_parquet(options); + } +}; + +int main(int argc, char const** argv) +{ + std::string input_paths; + std::string output_path; + cudf::io::column_encoding encoding; + cudf::io::compression_type compression; + std::optional page_stats; + int thread_count; + + switch (argc) { + case 1: + input_paths = "example.parquet"; + output_path = "output.parquet"; + encoding = get_encoding_type("DELTA_BINARY_PACKED"); + compression = get_compression_type("ZSTD"); + thread_count = 2; + break; + case 7: page_stats = get_page_size_stats(argv[6]); [[fallthrough]]; + case 6: + input_paths = std::string{argv[1]}; + output_path = std::string{argv[2]}; + encoding = get_encoding_type(argv[3]); + compression = get_compression_type(argv[4]); + thread_count = std::stoi(std::string(argv[5])); + break; + default: + throw std::runtime_error( + "Either provide all command-line arguments, or none to use defaults\n" + "Use: parquet_io_multithreaded " + " " + "\n"); + } + + // Process and extract all input files + auto const input_files = [&]() { + std::vector parquet_files; + std::vector delimited_paths = [&]() { + std::vector paths_list; + std::stringstream stream{input_paths}; + std::string path; + // extract the delimited paths. + while (std::getline(stream, path, char{','})) { + paths_list.push_back(path); // Add each token to the vector + } + return paths_list; + }(); + + std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { + std::filesystem::path path{path_string}; + // If this is a parquet file, add it. + if (std::filesystem::is_regular_file(path)) { + parquet_files.push_back(path_string); + } + // If this is a directory, add all files at this path + else if (std::filesystem::is_directory(path)) { + for (auto const& file : std::filesystem::directory_iterator(path)) { + if (std::filesystem::is_regular_file(file.path())) { + parquet_files.push_back(file.path().string()); + } + } + } else { + throw std::runtime_error("Encountered an invalid input path\n"); + } + }); + + // Add parquet files from existing ones if less than thread_count + for (size_t idx = 0, initial_size = parquet_files.size(); + thread_count > static_cast(parquet_files.size()); + idx++) { + parquet_files.push_back(parquet_files[idx % initial_size]); + } + + return parquet_files; + }(); + + // Exit early if nothing to do. + if (not input_files.size()) { return 0; } + + // Check if output path is a directory. + if (not std::filesystem::is_directory(std::filesystem::path{output_path})) { + throw std::runtime_error("The provided output path is not a directory\n"); + } + + auto const is_pool_used = true; + auto resource = create_memory_resource(is_pool_used); + auto default_stream = cudf::get_default_stream(); + auto stream_pool = rmm::cuda_stream_pool(thread_count); + auto stats_mr = + rmm::mr::statistics_resource_adaptor(resource.get()); + rmm::mr::set_current_device_resource(&stats_mr); + + // Lambda function to setup and launch multithread parquet read + auto const read_parquet_multithreaded = [&]() { + // Tables read by each thread + std::vector tables(thread_count); + + // Tasks to read each parquet file + std::vector read_tasks; + read_tasks.reserve(thread_count); + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(thread_count), + [&](auto tid) { + read_tasks.emplace_back( + read_fn{input_files, tables, tid, thread_count, stream_pool.get_stream()}); + }); + + std::vector threads; + threads.reserve(thread_count); + for (auto& c : read_tasks) { + threads.emplace_back(std::thread{c}); + } + for (auto& t : threads) { + t.join(); + } + return tables; + }; + + // Lambda function to setup and launch multithread parquet write + auto const write_parquet_multithreaded = [&](std::vector const& tables) { + // Tasks to read each parquet file + std::vector write_tasks; + write_tasks.reserve(thread_count); + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(thread_count), + [&](auto tid) { + write_tasks.emplace_back( + write_fn{output_path, tables, encoding, compression, page_stats, tid}); + }); + + std::vector threads; + threads.reserve(thread_count); + for (auto& c : write_tasks) { + threads.emplace_back(std::thread{c}); + } + for (auto& t : threads) { + t.join(); + } + }; + + // Read the parquet files with multiple threads + { + std::cout << "Note: Not timing the initial parquet read as it may include\n" + "times for nvcomp, cufile loading and RMM growth." + << std::endl + << std::endl; + + // tables read by each thread + auto const tables = read_parquet_multithreaded(); + + // In case some kernels are still running on the default stre + default_stream.synchronize(); + + // Write parquet file with the specified encoding and compression + auto const page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; + std::cout << "Writing at: " << output_path << " with encoding, compression and " + << page_stat_string << ".." << std::endl; + + // Write tables using multiple threads + cudf::examples::timer timer; + write_parquet_multithreaded(tables); + + // In case some kernels are still running on the default stream + default_stream.synchronize(); + + // Print elapsed time + timer.print_elapsed_millis(); + } + + // Re-read the parquet files with multiple threads + { + std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl; + cudf::examples::timer timer; + auto tables = read_parquet_multithreaded(); + + // Construct the final table + auto table = std::move(tables[0]); + std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) { + std::vector const table_views{table->view(), tbl->view()}; + table = cudf::concatenate(table_views, default_stream); + }); + + // In case some kernels are still running on the default stream + default_stream.synchronize(); + + // Print elapsed time and peak memory + timer.print_elapsed_millis(); + std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n"; + } + + return 0; +} From d06f7f2b584b53f0a24c8fbd1ff5c78875a8f4be Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Sep 2024 02:55:07 +0000 Subject: [PATCH 02/37] Set the default output path to the current path --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 8f1b08754a9..8b90bce68c1 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -109,7 +109,7 @@ int main(int argc, char const** argv) switch (argc) { case 1: input_paths = "example.parquet"; - output_path = "output.parquet"; + output_path = std::filesystem::current_path().string(); encoding = get_encoding_type("DELTA_BINARY_PACKED"); compression = get_compression_type("ZSTD"); thread_count = 2; From c13a4087407bcb6e7f85bce60450f6e84eca7245 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Sep 2024 18:48:28 +0000 Subject: [PATCH 03/37] Style fix --- cpp/examples/parquet_io/CMakeLists.txt | 2 +- cpp/examples/parquet_io/common.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index 1e1d2c3516f..28ade3666bf 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -29,4 +29,4 @@ target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17) install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) # Install the example.parquet file -install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) \ No newline at end of file +install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp index 2095a0b237c..57c6a8b4f0f 100644 --- a/cpp/examples/parquet_io/common.hpp +++ b/cpp/examples/parquet_io/common.hpp @@ -127,4 +127,4 @@ std::shared_ptr create_memory_resource(bool is_ } return std::nullopt; -} \ No newline at end of file +} From 12adeebd6900e7bf203108a2b5a4f0a1d0c1ed11 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Sep 2024 19:03:51 +0000 Subject: [PATCH 04/37] Use stream pool for parquet write as well --- .../parquet_io/parquet_io_multithreaded.cpp | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 8b90bce68c1..f39f86b7e08 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -75,6 +75,7 @@ struct write_fn { cudf::io::compression_type const compression; std::optional const stats_level; int const thread_id; + rmm::cuda_stream_view stream; void operator()() { @@ -92,8 +93,12 @@ struct write_fn { builder.metadata(table_metadata); auto options = builder.build(); + // Write parquet data - cudf::io::write_parquet(options); + cudf::io::write_parquet(options, stream); + + // Done with this stream + stream.synchronize_no_throw(); } }; @@ -189,7 +194,7 @@ int main(int argc, char const** argv) rmm::mr::set_current_device_resource(&stats_mr); // Lambda function to setup and launch multithread parquet read - auto const read_parquet_multithreaded = [&]() { + auto const read_parquet_multithreaded = [&](std::vector const& files) { // Tables read by each thread std::vector tables(thread_count); @@ -200,7 +205,7 @@ int main(int argc, char const** argv) thrust::make_counting_iterator(thread_count), [&](auto tid) { read_tasks.emplace_back( - read_fn{input_files, tables, tid, thread_count, stream_pool.get_stream()}); + read_fn{files, tables, tid, thread_count, stream_pool.get_stream()}); }); std::vector threads; @@ -219,12 +224,13 @@ int main(int argc, char const** argv) // Tasks to read each parquet file std::vector write_tasks; write_tasks.reserve(thread_count); - std::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(thread_count), - [&](auto tid) { - write_tasks.emplace_back( - write_fn{output_path, tables, encoding, compression, page_stats, tid}); - }); + std::for_each( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(thread_count), + [&](auto tid) { + write_tasks.emplace_back(write_fn{ + output_path, tables, encoding, compression, page_stats, tid, stream_pool.get_stream()}); + }); std::vector threads; threads.reserve(thread_count); @@ -244,7 +250,7 @@ int main(int argc, char const** argv) << std::endl; // tables read by each thread - auto const tables = read_parquet_multithreaded(); + auto const tables = read_parquet_multithreaded(input_files); // In case some kernels are still running on the default stre default_stream.synchronize(); @@ -265,12 +271,11 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); } - // Re-read the parquet files with multiple threads + // Re-read the same parquet files with multiple threads { std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl; cudf::examples::timer timer; - auto tables = read_parquet_multithreaded(); - + auto tables = read_parquet_multithreaded(input_files); // Construct the final table auto table = std::move(tables[0]); std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) { From a8ae50a42060f79904b7217d0a9eb4daabd046fd Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Sep 2024 20:10:46 +0000 Subject: [PATCH 05/37] Add more details to the example --- .../parquet_io/parquet_io_multithreaded.cpp | 81 +++++++++++++++---- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index f39f86b7e08..f46f02966f1 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -16,8 +16,11 @@ #include "common.hpp" +#include #include +#include + #include /** @@ -136,15 +139,15 @@ int main(int argc, char const** argv) } // Process and extract all input files - auto const input_files = [&]() { + auto const extract_input_files = [thread_count = thread_count](std::string const& paths) { std::vector parquet_files; std::vector delimited_paths = [&]() { std::vector paths_list; - std::stringstream stream{input_paths}; + std::stringstream stream{paths}; std::string path; - // extract the delimited paths. + // Extract the delimited paths. while (std::getline(stream, path, char{','})) { - paths_list.push_back(path); // Add each token to the vector + paths_list.push_back(path); } return paths_list; }(); @@ -175,13 +178,37 @@ int main(int argc, char const** argv) } return parquet_files; - }(); + }; + + // Concatenate a vector of tables and return + auto const concatenate_tables = [](std::vector& tables, rmm::cuda_stream_view stream) { + // Construct the final table + auto table = std::move(tables[0]); + std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) { + std::vector const table_views{table->view(), tbl->view()}; + table = cudf::concatenate(table_views, stream); + }); + return table; + }; + + // make input files from the input_paths string. + auto const input_files = extract_input_files(input_paths); // Exit early if nothing to do. - if (not input_files.size()) { return 0; } + if (not input_files.size()) { + std::cerr << "No input files to read. Exiting early.\n"; + return 0; + } - // Check if output path is a directory. - if (not std::filesystem::is_directory(std::filesystem::path{output_path})) { + // Check if output path is a valid + if (std::filesystem::is_directory({output_path})) { + // Create a new directory in output path if not empty. + if (not std::filesystem::is_empty({output_path})) { + output_path += + "/output_" + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now()); + std::filesystem::create_directory({output_path}); + } + } else { throw std::runtime_error("The provided output path is not a directory\n"); } @@ -275,20 +302,40 @@ int main(int argc, char const** argv) { std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl; cudf::examples::timer timer; - auto tables = read_parquet_multithreaded(input_files); - // Construct the final table - auto table = std::move(tables[0]); - std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) { - std::vector const table_views{table->view(), tbl->view()}; - table = cudf::concatenate(table_views, default_stream); - }); - + auto tables = read_parquet_multithreaded(input_files); + auto const table = concatenate_tables(tables, default_stream); // In case some kernels are still running on the default stream default_stream.synchronize(); + // Print elapsed time and peak memory + timer.print_elapsed_millis(); + std::cout << "Reading transcoded files using " << thread_count << " threads..." << std::endl; + timer.reset(); + auto transcoded_tables = read_parquet_multithreaded(extract_input_files(output_path)); + auto const transcoded_table = concatenate_tables(transcoded_tables, default_stream); // Print elapsed time and peak memory timer.print_elapsed_millis(); - std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n"; + + // In case some kernels are still running on the default stream + default_stream.synchronize(); + + std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n"; + + // Check for validity + try { + // Left anti-join the original and transcoded tables + // identical tables should not throw an exception and + // return an empty indices vector + auto const indices = cudf::left_anti_join( + table->view(), transcoded_table->view(), cudf::null_equality::EQUAL, resource.get()); + + // No exception thrown, check indices + auto const valid = indices->size() == 0; + std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; + } catch (std::exception& e) { + std::cerr << e.what() << std::endl << std::endl; + std::cout << "Transcoding valid: false" << std::endl; + } } return 0; From 6679f89196615361763357058dc1293d78ed88b3 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Sep 2024 20:26:19 +0000 Subject: [PATCH 06/37] Minor improvements --- cpp/examples/parquet_io/common.hpp | 24 +++++++++++++ cpp/examples/parquet_io/parquet_io.cpp | 15 +------- .../parquet_io/parquet_io_multithreaded.cpp | 36 +++++-------------- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp index 57c6a8b4f0f..f4e5757412a 100644 --- a/cpp/examples/parquet_io/common.hpp +++ b/cpp/examples/parquet_io/common.hpp @@ -128,3 +128,27 @@ std::shared_ptr create_memory_resource(bool is_ return std::nullopt; } + +/** + * @brief Check if two tables are identical, throw an error otherwise + * + * @param lhs_table View to lhs table + * @param rhs_table View to rhs table + */ +inline void check_identical_tables(cudf::table_view const& lhs_table, + cudf::table_view const& rhs_table) +{ + try { + // Left anti-join the original and transcoded tables + // identical tables should not throw an exception and + // return an empty indices vector + auto const indices = cudf::left_anti_join(lhs_table, rhs_table, cudf::null_equality::EQUAL); + + // No exception thrown, check indices + auto const valid = indices->size() == 0; + std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; + } catch (std::exception& e) { + std::cerr << e.what() << std::endl << std::endl; + throw std::runtime_error("Transcoding valid: false\n"); + } +} diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index cfd230d3751..c981928e8f2 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -153,20 +153,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Check for validity - try { - // Left anti-join the original and transcoded tables - // identical tables should not throw an exception and - // return an empty indices vector - auto const indices = cudf::left_anti_join( - input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get()); - - // No exception thrown, check indices - auto const valid = indices->size() == 0; - std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; - } catch (std::exception& e) { - std::cerr << e.what() << std::endl << std::endl; - std::cout << "Transcoding valid: false" << std::endl; - } + check_identical_tables(input->view(), transcoded_input->view()); return 0; } diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index f46f02966f1..6664eccb496 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -181,7 +181,7 @@ int main(int argc, char const** argv) }; // Concatenate a vector of tables and return - auto const concatenate_tables = [](std::vector& tables, rmm::cuda_stream_view stream) { + auto const concatenate_tables = [](std::vector tables, rmm::cuda_stream_view stream) { // Construct the final table auto table = std::move(tables[0]); std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) { @@ -275,10 +275,8 @@ int main(int argc, char const** argv) "times for nvcomp, cufile loading and RMM growth." << std::endl << std::endl; - - // tables read by each thread + // Tables read by each thread auto const tables = read_parquet_multithreaded(input_files); - // In case some kernels are still running on the default stre default_stream.synchronize(); @@ -290,10 +288,8 @@ int main(int argc, char const** argv) // Write tables using multiple threads cudf::examples::timer timer; write_parquet_multithreaded(tables); - // In case some kernels are still running on the default stream default_stream.synchronize(); - // Print elapsed time timer.print_elapsed_millis(); } @@ -302,8 +298,8 @@ int main(int argc, char const** argv) { std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl; cudf::examples::timer timer; - auto tables = read_parquet_multithreaded(input_files); - auto const table = concatenate_tables(tables, default_stream); + auto const input_table = + concatenate_tables(read_parquet_multithreaded(input_files), default_stream); // In case some kernels are still running on the default stream default_stream.synchronize(); // Print elapsed time and peak memory @@ -311,31 +307,17 @@ int main(int argc, char const** argv) std::cout << "Reading transcoded files using " << thread_count << " threads..." << std::endl; timer.reset(); - auto transcoded_tables = read_parquet_multithreaded(extract_input_files(output_path)); - auto const transcoded_table = concatenate_tables(transcoded_tables, default_stream); - // Print elapsed time and peak memory - timer.print_elapsed_millis(); - + auto const transcoded_table = concatenate_tables( + read_parquet_multithreaded(extract_input_files(output_path)), default_stream); // In case some kernels are still running on the default stream default_stream.synchronize(); + // Print elapsed time and peak memory + timer.print_elapsed_millis(); std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n"; // Check for validity - try { - // Left anti-join the original and transcoded tables - // identical tables should not throw an exception and - // return an empty indices vector - auto const indices = cudf::left_anti_join( - table->view(), transcoded_table->view(), cudf::null_equality::EQUAL, resource.get()); - - // No exception thrown, check indices - auto const valid = indices->size() == 0; - std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; - } catch (std::exception& e) { - std::cerr << e.what() << std::endl << std::endl; - std::cout << "Transcoding valid: false" << std::endl; - } + check_identical_tables(input_table->view(), transcoded_table->view()); } return 0; From e04602c34c42982efaffc78a5c3b7bcfb02d74b1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Sep 2024 20:58:18 +0000 Subject: [PATCH 07/37] Minor improvement --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 6664eccb496..5c9be0892cb 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -128,7 +128,7 @@ int main(int argc, char const** argv) output_path = std::string{argv[2]}; encoding = get_encoding_type(argv[3]); compression = get_compression_type(argv[4]); - thread_count = std::stoi(std::string(argv[5])); + thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); break; default: throw std::runtime_error( From 21ce7c7cc42596880ca82de4cd56d24323275122 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Sep 2024 23:09:32 +0000 Subject: [PATCH 08/37] Minor improvements --- cpp/examples/parquet_io/common.hpp | 8 +++- cpp/examples/parquet_io/parquet_io.cpp | 14 +++--- .../parquet_io/parquet_io_multithreaded.cpp | 44 ++++++++++++------- 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp index f4e5757412a..25f81022d07 100644 --- a/cpp/examples/parquet_io/common.hpp +++ b/cpp/examples/parquet_io/common.hpp @@ -31,6 +31,8 @@ #include #include +#include + #include #include #include @@ -146,9 +148,11 @@ inline void check_identical_tables(cudf::table_view const& lhs_table, // No exception thrown, check indices auto const valid = indices->size() == 0; - std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; + fmt::print( + fmt::emphasis::bold | fg(fmt::color::green_yellow), "Transcoding valid: {}\n", valid); } catch (std::exception& e) { std::cerr << e.what() << std::endl << std::endl; - throw std::runtime_error("Transcoding valid: false\n"); + throw std::runtime_error( + fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Transcoding valid: false\n")); } } diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index c981928e8f2..06505016ab9 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -126,18 +126,16 @@ int main(int argc, char const** argv) // Read input parquet file // We do not want to time the initial read time as it may include // time for nvcomp, cufile loading and RMM growth - std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl; - std::cout << "Note: Not timing the initial parquet read as it may include\n" - "times for nvcomp, cufile loading and RMM growth." - << std::endl - << std::endl; + fmt::print("\nReading {}...", input_filepath); + fmt::print( + "Note: Not timing the initial parquet read as it may include\n" + "times for nvcomp, cufile loading and RMM growth.\n\n"); auto [input, metadata] = read_parquet(input_filepath); // Status string to indicate if page stats are set to be written or not auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; // Write parquet file with the specified encoding and compression - std::cout << "Writing " << output_filepath << " with encoding, compression and " - << page_stat_string << ".." << std::endl; + fmt::print("Writing {} with encoding, compression and {}..\n", output_filepath, page_stat_string); // `timer` is automatically started here cudf::examples::timer timer; @@ -145,7 +143,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Read the parquet file written with encoding and compression - std::cout << "Reading " << output_filepath << "..." << std::endl; + fmt::print("Reading {}...\n", output_filepath); // Reset the timer timer.reset(); diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 5c9be0892cb..361683c0e9e 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -73,7 +73,7 @@ struct read_fn { struct write_fn { std::string const& output_path; - std::vector const& tables; + std::vector const& table_views; cudf::io::column_encoding const encoding; cudf::io::compression_type const compression; std::optional const stats_level; @@ -85,10 +85,10 @@ struct write_fn { // write the data for inspection auto sink_info = cudf::io::sink_info(output_path + "/table_" + std::to_string(thread_id) + ".parquet"); - auto builder = cudf::io::parquet_writer_options::builder(sink_info, tables[thread_id]->view()) + auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]) .compression(compression) .stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE)); - auto table_metadata = cudf::io::table_input_metadata{tables[thread_id]->view()}; + auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]}; std::for_each(table_metadata.column_metadata.begin(), table_metadata.column_metadata.end(), @@ -246,8 +246,8 @@ int main(int argc, char const** argv) return tables; }; - // Lambda function to setup and launch multithread parquet write - auto const write_parquet_multithreaded = [&](std::vector const& tables) { + // Lambda function to setup and launch multithreaded parquet writes + auto const write_parquet_multithreaded = [&](std::vector const& tables) { // Tasks to read each parquet file std::vector write_tasks; write_tasks.reserve(thread_count); @@ -271,23 +271,33 @@ int main(int argc, char const** argv) // Read the parquet files with multiple threads { - std::cout << "Note: Not timing the initial parquet read as it may include\n" - "times for nvcomp, cufile loading and RMM growth." - << std::endl - << std::endl; + fmt::print( + "Note: Not timing the initial parquet read as it may include\n" + "times for nvcomp, cufile loading and RMM growth.\n\n"); // Tables read by each thread auto const tables = read_parquet_multithreaded(input_files); // In case some kernels are still running on the default stre default_stream.synchronize(); - // Write parquet file with the specified encoding and compression + // Construct a vector of table views for write_parquet_multithreaded + auto const table_views = [&tables]() { + std::vector table_views; + table_views.reserve(tables.size()); + + std::transform( + tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { + return tbl->view(); + }); + return table_views; + }(); + + // Write tables to parquet with the specified encoding and compression auto const page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; - std::cout << "Writing at: " << output_path << " with encoding, compression and " - << page_stat_string << ".." << std::endl; + fmt::print( + "Writing at: {} with encoding, compression and {}..\n", output_path, page_stat_string); - // Write tables using multiple threads cudf::examples::timer timer; - write_parquet_multithreaded(tables); + write_parquet_multithreaded(table_views); // In case some kernels are still running on the default stream default_stream.synchronize(); // Print elapsed time @@ -296,7 +306,7 @@ int main(int argc, char const** argv) // Re-read the same parquet files with multiple threads { - std::cout << "Reading for the second time using " << thread_count << " threads..." << std::endl; + fmt::print("Reading for the second time using {} threads...\n", thread_count); cudf::examples::timer timer; auto const input_table = concatenate_tables(read_parquet_multithreaded(input_files), default_stream); @@ -305,7 +315,7 @@ int main(int argc, char const** argv) // Print elapsed time and peak memory timer.print_elapsed_millis(); - std::cout << "Reading transcoded files using " << thread_count << " threads..." << std::endl; + fmt::print("Reading transcoded files using {} threads...\n", thread_count); timer.reset(); auto const transcoded_table = concatenate_tables( read_parquet_multithreaded(extract_input_files(output_path)), default_stream); @@ -314,7 +324,7 @@ int main(int argc, char const** argv) // Print elapsed time and peak memory timer.print_elapsed_millis(); - std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n"; + fmt::print("Peak memory: {} MB\n\n", (stats_mr.get_bytes_counter().peak / 1048576.0)); // Check for validity check_identical_tables(input_table->view(), transcoded_table->view()); From b8b8bb954ab3d529ffab2e31fbfe2a58b1194965 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 19 Sep 2024 00:36:59 +0000 Subject: [PATCH 09/37] Move the vector to concatenate tables --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 361683c0e9e..30fc4ec9354 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -309,7 +309,7 @@ int main(int argc, char const** argv) fmt::print("Reading for the second time using {} threads...\n", thread_count); cudf::examples::timer timer; auto const input_table = - concatenate_tables(read_parquet_multithreaded(input_files), default_stream); + concatenate_tables(std::move(read_parquet_multithreaded(input_files)), default_stream); // In case some kernels are still running on the default stream default_stream.synchronize(); // Print elapsed time and peak memory @@ -318,7 +318,7 @@ int main(int argc, char const** argv) fmt::print("Reading transcoded files using {} threads...\n", thread_count); timer.reset(); auto const transcoded_table = concatenate_tables( - read_parquet_multithreaded(extract_input_files(output_path)), default_stream); + std::move(read_parquet_multithreaded(extract_input_files(output_path))), default_stream); // In case some kernels are still running on the default stream default_stream.synchronize(); // Print elapsed time and peak memory From 188ce11900072121145ce1ba554ec15be9eedd82 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Mon, 23 Sep 2024 18:09:25 +0000 Subject: [PATCH 10/37] Minor improvement --- .../parquet_io/parquet_io_multithreaded.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 30fc4ec9354..0a05e22f8d7 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -180,15 +180,16 @@ int main(int argc, char const** argv) return parquet_files; }; - // Concatenate a vector of tables and return + // Lambda to concatenate a vector of tables auto const concatenate_tables = [](std::vector tables, rmm::cuda_stream_view stream) { + std::vector table_views; + table_views.reserve(tables.size()); + std::transform( + tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) { + return tbl->view(); + }); // Construct the final table - auto table = std::move(tables[0]); - std::for_each(tables.begin() + 1, tables.end(), [&](auto& tbl) { - std::vector const table_views{table->view(), tbl->view()}; - table = cudf::concatenate(table_views, stream); - }); - return table; + return cudf::concatenate(table_views, stream); }; // make input files from the input_paths string. From 990f2bbacdfbb96a283000b7612587271244464d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 27 Sep 2024 00:30:05 +0000 Subject: [PATCH 11/37] Make multithreaded parquet io example more sophisticated --- cpp/examples/parquet_io/common.hpp | 111 ++++- cpp/examples/parquet_io/parquet_io.cpp | 4 +- .../parquet_io/parquet_io_multithreaded.cpp | 440 ++++++++++-------- 3 files changed, 334 insertions(+), 221 deletions(-) diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp index 25f81022d07..eaff77708e6 100644 --- a/cpp/examples/parquet_io/common.hpp +++ b/cpp/examples/parquet_io/common.hpp @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -64,7 +65,7 @@ std::shared_ptr create_memory_resource(bool is_ { using encoding_type = cudf::io::column_encoding; - static const std::unordered_map map = { + static const std::unordered_map map = { {"DEFAULT", encoding_type::USE_DEFAULT}, {"DICTIONARY", encoding_type::DICTIONARY}, {"PLAIN", encoding_type::PLAIN}, @@ -79,9 +80,7 @@ std::shared_ptr create_memory_resource(bool is_ " is not a valid encoding type.\n\n" "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n" "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n" - "DELTA_BYTE_ARRAY\n" - "\n" - "Exiting...\n"); + "DELTA_BYTE_ARRAY\n\n"); } /** @@ -94,7 +93,7 @@ std::shared_ptr create_memory_resource(bool is_ { using compression_type = cudf::io::compression_type; - static const std::unordered_map map = { + static const std::unordered_map map = { {"NONE", compression_type::NONE}, {"AUTO", compression_type::AUTO}, {"SNAPPY", compression_type::SNAPPY}, @@ -106,29 +105,26 @@ std::shared_ptr create_memory_resource(bool is_ throw std::invalid_argument("FATAL: " + std::string(name) + " is not a valid compression type.\n\n" "Available compression_type types: NONE, AUTO, SNAPPY,\n" - "LZ4, ZSTD\n" - "\n" - "Exiting...\n"); + "LZ4, ZSTD\n\n"); } /** - * @brief Get the optional page size stat frequency from they keyword + * @brief Get boolean from they keyword * - * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON - * @return optional page statistics frequency set to full (STATISTICS_COLUMN) + * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON + * @return true or false */ -[[nodiscard]] std::optional get_page_size_stats(std::string use_stats) +[[nodiscard]] bool get_boolean(std::string input) { - std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper); + std::transform(input.begin(), input.end(), input.begin(), ::toupper); // Check if the input string matches to any of the following - if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or - not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) { - // Full column and offset indices - STATISTICS_COLUMN - return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN); + if (not input.compare("ON") or not input.compare("TRUE") or not input.compare("YES") or + not input.compare("Y") or not input.compare("T")) { + return true; + } else { + return false; } - - return std::nullopt; } /** @@ -149,10 +145,83 @@ inline void check_identical_tables(cudf::table_view const& lhs_table, // No exception thrown, check indices auto const valid = indices->size() == 0; fmt::print( - fmt::emphasis::bold | fg(fmt::color::green_yellow), "Transcoding valid: {}\n", valid); + fmt::emphasis::bold | fg(fmt::color::green_yellow), "Transcoding valid: {}\n\n", valid); } catch (std::exception& e) { std::cerr << e.what() << std::endl << std::endl; throw std::runtime_error( - fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Transcoding valid: false\n")); + fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Transcoding valid: false\n\n")); + } +} + +/** + * @brief Get io sink type from the string keyword argumnet + * + * @param name io sink type keyword name + * @return corresponding io sink type type + */ +[[nodiscard]] std::optional get_io_sink_type(std::string name) +{ + using io_type = cudf::io::io_type; + + static const std::unordered_map map = { + {"FILEPATH", io_type::FILEPATH}, + {"HOST_BUFFER", io_type::HOST_BUFFER}, + {"PINNED_BUFFER", io_type::HOST_BUFFER}, + {"DEVICE_BUFFER", io_type::DEVICE_BUFFER}}; + + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + if (map.find(name) != map.end()) { + return {map.at(name)}; + } else { + fmt::print( + "{} is not a valid io sink type. Available: FILEPATH,\n" + "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER. Ignoring\n\n", + name); + return std::nullopt; + } +} + +/** + * @brief Concatenate a vector of tables and return the resultant table + * + * @param tables Vector of tables to concatenate + * @param stream CUDA stream to use + * + * @return Unique pointer to the resultant concatenated table. + */ +std::unique_ptr concatenate_tables(std::vector> tables, + rmm::cuda_stream_view stream) +{ + if (tables.size() == 1) { return std::move(tables[0]); } + + std::vector table_views; + table_views.reserve(tables.size()); + std::transform( + tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) { + return tbl->view(); + }); + // Construct the final table + return cudf::concatenate(table_views, stream); +} + +/** + * @brief Thread unsafe function to create a directory for FILEPATH io sink type and return its path + * + * @return File path of the created directory + */ +[[nodiscard]] std::string get_default_output_path() +{ + static std::string output_path = std::filesystem::current_path().string(); + if (output_path == std::filesystem::current_path().string()) { + // Check if output path is a valid directory + if (std::filesystem::is_directory({output_path})) { + // Create a new directory in output path if not empty. + if (not std::filesystem::is_empty({output_path})) { + output_path += + "/output_" + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now()); + std::filesystem::create_directory({output_path}); + } + } } + return output_path; } diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 5ea41a8fc67..a4ee550b0e4 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -108,7 +108,9 @@ int main(int argc, char const** argv) encoding = get_encoding_type("DELTA_BINARY_PACKED"); compression = get_compression_type("ZSTD"); break; - case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]]; + case 6: + if (get_boolean(argv[5])) { page_stats = cudf::io::statistics_freq::STATISTICS_COLUMN; }; + [[fallthrough]]; case 5: input_filepath = argv[1]; output_filepath = argv[2]; diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 0a05e22f8d7..419cef23d33 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -21,24 +21,34 @@ #include -#include - /** * @file parquet_io_multithreaded.cpp - * @brief Demonstrates usage of the libcudf APIs to read and write - * parquet file format with different encodings and compression types - * using multiple threads. + * @brief Demonstrates multithreaded read of parquet files and optionally + * multithreaded writing the read tables to the specified io sink source type. * - * The following encoding and compression ztypes are demonstrated: - * Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED, - * DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY + * Run: ``parquet_io_multithreaded -h`` to see help with input args and more. * - * Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD + * The following io sink types are supported: + * IO sink types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER * */ +// Type alias for unique ptr to cudf table using table_t = std::unique_ptr; +/** + * @brief Behavior when handling the read tables by multiple threads + */ +enum class read_mode { + NOWORK, ///< Only read and discard tables + CONCATENATE_THREAD, ///< Read and concatenate tables from each thread + CONCATENATE_ALL, ///< Read and concatenate everything to a single table +}; + +/** + * @brief Functor for multithreaded parquet reading based on the provided read_mode + */ +template struct read_fn { std::vector const& input_files; std::vector& tables; @@ -48,52 +58,99 @@ struct read_fn { void operator()() { + // Tables read by this thread std::vector tables_this_thread; + + // Sweep the available input files for (auto curr_file_idx = thread_id; curr_file_idx < input_files.size(); curr_file_idx += thread_count) { auto const source_info = cudf::io::source_info(input_files[curr_file_idx]); auto builder = cudf::io::parquet_reader_options::builder(source_info); auto const options = builder.build(); - tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); + if constexpr (READ_FN != read_mode::NOWORK) { + tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); + } else { + cudf::io::read_parquet(options, stream); + } } - // Concatenate all tables read by this thread. - auto table = std::move(tables_this_thread[0]); - std::for_each(tables_this_thread.begin() + 1, tables_this_thread.end(), [&](auto& tbl) { - std::vector const table_views{table->view(), tbl->view()}; - table = cudf::concatenate(table_views, stream); + // Concatenate the tables read by this thread if not NOWORK read_mode. + if constexpr (READ_FN != read_mode::NOWORK) { + auto table = concatenate_tables(std::move(tables_this_thread), stream); + stream.synchronize_no_throw(); + tables[thread_id] = std::move(table); + } else { + // Just synchronize this stream and exit + stream.synchronize_no_throw(); + } + } +}; + +/** + * @brief Function to setup and launch multithreaded parquet reading. + */ +template +std::vector read_parquet_multithreaded(std::vector const& files, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) +{ + // Tables read by each thread + std::vector tables(thread_count); + + // Table reading tasks + std::vector> read_tasks; + read_tasks.reserve(thread_count); + + // Create the read tasks + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { + read_tasks.emplace_back( + read_fn{files, tables, tid, thread_count, stream_pool.get_stream()}); }); - // Done with this stream - stream.synchronize_no_throw(); + // Create threads with tasks + std::vector threads; + threads.reserve(thread_count); + for (auto& c : read_tasks) { + threads.emplace_back(std::thread{c}); + } + for (auto& t : threads) { + t.join(); + } - tables[thread_id] = std::move(table); + // If CONCATENATE_ALL mode, then concatenate to a vector of one final table. + if (read_mode == read_mode::CONCATENATE_ALL) { + auto stream = stream_pool.get_stream(); + auto final_tbl = concatenate_tables(std::move(tables), stream); + stream.synchronize(); + tables.clear(); + tables.emplace_back(std::move(final_tbl)); } -}; + return tables; +} + +/** + * @brief Functor for multithreaded parquet writing + */ struct write_fn { - std::string const& output_path; + cudf::io::io_type io_sink_type; std::vector const& table_views; - cudf::io::column_encoding const encoding; - cudf::io::compression_type const compression; - std::optional const stats_level; int const thread_id; rmm::cuda_stream_view stream; void operator()() { - // write the data for inspection - auto sink_info = - cudf::io::sink_info(output_path + "/table_" + std::to_string(thread_id) + ".parquet"); - auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]) - .compression(compression) - .stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE)); + // Create a sink + auto const sink_info = [io_sink_type = io_sink_type, thread_id = thread_id]() { + return cudf::io::sink_info(get_default_output_path() + "/table_" + std::to_string(thread_id) + + ".parquet"); + }(); + // Writer options builder + auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]); + // Create a new metadata for the table auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]}; - std::for_each(table_metadata.column_metadata.begin(), - table_metadata.column_metadata.end(), - [=](auto& col_meta) { col_meta.set_encoding(encoding); }); - builder.metadata(table_metadata); auto options = builder.build(); @@ -105,43 +162,53 @@ struct write_fn { } }; -int main(int argc, char const** argv) +/** + * @brief The main function + */ +int32_t main(int argc, char const** argv) { - std::string input_paths; - std::string output_path; - cudf::io::column_encoding encoding; - cudf::io::compression_type compression; - std::optional page_stats; - int thread_count; + // Set arguments to defaults + std::string input_paths = "example.parquet"; + int32_t input_multiplier = 1; + int32_t thread_count = 2; + std::optional io_type = std::nullopt; + bool validate_output = false; + + // Function to print example usage + auto const print_usage = [] { + fmt::print(fg(fmt::color::yellow), + "\nUsage: parquet_io_multithreaded \n" + " \n" + " \n\n"); + fmt::print( + fg(fmt::color::light_sky_blue), + "Note: Provide as many arguments as you like in the above order. Default values\n" + " for the unprovided arguments will be used. No output parquet will be written\n" + " if isn't provided.\n\n"); + }; + // Set to the provided args switch (argc) { - case 1: - input_paths = "example.parquet"; - output_path = std::filesystem::current_path().string(); - encoding = get_encoding_type("DELTA_BINARY_PACKED"); - compression = get_compression_type("ZSTD"); - thread_count = 2; - break; - case 7: page_stats = get_page_size_stats(argv[6]); [[fallthrough]]; - case 6: - input_paths = std::string{argv[1]}; - output_path = std::string{argv[2]}; - encoding = get_encoding_type(argv[3]); - compression = get_compression_type(argv[4]); - thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); - break; - default: - throw std::runtime_error( - "Either provide all command-line arguments, or none to use defaults\n" - "Use: parquet_io_multithreaded " - " " - "\n"); + case 6: validate_output = get_boolean(argv[5]); [[fallthrough]]; + case 5: io_type = get_io_sink_type(argv[4]); [[fallthrough]]; + case 4: thread_count = std::max(thread_count, std::stoi(std::string{argv[3]})); [[fallthrough]]; + case 3: + input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]})); + [[fallthrough]]; + case 2: + if (auto arg = std::string{argv[1]}; arg == "-h" or arg == "--help") { + print_usage(); + return 0; + } else + input_paths = std::string{argv[1]}; + [[fallthrough]]; + case 1: break; + default: print_usage(); throw std::runtime_error(""); } - // Process and extract all input files - auto const extract_input_files = [thread_count = thread_count](std::string const& paths) { - std::vector parquet_files; - std::vector delimited_paths = [&]() { + // Lambda function to process and extract all input files + auto const extract_input_files = [thread_count, input_multiplier](std::string const& paths) { + std::vector const delimited_paths = [&]() { std::vector paths_list; std::stringstream stream{paths}; std::string path; @@ -152,114 +219,60 @@ int main(int argc, char const** argv) return paths_list; }(); - std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { - std::filesystem::path path{path_string}; - // If this is a parquet file, add it. - if (std::filesystem::is_regular_file(path)) { - parquet_files.push_back(path_string); - } - // If this is a directory, add all files at this path - else if (std::filesystem::is_directory(path)) { - for (auto const& file : std::filesystem::directory_iterator(path)) { - if (std::filesystem::is_regular_file(file.path())) { - parquet_files.push_back(file.path().string()); - } - } - } else { - throw std::runtime_error("Encountered an invalid input path\n"); - } - }); + // The final list of parquet files to be read. + std::vector parquet_files; + parquet_files.reserve( + std::max(thread_count, input_multiplier * delimited_paths.size())); + // Append the input files by input_multiplier times + std::for_each( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input_multiplier), + [&](auto i) { + std::for_each( + delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { + std::filesystem::path path{path_string}; + // If this is a parquet file, add it. + if (std::filesystem::is_regular_file(path)) { + parquet_files.emplace_back(path_string); + } + // If this is a directory, add all files at this path + else if (std::filesystem::is_directory(path)) { + for (auto const& file : std::filesystem::directory_iterator(path)) { + if (std::filesystem::is_regular_file(file.path())) { + parquet_files.emplace_back(file.path().string()); + } + } + } else { + throw std::runtime_error("Encountered an invalid input path\n"); + } + }); + }); - // Add parquet files from existing ones if less than thread_count + // Cycle append parquet files from the existing ones if less than the thread_count for (size_t idx = 0, initial_size = parquet_files.size(); thread_count > static_cast(parquet_files.size()); idx++) { - parquet_files.push_back(parquet_files[idx % initial_size]); + parquet_files.emplace_back(parquet_files[idx % initial_size]); } return parquet_files; }; - // Lambda to concatenate a vector of tables - auto const concatenate_tables = [](std::vector tables, rmm::cuda_stream_view stream) { - std::vector table_views; - table_views.reserve(tables.size()); - std::transform( - tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) { - return tbl->view(); - }); - // Construct the final table - return cudf::concatenate(table_views, stream); - }; - - // make input files from the input_paths string. - auto const input_files = extract_input_files(input_paths); - - // Exit early if nothing to do. - if (not input_files.size()) { - std::cerr << "No input files to read. Exiting early.\n"; - return 0; - } - - // Check if output path is a valid - if (std::filesystem::is_directory({output_path})) { - // Create a new directory in output path if not empty. - if (not std::filesystem::is_empty({output_path})) { - output_path += - "/output_" + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now()); - std::filesystem::create_directory({output_path}); - } - } else { - throw std::runtime_error("The provided output path is not a directory\n"); - } - - auto const is_pool_used = true; - auto resource = create_memory_resource(is_pool_used); - auto default_stream = cudf::get_default_stream(); - auto stream_pool = rmm::cuda_stream_pool(thread_count); - auto stats_mr = - rmm::mr::statistics_resource_adaptor(resource.get()); - rmm::mr::set_current_device_resource(&stats_mr); - - // Lambda function to setup and launch multithread parquet read - auto const read_parquet_multithreaded = [&](std::vector const& files) { - // Tables read by each thread - std::vector tables(thread_count); - - // Tasks to read each parquet file - std::vector read_tasks; - read_tasks.reserve(thread_count); - std::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(thread_count), - [&](auto tid) { - read_tasks.emplace_back( - read_fn{files, tables, tid, thread_count, stream_pool.get_stream()}); - }); - - std::vector threads; - threads.reserve(thread_count); - for (auto& c : read_tasks) { - threads.emplace_back(std::thread{c}); - } - for (auto& t : threads) { - t.join(); - } - return tables; - }; - // Lambda function to setup and launch multithreaded parquet writes - auto const write_parquet_multithreaded = [&](std::vector const& tables) { - // Tasks to read each parquet file + auto const write_parquet_multithreaded = [&](std::vector const& tables, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) { + // Table writing tasks std::vector write_tasks; write_tasks.reserve(thread_count); std::for_each( thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { - write_tasks.emplace_back(write_fn{ - output_path, tables, encoding, compression, page_stats, tid, stream_pool.get_stream()}); + write_tasks.emplace_back(write_fn{io_type.value(), tables, tid, stream_pool.get_stream()}); }); + // Writer threads std::vector threads; threads.reserve(thread_count); for (auto& c : write_tasks) { @@ -270,66 +283,95 @@ int main(int argc, char const** argv) } }; + // Make a list of input files from the input_paths string. + auto const input_files = extract_input_files(input_paths); + auto const is_pool_used = true; + auto resource = create_memory_resource(is_pool_used); + auto default_stream = cudf::get_default_stream(); + auto stream_pool = rmm::cuda_stream_pool(thread_count); + auto stats_mr = + rmm::mr::statistics_resource_adaptor(resource.get()); + rmm::mr::set_current_device_resource(&stats_mr); + + // Exit early if nothing to do. + if (not input_files.size()) { + std::cerr << "No input files to read. Exiting early.\n"; + return 0; + } + // Read the parquet files with multiple threads { - fmt::print( - "Note: Not timing the initial parquet read as it may include\n" - "times for nvcomp, cufile loading and RMM growth.\n\n"); - // Tables read by each thread - auto const tables = read_parquet_multithreaded(input_files); - // In case some kernels are still running on the default stre + fmt::print(fg(fmt::color::yellow), + "\nNote: Not timing the initial parquet read as it may include\n" + "times for nvcomp, cufile loading and RMM growth.\n\n"); + // Tasks to read each parquet file + auto const tables = read_parquet_multithreaded( + input_files, thread_count, stream_pool); default_stream.synchronize(); - // Construct a vector of table views for write_parquet_multithreaded - auto const table_views = [&tables]() { - std::vector table_views; - table_views.reserve(tables.size()); - - std::transform( - tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { - return tbl->view(); - }); - return table_views; - }(); + if (io_type.has_value()) { + // Initialize the default output path to avoid race condition with multiple writer threads. + std::ignore = get_default_output_path(); + + // Construct a vector of table views for write_parquet_multithreaded + auto const table_views = [&tables]() { + std::vector table_views; + table_views.reserve(tables.size()); + + std::transform( + tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { + return tbl->view(); + }); + return table_views; + }(); + + // Write tables to parquet + fmt::print("Writing parquet output to sink type: {}\n", std::string{argv[4]}); + cudf::examples::timer timer; + write_parquet_multithreaded(table_views, thread_count, stream_pool); + default_stream.synchronize(); + timer.print_elapsed_millis(); + } + } - // Write tables to parquet with the specified encoding and compression - auto const page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; + // Re-read the same parquet files with multiple threads and discard the read tables + { fmt::print( - "Writing at: {} with encoding, compression and {}..\n", output_path, page_stat_string); - + "Reading {} input files for the second time using {} threads and discarding output " + "tables...\n", + input_files.size(), + thread_count); cudf::examples::timer timer; - write_parquet_multithreaded(table_views); - // In case some kernels are still running on the default stream + // Read parquet files and discard the tables + std::ignore = + read_parquet_multithreaded(input_files, thread_count, stream_pool); default_stream.synchronize(); - // Print elapsed time timer.print_elapsed_millis(); } - // Re-read the same parquet files with multiple threads - { - fmt::print("Reading for the second time using {} threads...\n", thread_count); - cudf::examples::timer timer; - auto const input_table = - concatenate_tables(std::move(read_parquet_multithreaded(input_files)), default_stream); - // In case some kernels are still running on the default stream - default_stream.synchronize(); - // Print elapsed time and peak memory - timer.print_elapsed_millis(); + // Verify the output files if requested + if (validate_output and io_type.has_value()) { + fmt::print("Verifying transcoding...\n"); - fmt::print("Reading transcoded files using {} threads...\n", thread_count); - timer.reset(); - auto const transcoded_table = concatenate_tables( - std::move(read_parquet_multithreaded(extract_input_files(output_path))), default_stream); - // In case some kernels are still running on the default stream - default_stream.synchronize(); - // Print elapsed time and peak memory - timer.print_elapsed_millis(); + // CONCATENATE_ALL returns a vector of 1 table + auto const input_table = std::move( + read_parquet_multithreaded(input_files, thread_count, stream_pool) + .back()); - fmt::print("Peak memory: {} MB\n\n", (stats_mr.get_bytes_counter().peak / 1048576.0)); + auto const transcoded_table = + std::move(read_parquet_multithreaded( + extract_input_files(get_default_output_path()), thread_count, stream_pool) + .back()); + default_stream.synchronize(); // Check for validity check_identical_tables(input_table->view(), transcoded_table->view()); } + // Print peak memory + fmt::print(fmt::emphasis::bold | fg(fmt::color::medium_purple), + "Peak memory: {} MB\n\n", + (stats_mr.get_bytes_counter().peak / 1048576.0)); + return 0; -} +} \ No newline at end of file From 06817d05b644464582e173decefb67d2bdd0eba6 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 27 Sep 2024 01:27:47 +0000 Subject: [PATCH 12/37] Minor updates --- cpp/examples/parquet_io/common.hpp | 2 +- .../parquet_io/parquet_io_multithreaded.cpp | 40 ++++++++++++++----- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp index eaff77708e6..c4cbd6a589a 100644 --- a/cpp/examples/parquet_io/common.hpp +++ b/cpp/examples/parquet_io/common.hpp @@ -154,7 +154,7 @@ inline void check_identical_tables(cudf::table_view const& lhs_table, } /** - * @brief Get io sink type from the string keyword argumnet + * @brief Get io sink type from the string keyword argument * * @param name io sink type keyword name * @return corresponding io sink type type diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 419cef23d33..02e4c772e5b 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -88,6 +88,15 @@ struct read_fn { /** * @brief Function to setup and launch multithreaded parquet reading. + * + * @tparam read_mode Specifies if to concatenate and return the actual + * tables or discard them and return an empty vector + * + * @param files List of files to read + * @param thread_count Number of threads + * @param stream_pool CUDA stream pool to use for threads + * + * @return Vector of read tables. */ template std::vector read_parquet_multithreaded(std::vector const& files, @@ -302,14 +311,17 @@ int32_t main(int argc, char const** argv) // Read the parquet files with multiple threads { fmt::print(fg(fmt::color::yellow), - "\nNote: Not timing the initial parquet read as it may include\n" - "times for nvcomp, cufile loading and RMM growth.\n\n"); - // Tasks to read each parquet file - auto const tables = read_parquet_multithreaded( - input_files, thread_count, stream_pool); - default_stream.synchronize(); + "\nReading {} input files using {} threads without timing it as \n" + "it may include times for nvcomp, cufile loading and RMM growth.\n\n", + input_files.size(), + thread_count); + // If we are writing output then read with CONCATENATE_THREAD if (io_type.has_value()) { + // Launch + auto const tables = read_parquet_multithreaded( + input_files, thread_count, stream_pool); + default_stream.synchronize(); // Initialize the default output path to avoid race condition with multiple writer threads. std::ignore = get_default_output_path(); @@ -326,19 +338,25 @@ int32_t main(int argc, char const** argv) }(); // Write tables to parquet - fmt::print("Writing parquet output to sink type: {}\n", std::string{argv[4]}); + fmt::print("Writing parquet output to sink type: {}..\n", std::string{argv[4]}); cudf::examples::timer timer; write_parquet_multithreaded(table_views, thread_count, stream_pool); default_stream.synchronize(); timer.print_elapsed_millis(); } + // Else simply read with NOWORK mode + else { + std::ignore = + read_parquet_multithreaded(input_files, thread_count, stream_pool); + default_stream.synchronize(); + } } // Re-read the same parquet files with multiple threads and discard the read tables { fmt::print( - "Reading {} input files for the second time using {} threads and discarding output " - "tables...\n", + "Re-reading {} input files using {} threads and discarding output " + "tables..\n", input_files.size(), thread_count); cudf::examples::timer timer; @@ -351,7 +369,7 @@ int32_t main(int argc, char const** argv) // Verify the output files if requested if (validate_output and io_type.has_value()) { - fmt::print("Verifying transcoding...\n"); + fmt::print("Verifying output..\n"); // CONCATENATE_ALL returns a vector of 1 table auto const input_table = std::move( @@ -374,4 +392,4 @@ int32_t main(int argc, char const** argv) (stats_mr.get_bytes_counter().peak / 1048576.0)); return 0; -} \ No newline at end of file +} From af8ec6a9476654de9c7aece60cee2e9cb304ddc1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 27 Sep 2024 19:13:39 +0000 Subject: [PATCH 13/37] Minor improvements --- cpp/examples/parquet_io/common.hpp | 4 +- .../parquet_io/parquet_io_multithreaded.cpp | 134 +++++++++--------- 2 files changed, 66 insertions(+), 72 deletions(-) diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp index c4cbd6a589a..37eb138640a 100644 --- a/cpp/examples/parquet_io/common.hpp +++ b/cpp/examples/parquet_io/common.hpp @@ -145,11 +145,11 @@ inline void check_identical_tables(cudf::table_view const& lhs_table, // No exception thrown, check indices auto const valid = indices->size() == 0; fmt::print( - fmt::emphasis::bold | fg(fmt::color::green_yellow), "Transcoding valid: {}\n\n", valid); + fmt::emphasis::bold | fg(fmt::color::green_yellow), "Tables identical: {}\n\n", valid); } catch (std::exception& e) { std::cerr << e.what() << std::endl << std::endl; throw std::runtime_error( - fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Transcoding valid: false\n\n")); + fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Tables identical: false\n\n")); } } diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 02e4c772e5b..7728c91cbb7 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -179,16 +179,19 @@ int32_t main(int argc, char const** argv) // Set arguments to defaults std::string input_paths = "example.parquet"; int32_t input_multiplier = 1; + int32_t num_reads = 1; int32_t thread_count = 2; std::optional io_type = std::nullopt; bool validate_output = false; // Function to print example usage auto const print_usage = [] { - fmt::print(fg(fmt::color::yellow), - "\nUsage: parquet_io_multithreaded \n" - " \n" - " \n\n"); + fmt::print( + fg(fmt::color::yellow), + "\nUsage: parquet_io_multithreaded \n" + " \n" + " \n\n"); fmt::print( fg(fmt::color::light_sky_blue), "Note: Provide as many arguments as you like in the above order. Default values\n" @@ -198,9 +201,10 @@ int32_t main(int argc, char const** argv) // Set to the provided args switch (argc) { - case 6: validate_output = get_boolean(argv[5]); [[fallthrough]]; - case 5: io_type = get_io_sink_type(argv[4]); [[fallthrough]]; - case 4: thread_count = std::max(thread_count, std::stoi(std::string{argv[3]})); [[fallthrough]]; + case 7: validate_output = get_boolean(argv[6]); [[fallthrough]]; + case 6: io_type = get_io_sink_type(argv[5]); [[fallthrough]]; + case 5: thread_count = std::max(thread_count, std::stoi(std::string{argv[4]})); [[fallthrough]]; + case 4: num_reads = std::max(1, std::stoi(std::string{argv[3]})); [[fallthrough]]; case 3: input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]})); [[fallthrough]]; @@ -308,82 +312,72 @@ int32_t main(int argc, char const** argv) return 0; } - // Read the parquet files with multiple threads - { - fmt::print(fg(fmt::color::yellow), - "\nReading {} input files using {} threads without timing it as \n" - "it may include times for nvcomp, cufile loading and RMM growth.\n\n", - input_files.size(), - thread_count); - - // If we are writing output then read with CONCATENATE_THREAD - if (io_type.has_value()) { - // Launch - auto const tables = read_parquet_multithreaded( - input_files, thread_count, stream_pool); - default_stream.synchronize(); - // Initialize the default output path to avoid race condition with multiple writer threads. - std::ignore = get_default_output_path(); - - // Construct a vector of table views for write_parquet_multithreaded - auto const table_views = [&tables]() { - std::vector table_views; - table_views.reserve(tables.size()); - - std::transform( - tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { - return tbl->view(); - }); - return table_views; - }(); - - // Write tables to parquet - fmt::print("Writing parquet output to sink type: {}..\n", std::string{argv[4]}); - cudf::examples::timer timer; - write_parquet_multithreaded(table_views, thread_count, stream_pool); - default_stream.synchronize(); - timer.print_elapsed_millis(); - } - // Else simply read with NOWORK mode - else { - std::ignore = - read_parquet_multithreaded(input_files, thread_count, stream_pool); - default_stream.synchronize(); - } - } - - // Re-read the same parquet files with multiple threads and discard the read tables + // Read the same parquet files specified times with multiple threads and discard the read tables { fmt::print( - "Re-reading {} input files using {} threads and discarding output " + "\nReading {} input files {} times using {} threads and discarding output " "tables..\n", input_files.size(), + num_reads, thread_count); + fmt::print( + fg(fmt::color::yellow), + "Note that the first read may include times for nvcomp, cufile loading and RMM growth.\n\n"); cudf::examples::timer timer; - // Read parquet files and discard the tables - std::ignore = - read_parquet_multithreaded(input_files, thread_count, stream_pool); + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_reads), + [&](auto i) { // Read parquet files and discard the tables + std::ignore = read_parquet_multithreaded( + input_files, thread_count, stream_pool); + }); default_stream.synchronize(); timer.print_elapsed_millis(); } - // Verify the output files if requested - if (validate_output and io_type.has_value()) { - fmt::print("Verifying output..\n"); - - // CONCATENATE_ALL returns a vector of 1 table - auto const input_table = std::move( - read_parquet_multithreaded(input_files, thread_count, stream_pool) - .back()); + // Do we need to write parquet as well? + if (io_type.has_value()) { + // Read input files with CONCATENATE_THREADS mode + auto const tables = read_parquet_multithreaded( + input_files, thread_count, stream_pool); + default_stream.synchronize(); + // Initialize the default output path to avoid race condition with multiple writer threads. + std::ignore = get_default_output_path(); + + // Construct a vector of table views for write_parquet_multithreaded + auto const table_views = [&tables]() { + std::vector table_views; + table_views.reserve(tables.size()); + + std::transform( + tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { + return tbl->view(); + }); + return table_views; + }(); - auto const transcoded_table = - std::move(read_parquet_multithreaded( - extract_input_files(get_default_output_path()), thread_count, stream_pool) - .back()); + // Write tables to parquet + fmt::print("Writing parquet output to sink type: {}..\n", std::string{argv[5]}); + cudf::examples::timer timer; + write_parquet_multithreaded(table_views, thread_count, stream_pool); default_stream.synchronize(); + timer.print_elapsed_millis(); + + // Verify the output if requested + if (validate_output) { + fmt::print("Verifying output..\n"); + + // CONCATENATE_ALL returns a vector of 1 table + auto const input_table = cudf::concatenate(table_views, default_stream); - // Check for validity - check_identical_tables(input_table->view(), transcoded_table->view()); + auto const transcoded_table = + std::move(read_parquet_multithreaded( + extract_input_files(get_default_output_path()), thread_count, stream_pool) + .back()); + default_stream.synchronize(); + + // Check if the tables are identical + check_identical_tables(input_table->view(), transcoded_table->view()); + } } // Print peak memory From d3778cc33d9a6e00a881344e9c35c2429859c6b5 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 27 Sep 2024 19:14:44 +0000 Subject: [PATCH 14/37] Set default thread count = 1 instead of 2 --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 7728c91cbb7..1b4b342d1f5 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -180,7 +180,7 @@ int32_t main(int argc, char const** argv) std::string input_paths = "example.parquet"; int32_t input_multiplier = 1; int32_t num_reads = 1; - int32_t thread_count = 2; + int32_t thread_count = 1; std::optional io_type = std::nullopt; bool validate_output = false; From c2b39ccb9ede6300056bd5717a81727d69d6bf13 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 27 Sep 2024 19:16:45 +0000 Subject: [PATCH 15/37] Minor improvement --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 1b4b342d1f5..354b399c050 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -307,7 +307,7 @@ int32_t main(int argc, char const** argv) rmm::mr::set_current_device_resource(&stats_mr); // Exit early if nothing to do. - if (not input_files.size()) { + if (input_files.empty()) { std::cerr << "No input files to read. Exiting early.\n"; return 0; } From 8f39fb22897f4aaf21ffd9cc0283ae306473f5de Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 1 Oct 2024 01:37:43 +0000 Subject: [PATCH 16/37] Add io source types --- cpp/examples/parquet_io/common.hpp | 85 ++---- cpp/examples/parquet_io/io_source.hpp | 145 ++++++++++ .../parquet_io/parquet_io_multithreaded.cpp | 272 ++++++++++-------- 3 files changed, 316 insertions(+), 186 deletions(-) create mode 100644 cpp/examples/parquet_io/io_source.hpp diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common.hpp index 37eb138640a..16fd16ee7c1 100644 --- a/cpp/examples/parquet_io/common.hpp +++ b/cpp/examples/parquet_io/common.hpp @@ -25,20 +25,21 @@ #include #include -#include -#include #include #include #include #include -#include #include -#include -#include #include +/** + * @file commons.hpp + * @brief Common utilities for `parquet_io` examples + * + */ + /** * @brief Create memory resource for libcudf functions * @@ -65,7 +66,7 @@ std::shared_ptr create_memory_resource(bool is_ { using encoding_type = cudf::io::column_encoding; - static const std::unordered_map map = { + static std::unordered_map const map = { {"DEFAULT", encoding_type::USE_DEFAULT}, {"DICTIONARY", encoding_type::DICTIONARY}, {"PLAIN", encoding_type::PLAIN}, @@ -76,11 +77,12 @@ std::shared_ptr create_memory_resource(bool is_ std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument("FATAL: " + std::string(name) + - " is not a valid encoding type.\n\n" - "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n" - "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n" - "DELTA_BYTE_ARRAY\n\n"); + throw std::invalid_argument(fmt::format(fmt::emphasis::bold | fg(fmt::color::red), + "{} is not a valid encoding type.\n\n" + "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n" + "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n" + "DELTA_BYTE_ARRAY\n\n", + name)); } /** @@ -93,7 +95,7 @@ std::shared_ptr create_memory_resource(bool is_ { using compression_type = cudf::io::compression_type; - static const std::unordered_map map = { + static std::unordered_map const map = { {"NONE", compression_type::NONE}, {"AUTO", compression_type::AUTO}, {"SNAPPY", compression_type::SNAPPY}, @@ -102,10 +104,11 @@ std::shared_ptr create_memory_resource(bool is_ std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument("FATAL: " + std::string(name) + - " is not a valid compression type.\n\n" - "Available compression_type types: NONE, AUTO, SNAPPY,\n" - "LZ4, ZSTD\n\n"); + throw std::invalid_argument(fmt::format(fmt::emphasis::bold | fg(fmt::color::red), + "{} is not a valid compression type.\n\n" + "Available compression types: NONE, AUTO, SNAPPY,\n" + "LZ4, ZSTD\n\n", + name)); } /** @@ -153,34 +156,6 @@ inline void check_identical_tables(cudf::table_view const& lhs_table, } } -/** - * @brief Get io sink type from the string keyword argument - * - * @param name io sink type keyword name - * @return corresponding io sink type type - */ -[[nodiscard]] std::optional get_io_sink_type(std::string name) -{ - using io_type = cudf::io::io_type; - - static const std::unordered_map map = { - {"FILEPATH", io_type::FILEPATH}, - {"HOST_BUFFER", io_type::HOST_BUFFER}, - {"PINNED_BUFFER", io_type::HOST_BUFFER}, - {"DEVICE_BUFFER", io_type::DEVICE_BUFFER}}; - - std::transform(name.begin(), name.end(), name.begin(), ::toupper); - if (map.find(name) != map.end()) { - return {map.at(name)}; - } else { - fmt::print( - "{} is not a valid io sink type. Available: FILEPATH,\n" - "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER. Ignoring\n\n", - name); - return std::nullopt; - } -} - /** * @brief Concatenate a vector of tables and return the resultant table * @@ -203,25 +178,3 @@ std::unique_ptr concatenate_tables(std::vector +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +/** + * @file io_source.hpp + * @brief Utilities for construction IO sources from the input parquet files. + * + */ + +/** + * @brief Available IO source types + */ +enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER }; + +/** + * @brief Create and return a reference to a static pinned memory pool + * + * @return Reference to a static pinned memory pool + */ +rmm::host_async_resource_ref pinned_memory_resource() +{ + static auto mr = rmm::mr::pinned_host_memory_resource{}; + return mr; +} + +/** + * @brief Get io source type from the string keyword argument + * + * @param name io source type keyword name + * @return io source type + */ +[[nodiscard]] io_source_type get_io_source_type(std::string name) +{ + static std::unordered_map const map = { + {"FILEPATH", io_source_type::FILEPATH}, + {"HOST_BUFFER", io_source_type::HOST_BUFFER}, + {"PINNED_BUFFER", io_source_type::PINNED_BUFFER}, + {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}}; + + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + if (map.find(name) != map.end()) { + return map.at(name); + } else { + throw std::invalid_argument( + fmt::format(fmt::emphasis::bold | fg(fmt::color::red), + "{} is not a valid io source type. Available: FILEPATH,\n" + "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n", + name)); + } +} + +/** + * @brief Class to create a cudf::io::source_info of given type from the input parquet file + * + */ +class io_source { + public: + io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream) + : type{io_type}, + file_name{file_path}, + file_size{std::filesystem::file_size(file_name)}, + pinned_buffer({pinned_memory_resource(), stream}), + d_buffer{0, stream} + { + // For filepath make a quick source_info and return early + if (type == io_source_type::FILEPATH) { + source_info = cudf::io::source_info(file_name); + return; + } + + std::ifstream file{file_name, std::ifstream::binary}; + + // Copy file contents to the specified io source buffer + switch (type) { + case io_source_type::HOST_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + source_info = cudf::io::source_info(h_buffer.data(), h_buffer.size()); + break; + } + case io_source_type::PINNED_BUFFER: { + pinned_buffer.resize(file_size); + file.read(pinned_buffer.data(), file_size); + source_info = cudf::io::source_info(pinned_buffer.data(), pinned_buffer.size()); + break; + } + case io_source_type::DEVICE_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + d_buffer.resize(file_size, stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value())); + + source_info = cudf::io::source_info(d_buffer); + break; + } + default: { + throw std::runtime_error(fmt::format(fmt::emphasis::bold | fg(fmt::color::red), + "Encountered unexpected source type\n\n")); + } + } + } + + // Get the internal source info + [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; } + + private: + io_source_type const type; + std::string const file_name; + size_t const file_size; + cudf::io::source_info source_info; + std::vector h_buffer; + cudf::detail::host_vector pinned_buffer; + rmm::device_uvector d_buffer; +}; \ No newline at end of file diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 354b399c050..4204e50c271 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -15,21 +15,26 @@ */ #include "common.hpp" +#include "io_source.hpp" -#include +#include #include #include /** * @file parquet_io_multithreaded.cpp - * @brief Demonstrates multithreaded read of parquet files and optionally - * multithreaded writing the read tables to the specified io sink source type. + * @brief Demonstrates reading parquet data from the specified io source using multiple threads. * - * Run: ``parquet_io_multithreaded -h`` to see help with input args and more. + * The input parquet data is provided via files which are converted to the specified io source type + * to be read using multiple threads. Optionally, the parquet data read by each thread can be + * written to corresponding files and checked for validatity of the output files against the input + * data. * - * The following io sink types are supported: - * IO sink types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER + * Run: ``parquet_io_multithreaded -h`` to see help with input args and more information. + * + * The following io source types are supported: + * IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER * */ @@ -50,7 +55,7 @@ enum class read_mode { */ template struct read_fn { - std::vector const& input_files; + std::vector const& input_sources; std::vector& tables; int const thread_id; int const thread_count; @@ -62,11 +67,11 @@ struct read_fn { std::vector tables_this_thread; // Sweep the available input files - for (auto curr_file_idx = thread_id; curr_file_idx < input_files.size(); + for (auto curr_file_idx = thread_id; curr_file_idx < input_sources.size(); curr_file_idx += thread_count) { - auto const source_info = cudf::io::source_info(input_files[curr_file_idx]); - auto builder = cudf::io::parquet_reader_options::builder(source_info); - auto const options = builder.build(); + auto builder = + cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info()); + auto const options = builder.build(); if constexpr (READ_FN != read_mode::NOWORK) { tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); } else { @@ -99,7 +104,7 @@ struct read_fn { * @return Vector of read tables. */ template -std::vector read_parquet_multithreaded(std::vector const& files, +std::vector read_parquet_multithreaded(std::vector const& input_sources, int32_t thread_count, rmm::cuda_stream_pool& stream_pool) { @@ -114,7 +119,7 @@ std::vector read_parquet_multithreaded(std::vector const& std::for_each( thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { read_tasks.emplace_back( - read_fn{files, tables, tid, thread_count, stream_pool.get_stream()}); + read_fn{input_sources, tables, tid, thread_count, stream_pool.get_stream()}); }); // Create threads with tasks @@ -143,7 +148,7 @@ std::vector read_parquet_multithreaded(std::vector const& * @brief Functor for multithreaded parquet writing */ struct write_fn { - cudf::io::io_type io_sink_type; + std::string const& output_path; std::vector const& table_views; int const thread_id; rmm::cuda_stream_view stream; @@ -151,10 +156,8 @@ struct write_fn { void operator()() { // Create a sink - auto const sink_info = [io_sink_type = io_sink_type, thread_id = thread_id]() { - return cudf::io::sink_info(get_default_output_path() + "/table_" + std::to_string(thread_id) + - ".parquet"); - }(); + cudf::io::sink_info const sink_info{output_path + "/table_" + std::to_string(thread_id) + + ".parquet"}; // Writer options builder auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]); // Create a new metadata for the table @@ -171,40 +174,46 @@ struct write_fn { } }; +/** + * @brief Function to print example usage + */ +void print_usage() +{ + fmt::print( + fg(fmt::color::yellow), + "\nUsage: parquet_io_multithreaded \n" + " \n" + " \n\n"); + fmt::print( + "Available IO source types: FILEPATH, HOST_BUFFER, {}, DEVICE_BUFFER\n\n", + fmt::format(fmt::emphasis::bold | fg(fmt::color::green_yellow), "PINNED_BUFFER (Default)")); + fmt::print(fg(fmt::color::light_sky_blue), + "Note: Provide as many arguments as you like in the above order. Default values\n" + " for the unprovided arguments will be used. All input parquet files will\n" + " be converted to the specified before reading\n\n"); +} + /** * @brief The main function */ int32_t main(int argc, char const** argv) { // Set arguments to defaults - std::string input_paths = "example.parquet"; - int32_t input_multiplier = 1; - int32_t num_reads = 1; - int32_t thread_count = 1; - std::optional io_type = std::nullopt; - bool validate_output = false; - - // Function to print example usage - auto const print_usage = [] { - fmt::print( - fg(fmt::color::yellow), - "\nUsage: parquet_io_multithreaded \n" - " \n" - " \n\n"); - fmt::print( - fg(fmt::color::light_sky_blue), - "Note: Provide as many arguments as you like in the above order. Default values\n" - " for the unprovided arguments will be used. No output parquet will be written\n" - " if isn't provided.\n\n"); - }; + std::string input_paths = "example.parquet"; + int32_t input_multiplier = 1; + int32_t num_reads = 1; + int32_t thread_count = 1; + io_source_type io_source_type = io_source_type::PINNED_BUFFER; + bool write_and_validate = false; // Set to the provided args switch (argc) { - case 7: validate_output = get_boolean(argv[6]); [[fallthrough]]; - case 6: io_type = get_io_sink_type(argv[5]); [[fallthrough]]; - case 5: thread_count = std::max(thread_count, std::stoi(std::string{argv[4]})); [[fallthrough]]; - case 4: num_reads = std::max(1, std::stoi(std::string{argv[3]})); [[fallthrough]]; + case 7: write_and_validate = get_boolean(argv[6]); [[fallthrough]]; + case 6: thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); [[fallthrough]]; + case 5: num_reads = std::max(1, std::stoi(argv[4])); [[fallthrough]]; + case 4: io_source_type = get_io_source_type(argv[3]); [[fallthrough]]; case 3: input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]})); [[fallthrough]]; @@ -220,59 +229,68 @@ int32_t main(int argc, char const** argv) } // Lambda function to process and extract all input files - auto const extract_input_files = [thread_count, input_multiplier](std::string const& paths) { - std::vector const delimited_paths = [&]() { - std::vector paths_list; - std::stringstream stream{paths}; - std::string path; - // Extract the delimited paths. - while (std::getline(stream, path, char{','})) { - paths_list.push_back(path); - } - return paths_list; - }(); - - // The final list of parquet files to be read. - std::vector parquet_files; - parquet_files.reserve( - std::max(thread_count, input_multiplier * delimited_paths.size())); - // Append the input files by input_multiplier times - std::for_each( - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input_multiplier), - [&](auto i) { - std::for_each( - delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { - std::filesystem::path path{path_string}; - // If this is a parquet file, add it. - if (std::filesystem::is_regular_file(path)) { - parquet_files.emplace_back(path_string); - } - // If this is a directory, add all files at this path - else if (std::filesystem::is_directory(path)) { - for (auto const& file : std::filesystem::directory_iterator(path)) { - if (std::filesystem::is_regular_file(file.path())) { - parquet_files.emplace_back(file.path().string()); + auto const extract_input_sources_async = + [thread_count, input_multiplier, io_source_type = io_source_type]( + std::string const& paths, rmm::cuda_stream_view stream) { + std::vector const delimited_paths = [&]() { + std::vector paths_list; + std::stringstream strstream{paths}; + std::string path; + // Extract the delimited paths. + while (std::getline(strstream, path, char{','})) { + paths_list.push_back(path); + } + return paths_list; + }(); + + // The final list of parquet files to be read. + std::vector parquet_files; + parquet_files.reserve( + std::max(thread_count, input_multiplier * delimited_paths.size())); + // Append the input files by input_multiplier times + std::for_each( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input_multiplier), + [&](auto i) { + std::for_each( + delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { + std::filesystem::path path{path_string}; + // If this is a parquet file, add it. + if (std::filesystem::is_regular_file(path)) { + parquet_files.emplace_back(path_string); + } + // If this is a directory, add all files at this path + else if (std::filesystem::is_directory(path)) { + for (auto const& file : std::filesystem::directory_iterator(path)) { + if (std::filesystem::is_regular_file(file.path())) { + parquet_files.emplace_back(file.path().string()); + } } + } else { + throw std::runtime_error("Encountered an invalid input path\n"); } - } else { - throw std::runtime_error("Encountered an invalid input path\n"); - } - }); - }); + }); + }); - // Cycle append parquet files from the existing ones if less than the thread_count - for (size_t idx = 0, initial_size = parquet_files.size(); - thread_count > static_cast(parquet_files.size()); - idx++) { - parquet_files.emplace_back(parquet_files[idx % initial_size]); - } + // Cycle append parquet files from the existing ones if less than the thread_count + for (size_t idx = 0, initial_size = parquet_files.size(); + thread_count > static_cast(parquet_files.size()); + idx++) { + parquet_files.emplace_back(parquet_files[idx % initial_size]); + } - return parquet_files; - }; + std::vector input_sources; + input_sources.reserve(parquet_files.size()); + std::transform(parquet_files.begin(), + parquet_files.end(), + std::back_inserter(input_sources), + [&](auto& file_name) { return io_source(file_name, io_source_type, stream); }); + return input_sources; + }; // Lambda function to setup and launch multithreaded parquet writes - auto const write_parquet_multithreaded = [&](std::vector const& tables, + auto const write_parquet_multithreaded = [&](std::string const& output_path, + std::vector const& tables, int32_t thread_count, rmm::cuda_stream_pool& stream_pool) { // Table writing tasks @@ -282,7 +300,7 @@ int32_t main(int argc, char const** argv) thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { - write_tasks.emplace_back(write_fn{io_type.value(), tables, tid, stream_pool.get_stream()}); + write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()}); }); // Writer threads @@ -296,8 +314,7 @@ int32_t main(int argc, char const** argv) } }; - // Make a list of input files from the input_paths string. - auto const input_files = extract_input_files(input_paths); + // Initialize mr, default stream and stream pool auto const is_pool_used = true; auto resource = create_memory_resource(is_pool_used); auto default_stream = cudf::get_default_stream(); @@ -306,42 +323,54 @@ int32_t main(int argc, char const** argv) rmm::mr::statistics_resource_adaptor(resource.get()); rmm::mr::set_current_device_resource(&stats_mr); + // Make a list of input sources from the input_paths string. + auto const input_sources = extract_input_sources_async(input_paths, default_stream); + default_stream.synchronize(); + // Exit early if nothing to do. - if (input_files.empty()) { + if (input_sources.empty()) { std::cerr << "No input files to read. Exiting early.\n"; return 0; } // Read the same parquet files specified times with multiple threads and discard the read tables { + // Print status fmt::print( - "\nReading {} input files {} times using {} threads and discarding output " + "\nReading {} input sources {} time(s) using {} threads and discarding output " "tables..\n", - input_files.size(), + input_sources.size(), num_reads, thread_count); - fmt::print( - fg(fmt::color::yellow), - "Note that the first read may include times for nvcomp, cufile loading and RMM growth.\n\n"); + + if (io_source_type == io_source_type::FILEPATH) { + fmt::print(fg(fmt::color::yellow), + "Note that the first read may include times for nvcomp, cufile loading and RMM " + "growth.\n\n"); + } + cudf::examples::timer timer; std::for_each(thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_reads), [&](auto i) { // Read parquet files and discard the tables std::ignore = read_parquet_multithreaded( - input_files, thread_count, stream_pool); + input_sources, thread_count, stream_pool); }); default_stream.synchronize(); timer.print_elapsed_millis(); } - // Do we need to write parquet as well? - if (io_type.has_value()) { + // Do we need to write parquet files and validate? + if (write_and_validate) { // Read input files with CONCATENATE_THREADS mode auto const tables = read_parquet_multithreaded( - input_files, thread_count, stream_pool); + input_sources, thread_count, stream_pool); default_stream.synchronize(); - // Initialize the default output path to avoid race condition with multiple writer threads. - std::ignore = get_default_output_path(); + + // Create a directory at the tmpdir path. + std::string output_path = std::filesystem::temp_directory_path().string() + "/output_" + + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now()); + std::filesystem::create_directory({output_path}); // Construct a vector of table views for write_parquet_multithreaded auto const table_views = [&tables]() { @@ -356,28 +385,31 @@ int32_t main(int argc, char const** argv) }(); // Write tables to parquet - fmt::print("Writing parquet output to sink type: {}..\n", std::string{argv[5]}); + fmt::print("Writing parquet output files..\n"); cudf::examples::timer timer; - write_parquet_multithreaded(table_views, thread_count, stream_pool); + write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool); default_stream.synchronize(); timer.print_elapsed_millis(); - // Verify the output if requested - if (validate_output) { - fmt::print("Verifying output..\n"); + // Verify the output + fmt::print("Verifying output..\n"); + + // CONCATENATE_ALL returns a vector of 1 table + auto const input_table = cudf::concatenate(table_views, default_stream); - // CONCATENATE_ALL returns a vector of 1 table - auto const input_table = cudf::concatenate(table_views, default_stream); + auto const transcoded_input_sources = extract_input_sources_async(output_path, default_stream); + default_stream.synchronize(); + + auto const transcoded_table = std::move(read_parquet_multithreaded( + transcoded_input_sources, thread_count, stream_pool) + .back()); + default_stream.synchronize(); - auto const transcoded_table = - std::move(read_parquet_multithreaded( - extract_input_files(get_default_output_path()), thread_count, stream_pool) - .back()); - default_stream.synchronize(); + // Check if the tables are identical + check_identical_tables(input_table->view(), transcoded_table->view()); - // Check if the tables are identical - check_identical_tables(input_table->view(), transcoded_table->view()); - } + // Remove the created temp directory and parquet data. + std::filesystem::remove_all(output_path); } // Print peak memory From d0c2a62cfc230e463a73495a70e4d6a962e34cb2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 1 Oct 2024 01:38:28 +0000 Subject: [PATCH 17/37] Minor comment updates --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 4204e50c271..868195eb256 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -192,7 +192,7 @@ void print_usage() fmt::print(fg(fmt::color::light_sky_blue), "Note: Provide as many arguments as you like in the above order. Default values\n" " for the unprovided arguments will be used. All input parquet files will\n" - " be converted to the specified before reading\n\n"); + " be converted to the specified IO source type before reading\n\n"); } /** From 945c0c008226f5410052e90741568fb42474b70c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 1 Oct 2024 01:52:55 +0000 Subject: [PATCH 18/37] Style fix and add to CI. --- ci/run_cudf_examples.sh | 3 +++ cpp/examples/parquet_io/io_source.hpp | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 0819eacf636..830bb610cc8 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -26,4 +26,7 @@ compute-sanitizer --tool memcheck custom_with_malloc names.csv compute-sanitizer --tool memcheck parquet_io compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE +compute-sanitizer --tool memcheck parquet_io_multithreaded +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 PINNED_BUFFER 2 2 + exit ${EXITCODE} diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp index 677cc99385c..d9a6e0ee608 100644 --- a/cpp/examples/parquet_io/io_source.hpp +++ b/cpp/examples/parquet_io/io_source.hpp @@ -142,4 +142,4 @@ class io_source { std::vector h_buffer; cudf::detail::host_vector pinned_buffer; rmm::device_uvector d_buffer; -}; \ No newline at end of file +}; From f30c80168d0479e2d4ee72dc47b4b2e199ab0ab0 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 1 Oct 2024 02:01:19 +0000 Subject: [PATCH 19/37] Minor improvement --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 868195eb256..732609d1ad2 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -184,7 +184,7 @@ void print_usage() "\nUsage: parquet_io_multithreaded \n" " \n" - " \n\n"); fmt::print( "Available IO source types: FILEPATH, HOST_BUFFER, {}, DEVICE_BUFFER\n\n", From 719bfb6e25ce9af60b5fb21b6b6533fb9cd6f7a1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 1 Oct 2024 22:26:10 +0000 Subject: [PATCH 20/37] Updates --- cpp/examples/parquet_io/CMakeLists.txt | 7 +- .../{common.hpp => common_utils.cpp} | 21 +- cpp/examples/parquet_io/common_utils.hpp | 81 ++++++ cpp/examples/parquet_io/io_source.hpp | 44 ++- cpp/examples/parquet_io/parquet_io.cpp | 10 +- .../parquet_io/parquet_io_multithreaded.cpp | 256 +++++++++++------- 6 files changed, 291 insertions(+), 128 deletions(-) rename cpp/examples/parquet_io/{common.hpp => common_utils.cpp} (91%) create mode 100644 cpp/examples/parquet_io/common_utils.hpp diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index 28ade3666bf..9d81a726217 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -16,15 +16,18 @@ project( include(../fetch_dependencies.cmake) +add_library(parquet_io_common_utils OBJECT common_utils.cpp) +target_link_libraries(parquet_io_common_utils PRIVATE cudf::cudf) + # Build and install parquet_io add_executable(parquet_io parquet_io.cpp) -target_link_libraries(parquet_io PRIVATE cudf::cudf) +target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $) target_compile_features(parquet_io PRIVATE cxx_std_17) install(TARGETS parquet_io DESTINATION bin/examples/libcudf) # Build and install parquet_io_multithreaded add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) -target_link_libraries(parquet_io_multithreaded PRIVATE cudf::cudf) +target_link_libraries(parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $) target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17) install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) diff --git a/cpp/examples/parquet_io/common.hpp b/cpp/examples/parquet_io/common_utils.cpp similarity index 91% rename from cpp/examples/parquet_io/common.hpp rename to cpp/examples/parquet_io/common_utils.cpp index 16fd16ee7c1..aa3d4c922e4 100644 --- a/cpp/examples/parquet_io/common.hpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -1,3 +1,4 @@ + /* * Copyright (c) 2024, NVIDIA CORPORATION. * @@ -14,29 +15,24 @@ * limitations under the License. */ -#pragma once - -#include "../utilities/timer.hpp" +#include "common_utils.hpp" #include -#include #include #include #include -#include #include #include #include #include -#include #include /** - * @file commons.hpp - * @brief Common utilities for `parquet_io` examples + * @file commons.cpp + * @brief Definitions for common utilities for `parquet_io` examples * */ @@ -62,7 +58,7 @@ std::shared_ptr create_memory_resource(bool is_ * @param name encoding keyword name * @return corresponding column encoding type */ -[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name) +cudf::io::column_encoding get_encoding_type(std::string name) { using encoding_type = cudf::io::column_encoding; @@ -91,7 +87,7 @@ std::shared_ptr create_memory_resource(bool is_ * @param name compression keyword name * @return corresponding compression type */ -[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name) +cudf::io::compression_type get_compression_type(std::string name) { using compression_type = cudf::io::compression_type; @@ -117,7 +113,7 @@ std::shared_ptr create_memory_resource(bool is_ * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON * @return true or false */ -[[nodiscard]] bool get_boolean(std::string input) +bool get_boolean(std::string input) { std::transform(input.begin(), input.end(), input.begin(), ::toupper); @@ -136,8 +132,7 @@ std::shared_ptr create_memory_resource(bool is_ * @param lhs_table View to lhs table * @param rhs_table View to rhs table */ -inline void check_identical_tables(cudf::table_view const& lhs_table, - cudf::table_view const& rhs_table) +void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table) { try { // Left anti-join the original and transcoded tables diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp new file mode 100644 index 00000000000..135b40a09a3 --- /dev/null +++ b/cpp/examples/parquet_io/common_utils.hpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include + +/** + * @file commons.hpp + * @brief Common utilities for `parquet_io` examples + * + */ + +/** + * @brief Create memory resource for libcudf functions + * + * @param pool Whether to use a pool memory resource. + * @return Memory resource instance + */ +std::shared_ptr create_memory_resource(bool is_pool_used); + +/** + * @brief Get encoding type from the keyword + * + * @param name encoding keyword name + * @return corresponding column encoding type + */ +[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name); + +/** + * @brief Get compression type from the keyword + * + * @param name compression keyword name + * @return corresponding compression type + */ +[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name); + +/** + * @brief Get boolean from they keyword + * + * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON + * @return true or false + */ +[[nodiscard]] bool get_boolean(std::string input); + +/** + * @brief Check if two tables are identical, throw an error otherwise + * + * @param lhs_table View to lhs table + * @param rhs_table View to rhs table + */ +void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table); + +/** + * @brief Concatenate a vector of tables and return the resultant table + * + * @param tables Vector of tables to concatenate + * @param stream CUDA stream to use + * + * @return Unique pointer to the resultant concatenated table. + */ +std::unique_ptr concatenate_tables(std::vector> tables, + rmm::cuda_stream_view stream); diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp index d9a6e0ee608..3900877a4f7 100644 --- a/cpp/examples/parquet_io/io_source.hpp +++ b/cpp/examples/parquet_io/io_source.hpp @@ -16,15 +16,14 @@ #pragma once -#include #include #include #include -#include -#include #include +#include + #include #include @@ -53,6 +52,33 @@ rmm::host_async_resource_ref pinned_memory_resource() return mr; } +/** + * @brief Custom allocator for pinned_buffer via RMM. + */ +template +struct pinned_allocator : public std::allocator { + pinned_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream) + : mr{_mr}, stream{_stream} + { + } + + T* allocate(std::size_t n) + { + auto ptr = mr.allocate_async(n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + stream.synchronize(); + return static_cast(ptr); + } + + void deallocate(T* ptr, std::size_t n) + { + mr.deallocate_async(ptr, n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + private: + rmm::host_async_resource_ref mr; + rmm::cuda_stream_view stream; +}; + /** * @brief Get io source type from the string keyword argument * @@ -105,13 +131,13 @@ class io_source { case io_source_type::HOST_BUFFER: { h_buffer.resize(file_size); file.read(h_buffer.data(), file_size); - source_info = cudf::io::source_info(h_buffer.data(), h_buffer.size()); + source_info = cudf::io::source_info(h_buffer.data(), file_size); break; } case io_source_type::PINNED_BUFFER: { pinned_buffer.resize(file_size); file.read(pinned_buffer.data(), file_size); - source_info = cudf::io::source_info(pinned_buffer.data(), pinned_buffer.size()); + source_info = cudf::io::source_info(pinned_buffer.data(), file_size); break; } case io_source_type::DEVICE_BUFFER: { @@ -119,7 +145,7 @@ class io_source { file.read(h_buffer.data(), file_size); d_buffer.resize(file_size, stream); CUDF_CUDA_TRY(cudaMemcpyAsync( - d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value())); + d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value())); source_info = cudf::io::source_info(d_buffer); break; @@ -135,11 +161,15 @@ class io_source { [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; } private: + // alias for pinned vector + template + using pinned_vector = thrust::host_vector>; + io_source_type const type; std::string const file_name; size_t const file_size; cudf::io::source_info source_info; std::vector h_buffer; - cudf::detail::host_vector pinned_buffer; + pinned_vector pinned_buffer; rmm::device_uvector d_buffer; }; diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index a4ee550b0e4..08dbaa0bdd6 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -14,9 +14,15 @@ * limitations under the License. */ -#include "common.hpp" +#include "../utilities/timer.hpp" +#include "common_utils.hpp" +#include "io_source.hpp" -#include +#include +#include +#include + +#include /** * @file parquet_io.cpp diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 732609d1ad2..95ade08c791 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -14,13 +14,25 @@ * limitations under the License. */ -#include "common.hpp" +#include "../utilities/timer.hpp" +#include "common_utils.hpp" #include "io_source.hpp" +#include +#include +#include +#include + #include +#include #include #include +#include + +#include +#include +#include /** * @file parquet_io_multithreaded.cpp @@ -175,7 +187,40 @@ struct write_fn { }; /** - * @brief Function to print example usage + * @brief Function to setup and launch multithreaded writing parquet files. + * + * @param output_path Path to output directory + * @param tables List of at least table views to be written + * @param thread_count Number of threads to use for writing tables. + * @param stream_pool CUDA stream pool to use for threads + * + */ +void write_parquet_multithreaded(std::string const& output_path, + std::vector const& tables, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) +{ + // Table writing tasks + std::vector write_tasks; + write_tasks.reserve(thread_count); + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { + write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()}); + }); + + // Writer threads + std::vector threads; + threads.reserve(thread_count); + for (auto& c : write_tasks) { + threads.emplace_back(std::thread{c}); + } + for (auto& t : threads) { + t.join(); + } +} + +/** + * @brief Function to print example usage and argument information. */ void print_usage() { @@ -195,6 +240,94 @@ void print_usage() " be converted to the specified IO source type before reading\n\n"); } +/** + * @brief Function to process comma delimited input paths string to parquet files and/or dirs + * and asynchronously convert them to specified io sources. + * + * Process the input path string containing directories (of parquet files) and/or individual + * parquet files into a list of input parquet files, multiple the list by `input_multiplier`, + * make sure to have at least `thread_count` files to satisfy at least file per parallel thread, + * and asynchronously convert the final list of files to a list of `io_source` and return. + * + * @param paths Comma delimited input paths string + * @param input_multiplier Multiplier for the input files list + * @param thread_count Number of threads being used in the example + * @param io_source_type Specified IO source type to convert input files to + * @param stream CUDA stream to use + * + */ +std::vector extract_input_sources_async(std::string const& paths, + int32_t input_multiplier, + int32_t thread_count, + io_source_type io_source_type, + rmm::cuda_stream_view stream) +{ + // Get the delimited paths to directory and/or files. + std::vector const delimited_paths = [&]() { + std::vector paths_list; + std::stringstream strstream{paths}; + std::string path; + // Extract the delimited paths. + while (std::getline(strstream, path, char{','})) { + paths_list.push_back(path); + } + return paths_list; + }(); + + // List of parquet files + std::vector parquet_files; + std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { + std::filesystem::path path{path_string}; + // If this is a parquet file, add it. + if (std::filesystem::is_regular_file(path)) { + parquet_files.push_back(path_string); + } + // If this is a directory, add all files in the directory. + else if (std::filesystem::is_directory(path)) { + for (auto const& file : std::filesystem::directory_iterator(path)) { + if (std::filesystem::is_regular_file(file.path())) { + parquet_files.push_back(file.path().string()); + } + } + } else { + throw std::runtime_error("Encountered an invalid input path\n"); + } + }); + + // Current size of list of parquet files + auto const initial_size = parquet_files.size(); + if (initial_size == 0) { return {}; } + + // Reserve space + parquet_files.reserve(std::max(thread_count, input_multiplier * parquet_files.size())); + + // Append the input files by input_multiplier times + std::for_each(thrust::make_counting_iterator(1), + thrust::make_counting_iterator(input_multiplier), + [&](auto i) { + parquet_files.insert(parquet_files.end(), + parquet_files.begin(), + parquet_files.begin() + initial_size); + }); + + // Cycle append parquet files from the existing ones if less than the thread_count + for (size_t idx = 0; thread_count > static_cast(parquet_files.size()); idx++) { + parquet_files.emplace_back(parquet_files[idx % initial_size]); + } + + // Vector of io sources + std::vector input_sources; + input_sources.reserve(parquet_files.size()); + // Transform input files to the specified io sources + std::transform(parquet_files.begin(), + parquet_files.end(), + std::back_inserter(input_sources), + [&](auto const& file_name) { + return io_source{file_name, io_source_type, stream}; + }); + return input_sources; +} + /** * @brief The main function */ @@ -228,92 +361,6 @@ int32_t main(int argc, char const** argv) default: print_usage(); throw std::runtime_error(""); } - // Lambda function to process and extract all input files - auto const extract_input_sources_async = - [thread_count, input_multiplier, io_source_type = io_source_type]( - std::string const& paths, rmm::cuda_stream_view stream) { - std::vector const delimited_paths = [&]() { - std::vector paths_list; - std::stringstream strstream{paths}; - std::string path; - // Extract the delimited paths. - while (std::getline(strstream, path, char{','})) { - paths_list.push_back(path); - } - return paths_list; - }(); - - // The final list of parquet files to be read. - std::vector parquet_files; - parquet_files.reserve( - std::max(thread_count, input_multiplier * delimited_paths.size())); - // Append the input files by input_multiplier times - std::for_each( - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input_multiplier), - [&](auto i) { - std::for_each( - delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { - std::filesystem::path path{path_string}; - // If this is a parquet file, add it. - if (std::filesystem::is_regular_file(path)) { - parquet_files.emplace_back(path_string); - } - // If this is a directory, add all files at this path - else if (std::filesystem::is_directory(path)) { - for (auto const& file : std::filesystem::directory_iterator(path)) { - if (std::filesystem::is_regular_file(file.path())) { - parquet_files.emplace_back(file.path().string()); - } - } - } else { - throw std::runtime_error("Encountered an invalid input path\n"); - } - }); - }); - - // Cycle append parquet files from the existing ones if less than the thread_count - for (size_t idx = 0, initial_size = parquet_files.size(); - thread_count > static_cast(parquet_files.size()); - idx++) { - parquet_files.emplace_back(parquet_files[idx % initial_size]); - } - - std::vector input_sources; - input_sources.reserve(parquet_files.size()); - std::transform(parquet_files.begin(), - parquet_files.end(), - std::back_inserter(input_sources), - [&](auto& file_name) { return io_source(file_name, io_source_type, stream); }); - return input_sources; - }; - - // Lambda function to setup and launch multithreaded parquet writes - auto const write_parquet_multithreaded = [&](std::string const& output_path, - std::vector const& tables, - int32_t thread_count, - rmm::cuda_stream_pool& stream_pool) { - // Table writing tasks - std::vector write_tasks; - write_tasks.reserve(thread_count); - std::for_each( - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(thread_count), - [&](auto tid) { - write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()}); - }); - - // Writer threads - std::vector threads; - threads.reserve(thread_count); - for (auto& c : write_tasks) { - threads.emplace_back(std::thread{c}); - } - for (auto& t : threads) { - t.join(); - } - }; - // Initialize mr, default stream and stream pool auto const is_pool_used = true; auto resource = create_memory_resource(is_pool_used); @@ -323,14 +370,14 @@ int32_t main(int argc, char const** argv) rmm::mr::statistics_resource_adaptor(resource.get()); rmm::mr::set_current_device_resource(&stats_mr); - // Make a list of input sources from the input_paths string. - auto const input_sources = extract_input_sources_async(input_paths, default_stream); + // List of input sources from the input_paths string. + auto const input_sources = extract_input_sources_async( + input_paths, input_multiplier, thread_count, io_source_type, default_stream); default_stream.synchronize(); - // Exit early if nothing to do. + // Check if there is nothing to do if (input_sources.empty()) { - std::cerr << "No input files to read. Exiting early.\n"; - return 0; + throw std::runtime_error("No input files to read. Exiting early.\n"); } // Read the same parquet files specified times with multiple threads and discard the read tables @@ -362,21 +409,15 @@ int32_t main(int argc, char const** argv) // Do we need to write parquet files and validate? if (write_and_validate) { - // Read input files with CONCATENATE_THREADS mode + // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables auto const tables = read_parquet_multithreaded( input_sources, thread_count, stream_pool); default_stream.synchronize(); - // Create a directory at the tmpdir path. - std::string output_path = std::filesystem::temp_directory_path().string() + "/output_" + - fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now()); - std::filesystem::create_directory({output_path}); - // Construct a vector of table views for write_parquet_multithreaded auto const table_views = [&tables]() { std::vector table_views; table_views.reserve(tables.size()); - std::transform( tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { return tbl->view(); @@ -386,6 +427,10 @@ int32_t main(int argc, char const** argv) // Write tables to parquet fmt::print("Writing parquet output files..\n"); + // Create a directory at the tmpdir path. + std::string output_path = std::filesystem::temp_directory_path().string() + "/output_" + + fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now()); + std::filesystem::create_directory({output_path}); cudf::examples::timer timer; write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool); default_stream.synchronize(); @@ -394,21 +439,24 @@ int32_t main(int argc, char const** argv) // Verify the output fmt::print("Verifying output..\n"); - // CONCATENATE_ALL returns a vector of 1 table + // Simply concatenate the previously read tables from input sources auto const input_table = cudf::concatenate(table_views, default_stream); - auto const transcoded_input_sources = extract_input_sources_async(output_path, default_stream); + // Sources from written parquet files + auto const written_pq_sources = extract_input_sources_async( + output_path, input_multiplier, thread_count, io_source_type, default_stream); default_stream.synchronize(); + // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only auto const transcoded_table = std::move(read_parquet_multithreaded( - transcoded_input_sources, thread_count, stream_pool) + written_pq_sources, thread_count, stream_pool) .back()); default_stream.synchronize(); // Check if the tables are identical check_identical_tables(input_table->view(), transcoded_table->view()); - // Remove the created temp directory and parquet data. + // Remove the created temp directory and parquet data std::filesystem::remove_all(output_path); } From 2ade064859edaf9529175703f5ce6805f6e434a7 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 1 Oct 2024 22:43:35 +0000 Subject: [PATCH 21/37] Style fix. --- cpp/examples/parquet_io/CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index 9d81a726217..7c963e5192b 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -21,13 +21,17 @@ target_link_libraries(parquet_io_common_utils PRIVATE cudf::cudf) # Build and install parquet_io add_executable(parquet_io parquet_io.cpp) -target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $) +target_link_libraries( + parquet_io PRIVATE cudf::cudf nvToolsExt $ +) target_compile_features(parquet_io PRIVATE cxx_std_17) install(TARGETS parquet_io DESTINATION bin/examples/libcudf) # Build and install parquet_io_multithreaded add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) -target_link_libraries(parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $) +target_link_libraries( + parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $ +) target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17) install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) From b559eafeb1990d9591a1914b640282bf2f6b4326 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 2 Oct 2024 00:36:56 +0000 Subject: [PATCH 22/37] Print message when skipping a subdirectory --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 95ade08c791..3a46552e863 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -287,6 +287,8 @@ std::vector extract_input_sources_async(std::string const& paths, for (auto const& file : std::filesystem::directory_iterator(path)) { if (std::filesystem::is_regular_file(file.path())) { parquet_files.push_back(file.path().string()); + } else { + fmt::print("Skipping sub-directory: {}\n", file.path().string()); } } } else { From 73de5bcc028c22d9fc924c572d078f65a5843942 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:39:17 -0700 Subject: [PATCH 23/37] Update cpp/examples/parquet_io/io_source.hpp --- cpp/examples/parquet_io/io_source.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp index 3900877a4f7..d52470ab5c7 100644 --- a/cpp/examples/parquet_io/io_source.hpp +++ b/cpp/examples/parquet_io/io_source.hpp @@ -32,7 +32,7 @@ /** * @file io_source.hpp - * @brief Utilities for construction IO sources from the input parquet files. + * @brief Utilities for constructing the specified IO sources from the input parquet files. * */ From 52e6953f97498bd06f679be9aa0018d69fcc6148 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:40:00 -0700 Subject: [PATCH 24/37] Update cpp/examples/parquet_io/common_utils.cpp --- cpp/examples/parquet_io/common_utils.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp index aa3d4c922e4..0c78a2d2877 100644 --- a/cpp/examples/parquet_io/common_utils.cpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -1,4 +1,3 @@ - /* * Copyright (c) 2024, NVIDIA CORPORATION. * From 6194a50e75612e3c997063d845472963380ef136 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 2 Oct 2024 04:33:02 +0000 Subject: [PATCH 25/37] Do not use `fmtlib` --- cpp/examples/parquet_io/common_utils.cpp | 78 ++++++------------- cpp/examples/parquet_io/common_utils.hpp | 6 ++ cpp/examples/parquet_io/io_source.hpp | 14 +--- cpp/examples/parquet_io/parquet_io.cpp | 12 +-- .../parquet_io/parquet_io_multithreaded.cpp | 58 ++++++-------- 5 files changed, 63 insertions(+), 105 deletions(-) diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp index 0c78a2d2877..3b89a66c902 100644 --- a/cpp/examples/parquet_io/common_utils.cpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -25,8 +25,8 @@ #include #include -#include - +#include +#include #include /** @@ -35,12 +35,6 @@ * */ -/** - * @brief Create memory resource for libcudf functions - * - * @param pool Whether to use a pool memory resource. - * @return Memory resource instance - */ std::shared_ptr create_memory_resource(bool is_pool_used) { auto cuda_mr = std::make_shared(); @@ -51,12 +45,6 @@ std::shared_ptr create_memory_resource(bool is_ return cuda_mr; } -/** - * @brief Get encoding type from the keyword - * - * @param name encoding keyword name - * @return corresponding column encoding type - */ cudf::io::column_encoding get_encoding_type(std::string name) { using encoding_type = cudf::io::column_encoding; @@ -72,20 +60,13 @@ cudf::io::column_encoding get_encoding_type(std::string name) std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument(fmt::format(fmt::emphasis::bold | fg(fmt::color::red), - "{} is not a valid encoding type.\n\n" - "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n" - "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n" - "DELTA_BYTE_ARRAY\n\n", - name)); + throw std::invalid_argument(name + + " is not a valid encoding type.\n\n" + "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n" + "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n" + "DELTA_BYTE_ARRAY\n\n"); } -/** - * @brief Get compression type from the keyword - * - * @param name compression keyword name - * @return corresponding compression type - */ cudf::io::compression_type get_compression_type(std::string name) { using compression_type = cudf::io::compression_type; @@ -99,19 +80,12 @@ cudf::io::compression_type get_compression_type(std::string name) std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument(fmt::format(fmt::emphasis::bold | fg(fmt::color::red), - "{} is not a valid compression type.\n\n" - "Available compression types: NONE, AUTO, SNAPPY,\n" - "LZ4, ZSTD\n\n", - name)); + throw std::invalid_argument(name + + " is not a valid compression type.\n\n" + "Available compression types: NONE, AUTO, SNAPPY,\n" + "LZ4, ZSTD\n\n"); } -/** - * @brief Get boolean from they keyword - * - * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON - * @return true or false - */ bool get_boolean(std::string input) { std::transform(input.begin(), input.end(), input.begin(), ::toupper); @@ -125,12 +99,6 @@ bool get_boolean(std::string input) } } -/** - * @brief Check if two tables are identical, throw an error otherwise - * - * @param lhs_table View to lhs table - * @param rhs_table View to rhs table - */ void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table) { try { @@ -141,23 +109,13 @@ void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view // No exception thrown, check indices auto const valid = indices->size() == 0; - fmt::print( - fmt::emphasis::bold | fg(fmt::color::green_yellow), "Tables identical: {}\n\n", valid); + std::cout << "Tables identical: " << valid << "\n\n"; } catch (std::exception& e) { std::cerr << e.what() << std::endl << std::endl; - throw std::runtime_error( - fmt::format(fmt::emphasis::bold | fg(fmt::color::red), "Tables identical: false\n\n")); + throw std::runtime_error("Tables identical: false\n\n"); } } -/** - * @brief Concatenate a vector of tables and return the resultant table - * - * @param tables Vector of tables to concatenate - * @param stream CUDA stream to use - * - * @return Unique pointer to the resultant concatenated table. - */ std::unique_ptr concatenate_tables(std::vector> tables, rmm::cuda_stream_view stream) { @@ -172,3 +130,13 @@ std::unique_ptr concatenate_tables(std::vector concatenate_tables(std::vector> tables, rmm::cuda_stream_view stream); + +/** + * @brief Returns a string containing current date and time + * + */ +std::string current_time_and_date(); \ No newline at end of file diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp index d52470ab5c7..6ccc031d382 100644 --- a/cpp/examples/parquet_io/io_source.hpp +++ b/cpp/examples/parquet_io/io_source.hpp @@ -24,9 +24,6 @@ #include -#include -#include - #include #include @@ -97,11 +94,9 @@ struct pinned_allocator : public std::allocator { if (map.find(name) != map.end()) { return map.at(name); } else { - throw std::invalid_argument( - fmt::format(fmt::emphasis::bold | fg(fmt::color::red), - "{} is not a valid io source type. Available: FILEPATH,\n" - "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n", - name)); + throw std::invalid_argument(name + + " is not a valid io source type. Available: FILEPATH,\n" + "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n"); } } @@ -151,8 +146,7 @@ class io_source { break; } default: { - throw std::runtime_error(fmt::format(fmt::emphasis::bold | fg(fmt::color::red), - "Encountered unexpected source type\n\n")); + throw std::runtime_error("Encountered unexpected source type\n\n"); } } } diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 08dbaa0bdd6..513bd9c0518 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -136,16 +136,16 @@ int main(int argc, char const** argv) // Read input parquet file // We do not want to time the initial read time as it may include // time for nvcomp, cufile loading and RMM growth - fmt::print("\nReading {}...", input_filepath); - fmt::print( - "Note: Not timing the initial parquet read as it may include\n" - "times for nvcomp, cufile loading and RMM growth.\n\n"); + std::cout << "\nReading " << input_filepath << "..."; + std::cout << "Note: Not timing the initial parquet read as it may include\n" + "times for nvcomp, cufile loading and RMM growth.\n\n"; auto [input, metadata] = read_parquet(input_filepath); // Status string to indicate if page stats are set to be written or not auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; // Write parquet file with the specified encoding and compression - fmt::print("Writing {} with encoding, compression and {}..\n", output_filepath, page_stat_string); + std::cout << "Writing " << output_filepath << " with encoding, compression and " + << page_stat_string << "..\n"; // `timer` is automatically started here cudf::examples::timer timer; @@ -153,7 +153,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Read the parquet file written with encoding and compression - fmt::print("Reading {}...\n", output_filepath); + std::cout << "Reading " << output_filepath << "...\n"; // Reset the timer timer.reset(); diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 3a46552e863..4e3f61866db 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -27,9 +27,6 @@ #include #include -#include -#include - #include #include #include @@ -224,20 +221,17 @@ void write_parquet_multithreaded(std::string const& output_path, */ void print_usage() { - fmt::print( - fg(fmt::color::yellow), - "\nUsage: parquet_io_multithreaded \n" - " \n" - " \n\n"); - fmt::print( - "Available IO source types: FILEPATH, HOST_BUFFER, {}, DEVICE_BUFFER\n\n", - fmt::format(fmt::emphasis::bold | fg(fmt::color::green_yellow), "PINNED_BUFFER (Default)")); - fmt::print(fg(fmt::color::light_sky_blue), - "Note: Provide as many arguments as you like in the above order. Default values\n" - " for the unprovided arguments will be used. All input parquet files will\n" - " be converted to the specified IO source type before reading\n\n"); + std::cout + << "\nUsage: parquet_io_multithreaded \n" + " \n" + " \n\n" + "Available IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER (Default), " + "DEVICE_BUFFER\n\n" + "Note: Provide as many arguments as you like in the above order. Default values\n" + " for the unprovided arguments will be used. All input parquet files will\n" + " be converted to the specified IO source type before reading\n\n"; } /** @@ -288,7 +282,7 @@ std::vector extract_input_sources_async(std::string const& paths, if (std::filesystem::is_regular_file(file.path())) { parquet_files.push_back(file.path().string()); } else { - fmt::print("Skipping sub-directory: {}\n", file.path().string()); + std::cout << "Skipping sub-directory: " << file.path().string() << "\n"; } } } else { @@ -385,17 +379,14 @@ int32_t main(int argc, char const** argv) // Read the same parquet files specified times with multiple threads and discard the read tables { // Print status - fmt::print( - "\nReading {} input sources {} time(s) using {} threads and discarding output " - "tables..\n", - input_sources.size(), - num_reads, - thread_count); + std::cout << "\nReading " << input_sources.size() << " input sources " << num_reads + << " time(s) using " << thread_count + << " threads and discarding output " + "tables..\n"; if (io_source_type == io_source_type::FILEPATH) { - fmt::print(fg(fmt::color::yellow), - "Note that the first read may include times for nvcomp, cufile loading and RMM " - "growth.\n\n"); + std::cout << "Note that the first read may include times for nvcomp, cufile loading and RMM " + "growth.\n\n"; } cudf::examples::timer timer; @@ -428,10 +419,11 @@ int32_t main(int argc, char const** argv) }(); // Write tables to parquet - fmt::print("Writing parquet output files..\n"); + std::cout << "Writing parquet output files..\n"; + // Create a directory at the tmpdir path. - std::string output_path = std::filesystem::temp_directory_path().string() + "/output_" + - fmt::format("{:%Y-%m-%d-%H-%M-%S}", std::chrono::system_clock::now()); + std::string output_path = + std::filesystem::temp_directory_path().string() + "/output_" + current_time_and_date(); std::filesystem::create_directory({output_path}); cudf::examples::timer timer; write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool); @@ -439,7 +431,7 @@ int32_t main(int argc, char const** argv) timer.print_elapsed_millis(); // Verify the output - fmt::print("Verifying output..\n"); + std::cout << "Verifying output..\n"; // Simply concatenate the previously read tables from input sources auto const input_table = cudf::concatenate(table_views, default_stream); @@ -463,9 +455,7 @@ int32_t main(int argc, char const** argv) } // Print peak memory - fmt::print(fmt::emphasis::bold | fg(fmt::color::medium_purple), - "Peak memory: {} MB\n\n", - (stats_mr.get_bytes_counter().peak / 1048576.0)); + std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n"; return 0; } From 3420c3f0f20a7a6d61e5e1b96072a52e7ed27e52 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 2 Oct 2024 16:53:03 +0000 Subject: [PATCH 26/37] Minor style fix --- cpp/examples/parquet_io/common_utils.cpp | 2 +- cpp/examples/parquet_io/common_utils.hpp | 2 +- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp index 3b89a66c902..abb03f33c15 100644 --- a/cpp/examples/parquet_io/common_utils.cpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -131,7 +131,7 @@ std::unique_ptr concatenate_tables(std::vector concatenate_tables(std::vector Date: Wed, 2 Oct 2024 10:25:35 -0700 Subject: [PATCH 27/37] Minor change --- cpp/examples/parquet_io/parquet_io.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 513bd9c0518..12b9e5525a3 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -136,7 +136,7 @@ int main(int argc, char const** argv) // Read input parquet file // We do not want to time the initial read time as it may include // time for nvcomp, cufile loading and RMM growth - std::cout << "\nReading " << input_filepath << "..."; + std::cout << "\nReading " << input_filepath << "...\n"; std::cout << "Note: Not timing the initial parquet read as it may include\n" "times for nvcomp, cufile loading and RMM growth.\n\n"; auto [input, metadata] = read_parquet(input_filepath); From 70ec6fd328ca4093510df752ab53b04e05d40739 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 3 Oct 2024 03:35:58 +0000 Subject: [PATCH 28/37] Address minor nits from reviews --- cpp/examples/parquet_io/common_utils.cpp | 2 +- cpp/examples/parquet_io/common_utils.hpp | 6 ++++-- cpp/examples/parquet_io/parquet_io.cpp | 2 +- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp index abb03f33c15..62a5f4bdeae 100644 --- a/cpp/examples/parquet_io/common_utils.cpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -30,7 +30,7 @@ #include /** - * @file commons.cpp + * @file common_utils.cpp * @brief Definitions for common utilities for `parquet_io` examples * */ diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp index 5aa62a4fb68..12896e61a0d 100644 --- a/cpp/examples/parquet_io/common_utils.hpp +++ b/cpp/examples/parquet_io/common_utils.hpp @@ -19,12 +19,14 @@ #include #include +#include #include +#include #include /** - * @file commons.hpp + * @file common_utils.hpp * @brief Common utilities for `parquet_io` examples * */ @@ -67,7 +69,7 @@ std::shared_ptr create_memory_resource(bool is_ * @param lhs_table View to lhs table * @param rhs_table View to rhs table */ -void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table); +void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table); /** * @brief Concatenate a vector of tables and return the resultant table diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 12b9e5525a3..aeb47225cac 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -161,7 +161,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Check for validity - check_identical_tables(input->view(), transcoded_input->view()); + check_tables_equal(input->view(), transcoded_input->view()); return 0; } diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index d5dee85c7e0..19af739032d 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -448,7 +448,7 @@ int32_t main(int argc, char const** argv) default_stream.synchronize(); // Check if the tables are identical - check_identical_tables(input_table->view(), transcoded_table->view()); + check_tables_equal(input_table->view(), transcoded_table->view()); // Remove the created temp directory and parquet data std::filesystem::remove_all(output_path); From 00390cd342a8e910fef07a6e298769e8cce3f6c2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 3 Oct 2024 12:16:37 -0700 Subject: [PATCH 29/37] Update cpp/examples/parquet_io/parquet_io_multithreaded.cpp Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 19af739032d..e7966357cbd 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -248,7 +248,7 @@ void print_usage() * @param thread_count Number of threads being used in the example * @param io_source_type Specified IO source type to convert input files to * @param stream CUDA stream to use - * + * @return Vector of input sources for the given paths */ std::vector extract_input_sources_async(std::string const& paths, int32_t input_multiplier, From 5ad8ecdb08aca5831ba24af9e9e09e92b78d3cf6 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 3 Oct 2024 19:27:11 +0000 Subject: [PATCH 30/37] Move code to cpp files and minor refactoring --- ci/run_cudf_examples.sh | 4 +- cpp/examples/parquet_io/CMakeLists.txt | 8 +- cpp/examples/parquet_io/common_utils.cpp | 2 +- cpp/examples/parquet_io/io_source.cpp | 99 +++++++++++++++++++ cpp/examples/parquet_io/io_source.hpp | 88 +++-------------- .../parquet_io/parquet_io_multithreaded.cpp | 4 +- 6 files changed, 121 insertions(+), 84 deletions(-) create mode 100644 cpp/examples/parquet_io/io_source.cpp diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 830bb610cc8..f8c3ed20b03 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -26,7 +26,7 @@ compute-sanitizer --tool memcheck custom_with_malloc names.csv compute-sanitizer --tool memcheck parquet_io compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE -compute-sanitizer --tool memcheck parquet_io_multithreaded -compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 PINNED_BUFFER 2 2 +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2 exit ${EXITCODE} diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index 7c963e5192b..e2e9919fc49 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -16,13 +16,13 @@ project( include(../fetch_dependencies.cmake) -add_library(parquet_io_common_utils OBJECT common_utils.cpp) -target_link_libraries(parquet_io_common_utils PRIVATE cudf::cudf) +add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp) +target_link_libraries(parquet_io_utils PRIVATE cudf::cudf) # Build and install parquet_io add_executable(parquet_io parquet_io.cpp) target_link_libraries( - parquet_io PRIVATE cudf::cudf nvToolsExt $ + parquet_io PRIVATE cudf::cudf nvToolsExt $ ) target_compile_features(parquet_io PRIVATE cxx_std_17) install(TARGETS parquet_io DESTINATION bin/examples/libcudf) @@ -30,7 +30,7 @@ install(TARGETS parquet_io DESTINATION bin/examples/libcudf) # Build and install parquet_io_multithreaded add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) target_link_libraries( - parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $ + parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $ ) target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17) install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp index 62a5f4bdeae..13a8293e64c 100644 --- a/cpp/examples/parquet_io/common_utils.cpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -99,7 +99,7 @@ bool get_boolean(std::string input) } } -void check_identical_tables(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table) +void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table) { try { // Left anti-join the original and transcoded tables diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp new file mode 100644 index 00000000000..342c2749d7a --- /dev/null +++ b/cpp/examples/parquet_io/io_source.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io_source.hpp" + +#include +#include + +#include +#include + +#include + +#include +#include + +rmm::host_async_resource_ref pinned_memory_resource() +{ + static auto mr = rmm::mr::pinned_host_memory_resource{}; + return mr; +} + +io_source_type get_io_source_type(std::string name) +{ + static std::unordered_map const map = { + {"FILEPATH", io_source_type::FILEPATH}, + {"HOST_BUFFER", io_source_type::HOST_BUFFER}, + {"PINNED_BUFFER", io_source_type::PINNED_BUFFER}, + {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}}; + + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + if (map.find(name) != map.end()) { + return map.at(name); + } else { + throw std::invalid_argument(name + + " is not a valid io source type. Available: FILEPATH,\n" + "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n"); + } +} + +io_source::io_source(std::string_view file_path, + io_source_type io_type, + rmm::cuda_stream_view stream) + : type{io_type}, + file_name{file_path}, + file_size{std::filesystem::file_size(file_name)}, + pinned_buffer({pinned_memory_resource(), stream}), + d_buffer{0, stream} +{ + // For filepath make a quick source_info and return early + if (type == io_source_type::FILEPATH) { + source_info = cudf::io::source_info(file_name); + return; + } + + std::ifstream file{file_name, std::ifstream::binary}; + + // Copy file contents to the specified io source buffer + switch (type) { + case io_source_type::HOST_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + source_info = cudf::io::source_info(h_buffer.data(), file_size); + break; + } + case io_source_type::PINNED_BUFFER: { + pinned_buffer.resize(file_size); + file.read(pinned_buffer.data(), file_size); + source_info = cudf::io::source_info(pinned_buffer.data(), file_size); + break; + } + case io_source_type::DEVICE_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + d_buffer.resize(file_size, stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value())); + + source_info = cudf::io::source_info(d_buffer); + break; + } + default: { + throw std::runtime_error("Encountered unexpected source type\n\n"); + } + } +} diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp index 6ccc031d382..a296938adaa 100644 --- a/cpp/examples/parquet_io/io_source.hpp +++ b/cpp/examples/parquet_io/io_source.hpp @@ -17,14 +17,13 @@ #pragma once #include -#include #include -#include +#include +#include #include -#include #include /** @@ -38,16 +37,20 @@ */ enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER }; +/** + * @brief Get io source type from the string keyword argument + * + * @param name io source type keyword name + * @return io source type + */ +[[nodiscard]] io_source_type get_io_source_type(std::string name); + /** * @brief Create and return a reference to a static pinned memory pool * * @return Reference to a static pinned memory pool */ -rmm::host_async_resource_ref pinned_memory_resource() -{ - static auto mr = rmm::mr::pinned_host_memory_resource{}; - return mr; -} +rmm::host_async_resource_ref pinned_memory_resource(); /** * @brief Custom allocator for pinned_buffer via RMM. @@ -76,80 +79,13 @@ struct pinned_allocator : public std::allocator { rmm::cuda_stream_view stream; }; -/** - * @brief Get io source type from the string keyword argument - * - * @param name io source type keyword name - * @return io source type - */ -[[nodiscard]] io_source_type get_io_source_type(std::string name) -{ - static std::unordered_map const map = { - {"FILEPATH", io_source_type::FILEPATH}, - {"HOST_BUFFER", io_source_type::HOST_BUFFER}, - {"PINNED_BUFFER", io_source_type::PINNED_BUFFER}, - {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}}; - - std::transform(name.begin(), name.end(), name.begin(), ::toupper); - if (map.find(name) != map.end()) { - return map.at(name); - } else { - throw std::invalid_argument(name + - " is not a valid io source type. Available: FILEPATH,\n" - "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n"); - } -} - /** * @brief Class to create a cudf::io::source_info of given type from the input parquet file * */ class io_source { public: - io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream) - : type{io_type}, - file_name{file_path}, - file_size{std::filesystem::file_size(file_name)}, - pinned_buffer({pinned_memory_resource(), stream}), - d_buffer{0, stream} - { - // For filepath make a quick source_info and return early - if (type == io_source_type::FILEPATH) { - source_info = cudf::io::source_info(file_name); - return; - } - - std::ifstream file{file_name, std::ifstream::binary}; - - // Copy file contents to the specified io source buffer - switch (type) { - case io_source_type::HOST_BUFFER: { - h_buffer.resize(file_size); - file.read(h_buffer.data(), file_size); - source_info = cudf::io::source_info(h_buffer.data(), file_size); - break; - } - case io_source_type::PINNED_BUFFER: { - pinned_buffer.resize(file_size); - file.read(pinned_buffer.data(), file_size); - source_info = cudf::io::source_info(pinned_buffer.data(), file_size); - break; - } - case io_source_type::DEVICE_BUFFER: { - h_buffer.resize(file_size); - file.read(h_buffer.data(), file_size); - d_buffer.resize(file_size, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value())); - - source_info = cudf::io::source_info(d_buffer); - break; - } - default: { - throw std::runtime_error("Encountered unexpected source type\n\n"); - } - } - } + io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream); // Get the internal source info [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; } diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index e7966357cbd..b621dcde1c9 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -248,6 +248,7 @@ void print_usage() * @param thread_count Number of threads being used in the example * @param io_source_type Specified IO source type to convert input files to * @param stream CUDA stream to use + * * @return Vector of input sources for the given paths */ std::vector extract_input_sources_async(std::string const& paths, @@ -286,6 +287,7 @@ std::vector extract_input_sources_async(std::string const& paths, } } } else { + print_usage(); throw std::runtime_error("Encountered an invalid input path\n"); } }); @@ -353,7 +355,6 @@ int32_t main(int argc, char const** argv) } else input_paths = std::string{argv[1]}; [[fallthrough]]; - case 1: break; default: print_usage(); throw std::runtime_error(""); } @@ -373,6 +374,7 @@ int32_t main(int argc, char const** argv) // Check if there is nothing to do if (input_sources.empty()) { + print_usage(); throw std::runtime_error("No input files to read. Exiting early.\n"); } From 74763b05de8795af40db3d27c7708c8c9e262ab7 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 3 Oct 2024 19:51:13 +0000 Subject: [PATCH 31/37] Minor style fix --- cpp/examples/parquet_io/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index e2e9919fc49..3b87efbac7e 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -21,9 +21,7 @@ target_link_libraries(parquet_io_utils PRIVATE cudf::cudf) # Build and install parquet_io add_executable(parquet_io parquet_io.cpp) -target_link_libraries( - parquet_io PRIVATE cudf::cudf nvToolsExt $ -) +target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $) target_compile_features(parquet_io PRIVATE cxx_std_17) install(TARGETS parquet_io DESTINATION bin/examples/libcudf) From 06afb49d5879a3a53fb14555dae400ada2821eda Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 4 Oct 2024 17:52:13 +0000 Subject: [PATCH 32/37] Minor updates --- cpp/examples/parquet_io/CMakeLists.txt | 1 + cpp/examples/parquet_io/io_source.cpp | 1 + cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 11 ++++++----- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index 3b87efbac7e..a7d0146b170 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -17,6 +17,7 @@ project( include(../fetch_dependencies.cmake) add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp) +target_compile_features(parquet_io_utils PRIVATE cxx_std_17) target_link_libraries(parquet_io_utils PRIVATE cudf::cudf) # Build and install parquet_io diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp index 342c2749d7a..d3cdf6bb276 100644 --- a/cpp/examples/parquet_io/io_source.cpp +++ b/cpp/examples/parquet_io/io_source.cpp @@ -25,6 +25,7 @@ #include #include +#include #include rmm::host_async_resource_ref pinned_memory_resource() diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index b621dcde1c9..6b6a147f3a4 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -30,6 +30,7 @@ #include #include #include +#include /** * @file parquet_io_multithreaded.cpp @@ -349,11 +350,11 @@ int32_t main(int argc, char const** argv) input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]})); [[fallthrough]]; case 2: - if (auto arg = std::string{argv[1]}; arg == "-h" or arg == "--help") { - print_usage(); - return 0; - } else - input_paths = std::string{argv[1]}; + // Check if instead of input_paths, the first argument is `-h` or `--help` + if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") { + input_paths = std::move(arg); + break; + } [[fallthrough]]; default: print_usage(); throw std::runtime_error(""); } From 1a044099c9a0bb400ea0c4a57c20cdcd4bcfe66f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 4 Oct 2024 23:47:52 +0000 Subject: [PATCH 33/37] Nits from code reviews --- cpp/examples/parquet_io/common_utils.cpp | 7 +--- cpp/examples/parquet_io/io_source.cpp | 13 +++--- cpp/examples/parquet_io/io_source.hpp | 4 -- .../parquet_io/parquet_io_multithreaded.cpp | 40 ++++++++++--------- 4 files changed, 27 insertions(+), 37 deletions(-) diff --git a/cpp/examples/parquet_io/common_utils.cpp b/cpp/examples/parquet_io/common_utils.cpp index 13a8293e64c..a79ca48af86 100644 --- a/cpp/examples/parquet_io/common_utils.cpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -91,12 +91,7 @@ bool get_boolean(std::string input) std::transform(input.begin(), input.end(), input.begin(), ::toupper); // Check if the input string matches to any of the following - if (not input.compare("ON") or not input.compare("TRUE") or not input.compare("YES") or - not input.compare("Y") or not input.compare("T")) { - return true; - } else { - return false; - } + return input == "ON" or input == "TRUE" or input == "YES" or input == "Y" or input == "T"; } void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table) diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp index d3cdf6bb276..019b3f96474 100644 --- a/cpp/examples/parquet_io/io_source.cpp +++ b/cpp/examples/parquet_io/io_source.cpp @@ -52,15 +52,12 @@ io_source_type get_io_source_type(std::string name) } } -io_source::io_source(std::string_view file_path, - io_source_type io_type, - rmm::cuda_stream_view stream) - : type{io_type}, - file_name{file_path}, - file_size{std::filesystem::file_size(file_name)}, - pinned_buffer({pinned_memory_resource(), stream}), - d_buffer{0, stream} +io_source::io_source(std::string_view file_path, io_source_type type, rmm::cuda_stream_view stream) + : pinned_buffer({pinned_memory_resource(), stream}), d_buffer{0, stream} { + std::string const file_name{file_path}; + auto const file_size = std::filesystem::file_size(file_name); + // For filepath make a quick source_info and return early if (type == io_source_type::FILEPATH) { source_info = cudf::io::source_info(file_name); diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp index a296938adaa..a614d348fae 100644 --- a/cpp/examples/parquet_io/io_source.hpp +++ b/cpp/examples/parquet_io/io_source.hpp @@ -94,10 +94,6 @@ class io_source { // alias for pinned vector template using pinned_vector = thrust::host_vector>; - - io_source_type const type; - std::string const file_name; - size_t const file_size; cudf::io::source_info source_info; std::vector h_buffer; pinned_vector pinned_buffer; diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 6b6a147f3a4..32d5aaa0e41 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -55,7 +55,7 @@ using table_t = std::unique_ptr; * @brief Behavior when handling the read tables by multiple threads */ enum class read_mode { - NOWORK, ///< Only read and discard tables + NO_CONCATENATE, ///< Only read and discard tables CONCATENATE_THREAD, ///< Read and concatenate tables from each thread CONCATENATE_ALL, ///< Read and concatenate everything to a single table }; @@ -82,15 +82,15 @@ struct read_fn { auto builder = cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info()); auto const options = builder.build(); - if constexpr (READ_FN != read_mode::NOWORK) { + if constexpr (READ_FN != read_mode::NO_CONCATENATE) { tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); } else { cudf::io::read_parquet(options, stream); } } - // Concatenate the tables read by this thread if not NOWORK read_mode. - if constexpr (READ_FN != read_mode::NOWORK) { + // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode. + if constexpr (READ_FN != read_mode::NO_CONCATENATE) { auto table = concatenate_tables(std::move(tables_this_thread), stream); stream.synchronize_no_throw(); tables[thread_id] = std::move(table); @@ -107,7 +107,7 @@ struct read_fn { * @tparam read_mode Specifies if to concatenate and return the actual * tables or discard them and return an empty vector * - * @param files List of files to read + * @param input_sources List of input sources to read * @param thread_count Number of threads * @param stream_pool CUDA stream pool to use for threads * @@ -136,7 +136,7 @@ std::vector read_parquet_multithreaded(std::vector const& in std::vector threads; threads.reserve(thread_count); for (auto& c : read_tasks) { - threads.emplace_back(std::thread{c}); + threads.emplace_back(c); } for (auto& t : threads) { t.join(); @@ -210,7 +210,7 @@ void write_parquet_multithreaded(std::string const& output_path, std::vector threads; threads.reserve(thread_count); for (auto& c : write_tasks) { - threads.emplace_back(std::thread{c}); + threads.emplace_back(c); } for (auto& t : threads) { t.join(); @@ -237,12 +237,12 @@ void print_usage() /** * @brief Function to process comma delimited input paths string to parquet files and/or dirs - * and asynchronously convert them to specified io sources. + * and convert them to specified io sources. * * Process the input path string containing directories (of parquet files) and/or individual * parquet files into a list of input parquet files, multiple the list by `input_multiplier`, * make sure to have at least `thread_count` files to satisfy at least file per parallel thread, - * and asynchronously convert the final list of files to a list of `io_source` and return. + * and convert the final list of files to a list of `io_source` and return. * * @param paths Comma delimited input paths string * @param input_multiplier Multiplier for the input files list @@ -252,11 +252,11 @@ void print_usage() * * @return Vector of input sources for the given paths */ -std::vector extract_input_sources_async(std::string const& paths, - int32_t input_multiplier, - int32_t thread_count, - io_source_type io_source_type, - rmm::cuda_stream_view stream) +std::vector extract_input_sources(std::string const& paths, + int32_t input_multiplier, + int32_t thread_count, + io_source_type io_source_type, + rmm::cuda_stream_view stream) { // Get the delimited paths to directory and/or files. std::vector const delimited_paths = [&]() { @@ -310,6 +310,9 @@ std::vector extract_input_sources_async(std::string const& paths, }); // Cycle append parquet files from the existing ones if less than the thread_count + std::cout << "Warning: Number of input sources < thread count. Cycling from\n" + "and appending to current input sources such that the number of\n" + "input source == thread count\n"; for (size_t idx = 0; thread_count > static_cast(parquet_files.size()); idx++) { parquet_files.emplace_back(parquet_files[idx % initial_size]); } @@ -324,6 +327,7 @@ std::vector extract_input_sources_async(std::string const& paths, [&](auto const& file_name) { return io_source{file_name, io_source_type, stream}; }); + stream.synchronize(); return input_sources; } @@ -369,9 +373,8 @@ int32_t main(int argc, char const** argv) rmm::mr::set_current_device_resource(&stats_mr); // List of input sources from the input_paths string. - auto const input_sources = extract_input_sources_async( + auto const input_sources = extract_input_sources( input_paths, input_multiplier, thread_count, io_source_type, default_stream); - default_stream.synchronize(); // Check if there is nothing to do if (input_sources.empty()) { @@ -396,7 +399,7 @@ int32_t main(int argc, char const** argv) std::for_each(thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_reads), [&](auto i) { // Read parquet files and discard the tables - std::ignore = read_parquet_multithreaded( + std::ignore = read_parquet_multithreaded( input_sources, thread_count, stream_pool); }); default_stream.synchronize(); @@ -440,9 +443,8 @@ int32_t main(int argc, char const** argv) auto const input_table = cudf::concatenate(table_views, default_stream); // Sources from written parquet files - auto const written_pq_sources = extract_input_sources_async( + auto const written_pq_sources = extract_input_sources( output_path, input_multiplier, thread_count, io_source_type, default_stream); - default_stream.synchronize(); // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only auto const transcoded_table = std::move(read_parquet_multithreaded( From 3a590275242658600b78ee7eb7ccac01d88a7f5a Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 10 Oct 2024 20:20:18 +0000 Subject: [PATCH 34/37] Minor arg setting --- ci/run_cudf_examples.sh | 2 +- cpp/examples/parquet_io/parquet_io.cpp | 51 +++++++++++++++----------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index f8c3ed20b03..2439af5b644 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -23,7 +23,7 @@ compute-sanitizer --tool memcheck custom_optimized names.csv compute-sanitizer --tool memcheck custom_prealloc names.csv compute-sanitizer --tool memcheck custom_with_malloc names.csv -compute-sanitizer --tool memcheck parquet_io +compute-sanitizer --tool memcheck parquet_io example.parquet compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index aeb47225cac..9c34b342b62 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -85,6 +85,18 @@ void write_parquet(cudf::table_view input, cudf::io::write_parquet(options); } +/** + * @brief Function to print example usage and argument information. + */ +void print_usage() +{ + std::cout << "\nUsage: parquet_io \n" + " \n\n" + "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n" + " DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n" + "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n"; +} + /** * @brief Main for nested_types examples * @@ -101,31 +113,28 @@ void write_parquet(cudf::table_view input, */ int main(int argc, char const** argv) { - std::string input_filepath; - std::string output_filepath; - cudf::io::column_encoding encoding; - cudf::io::compression_type compression; - std::optional page_stats; + std::string input_filepath = "example.parquet"; + std::string output_filepath = "output.parquet"; + cudf::io::column_encoding encoding = get_encoding_type("DELTA_BINARY_PACKED"); + cudf::io::compression_type compression = get_compression_type("ZSTD"); + std::optional page_stats = std::nullopt; switch (argc) { - case 1: - input_filepath = "example.parquet"; - output_filepath = "output.parquet"; - encoding = get_encoding_type("DELTA_BINARY_PACKED"); - compression = get_compression_type("ZSTD"); - break; case 6: - if (get_boolean(argv[5])) { page_stats = cudf::io::statistics_freq::STATISTICS_COLUMN; }; + page_stats = get_boolean(argv[5]) + ? std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN) + : std::nullopt; + [[fallthrough]]; + case 5: compression = get_compression_type(argv[4]); [[fallthrough]]; + case 4: encoding = get_encoding_type(argv[3]); [[fallthrough]]; + case 3: output_filepath = argv[2]; [[fallthrough]]; + case 2: // Check if instead of input_paths, the first argument is `-h` or `--help` + if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") { + input_filepath = std::move(arg); + break; + } [[fallthrough]]; - case 5: - input_filepath = argv[1]; - output_filepath = argv[2]; - encoding = get_encoding_type(argv[3]); - compression = get_compression_type(argv[4]); - break; - default: - throw std::runtime_error( - "Either provide all command-line arguments, or none to use defaults\n"); + default: print_usage(); throw std::runtime_error(""); } // Create and use a memory pool From 7cfd7ae4b0edc7547ed4e354f332dfa253aa4c21 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 11 Oct 2024 00:05:34 +0000 Subject: [PATCH 35/37] Adjust spacing --- cpp/examples/parquet_io/parquet_io.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 9c34b342b62..c11b8de82b5 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -91,7 +91,7 @@ void write_parquet(cudf::table_view input, void print_usage() { std::cout << "\nUsage: parquet_io \n" - " \n\n" + " \n\n" "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n" " DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n" "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n"; From d9102f00f9b7589fdbf8d456c019d42bca1f75ad Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 11 Oct 2024 00:07:28 +0000 Subject: [PATCH 36/37] Apply suggestion --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 32d5aaa0e41..82f55473e7e 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -406,7 +406,7 @@ int32_t main(int argc, char const** argv) timer.print_elapsed_millis(); } - // Do we need to write parquet files and validate? + // Write parquet files and validate if needed if (write_and_validate) { // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables auto const tables = read_parquet_multithreaded( From b61f18ee1769c2b67c969c8ccdec9fc150292997 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 11 Oct 2024 18:48:28 +0000 Subject: [PATCH 37/37] Minor --- cpp/examples/parquet_io/parquet_io_multithreaded.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp index 82f55473e7e..6ad4b862240 100644 --- a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -63,7 +63,7 @@ enum class read_mode { /** * @brief Functor for multithreaded parquet reading based on the provided read_mode */ -template +template struct read_fn { std::vector const& input_sources; std::vector& tables; @@ -82,7 +82,7 @@ struct read_fn { auto builder = cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info()); auto const options = builder.build(); - if constexpr (READ_FN != read_mode::NO_CONCATENATE) { + if constexpr (read_mode != read_mode::NO_CONCATENATE) { tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); } else { cudf::io::read_parquet(options, stream); @@ -90,7 +90,7 @@ struct read_fn { } // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode. - if constexpr (READ_FN != read_mode::NO_CONCATENATE) { + if constexpr (read_mode != read_mode::NO_CONCATENATE) { auto table = concatenate_tables(std::move(tables_this_thread), stream); stream.synchronize_no_throw(); tables[thread_id] = std::move(table);