From 60f30d831325d5816e6968e8037796b8ce1dc579 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 27 Aug 2024 17:45:33 -0700 Subject: [PATCH] Use `make_host_vector` instead of `make_std_vector` to facilitate pinned memory optimizations (#16386) Replaced most of `make_std_vector` calls with `make_host_vector` to allow pinned memory and kernel copies, when enabled. Skipped places where the change would impact the public API. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/16386 --- cpp/include/cudf/detail/gather.cuh | 2 +- cpp/src/io/csv/csv_gpu.cu | 4 +-- cpp/src/io/csv/csv_gpu.hpp | 2 +- cpp/src/io/csv/reader_impl.cu | 2 +- cpp/src/io/json/json_column.cu | 42 +++++++++++----------- cpp/src/io/orc/writer_impl.cu | 8 ++--- cpp/src/io/orc/writer_impl.hpp | 5 +-- cpp/src/io/parquet/predicate_pushdown.cpp | 2 +- cpp/src/io/parquet/reader_impl_chunking.cu | 20 +++++------ cpp/src/io/parquet/writer_impl.cu | 22 ++++++------ cpp/src/io/utilities/datasource.cpp | 4 +-- cpp/src/text/jaccard.cu | 2 +- 12 files changed, 57 insertions(+), 58 deletions(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 41f5494f78f..df6fe6e6ccb 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -609,7 +609,7 @@ void gather_bitmask(table_view const& source, stream); // Copy the valid counts into each column - auto const valid_counts = make_std_vector_sync(d_valid_counts, stream); + auto const valid_counts = make_host_vector_sync(d_valid_counts, stream); for (size_t i = 0; i < target.size(); ++i) { if (target[i]->nullable()) { auto const null_count = target_rows - valid_counts[i]; diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 7a05d0aebaf..5a0c6decfda 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -794,7 +794,7 @@ device_span __host__ remove_blank_rows(cudf::io::parse_options_view co return row_offsets.subspan(0, new_end - row_offsets.begin()); } -std::vector detect_column_types( +cudf::detail::host_vector detect_column_types( cudf::io::parse_options_view const& options, device_span const data, device_span const column_flags, @@ -812,7 +812,7 @@ std::vector detect_column_types( data_type_detection<<>>( options, data, column_flags, row_starts, d_stats); - return detail::make_std_vector_sync(d_stats, stream); + return detail::make_host_vector_sync(d_stats, stream); } void decode_row_column_data(cudf::io::parse_options_view const& options, diff --git a/cpp/src/io/csv/csv_gpu.hpp b/cpp/src/io/csv/csv_gpu.hpp index 06c60319371..aa3d9f6c7b7 100644 --- a/cpp/src/io/csv/csv_gpu.hpp +++ b/cpp/src/io/csv/csv_gpu.hpp @@ -199,7 +199,7 @@ device_span remove_blank_rows(cudf::io::parse_options_view const& opti * * @return stats Histogram of each dtypes' occurrence for each column */ -std::vector detect_column_types( +cudf::detail::host_vector detect_column_types( cudf::io::parse_options_view const& options, device_span data, device_span column_flags, diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 40d4372ae9d..e27b06682bb 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -614,7 +614,7 @@ std::vector decode_data(parse_options const& parse_opts, d_valid_counts, stream); - auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); + auto const h_valid_counts = cudf::detail::make_host_vector_sync(d_valid_counts, stream); for (int i = 0; i < num_active_columns; ++i) { out_buffers[i].null_count() = num_records - h_valid_counts[i]; } diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index e5e21e054a6..8d6890045be 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -77,16 +77,16 @@ void print_tree(host_span input, tree_meta_t const& d_gpu_tree, rmm::cuda_stream_view stream) { - print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream), + print_vec(cudf::detail::make_host_vector_sync(d_gpu_tree.node_categories, stream), "node_categories", to_cat); - print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream), + print_vec(cudf::detail::make_host_vector_sync(d_gpu_tree.parent_node_ids, stream), "parent_node_ids", to_int); print_vec( - cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int); - auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream); - auto node_range_end = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream); + cudf::detail::make_host_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int); + auto node_range_begin = cudf::detail::make_host_vector_sync(d_gpu_tree.node_range_begin, stream); + auto node_range_end = cudf::detail::make_host_vector_sync(d_gpu_tree.node_range_end, stream); print_vec(node_range_begin, "node_range_begin", to_int); print_vec(node_range_end, "node_range_end", to_int); for (int i = 0; i < int(node_range_begin.size()); i++) { @@ -373,9 +373,9 @@ std::vector copy_strings_to_host_sync( auto to_host = [stream](auto const& col) { if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); - auto const h_chars = cudf::detail::make_std_vector_async( + auto const h_chars = cudf::detail::make_host_vector_async( cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); - auto const h_offsets = cudf::detail::make_std_vector_async( + auto const h_offsets = cudf::detail::make_host_vector_async( cudf::device_span(scv.offsets().data() + scv.offset(), scv.size() + 1), stream); @@ -523,25 +523,23 @@ void make_device_json_column(device_span input, row_array_parent_col_id, stream); auto num_columns = d_unique_col_ids.size(); - auto unique_col_ids = cudf::detail::make_std_vector_async(d_unique_col_ids, stream); + auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); auto column_categories = - cudf::detail::make_std_vector_async(d_column_tree.node_categories, stream); - auto column_parent_ids = - cudf::detail::make_std_vector_async(d_column_tree.parent_node_ids, stream); + cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); auto column_range_beg = - cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream); - auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream); + cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); + auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); std::vector column_names = copy_strings_to_host_sync( input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - stream.synchronize(); // array of arrays column names if (is_array_of_arrays) { TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; auto values_column_indices = get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); auto h_values_column_indices = - cudf::detail::make_std_vector_async(values_column_indices, stream); - stream.synchronize(); + cudf::detail::make_host_vector_sync(values_column_indices, stream); std::transform(unique_col_ids.begin(), unique_col_ids.end(), column_names.begin(), @@ -611,11 +609,13 @@ void make_device_json_column(device_span input, return thrust::get<0>(a) < thrust::get<0>(b); }); - std::vector is_str_column_all_nulls{}; - if (is_enabled_mixed_types_as_string) { - is_str_column_all_nulls = cudf::detail::make_std_vector_sync( - is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream); - } + auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { + if (is_enabled_mixed_types_as_string) { + return cudf::detail::make_host_vector_sync( + is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); + } + return cudf::detail::make_empty_host_vector(0, stream); + }(); // use hash map because we may skip field name's col_ids std::unordered_map> columns; diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 04eee68e757..ede9fd060b8 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1978,7 +1978,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, // Gather the row group sizes and copy to host auto d_tmp_rowgroup_sizes = rmm::device_uvector(segmentation.num_rowgroups(), stream); - std::map> rg_sizes; + std::map> rg_sizes; for (auto const& [col_idx, esizes] : elem_sizes) { // Copy last elem in each row group - equal to row group size thrust::tabulate(rmm::exec_policy(stream), @@ -1991,14 +1991,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, return src[rg_bounds[idx][col_idx].end - 1]; }); - rg_sizes[col_idx] = cudf::detail::make_std_vector_async(d_tmp_rowgroup_sizes, stream); + rg_sizes.emplace(col_idx, cudf::detail::make_host_vector_async(d_tmp_rowgroup_sizes, stream)); } return {std::move(elem_sizes), std::move(rg_sizes)}; } std::map decimal_column_sizes( - std::map> const& chunk_sizes) + std::map> const& chunk_sizes) { std::map column_sizes; std::transform(chunk_sizes.cbegin(), @@ -2056,7 +2056,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table, orc_table.d_string_column_indices, stream); - auto const h_counts = cudf::detail::make_std_vector_sync(counts, stream); + auto const h_counts = cudf::detail::make_host_vector_sync(counts, stream); for (auto col_idx : orc_table.string_column_indices) { auto& str_column = orc_table.column(col_idx); diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index f5f8b3cfed9..cae849ee315 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -90,8 +90,9 @@ struct stripe_rowgroups { */ struct encoder_decimal_info { std::map> - elem_sizes; ///< Column index -> per-element size map - std::map> rg_sizes; ///< Column index -> per-rowgroup size map + elem_sizes; ///< Column index -> per-element size map + std::map> + rg_sizes; ///< Column index -> per-rowgroup size map }; /** diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 5ca090b05b3..c8b8b7a1193 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -468,7 +468,7 @@ std::optional>> aggregate_reader_metadata::fi auto validity_it = cudf::detail::make_counting_transform_iterator( 0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); }); - auto is_row_group_required = cudf::detail::make_std_vector_sync( + auto const is_row_group_required = cudf::detail::make_host_vector_sync( device_span(predicate.data(), predicate.size()), stream); // Return only filtered row groups based on predicate diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 54ba898b058..00d62c45962 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -77,9 +77,9 @@ void print_cumulative_page_info(device_span d_pages, device_span d_c_info, rmm::cuda_stream_view stream) { - std::vector pages = cudf::detail::make_std_vector_sync(d_pages, stream); - std::vector chunks = cudf::detail::make_std_vector_sync(d_chunks, stream); - std::vector c_info = cudf::detail::make_std_vector_sync(d_c_info, stream); + auto const pages = cudf::detail::make_host_vector_sync(d_pages, stream); + auto const chunks = cudf::detail::make_host_vector_sync(d_chunks, stream); + auto const c_info = cudf::detail::make_host_vector_sync(d_c_info, stream); printf("------------\nCumulative sizes by page\n"); @@ -647,7 +647,7 @@ std::tuple, size_t, size_t> compute_next_subpass( auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream); // bring back to the cpu - auto const h_aggregated_info = cudf::detail::make_std_vector_sync(aggregated_info, stream); + auto const h_aggregated_info = cudf::detail::make_host_vector_sync(aggregated_info, stream); // print_cumulative_row_info(h_aggregated_info, "adjusted"); // TODO: if the user has explicitly specified skip_rows/num_rows we could be more intelligent @@ -694,8 +694,7 @@ std::vector compute_page_splits_by_row(device_span h_aggregated_info = - cudf::detail::make_std_vector_sync(aggregated_info, stream); + auto const h_aggregated_info = cudf::detail::make_host_vector_sync(aggregated_info, stream); // print_cumulative_row_info(h_aggregated_info, "adjusted"); std::vector splits; @@ -1304,9 +1303,8 @@ void reader::impl::setup_next_pass(read_mode mode) printf("\tskip_rows: %'lu\n", pass.skip_rows); printf("\tnum_rows: %'lu\n", pass.num_rows); printf("\tbase mem usage: %'lu\n", pass.base_mem_size); - auto const num_columns = _input_columns.size(); - std::vector h_page_offsets = - cudf::detail::make_std_vector_sync(pass.page_offsets, _stream); + auto const num_columns = _input_columns.size(); + auto const h_page_offsets = cudf::detail::make_host_vector_sync(pass.page_offsets, _stream); for (size_t c_idx = 0; c_idx < num_columns; c_idx++) { printf("\t\tColumn %'lu: num_pages(%'d)\n", c_idx, @@ -1426,7 +1424,7 @@ void reader::impl::setup_next_subpass(read_mode mode) subpass.pages = subpass.page_buf; } - std::vector h_spans = cudf::detail::make_std_vector_async(page_indices, _stream); + auto const h_spans = cudf::detail::make_host_vector_async(page_indices, _stream); subpass.pages.device_to_host_async(_stream); _stream.synchronize(); @@ -1464,7 +1462,7 @@ void reader::impl::setup_next_subpass(read_mode mode) printf("\t\tTotal expected usage: %'lu\n", total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size : total_expected_size + pass.base_mem_size); - std::vector h_page_indices = cudf::detail::make_std_vector_sync(page_indices, _stream); + auto const h_page_indices = cudf::detail::make_host_vector_sync(page_indices, _stream); for (size_t c_idx = 0; c_idx < num_columns; c_idx++) { printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n", c_idx, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index c2c5dbb4a56..74992aa733f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2230,20 +2230,20 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, bool need_sync{false}; // need to fetch the histogram data from the device - std::vector h_def_histogram; - std::vector h_rep_histogram; - if (stats_granularity == statistics_freq::STATISTICS_COLUMN) { - if (def_histogram_bfr_size > 0) { - h_def_histogram = - std::move(cudf::detail::make_std_vector_async(def_level_histogram, stream)); + auto const h_def_histogram = [&]() { + if (stats_granularity == statistics_freq::STATISTICS_COLUMN && def_histogram_bfr_size > 0) { need_sync = true; + return cudf::detail::make_host_vector_async(def_level_histogram, stream); } - if (rep_histogram_bfr_size > 0) { - h_rep_histogram = - std::move(cudf::detail::make_std_vector_async(rep_level_histogram, stream)); + return cudf::detail::make_host_vector(0, stream); + }(); + auto const h_rep_histogram = [&]() { + if (stats_granularity == statistics_freq::STATISTICS_COLUMN && rep_histogram_bfr_size > 0) { need_sync = true; + return cudf::detail::make_host_vector_async(rep_level_histogram, stream); } - } + return cudf::detail::make_host_vector(0, stream); + }(); for (int r = 0; r < num_rowgroups; r++) { int p = rg_to_part[r]; @@ -2265,7 +2265,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers); if (ck.ck_stat_size != 0) { - std::vector const stats_blob = cudf::detail::make_std_vector_sync( + auto const stats_blob = cudf::detail::make_host_vector_sync( device_span(dev_bfr, ck.ck_stat_size), stream); CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); cp.read(&column_chunk_meta.statistics); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 91be154e09d..e4313eba454 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -297,10 +297,10 @@ class device_buffer_source final : public datasource { { auto const count = std::min(size, this->size() - offset); auto const stream = cudf::get_default_stream(); - auto h_data = cudf::detail::make_std_vector_async( + auto h_data = cudf::detail::make_host_vector_async( cudf::device_span{_d_buffer.data() + offset, count}, stream); stream.synchronize(); - return std::make_unique>>(std::move(h_data)); + return std::make_unique>>(std::move(h_data)); } [[nodiscard]] bool supports_device_read() const override { return true; } diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index e465fb79c89..e856b89b836 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -376,7 +376,7 @@ std::pair, rmm::device_uvector> hash_subs sub_offsets.begin(), sub_offsets.end(), indices.begin()); - return cudf::detail::make_std_vector_sync(indices, stream); + return cudf::detail::make_host_vector_sync(indices, stream); }(); // Call segmented sort with the sort sections