Skip to content

Commit

Permalink
Use make_host_vector instead of make_std_vector to facilitate pin…
Browse files Browse the repository at this point in the history
…ned memory optimizations (#16386)

Replaced most of `make_std_vector` calls with `make_host_vector` to allow pinned memory and kernel copies, when enabled.
Skipped places where the change would impact the public API.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: #16386
  • Loading branch information
vuule authored Aug 28, 2024
1 parent d1412e0 commit 60f30d8
Show file tree
Hide file tree
Showing 12 changed files with 57 additions and 58 deletions.
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ void gather_bitmask(table_view const& source,
stream);

// Copy the valid counts into each column
auto const valid_counts = make_std_vector_sync(d_valid_counts, stream);
auto const valid_counts = make_host_vector_sync(d_valid_counts, stream);
for (size_t i = 0; i < target.size(); ++i) {
if (target[i]->nullable()) {
auto const null_count = target_rows - valid_counts[i];
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,7 @@ device_span<uint64_t> __host__ remove_blank_rows(cudf::io::parse_options_view co
return row_offsets.subspan(0, new_end - row_offsets.begin());
}

std::vector<column_type_histogram> detect_column_types(
cudf::detail::host_vector<column_type_histogram> detect_column_types(
cudf::io::parse_options_view const& options,
device_span<char const> const data,
device_span<column_parse::flags const> const column_flags,
Expand All @@ -812,7 +812,7 @@ std::vector<column_type_histogram> detect_column_types(
data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
options, data, column_flags, row_starts, d_stats);

return detail::make_std_vector_sync(d_stats, stream);
return detail::make_host_vector_sync(d_stats, stream);
}

void decode_row_column_data(cudf::io::parse_options_view const& options,
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/csv/csv_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ device_span<uint64_t> remove_blank_rows(cudf::io::parse_options_view const& opti
*
* @return stats Histogram of each dtypes' occurrence for each column
*/
std::vector<column_type_histogram> detect_column_types(
cudf::detail::host_vector<column_type_histogram> detect_column_types(
cudf::io::parse_options_view const& options,
device_span<char const> data,
device_span<column_parse::flags const> column_flags,
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
d_valid_counts,
stream);

auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream);
auto const h_valid_counts = cudf::detail::make_host_vector_sync(d_valid_counts, stream);
for (int i = 0; i < num_active_columns; ++i) {
out_buffers[i].null_count() = num_records - h_valid_counts[i];
}
Expand Down
42 changes: 21 additions & 21 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,16 @@ void print_tree(host_span<SymbolT const> input,
tree_meta_t const& d_gpu_tree,
rmm::cuda_stream_view stream)
{
print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
print_vec(cudf::detail::make_host_vector_sync(d_gpu_tree.node_categories, stream),
"node_categories",
to_cat);
print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
print_vec(cudf::detail::make_host_vector_sync(d_gpu_tree.parent_node_ids, stream),
"parent_node_ids",
to_int);
print_vec(
cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
auto node_range_end = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
cudf::detail::make_host_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
auto node_range_begin = cudf::detail::make_host_vector_sync(d_gpu_tree.node_range_begin, stream);
auto node_range_end = cudf::detail::make_host_vector_sync(d_gpu_tree.node_range_end, stream);
print_vec(node_range_begin, "node_range_begin", to_int);
print_vec(node_range_end, "node_range_end", to_int);
for (int i = 0; i < int(node_range_begin.size()); i++) {
Expand Down Expand Up @@ -373,9 +373,9 @@ std::vector<std::string> copy_strings_to_host_sync(
auto to_host = [stream](auto const& col) {
if (col.is_empty()) return std::vector<std::string>{};
auto const scv = cudf::strings_column_view(col);
auto const h_chars = cudf::detail::make_std_vector_async<char>(
auto const h_chars = cudf::detail::make_host_vector_async<char>(
cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
auto const h_offsets = cudf::detail::make_std_vector_async(
auto const h_offsets = cudf::detail::make_host_vector_async(
cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
scv.size() + 1),
stream);
Expand Down Expand Up @@ -523,25 +523,23 @@ void make_device_json_column(device_span<SymbolT const> input,
row_array_parent_col_id,
stream);
auto num_columns = d_unique_col_ids.size();
auto unique_col_ids = cudf::detail::make_std_vector_async(d_unique_col_ids, stream);
auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
auto column_categories =
cudf::detail::make_std_vector_async(d_column_tree.node_categories, stream);
auto column_parent_ids =
cudf::detail::make_std_vector_async(d_column_tree.parent_node_ids, stream);
cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
auto const column_parent_ids =
cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
auto column_range_beg =
cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
std::vector<std::string> column_names = copy_strings_to_host_sync(
input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
stream.synchronize();
// array of arrays column names
if (is_array_of_arrays) {
TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
auto values_column_indices =
get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
auto h_values_column_indices =
cudf::detail::make_std_vector_async(values_column_indices, stream);
stream.synchronize();
cudf::detail::make_host_vector_sync(values_column_indices, stream);
std::transform(unique_col_ids.begin(),
unique_col_ids.end(),
column_names.begin(),
Expand Down Expand Up @@ -611,11 +609,13 @@ void make_device_json_column(device_span<SymbolT const> input,
return thrust::get<0>(a) < thrust::get<0>(b);
});

std::vector<uint8_t> is_str_column_all_nulls{};
if (is_enabled_mixed_types_as_string) {
is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
}
auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
if (is_enabled_mixed_types_as_string) {
return cudf::detail::make_host_vector_sync(
is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
}
return cudf::detail::make_empty_host_vector<uint8_t>(0, stream);
}();

// use hash map because we may skip field name's col_ids
std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1978,7 +1978,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,

// Gather the row group sizes and copy to host
auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(segmentation.num_rowgroups(), stream);
std::map<uint32_t, std::vector<uint32_t>> rg_sizes;
std::map<uint32_t, cudf::detail::host_vector<uint32_t>> rg_sizes;
for (auto const& [col_idx, esizes] : elem_sizes) {
// Copy last elem in each row group - equal to row group size
thrust::tabulate(rmm::exec_policy(stream),
Expand All @@ -1991,14 +1991,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
return src[rg_bounds[idx][col_idx].end - 1];
});

rg_sizes[col_idx] = cudf::detail::make_std_vector_async(d_tmp_rowgroup_sizes, stream);
rg_sizes.emplace(col_idx, cudf::detail::make_host_vector_async(d_tmp_rowgroup_sizes, stream));
}

return {std::move(elem_sizes), std::move(rg_sizes)};
}

std::map<uint32_t, size_t> decimal_column_sizes(
std::map<uint32_t, std::vector<uint32_t>> const& chunk_sizes)
std::map<uint32_t, cudf::detail::host_vector<uint32_t>> const& chunk_sizes)
{
std::map<uint32_t, size_t> column_sizes;
std::transform(chunk_sizes.cbegin(),
Expand Down Expand Up @@ -2056,7 +2056,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
orc_table.d_string_column_indices,
stream);

auto const h_counts = cudf::detail::make_std_vector_sync(counts, stream);
auto const h_counts = cudf::detail::make_host_vector_sync(counts, stream);

for (auto col_idx : orc_table.string_column_indices) {
auto& str_column = orc_table.column(col_idx);
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/io/orc/writer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ struct stripe_rowgroups {
*/
struct encoder_decimal_info {
std::map<uint32_t, rmm::device_uvector<uint32_t>>
elem_sizes; ///< Column index -> per-element size map
std::map<uint32_t, std::vector<uint32_t>> rg_sizes; ///< Column index -> per-rowgroup size map
elem_sizes; ///< Column index -> per-element size map
std::map<uint32_t, cudf::detail::host_vector<uint32_t>>
rg_sizes; ///< Column index -> per-rowgroup size map
};

/**
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/predicate_pushdown.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
auto validity_it = cudf::detail::make_counting_transform_iterator(
0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });

auto is_row_group_required = cudf::detail::make_std_vector_sync(
auto const is_row_group_required = cudf::detail::make_host_vector_sync(
device_span<uint8_t const>(predicate.data<uint8_t>(), predicate.size()), stream);

// Return only filtered row groups based on predicate
Expand Down
20 changes: 9 additions & 11 deletions cpp/src/io/parquet/reader_impl_chunking.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ void print_cumulative_page_info(device_span<PageInfo const> d_pages,
device_span<cumulative_page_info const> d_c_info,
rmm::cuda_stream_view stream)
{
std::vector<PageInfo> pages = cudf::detail::make_std_vector_sync(d_pages, stream);
std::vector<ColumnChunkDesc> chunks = cudf::detail::make_std_vector_sync(d_chunks, stream);
std::vector<cumulative_page_info> c_info = cudf::detail::make_std_vector_sync(d_c_info, stream);
auto const pages = cudf::detail::make_host_vector_sync(d_pages, stream);
auto const chunks = cudf::detail::make_host_vector_sync(d_chunks, stream);
auto const c_info = cudf::detail::make_host_vector_sync(d_c_info, stream);

printf("------------\nCumulative sizes by page\n");

Expand Down Expand Up @@ -647,7 +647,7 @@ std::tuple<rmm::device_uvector<page_span>, size_t, size_t> compute_next_subpass(
auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);

// bring back to the cpu
auto const h_aggregated_info = cudf::detail::make_std_vector_sync(aggregated_info, stream);
auto const h_aggregated_info = cudf::detail::make_host_vector_sync(aggregated_info, stream);
// print_cumulative_row_info(h_aggregated_info, "adjusted");

// TODO: if the user has explicitly specified skip_rows/num_rows we could be more intelligent
Expand Down Expand Up @@ -694,8 +694,7 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);

// bring back to the cpu
std::vector<cumulative_page_info> h_aggregated_info =
cudf::detail::make_std_vector_sync(aggregated_info, stream);
auto const h_aggregated_info = cudf::detail::make_host_vector_sync(aggregated_info, stream);
// print_cumulative_row_info(h_aggregated_info, "adjusted");

std::vector<row_range> splits;
Expand Down Expand Up @@ -1304,9 +1303,8 @@ void reader::impl::setup_next_pass(read_mode mode)
printf("\tskip_rows: %'lu\n", pass.skip_rows);
printf("\tnum_rows: %'lu\n", pass.num_rows);
printf("\tbase mem usage: %'lu\n", pass.base_mem_size);
auto const num_columns = _input_columns.size();
std::vector<size_type> h_page_offsets =
cudf::detail::make_std_vector_sync(pass.page_offsets, _stream);
auto const num_columns = _input_columns.size();
auto const h_page_offsets = cudf::detail::make_host_vector_sync(pass.page_offsets, _stream);
for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
printf("\t\tColumn %'lu: num_pages(%'d)\n",
c_idx,
Expand Down Expand Up @@ -1426,7 +1424,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
subpass.pages = subpass.page_buf;
}

std::vector<page_span> h_spans = cudf::detail::make_std_vector_async(page_indices, _stream);
auto const h_spans = cudf::detail::make_host_vector_async(page_indices, _stream);
subpass.pages.device_to_host_async(_stream);

_stream.synchronize();
Expand Down Expand Up @@ -1464,7 +1462,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
printf("\t\tTotal expected usage: %'lu\n",
total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size
: total_expected_size + pass.base_mem_size);
std::vector<page_span> h_page_indices = cudf::detail::make_std_vector_sync(page_indices, _stream);
auto const h_page_indices = cudf::detail::make_host_vector_sync(page_indices, _stream);
for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
c_idx,
Expand Down
22 changes: 11 additions & 11 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2230,20 +2230,20 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
bool need_sync{false};

// need to fetch the histogram data from the device
std::vector<uint32_t> h_def_histogram;
std::vector<uint32_t> h_rep_histogram;
if (stats_granularity == statistics_freq::STATISTICS_COLUMN) {
if (def_histogram_bfr_size > 0) {
h_def_histogram =
std::move(cudf::detail::make_std_vector_async(def_level_histogram, stream));
auto const h_def_histogram = [&]() {
if (stats_granularity == statistics_freq::STATISTICS_COLUMN && def_histogram_bfr_size > 0) {
need_sync = true;
return cudf::detail::make_host_vector_async(def_level_histogram, stream);
}
if (rep_histogram_bfr_size > 0) {
h_rep_histogram =
std::move(cudf::detail::make_std_vector_async(rep_level_histogram, stream));
return cudf::detail::make_host_vector<uint32_t>(0, stream);
}();
auto const h_rep_histogram = [&]() {
if (stats_granularity == statistics_freq::STATISTICS_COLUMN && rep_histogram_bfr_size > 0) {
need_sync = true;
return cudf::detail::make_host_vector_async(rep_level_histogram, stream);
}
}
return cudf::detail::make_host_vector<uint32_t>(0, stream);
}();

for (int r = 0; r < num_rowgroups; r++) {
int p = rg_to_part[r];
Expand All @@ -2265,7 +2265,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers);

if (ck.ck_stat_size != 0) {
std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
auto const stats_blob = cudf::detail::make_host_vector_sync(
device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
cp.read(&column_chunk_meta.statistics);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/utilities/datasource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,10 +297,10 @@ class device_buffer_source final : public datasource {
{
auto const count = std::min(size, this->size() - offset);
auto const stream = cudf::get_default_stream();
auto h_data = cudf::detail::make_std_vector_async(
auto h_data = cudf::detail::make_host_vector_async(
cudf::device_span<std::byte const>{_d_buffer.data() + offset, count}, stream);
stream.synchronize();
return std::make_unique<owning_buffer<std::vector<std::byte>>>(std::move(h_data));
return std::make_unique<owning_buffer<cudf::detail::host_vector<std::byte>>>(std::move(h_data));
}

[[nodiscard]] bool supports_device_read() const override { return true; }
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/text/jaccard.cu
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_subs
sub_offsets.begin(),
sub_offsets.end(),
indices.begin());
return cudf::detail::make_std_vector_sync(indices, stream);
return cudf::detail::make_host_vector_sync(indices, stream);
}();

// Call segmented sort with the sort sections
Expand Down

0 comments on commit 60f30d8

Please sign in to comment.