Skip to content

Commit

Permalink
Rework strings::slice benchmark to use nvbench (#16563)
Browse files Browse the repository at this point in the history
Moves google-benchmark  for `cudf::strings::slice_strings` to nvbench.
This is to help measure performance improvements in follow on work for strings-slice.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: #16563
  • Loading branch information
davidwendt authored Aug 27, 2024
1 parent e2a15cb commit d1412e0
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 47 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,6 @@ ConfigureBench(
string/filter.cpp
string/repeat_strings.cpp
string/replace.cpp
string/slice.cpp
string/translate.cpp
string/url_decode.cu
)
Expand All @@ -346,6 +345,7 @@ ConfigureNVBench(
string/like.cpp
string/replace_re.cpp
string/reverse.cpp
string/slice.cpp
string/split.cpp
string/split_re.cpp
)
Expand Down
89 changes: 43 additions & 46 deletions cpp/benchmarks/string/slice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@
* limitations under the License.
*/

#include "string_bench_args.hpp"

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>
#include <benchmarks/common/nvbench_utilities.hpp>

#include <cudf_test/column_wrapper.hpp>

Expand All @@ -29,56 +26,56 @@

#include <thrust/iterator/constant_iterator.h>

#include <nvbench/nvbench.cuh>

#include <limits>

class StringSlice : public cudf::benchmark {};
static void bench_slice(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const stype = state.get_string("type");

enum slice_type { position, multi_position };
if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

static void BM_slice(benchmark::State& state, slice_type rt)
{
cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
cudf::strings_column_view input(column->view());
auto starts_itr = thrust::constant_iterator<cudf::size_type>(max_str_length / 3);
auto stops_itr = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
auto starts_itr = thrust::constant_iterator<cudf::size_type>(row_width / 4);
auto starts =
cudf::test::fixed_width_column_wrapper<cudf::size_type>(starts_itr, starts_itr + num_rows);
auto stops_itr = thrust::constant_iterator<cudf::size_type>(row_width / 3);
auto stops =
cudf::test::fixed_width_column_wrapper<cudf::size_type>(stops_itr, stops_itr + num_rows);

for (auto _ : state) {
cuda_event_timer raii(state, true, cudf::get_default_stream());
switch (rt) {
case position:
cudf::strings::slice_strings(input, max_str_length / 3, max_str_length / 2);
break;
case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
}
auto stream = cudf::get_default_stream();
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
// gather some throughput statistics as well
auto chars_size = input.chars_size(stream);
state.add_element_count(chars_size, "chars_size"); // number of bytes
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read
auto output_size = (row_width / 3 - row_width / 4) * num_rows;
state.add_global_memory_writes<nvbench::int8_t>(output_size);

if (stype == "multi") {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::slice_strings(input, starts, stops, stream);
});
} else {
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::strings::slice_strings(input, row_width / 4, row_width / 3, 1, stream);
});
}

state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
set_throughputs(state);
}

static void generate_bench_args(benchmark::internal::Benchmark* b)
{
int const min_rows = 1 << 12;
int const max_rows = 1 << 24;
int const row_mult = 8;
int const min_rowlen = 1 << 5;
int const max_rowlen = 1 << 13;
int const len_mult = 2;
generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
}

#define STRINGS_BENCHMARK_DEFINE(name) \
BENCHMARK_DEFINE_F(StringSlice, name) \
(::benchmark::State & st) { BM_slice(st, slice_type::name); } \
BENCHMARK_REGISTER_F(StringSlice, name) \
->Apply(generate_bench_args) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

STRINGS_BENCHMARK_DEFINE(position)
STRINGS_BENCHMARK_DEFINE(multi_position)
NVBENCH_BENCH(bench_slice)
.set_name("slice")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {262144, 2097152, 16777216})
.add_string_axis("type", {"position", "multi"});

0 comments on commit d1412e0

Please sign in to comment.