From b5ec22e3cc7f3570f9ef7eeb9b55b0ffde350eb7 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Mon, 12 Aug 2024 16:31:00 -0400
Subject: [PATCH 01/36] work in progress

---
 cpp/src/io/parquet/decode_fixed.cu | 581 +++++++++++++++++++++++------
 cpp/src/io/parquet/page_hdr.cu     |  11 +
 cpp/src/io/parquet/parquet_gpu.hpp |  31 ++
 cpp/src/io/parquet/reader_impl.cpp |  48 +++
 4 files changed, 566 insertions(+), 105 deletions(-)
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index ea80ae73c2f..8157198e116 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -195,13 +195,13 @@ struct decode_fixed_width_split_values_func {
 };
 
 template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityAndRowIndicesNested(
+static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  // how many (input) values we've processed in the page so far
+  // how many (input) values we've processed in the page so far, prior to this loop iteration
   int value_count = s->input_value_count;
 
   // cap by last row so that we don't process any rows past what we want to output.
@@ -217,69 +217,99 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
+    // get definition level. only need to process for nullable columns
+    int def_level;
     if constexpr (nullable) {
       if (def) {
-        d = t < batch_size
+        def_level = t < batch_size
               ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
               : -1;
       } else {
-        d = t < batch_size ? 1 : -1;
+        def_level = t < batch_size ? 1 : -1;
       }
+    } else {
+      def_level = 0;
     }
 
-    int const thread_value_count = t + 1;
-    int const block_value_count  = batch_size;
+    //Determine value count & row index
+    int const thread_value_count = t + 1; //# of output values from the view of this thread
+    int const block_value_count = batch_size;
+    int const row_index = t + value_count; //thread_row_index in old
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
-    // compute our row index, whether we're in row bounds, and validity
-    int const row_index           = (thread_value_count + value_count) - 1;
-    int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
-    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+    //per-warp variables used below for writing validity
+    int const in_write_row_bounds = (row_index >= first_row) && (row_index < last_row);
 
-    // iterate by depth
-    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
-      auto& ni = s->nesting_info[d_idx];
+    //bit mask of all threads that passed true
+    int const in_write_row_bounds_mask = ballot(in_write_row_bounds);
 
-      int is_valid;
-      if constexpr (nullable) {
-        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
-      } else {
-        is_valid = in_row_bounds;
+    // index of first set bit (in the warp to store)
+    int write_start = __ffs(in_write_row_bounds_mask) - 1;
+      
+    // remaining code is trivial for non-nullable, non-list columns: no need to iterate over depth
+    if constexpr (!nullable) {
+
+      // if this is valid and we're at the leaf, output dst_pos
+      int const is_valid = in_row_bounds;
+      if (is_valid) {
+        auto& ni = s->nesting_info[max_depth];
+        int const thread_valid_count = thread_value_count;
+
+        // for non-list types, the value count is always the same across
+        int const dst_pos = (value_count + thread_value_count) - 1;
+        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
       }
 
-      // thread and block validity count
-      int thread_valid_count, block_valid_count;
-      if constexpr (nullable) {
+      // update valid_count
+      if (t == 0) { 
+        int const block_valid_count = block_value_count;
+        s->nesting_info[max_depth].valid_count += block_valid_count;
+      }
+
+      __syncthreads(); // publish modification of nesting_info value_count
+    } else {
+
+      // column is a nullable non-list: iterate by depth
+      for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+
+        auto& ni = s->nesting_info[d_idx];
+
+        // everything up to the max_def_level is a non-null value
+        int is_valid = ((def_level >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
+
+        // thread and block validity count
+        // queries is_valid of all threads, stores prior total and total total
         using block_scan = cub::BlockScan<int, decode_block_size>;
         __shared__ typename block_scan::TempStorage scan_storage;
+        int thread_valid_count, block_valid_count;
         block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-        __syncthreads();
 
-        // validity is processed per-warp
+        // validity is processed per-warp (lane 0 writes), because writes are atomic
         //
-        // nested schemas always read and write to the same bounds (that is, read and write
-        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-        // at the first value, even if that is before first_row, because we cannot trivially jump to
-        // the correct position to start reading. since we are about to write the validity vector
+        // nested schemas always read and write to the same bounds 
+        // (that is, read and write positions are already pre-bounded by first_row/num_rows). 
+        // since we are about to write the validity vector
         // here we need to adjust our computed mask to take into account the write row bounds.
         int warp_null_count = 0;
-        if (write_start >= 0 && ni.valid_map != nullptr) {
-          int const valid_map_offset        = ni.valid_map_offset;
+        if ((write_start >= 0) && (ni.valid_map != nullptr)) {
           uint32_t const warp_validity_mask = ballot(is_valid);
-          // lane 0 from each warp writes out validity
           if ((t % cudf::detail::warp_size) == 0) {
-            int const vindex =
-              (value_count + thread_value_count) - 1;  // absolute input value index
-            int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                   first_row;  // absolute bit offset into the output validity map
-            int const write_end = cudf::detail::warp_size -
-                                  __clz(in_write_row_bounds);  // last bit in the warp to store
-            int const bit_count = write_end - write_start;
-            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+            // absolute input value index
+            int const vindex = (value_count + thread_value_count) - 1;
+
+            // absolute bit offset into the output validity map
+            int const bit_offset = (ni.valid_map_offset + vindex + write_start) - first_row;
+
+            // last bit in the warp to store
+            int const write_end = cudf::detail::warp_size - __clz(in_write_row_bounds_mask);
+            int const bit_count = write_end - write_start; //in old is warp_valid_mask_bit_count
+
+            uint32_t const warp_output_valid_mask = warp_validity_mask >> write_start;
+
+            store_validity(bit_offset, ni.valid_map, warp_output_valid_mask, bit_count);
+
+            warp_null_count = bit_count - __popc(warp_output_valid_mask);
           }
         }
 
@@ -290,25 +320,20 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
         size_type const block_null_count =
           cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
         if (t == 0) { ni.null_count += block_null_count; }
-      }
-      // trivial for non-nullable columns
-      else {
-        thread_valid_count = thread_value_count;
-        block_valid_count  = block_value_count;
-      }
 
-      // if this is valid and we're at the leaf, output dst_pos
-      __syncthreads();  // handle modification of ni.value_count from below
-      if (is_valid && d_idx == max_depth) {
-        // for non-list types, the value count is always the same across
-        int const dst_pos = (value_count + thread_value_count) - 1;
-        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
-        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
-      }
-      __syncthreads();  // handle modification of ni.value_count from below
+        // if this is valid and we're at the leaf, output dst_pos
+        if (is_valid && d_idx == max_depth) {
+          // for non-list types, the value count is always the same across
+          __syncthreads();  // handle modification of ni.valid_count from below
+          int const dst_pos = (value_count + thread_value_count) - 1;
+          int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+          sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+        }
+        __syncthreads();  // handle modification of ni.valid_count from below
 
-      // update stuff
-      if (t == 0) { ni.valid_count += block_valid_count; }
+        // update stuff
+        if (t == 0) { ni.valid_count += block_valid_count; }
+      } //END OF DEPTH LOOP
     }
 
     value_count += block_value_count;
@@ -351,27 +376,26 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
-    }
-
     int const thread_value_count = t + 1;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
     int const row_index     = (thread_value_count + value_count) - 1;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // determine if is valid
+    // everything up to the max_def_level is a non-null value
     int is_valid;
     if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+      // get definition level. only need to process for nullable columns
+      if (t >= batch_size) {
+        is_valid = 0;
+      } else if (def) {
+        int const def_level = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+        is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
+      } else {
+        is_valid = in_row_bounds;
+      }
     } else {
       is_valid = in_row_bounds;
     }
@@ -379,32 +403,37 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
     // thread and block validity count
     int thread_valid_count, block_valid_count;
     if constexpr (nullable) {
+      // use a scan to compute the total number of valid values, as well as the total number of valid 
+      // values for each individual thread (how many valids there are including me, but no one after me)
       using block_scan = cub::BlockScan<int, decode_block_size>;
       __shared__ typename block_scan::TempStorage scan_storage;
       block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
       __syncthreads();
 
-      // validity is processed per-warp
+      // validity is processed per-warp, because storing is an atomic operation
       //
       // nested schemas always read and write to the same bounds (that is, read and write
       // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
       // at the first value, even if that is before first_row, because we cannot trivially jump to
       // the correct position to start reading. since we are about to write the validity vector
       // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
+      int const in_write_row_bounds = row_index >= first_row && row_index < last_row;
+      int const in_write_row_bounds_mask = ballot(in_write_row_bounds);
+      //is first_thread_in_write_range in old
+      int const write_start = __ffs(in_write_row_bounds_mask) - 1;  // first bit in the warp to store
+
+      int warp_null_count = 0;
+      if ((write_start >= 0) && (ni.valid_map != nullptr)) {
+        uint32_t const warp_validity_mask = ballot(is_valid); // is warp_valid_mask in old
         // lane 0 from each warp writes out validity
         if ((t % cudf::detail::warp_size) == 0) {
           int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
           int const bit_offset = (valid_map_offset + vindex + write_start) -
                                  first_row;  // absolute bit offset into the output validity map
           int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+            cudf::detail::warp_size - __clz(in_write_row_bounds_mask);  // last bit in the warp to store
           int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start); //#set bits
 
           store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
         }
@@ -439,7 +468,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
     ni.valid_count       = valid_count;
-    ni.value_count       = value_count;  // TODO: remove? this is unused in the non-list path
+    ni.value_count       = value_count;
     s->nz_count          = valid_count;
     s->input_value_count = value_count;
     s->input_row_count   = value_count;
@@ -448,6 +477,259 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   return valid_count;
 }
 
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, 
+  level_t const* const rep, int t)
+{
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
+
+  // how many (input) values we've processed in the page so far, prior to this loop iteration
+  int value_count = s->input_value_count;
+
+  // how many rows we've processed in the page so far
+  int input_row_count = s->input_row_count;
+if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); }
+
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); }
+
+  int const row_index_lower_bound = s->row_index_lower_bound;
+  int const max_depth = s->col.max_nesting_depth - 1;
+
+  __syncthreads();
+
+  while (value_count < target_value_count) {
+    bool const within_batch = value_count + t < target_value_count;
+
+    // get definition level. only need to process for nullable columns
+    int def_level;
+    if constexpr (nullable) {
+      if (def) {
+        def_level = within_batch
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        def_level = within_batch ? 1 : -1;
+      }
+    } else {
+      def_level = 0;
+    }
+
+    // use repitition level to get start/end depth
+    // different for each thread, as each thread has a different r/d
+    int start_depth = -1, end_depth = -1;
+    if (within_batch) {
+      int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
+      int const rep_level = rep[index];
+      //computed by generate_depth_remappings()
+      start_depth = s->nesting_info[rep_level].start_depth;
+      end_depth   = s->nesting_info[def_level].end_depth;
+if (t == 0) { printf("def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
+  def_level, rep_level, start_depth, end_depth); }
+    }
+
+    //Determine value count & row index
+    // track (page-relative) row index for the thread so we can compare against input bounds
+    // keep track of overall # of rows we've read.
+    int const is_new_row = start_depth == 0 ? 1 : 0; //TODO: UNCOMMENT
+    int thread_num_new_rows, total_num_new_rows;
+    using block_scan = cub::BlockScan<int, decode_block_size>;
+    __shared__ typename block_scan::TempStorage scan_storage;
+    block_scan(scan_storage).InclusiveSum(is_new_row, thread_num_new_rows, total_num_new_rows);
+    __syncthreads(); //Needed because scan_storage will be reused
+
+if (t == 0) { printf("thread_num_new_rows %d, total_num_new_rows %d\n", thread_num_new_rows, total_num_new_rows); }
+
+    int const row_index = input_row_count + (thread_num_new_rows - 1);
+    input_row_count += total_num_new_rows;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // thread and block value count
+
+    // if we are within the range of nesting levels we should be adding value indices for
+    // is from/in current rep level to/in the rep level AT the depth with the def value
+    int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
+
+if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d\n", \
+  row_index, in_row_bounds, in_nesting_bounds); }
+
+    // queries is_valid from all threads, stores prior total and total total
+    int thread_value_count = 0, block_value_count = 0;
+/*    int thread_value_count, block_value_count;
+    block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count);
+*/
+    //bit mask of all threads that passed true
+    int const in_write_row_bounds_mask = ballot(in_row_bounds);
+
+if (t == 0) { printf("thread_value_count %d, block_value_count %d\n", thread_value_count, block_value_count); }
+
+    // column is either nullable or is a list (or both): iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+
+      auto& ni = s->nesting_info[d_idx];
+
+      // everything up to the max_def_level is a non-null value
+      int is_valid;
+      if constexpr (nullable) {
+        is_valid = ((def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0;
+      } else {
+        is_valid = in_nesting_bounds;
+      }
+
+if (t == 0) { printf("nullable %d, depth %d, max_depth %d, is_valid %d\n", int(nullable), d_idx, max_depth, is_valid); }
+if (t < 10) { printf("t %d, is_valid %d\n", t, is_valid); }
+
+      // thread and block validity count
+      // queries is_valid of all threads, stores prior total and total total
+
+      // for nested lists, it's more complicated.  This block will visit 128 incoming values,
+      // however not all of them will necessarily represent a value at this nesting level. so
+      // the validity bit for thread t might actually represent output value t-6. the correct
+      // position for thread t's bit is thread_value_count. 
+      static_assert(decode_block_size <= 8*sizeof(__uint128_t), 
+        "This code relies on bits for block threads fitting within a uint128!");
+
+if (t < 10) { printf("t %d, thread_value_count %d\n", t, thread_value_count); }
+
+/*      using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>;
+      __shared__ typename block_reduce::TempStorage reduce_storage;
+      auto shifted_validity = static_cast<__uint128_t>(is_valid) << thread_value_count;
+      auto or_reducer = [](const __uint128_t& lhs, const __uint128_t& rhs){
+        return lhs | rhs;
+      };
+      __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer);
+*/
+__uint128_t block_valid_mask = 0;
+
+      //Reduction result is only visible to thread zero, must share with other threads:
+/*      __shared__ __uint128_t block_valid_mask_storage;
+      if(t == 0) { block_valid_mask_storage = block_valid_mask; }
+      __syncthreads();
+      block_valid_mask = block_valid_mask_storage;
+*/
+      auto count_set_bits = [](__uint128_t bits){
+        return __popcll((uint64_t)bits) + __popcll((uint64_t)(bits >> 64));
+      };
+      auto thread_mask = (__uint128_t(1) << thread_value_count) - 1;
+      int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask);
+
+if (t == 0) { printf("block_valid_mask %d, thread_valid_count %d\n", int(block_valid_mask), thread_valid_count); }
+if (t < 10) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
+
+      // compute warp and thread value counts for the -next- nesting level. we need to
+      // do this for nested schemas so that we can emit an offset for the -current- nesting
+      // level. more concretely : the offset for the current nesting level == current length of the
+      // next nesting level
+      int32_t next_thread_value_count = 0, next_block_value_count = 0;
+      int next_in_nesting_bounds = 0;
+      if (d_idx < max_depth) {
+        //mask is different between depths
+        next_in_nesting_bounds = 
+          (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0;
+/*
+        using block_scan = cub::BlockScan<int, decode_block_size>;
+        __shared__ typename block_scan::TempStorage scan_storage;
+        block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count);
+*/
+if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n", next_thread_value_count, next_block_value_count); }
+
+        // if we're -not- at a leaf column and we're within nesting/row bounds
+        // and we have a valid data_out pointer, it implies this is a list column, so
+        // emit an offset.
+        if (in_nesting_bounds && ni.data_out != nullptr) {
+          int const idx             = ni.value_count + thread_value_count;
+          cudf::size_type const ofs = s->nesting_info[d_idx + 1].value_count +
+                                      next_thread_value_count +
+                                      s->nesting_info[d_idx + 1].page_start_value;
+          //STORE THE OFFSET FOR THE NEW LIST LOCATION
+          (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
+        }
+      }
+
+      // validity is processed per-warp (on lane 0's), because writes are atomic
+      //
+      // nested schemas always read and write to the same bounds 
+      // (that is, read and write positions are already pre-bounded by first_row/num_rows). 
+      // since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int warp_null_count = 0;
+      if constexpr (nullable) {
+        if (ni.valid_map != nullptr) {
+          uint32_t const warp_validity_mask = ballot(is_valid);
+          if ((t % cudf::detail::warp_size) == 0) {
+            // absolute input value index
+            int const vindex = (value_count + thread_value_count) - 1;
+
+            // absolute bit offset into the output validity map
+//TODO: first_row??            
+            int const bit_offset = (ni.valid_map_offset + vindex) - first_row;
+
+            // last bit in the warp to store //in old is warp_valid_mask_bit_count
+            int const bit_count = cudf::detail::warp_size - __clz(in_write_row_bounds_mask);
+
+            store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count);
+            warp_null_count = bit_count - __popc(warp_validity_mask);
+          }
+        }
+
+        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+        // valid_count) because valid_count also includes rows that potentially start before our row
+        // bounds. if we could come up with a way to clean that up, we could remove this and just
+        // compute it directly at the end of the kernel.
+        size_type const block_null_count =
+          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+        if (t == 0) { ni.null_count += block_null_count; }
+      }
+
+      // if this is valid and we're at the leaf, output dst_pos
+      __syncthreads();  // handle modification of ni.valid_count from below
+      if (is_valid && d_idx == max_depth) {
+        // for non-list types, the value count is always the same across
+        int const dst_pos = (ni.value_count + thread_value_count) - 1;
+        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      }
+      __syncthreads();  // handle modification of ni.value_count from below
+
+      // update stuff
+      if (t == 0) {
+        int const block_valid_count = count_set_bits(block_valid_mask);
+        ni.valid_count += block_valid_count;
+        ni.value_count += block_value_count;
+      }
+
+      // propagate value counts for the next depth level
+      block_value_count  = next_block_value_count;
+      thread_value_count = next_thread_value_count;
+      in_nesting_bounds  = next_in_nesting_bounds;
+    } //END OF DEPTH LOOP
+
+if (t == 0) { printf("END DEPTH LOOP\n"); }
+
+//TODO: Shouldn't we guard against threads going beyond the last row? Old algo didn't?
+    int const batch_size = min(max_batch_size, target_value_count - value_count);
+    value_count += batch_size;
+  }
+
+if (t == 0) { printf("END LOOP\n"); }
+
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nz_count          = s->nesting_info[max_depth].valid_count;
+    s->input_value_count = value_count;
+
+    // If we have lists # rows != # values
+    s->input_row_count = input_row_count;
+  }
+
+  __syncthreads();
+  return s->nesting_info[max_depth].valid_count;
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -494,6 +776,7 @@ template <typename level_t,
           decode_kernel_mask kernel_mask_t,
           bool has_dict_t,
           bool has_nesting_t,
+          bool has_lists_t,
           template <int block_size, typename state_buf>
           typename DecodeValuesFunc>
 CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
@@ -542,25 +825,22 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   bool const should_process_nulls = nullable && maybe_has_nulls(s);
 
   // shared buffer. all shared memory is suballocated out of here
-  // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
-  // sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
+    sizeof(rle_run<level_t>), size_t{16}) : 0;
   constexpr int shared_dict_size =
     has_dict_t
       ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
       : 0;
   constexpr int shared_def_size =
     cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
-  constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size;
+  constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size;
   __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
 
   // setup all shared memory buffers
   int shared_offset = 0;
-  /*
-  rle_run<level_t> *rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
-  if constexpr (has_lists_t){
-    shared_offset += shared_rep_size;
-  }
-  */
+  rle_run<level_t>* rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t){ shared_offset += shared_rep_size; }
+
   rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
   if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
   rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
@@ -575,17 +855,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
                      def,
                      s->page.num_input_values);
   }
-  /*
+  
   rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
   level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
-  if constexpr(has_lists_t){
+  if constexpr (has_lists_t){
     rep_decoder.init(s->col.level_bits[level_type::REPETITION],
                      s->abs_lvl_start[level_type::REPETITION],
                      s->abs_lvl_end[level_type::REPETITION],
                      rep,
                      s->page.num_input_values);
   }
-  */
 
   rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
   if constexpr (has_dict_t) {
@@ -605,17 +884,28 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   int valid_count     = 0;
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
+
+if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d \n", int(has_lists_t), int(has_nesting_t)); }
+
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
     // only need to process definition levels if this is a nullable column
     if (should_process_nulls) {
+      if constexpr (has_lists_t){
+        rep_decoder.decode_next(t);
+      }
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
       if constexpr (has_nesting_t) {
-        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
-          processed_count, s, sb, def, t);
+        if constexpr (has_lists_t) {
+          next_valid_count = gpuUpdateValidityAndRowIndicesNestedLists<decode_block_size_t, true, level_t>(
+            processed_count, s, sb, def, rep, t);
+        } else {
+          next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, true, level_t>(
+            processed_count, s, sb, def, t);
+        }
       } else {
         next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, t);
@@ -628,9 +918,15 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
 
       if constexpr (has_nesting_t) {
-        next_valid_count =
-          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
-            processed_count, s, sb, nullptr, t);
+        if constexpr (has_lists_t) {
+          next_valid_count =
+            gpuUpdateValidityAndRowIndicesNestedLists<decode_block_size_t, false, level_t>(
+              processed_count, s, sb, nullptr, rep, t);
+        } else {
+          next_valid_count =
+            gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, false, level_t>(
+              processed_count, s, sb, nullptr, t);
+        }
       } else {
         next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
           processed_count, s, sb, nullptr, t);
@@ -664,6 +960,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                   size_t min_row,
                                   int level_type_size,
                                   bool has_nesting,
+                                  bool is_list,
                                   kernel_error::pointer error_code,
                                   rmm::cuda_stream_view stream)
 {
@@ -673,12 +970,23 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
+                               false,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -688,17 +996,29 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
+                               false,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -708,6 +1028,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -721,6 +1042,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                       size_t min_row,
                                       int level_type_size,
                                       bool has_nesting,
+                                      bool is_list,
                                       kernel_error::pointer error_code,
                                       rmm::cuda_stream_view stream)
 {
@@ -730,12 +1052,23 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -745,17 +1078,29 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                decode_kernel_mask::FIXED_WIDTH_DICT,
                                true,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -765,6 +1110,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                decode_kernel_mask::FIXED_WIDTH_DICT,
                                true,
                                false,
+                               true,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -779,6 +1125,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                               size_t min_row,
                               int level_type_size,
                               bool has_nesting,
+                              bool is_list,
                               kernel_error::pointer error_code,
                               rmm::cuda_stream_view stream)
 {
@@ -788,12 +1135,23 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -803,17 +1161,29 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
                                true,
                                false,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -823,6 +1193,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
                                true,
                                false,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index d604642be54..ac39e2ac291 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -181,6 +181,17 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
   } else if (is_string_col(chunk)) {
     // check for string before byte_stream_split so FLBA will go to the right kernel
     return decode_kernel_mask::STRING;
+  } 
+  
+  if (is_list(chunk)) {
+    if (page.encoding == Encoding::PLAIN) {
+      return decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST;
+    } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+      return decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST;
+    } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
+               page.encoding == Encoding::RLE_DICTIONARY) {
+      return decode_kernel_mask::FIXED_WIDTH_DICT_LIST;
+    }
   }
 
   if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index efc1f5ebab1..d666f129af8 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -221,6 +221,9 @@ enum class decode_kernel_mask {
     (1 << 9),                              // Same as above but for nested, fixed-width data
   FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
   FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
+  FIXED_WIDTH_DICT_LIST      = (1 << 12),  // Run decode kernel for fixed width dictionary pages for lists
+  FIXED_WIDTH_NO_DICT_LIST   = (1 << 13),  // Run decode kernel for fixed width non-dictionary pages for lists
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = (1 << 14),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -815,6 +818,28 @@ void DecodeStringPageData(cudf::detail::hostdevice_span<PageInfo> pages,
                           kernel_error::pointer error_code,
                           rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the list column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeListPageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                        cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                        size_t num_rows,
+                        size_t min_row,
+                        int level_type_size,
+                        kernel_error::pointer error_code,
+                        rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the DELTA_BINARY_PACKED column data stored in the pages
  *
@@ -893,6 +918,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -902,6 +928,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                          size_t min_row,
                          int level_type_size,
                          bool has_nesting,
+                         bool is_list,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
@@ -917,6 +944,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -926,6 +954,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              size_t min_row,
                              int level_type_size,
                              bool has_nesting,
+                             bool is_list,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
@@ -941,6 +970,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -950,6 +980,7 @@ void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages
                                    size_t min_row,
                                    int level_type_size,
                                    bool has_nesting,
+                                   bool is_list,
                                    kernel_error::pointer error_code,
                                    rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index f705f6626e7..cc98e263664 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -71,6 +71,8 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   // figure out which kernels to run
   auto const kernel_mask = GetAggregatedDecodeKernelMask(subpass.pages, _stream);
 
+printf("DECODE DATA PAGE, mask %d\n", kernel_mask);
+
   // Check to see if there are any string columns present. If so, then we need to get size info
   // for each string page. This size info will be used to pre-allocate memory for the column,
   // allowing the page decoder to write string data directly to the column buffer, rather than
@@ -274,6 +276,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                   skip_rows,
                                   level_type_size,
                                   false,
+                                  false,
                                   error_code.data(),
                                   streams[s_idx++]);
   }
@@ -286,6 +289,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                   skip_rows,
                                   level_type_size,
                                   true,
+                                  false,
+                                  error_code.data(),
+                                  streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder, for list columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  true,
+                                  true,
                                   error_code.data(),
                                   streams[s_idx++]);
   }
@@ -309,6 +326,21 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         skip_rows,
                         level_type_size,
                         false,
+                        false,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder for lists
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) {
+printf("LIST PAGE\n");
+    DecodePageDataFixed(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        true,
+                        true,
                         error_code.data(),
                         streams[s_idx++]);
   }
@@ -321,6 +353,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         skip_rows,
                         level_type_size,
                         true,
+                        false,
                         error_code.data(),
                         streams[s_idx++]);
   }
@@ -333,6 +366,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                             skip_rows,
                             level_type_size,
                             false,
+                            false,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder with dictionaries for lists
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) {
+    DecodePageDataFixedDict(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            true,
+                            true,
                             error_code.data(),
                             streams[s_idx++]);
   }
@@ -345,6 +392,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                             skip_rows,
                             level_type_size,
                             true,
+                            false,
                             error_code.data(),
                             streams[s_idx++]);
   }

From 2ca9618ef3c4f8a0973da9d680143c3776dbb3a7 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Fri, 16 Aug 2024 16:19:50 -0400
Subject: [PATCH 02/36] Further work in list code

---
 cpp/src/io/parquet/decode_fixed.cu | 220 ++++++++++++++++-------------
 1 file changed, 119 insertions(+), 101 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 8157198e116..57eaaf1079e 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -205,7 +205,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
   int value_count = s->input_value_count;
 
   // cap by last row so that we don't process any rows past what we want to output.
-  int const first_row                 = s->first_row;
+  int const first_row                 = s->first_row; //row index WITHIN THE PAGE
   int const last_row                  = first_row + s->num_rows;
   int const capped_target_value_count = min(target_value_count, last_row);
 
@@ -232,7 +232,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
     }
 
     //Determine value count & row index
-    int const thread_value_count = t + 1; //# of output values from the view of this thread
+    int const thread_value_count = t; //# of output values from the view of this thread
     int const block_value_count = batch_size;
     int const row_index = t + value_count; //thread_row_index in old
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
@@ -256,8 +256,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
         int const thread_valid_count = thread_value_count;
 
         // for non-list types, the value count is always the same across
-        int const dst_pos = (value_count + thread_value_count) - 1;
-        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+        int const dst_pos = value_count + thread_value_count;
+        int const src_pos = ni.valid_count + thread_valid_count;
         sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
       }
 
@@ -283,7 +283,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
         using block_scan = cub::BlockScan<int, decode_block_size>;
         __shared__ typename block_scan::TempStorage scan_storage;
         int thread_valid_count, block_valid_count;
-        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+        block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
 
         // validity is processed per-warp (lane 0 writes), because writes are atomic
         //
@@ -296,9 +296,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
           uint32_t const warp_validity_mask = ballot(is_valid);
           if ((t % cudf::detail::warp_size) == 0) {
             // absolute input value index
-            int const vindex = (value_count + thread_value_count) - 1;
+            int const vindex = value_count + thread_value_count;
 
             // absolute bit offset into the output validity map
+            // subtract by first_row: we may skip first N rows, 
+            // but still need to write bits at beginning of output vector
             int const bit_offset = (ni.valid_map_offset + vindex + write_start) - first_row;
 
             // last bit in the warp to store
@@ -325,8 +327,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
         if (is_valid && d_idx == max_depth) {
           // for non-list types, the value count is always the same across
           __syncthreads();  // handle modification of ni.valid_count from below
-          int const dst_pos = (value_count + thread_value_count) - 1;
-          int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+          int const dst_pos = value_count + thread_value_count;
+          int const src_pos = ni.valid_count + thread_valid_count;
           sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
         }
         __syncthreads();  // handle modification of ni.valid_count from below
@@ -376,11 +378,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
+    int const row_index     = thread_value_count + value_count;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
     // determine if is valid
@@ -407,7 +409,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
       // values for each individual thread (how many valids there are including me, but no one after me)
       using block_scan = cub::BlockScan<int, decode_block_size>;
       __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+      block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
       __syncthreads();
 
       // validity is processed per-warp, because storing is an atomic operation
@@ -427,9 +429,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
         uint32_t const warp_validity_mask = ballot(is_valid); // is warp_valid_mask in old
         // lane 0 from each warp writes out validity
         if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
+          int const vindex = value_count + thread_value_count;  // absolute input value index
+
+          // absolute bit offset into the output validity map
+          int const bit_offset = (valid_map_offset + vindex + write_start) - first_row;
+
           int const write_end =
             cudf::detail::warp_size - __clz(in_write_row_bounds_mask);  // last bit in the warp to store
           int const bit_count = write_end - write_start;
@@ -455,8 +459,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
 
     // output offset
     if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
+      int const dst_pos = value_count + thread_value_count;
+      int const src_pos = valid_count + thread_valid_count;
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 
@@ -482,20 +486,23 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, 
   level_t const* const rep, int t)
 {
+  //What is the output of this? Validity bits and offsets to list starts
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
   // how many (input) values we've processed in the page so far, prior to this loop iteration
   int value_count = s->input_value_count;
 
+  int printf_num_threads = 34;
+
   // how many rows we've processed in the page so far
   int input_row_count = s->input_row_count;
-if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); }
+  if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); }
 
   // cap by last row so that we don't process any rows past what we want to output.
   int const first_row                 = s->first_row;
   int const last_row                  = first_row + s->num_rows;
-if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); }
+  if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); }
 
   int const row_index_lower_bound = s->row_index_lower_bound;
   int const max_depth = s->col.max_nesting_depth - 1;
@@ -505,46 +512,34 @@ if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first
   while (value_count < target_value_count) {
     bool const within_batch = value_count + t < target_value_count;
 
-    // get definition level. only need to process for nullable columns
-    int def_level;
-    if constexpr (nullable) {
-      if (def) {
-        def_level = within_batch
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        def_level = within_batch ? 1 : -1;
-      }
-    } else {
-      def_level = 0;
-    }
-
-    // use repitition level to get start/end depth
+    // get definition level, use repitition level to get start/end depth
     // different for each thread, as each thread has a different r/d
-    int start_depth = -1, end_depth = -1;
+    int def_level = -1, start_depth = -1, end_depth = -1;
     if (within_batch) {
       int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
       int const rep_level = rep[index];
+      def_level = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+
       //computed by generate_depth_remappings()
       start_depth = s->nesting_info[rep_level].start_depth;
       end_depth   = s->nesting_info[def_level].end_depth;
-if (t == 0) { printf("def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
-  def_level, rep_level, start_depth, end_depth); }
+      if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
+        t, def_level, rep_level, start_depth, end_depth); }
     }
 
     //Determine value count & row index
     // track (page-relative) row index for the thread so we can compare against input bounds
     // keep track of overall # of rows we've read.
-    int const is_new_row = start_depth == 0 ? 1 : 0; //TODO: UNCOMMENT
-    int thread_num_new_rows, total_num_new_rows;
+    int const is_new_row = start_depth == 0 ? 1 : 0;
+    int num_prior_new_rows, total_num_new_rows;
     using block_scan = cub::BlockScan<int, decode_block_size>;
     __shared__ typename block_scan::TempStorage scan_storage;
-    block_scan(scan_storage).InclusiveSum(is_new_row, thread_num_new_rows, total_num_new_rows);
+    block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows);
     __syncthreads(); //Needed because scan_storage will be reused
 
-if (t == 0) { printf("thread_num_new_rows %d, total_num_new_rows %d\n", thread_num_new_rows, total_num_new_rows); }
+    if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); }
 
-    int const row_index = input_row_count + (thread_num_new_rows - 1);
+    int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1);
     input_row_count += total_num_new_rows;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
@@ -554,18 +549,21 @@ if (t == 0) { printf("thread_num_new_rows %d, total_num_new_rows %d\n", thread_n
     // is from/in current rep level to/in the rep level AT the depth with the def value
     int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
 
-if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d\n", \
-  row_index, in_row_bounds, in_nesting_bounds); }
+    if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d, last_row %d\n", \
+      row_index, in_row_bounds, in_nesting_bounds, last_row); }
+    if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", 
+      t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); }
 
     // queries is_valid from all threads, stores prior total and total total
     int thread_value_count = 0, block_value_count = 0;
-/*    int thread_value_count, block_value_count;
     block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count);
-*/
-    //bit mask of all threads that passed true
-    int const in_write_row_bounds_mask = ballot(in_row_bounds);
 
-if (t == 0) { printf("thread_value_count %d, block_value_count %d\n", thread_value_count, block_value_count); }
+    if (t == 0) { printf("block_value_count %d\n", block_value_count); }
+    if (t < printf_num_threads) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", 
+      t, thread_value_count, in_nesting_bounds); }
+
+    //bit mask of all threads that passed true //TODO DELETE ME
+    //uint32_t const in_write_row_bounds_mask = ballot(in_row_bounds);
 
     // column is either nullable or is a list (or both): iterate by depth
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
@@ -580,8 +578,9 @@ if (t == 0) { printf("thread_value_count %d, block_value_count %d\n", thread_val
         is_valid = in_nesting_bounds;
       }
 
-if (t == 0) { printf("nullable %d, depth %d, max_depth %d, is_valid %d\n", int(nullable), d_idx, max_depth, is_valid); }
-if (t < 10) { printf("t %d, is_valid %d\n", t, is_valid); }
+      if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", 
+        int(nullable), d_idx, max_depth, ni.max_def_level, value_count); }
+      if (t < printf_num_threads) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", t, def_level, in_nesting_bounds, is_valid); }
 
       // thread and block validity count
       // queries is_valid of all threads, stores prior total and total total
@@ -593,32 +592,28 @@ if (t < 10) { printf("t %d, is_valid %d\n", t, is_valid); }
       static_assert(decode_block_size <= 8*sizeof(__uint128_t), 
         "This code relies on bits for block threads fitting within a uint128!");
 
-if (t < 10) { printf("t %d, thread_value_count %d\n", t, thread_value_count); }
-
-/*      using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>;
+      using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>;
       __shared__ typename block_reduce::TempStorage reduce_storage;
       auto shifted_validity = static_cast<__uint128_t>(is_valid) << thread_value_count;
       auto or_reducer = [](const __uint128_t& lhs, const __uint128_t& rhs){
         return lhs | rhs;
       };
       __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer);
-*/
-__uint128_t block_valid_mask = 0;
 
       //Reduction result is only visible to thread zero, must share with other threads:
-/*      __shared__ __uint128_t block_valid_mask_storage;
+      __shared__ __uint128_t block_valid_mask_storage;
       if(t == 0) { block_valid_mask_storage = block_valid_mask; }
       __syncthreads();
       block_valid_mask = block_valid_mask_storage;
-*/
+
       auto count_set_bits = [](__uint128_t bits){
         return __popcll((uint64_t)bits) + __popcll((uint64_t)(bits >> 64));
       };
       auto thread_mask = (__uint128_t(1) << thread_value_count) - 1;
       int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask);
 
-if (t == 0) { printf("block_valid_mask %d, thread_valid_count %d\n", int(block_valid_mask), thread_valid_count); }
-if (t < 10) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
+      if (t == 0) { printf("block_valid_mask %d\n", int(block_valid_mask)); }
+      if (t < printf_num_threads) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
 
       // compute warp and thread value counts for the -next- nesting level. we need to
       // do this for nested schemas so that we can emit an offset for the -current- nesting
@@ -630,23 +625,28 @@ if (t < 10) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
         //mask is different between depths
         next_in_nesting_bounds = 
           (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0;
-/*
+
         using block_scan = cub::BlockScan<int, decode_block_size>;
         __shared__ typename block_scan::TempStorage scan_storage;
         block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count);
-*/
-if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n", next_thread_value_count, next_block_value_count); }
+
+        if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); }
+        if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", 
+          t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); }
+        if (t < printf_num_threads) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); }
 
         // if we're -not- at a leaf column and we're within nesting/row bounds
         // and we have a valid data_out pointer, it implies this is a list column, so
         // emit an offset.
         if (in_nesting_bounds && ni.data_out != nullptr) {
+          const auto& next_ni = s->nesting_info[d_idx + 1];
           int const idx             = ni.value_count + thread_value_count;
-          cudf::size_type const ofs = s->nesting_info[d_idx + 1].value_count +
-                                      next_thread_value_count +
-                                      s->nesting_info[d_idx + 1].page_start_value;
+          cudf::size_type const ofs = next_ni.value_count + next_thread_value_count + next_ni.page_start_value;
+
           //STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
+          if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", 
+            t, idx, next_ni.value_count, next_ni.page_start_value, ofs); }
         }
       }
 
@@ -659,20 +659,34 @@ if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n",
       int warp_null_count = 0;
       if constexpr (nullable) {
         if (ni.valid_map != nullptr) {
-          uint32_t const warp_validity_mask = ballot(is_valid);
+//TODO: Consider OR'ING for next_thread_value_count and popc() for next_thread_value_count
+//so that we don't have to take a ballot here. Is uint128 so may deconstruct to this anyway ...
+          uint32_t const warp_count_mask = ballot(in_nesting_bounds);
           if ((t % cudf::detail::warp_size) == 0) {
-            // absolute input value index
-            int const vindex = (value_count + thread_value_count) - 1;
+            // last bit in the warp to store //in old is warp_valid_mask_bit_count
+//so it's a count of everything in nesting bounds, though bits can be zero if NULL at this level            
+            int const bit_count = __popc(warp_count_mask);
+            if(bit_count > 0) {
 
-            // absolute bit offset into the output validity map
-//TODO: first_row??            
-            int const bit_offset = (ni.valid_map_offset + vindex) - first_row;
+              // absolute input value index
+              int const vindex = value_count + thread_value_count;
 
-            // last bit in the warp to store //in old is warp_valid_mask_bit_count
-            int const bit_count = cudf::detail::warp_size - __clz(in_write_row_bounds_mask);
+              // absolute bit offset into the output validity map
+              //is cumulative sum of bit_count at the given nesting depth
+              // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
+              // valid_map_offset was already set during list pre-processing for appropriate start index
+              int const bit_offset = ni.valid_map_offset + vindex;
+
+              auto const shifted_valid_mask = static_cast<uint32_t>(block_valid_mask >> thread_value_count);
+              auto const bit_range_mask = (1 << bit_count) - 1; //mainly needed for warp_null_count
+              auto const warp_validity_mask = shifted_valid_mask & bit_range_mask;
 
-            store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count);
-            warp_null_count = bit_count - __popc(warp_validity_mask);
+              printf("t %d, thread_value_count %d, vindex %d, bit_offset %d, bit_count %d, warp_validity_mask %u\n", 
+                t, thread_value_count, vindex, bit_offset, bit_count, warp_validity_mask);
+
+              store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count);
+              warp_null_count = bit_count - __popc(warp_validity_mask);
+            }
           }
         }
 
@@ -689,9 +703,14 @@ if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n",
       __syncthreads();  // handle modification of ni.valid_count from below
       if (is_valid && d_idx == max_depth) {
         // for non-list types, the value count is always the same across
-        int const dst_pos = (ni.value_count + thread_value_count) - 1;
-        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
-        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+        int const dst_pos = ni.value_count + thread_value_count;
+        int const src_pos = ni.valid_count + thread_valid_count;
+        int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
+
+        if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); }
+        if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
+
+        sb->nz_idx[output_index] = dst_pos;
       }
       __syncthreads();  // handle modification of ni.value_count from below
 
@@ -708,14 +727,13 @@ if (t == 0) { printf("next_thread_value_count %d, next_block_value_count %d\n",
       in_nesting_bounds  = next_in_nesting_bounds;
     } //END OF DEPTH LOOP
 
-if (t == 0) { printf("END DEPTH LOOP\n"); }
+    if (t == 0) { printf("END DEPTH LOOP\n"); }
 
-//TODO: Shouldn't we guard against threads going beyond the last row? Old algo didn't?
     int const batch_size = min(max_batch_size, target_value_count - value_count);
     value_count += batch_size;
   }
 
-if (t == 0) { printf("END LOOP\n"); }
+  if (t == 0) { printf("END LOOP\n"); }
 
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
@@ -823,6 +841,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
 
   bool const nullable             = is_nullable(s);
   bool const should_process_nulls = nullable && maybe_has_nulls(s);
+  bool const should_process_def_levels = should_process_nulls || has_lists_t;
 
   // shared buffer. all shared memory is suballocated out of here
   constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
@@ -848,7 +867,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   rle_stream<level_t, decode_block_size_t, rolling_buf_size> def_decoder{def_runs};
   level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (should_process_nulls) {
+  if (should_process_def_levels) {
     def_decoder.init(s->col.level_bits[level_type::DEFINITION],
                      s->abs_lvl_start[level_type::DEFINITION],
                      s->abs_lvl_end[level_type::DEFINITION],
@@ -885,7 +904,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
 
-if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d \n", int(has_lists_t), int(has_nesting_t)); }
+if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d, should nulls %d \n", 
+  int(has_lists_t), int(has_nesting_t), int(should_process_nulls)); }
 
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
@@ -898,14 +918,12 @@ if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d \n", int(has_list
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
-      if constexpr (has_nesting_t) {
-        if constexpr (has_lists_t) {
-          next_valid_count = gpuUpdateValidityAndRowIndicesNestedLists<decode_block_size_t, true, level_t>(
-            processed_count, s, sb, def, rep, t);
-        } else {
-          next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, true, level_t>(
-            processed_count, s, sb, def, t);
-        }
+      if constexpr (has_lists_t) {
+        next_valid_count = gpuUpdateValidityAndRowIndicesNestedLists<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, rep, t);
+      } else if constexpr (has_nesting_t) {
+        next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
       } else {
         next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, t);
@@ -917,16 +935,16 @@ if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d \n", int(has_list
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
 
-      if constexpr (has_nesting_t) {
-        if constexpr (has_lists_t) {
-          next_valid_count =
-            gpuUpdateValidityAndRowIndicesNestedLists<decode_block_size_t, false, level_t>(
-              processed_count, s, sb, nullptr, rep, t);
-        } else {
-          next_valid_count =
-            gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, false, level_t>(
-              processed_count, s, sb, nullptr, t);
-        }
+      if constexpr (has_lists_t) {
+        // no nulls, but if we have a list we still need the definition levels
+        def_decoder.decode_next(t);
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNestedLists<decode_block_size_t, false, level_t>(
+            processed_count, s, sb, def, rep, t);
+      } else if constexpr (has_nesting_t) {
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, false, level_t>(
+            processed_count, s, sb, nullptr, t);
       } else {
         next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
           processed_count, s, sb, nullptr, t);

From 4b5f91a9f6b7c70400a9ed08b05c9f82cc3be971 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Tue, 27 Aug 2024 14:59:14 -0400
Subject: [PATCH 03/36] Tests working

---
 .../cudf/table/experimental/row_operators.cuh |  14 +-
 cpp/src/io/parquet/decode_fixed.cu            | 462 +++++++++++-------
 cpp/src/io/parquet/page_data.cuh              |  18 +
 cpp/src/io/parquet/page_decode.cuh            | 101 ++--
 cpp/src/io/parquet/page_hdr.cu                |   3 +-
 cpp/src/io/parquet/reader_impl.cpp            |   8 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  12 +
 7 files changed, 411 insertions(+), 207 deletions(-)

diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index e9b81a525fc..75de3a75197 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -1429,18 +1429,30 @@ class device_row_comparator {
     __device__ bool operator()(size_type const lhs_element_index,
                                size_type const rhs_element_index) const noexcept
     {
+      static constexpr bool enable_print = false;
       if (check_nulls) {
         bool const lhs_is_null{lhs.is_null(lhs_element_index)};
         bool const rhs_is_null{rhs.is_null(rhs_element_index)};
         if (lhs_is_null and rhs_is_null) {
           return nulls_are_equal == null_equality::EQUAL;
         } else if (lhs_is_null != rhs_is_null) {
+          if constexpr (enable_print) {
+            printf("NULLS UNEQUAL AT %d, %d; values: %d %d\n", 
+              lhs_element_index, rhs_element_index, int(lhs_is_null), int(rhs_is_null));
+          }
           return false;
         }
       }
 
-      return comparator(lhs.element<Element>(lhs_element_index),
+      bool result = comparator(lhs.element<Element>(lhs_element_index),
                         rhs.element<Element>(rhs_element_index));
+      if constexpr (enable_print && cuda::std::is_integral_v<Element>) {
+        if(!result) {
+          printf("VALUES UNEQUAL: AT %d, %d, VALUES %d, %d\n", lhs_element_index, rhs_element_index, 
+            (int)lhs.element<Element>(lhs_element_index), (int)rhs.element<Element>(rhs_element_index));
+        }
+      }
+      return result;
     }
 
     template <typename Element,
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 57eaaf1079e..312aa31c67b 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,7 +24,7 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
@@ -34,26 +34,66 @@ __device__ inline void gpuDecodeFixedWidthValues(
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
   int const dtype                          = s->col.physical_type;
 
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  uint32_t dtype_len = s->dtype_len;
+  auto const data_out = nesting_info_base[leaf_level_index].data_out;
+  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
+
+  static constexpr bool enable_print = false;
+  if constexpr (enable_print) {
+    if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, "
+      "data_out %p, dict_base %p, dict_size %d, dict_bits %d, dict_val %d, data_start %p, skipped_leaf_values %u, input_row_count %d\n", 
+      start, end, s->first_row, leaf_level_index, dtype_len, data_out, s->dict_base, s->dict_bits, s->dict_val, 
+      s->dict_size, s->data_start, skipped_leaf_values, s->input_row_count);
+    }
+  }
+
   // decode values
   int pos = start;
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
 
     int const target_pos = pos + batch_size;
-    int const src_pos    = pos + t;
+    int src_pos    = pos + t;
 
     // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+//Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
+    auto offset = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
+    int dst_pos = offset;
+    if constexpr (!has_lists_t) {
+      dst_pos -= s->first_row;
+    }
+
+    int dict_idx = rolling_index<state_buf::dict_buf_size>(src_pos + skipped_leaf_values);
+    int dict_pos = sb->dict_idx[dict_idx];
+    if constexpr (enable_print) {
+      if(t == 0) { 
+        printf("DECODE OFFSETS: pos %d, src_pos %d, offset %d, dst_pos %d, target_pos %d, dict_idx %d, dict_pos %d\n", 
+          pos, src_pos, offset, dst_pos, target_pos, dict_idx, dict_pos);
+      }
+    }
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
     if (src_pos < target_pos && dst_pos >= 0) {
       // nesting level that is storing actual leaf values
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
 
-      uint32_t dtype_len = s->dtype_len;
-      void* dst =
-        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      // src_pos represents the logical row position we want to read from. But in the case of
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values.  So our true read position
+      // has to take into account the # of values we have to skip in the page to get to the
+      // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+      if constexpr (has_lists_t) {
+        src_pos += skipped_leaf_values;
+      }
+
+      void* dst = data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      if constexpr (enable_print) {
+        if(dst_pos == 0) {
+          printf("WRITTEN TO dst_pos ZERO: t %d, data_out %p, dst %p, src_pos %d, dict_idx %d, dict_pos %d, dict_base %p\n", 
+            t, data_out, dst, src_pos, dict_idx, dict_pos, s->dict_base);
+        }
+      }
+
       if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
           case INT32: gpuOutputFast(s, sb, src_pos, static_cast<uint32_t*>(dst)); break;
@@ -92,15 +132,15 @@ __device__ inline void gpuDecodeFixedWidthValues(
   }
 }
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 struct decode_fixed_width_values_func {
   __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
   {
-    gpuDecodeFixedWidthValues<block_size, state_buf>(s, sb, start, end, t);
+    gpuDecodeFixedWidthValues<block_size, has_lists_t, state_buf>(s, sb, start, end, t);
   }
 };
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthSplitValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
@@ -112,6 +152,7 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   int const dtype                          = s->col.physical_type;
   auto const data_len                      = thrust::distance(s->data_start, s->data_end);
   auto const num_values                    = data_len / s->dtype_len_in;
+  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
@@ -119,10 +160,13 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
     int const batch_size = min(max_batch_size, end - pos);
 
     int const target_pos = pos + batch_size;
-    int const src_pos    = pos + t;
+    int src_pos    = pos + t;
 
     // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
+    if constexpr (!has_lists_t) {
+      dst_pos -= s->first_row;
+    }
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
@@ -130,6 +174,14 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
       // nesting level that is storing actual leaf values
       int const leaf_level_index = s->col.max_nesting_depth - 1;
 
+      // src_pos represents the logical row position we want to read from. But in the case of
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values.  So our true read position
+      // has to take into account the # of values we have to skip in the page to get to the
+      // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+      if constexpr (has_lists_t) {
+        src_pos += skipped_leaf_values;
+      }
+
       uint32_t dtype_len = s->dtype_len;
       uint8_t const* src = s->data_start + src_pos;
       uint8_t* dst =
@@ -186,11 +238,11 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   }
 }
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 struct decode_fixed_width_split_values_func {
   __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
   {
-    gpuDecodeFixedWidthSplitValues<block_size, state_buf>(s, sb, start, end, t);
+    gpuDecodeFixedWidthSplitValues<block_size, has_lists_t, state_buf>(s, sb, start, end, t);
   }
 };
 
@@ -201,11 +253,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  // how many (input) values we've processed in the page so far, prior to this loop iteration
+  // how many (input) values we've processed in the page so far
   int value_count = s->input_value_count;
 
   // cap by last row so that we don't process any rows past what we want to output.
-  int const first_row                 = s->first_row; //row index WITHIN THE PAGE
+  int const first_row                 = s->first_row;
   int const last_row                  = first_row + s->num_rows;
   int const capped_target_value_count = min(target_value_count, last_row);
 
@@ -217,101 +269,69 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // get definition level. only need to process for nullable columns
-    int def_level;
+    // definition level. only need to process for nullable columns
+    int d = 0;
     if constexpr (nullable) {
       if (def) {
-        def_level = t < batch_size
+        d = t < batch_size
               ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
               : -1;
       } else {
-        def_level = t < batch_size ? 1 : -1;
+        d = t < batch_size ? 1 : -1;
       }
-    } else {
-      def_level = 0;
     }
 
-    //Determine value count & row index
-    int const thread_value_count = t; //# of output values from the view of this thread
-    int const block_value_count = batch_size;
-    int const row_index = t + value_count; //thread_row_index in old
-    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
-
-    //per-warp variables used below for writing validity
-    int const in_write_row_bounds = (row_index >= first_row) && (row_index < last_row);
-
-    //bit mask of all threads that passed true
-    int const in_write_row_bounds_mask = ballot(in_write_row_bounds);
-
-    // index of first set bit (in the warp to store)
-    int write_start = __ffs(in_write_row_bounds_mask) - 1;
-      
-    // remaining code is trivial for non-nullable, non-list columns: no need to iterate over depth
-    if constexpr (!nullable) {
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-      // if this is valid and we're at the leaf, output dst_pos
-      int const is_valid = in_row_bounds;
-      if (is_valid) {
-        auto& ni = s->nesting_info[max_depth];
-        int const thread_valid_count = thread_value_count;
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index           = (thread_value_count + value_count) - 1;
+    int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
 
-        // for non-list types, the value count is always the same across
-        int const dst_pos = value_count + thread_value_count;
-        int const src_pos = ni.valid_count + thread_valid_count;
-        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
-      }
+    // iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+      auto& ni = s->nesting_info[d_idx];
 
-      // update valid_count
-      if (t == 0) { 
-        int const block_valid_count = block_value_count;
-        s->nesting_info[max_depth].valid_count += block_valid_count;
+      int is_valid;
+      if constexpr (nullable) {
+        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
+      } else {
+        is_valid = in_row_bounds;
       }
 
-      __syncthreads(); // publish modification of nesting_info value_count
-    } else {
-
-      // column is a nullable non-list: iterate by depth
-      for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
-
-        auto& ni = s->nesting_info[d_idx];
-
-        // everything up to the max_def_level is a non-null value
-        int is_valid = ((def_level >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
-
-        // thread and block validity count
-        // queries is_valid of all threads, stores prior total and total total
+      // thread and block validity count
+      int thread_valid_count, block_valid_count;
+      if constexpr (nullable) {
         using block_scan = cub::BlockScan<int, decode_block_size>;
         __shared__ typename block_scan::TempStorage scan_storage;
-        int thread_valid_count, block_valid_count;
-        block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
+        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+        __syncthreads();
 
-        // validity is processed per-warp (lane 0 writes), because writes are atomic
+        // validity is processed per-warp
         //
-        // nested schemas always read and write to the same bounds 
-        // (that is, read and write positions are already pre-bounded by first_row/num_rows). 
-        // since we are about to write the validity vector
+        // nested schemas always read and write to the same bounds (that is, read and write
+        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+        // at the first value, even if that is before first_row, because we cannot trivially jump to
+        // the correct position to start reading. since we are about to write the validity vector
         // here we need to adjust our computed mask to take into account the write row bounds.
         int warp_null_count = 0;
-        if ((write_start >= 0) && (ni.valid_map != nullptr)) {
+        if (write_start >= 0 && ni.valid_map != nullptr) {
+          int const valid_map_offset        = ni.valid_map_offset;
           uint32_t const warp_validity_mask = ballot(is_valid);
+          // lane 0 from each warp writes out validity
           if ((t % cudf::detail::warp_size) == 0) {
-            // absolute input value index
-            int const vindex = value_count + thread_value_count;
-
-            // absolute bit offset into the output validity map
-            // subtract by first_row: we may skip first N rows, 
-            // but still need to write bits at beginning of output vector
-            int const bit_offset = (ni.valid_map_offset + vindex + write_start) - first_row;
-
-            // last bit in the warp to store
-            int const write_end = cudf::detail::warp_size - __clz(in_write_row_bounds_mask);
-            int const bit_count = write_end - write_start; //in old is warp_valid_mask_bit_count
-
-            uint32_t const warp_output_valid_mask = warp_validity_mask >> write_start;
-
-            store_validity(bit_offset, ni.valid_map, warp_output_valid_mask, bit_count);
-
-            warp_null_count = bit_count - __popc(warp_output_valid_mask);
+            int const vindex =
+              (value_count + thread_value_count) - 1;  // absolute input value index
+            int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                   first_row;  // absolute bit offset into the output validity map
+            int const write_end = cudf::detail::warp_size -
+                                  __clz(in_write_row_bounds);  // last bit in the warp to store
+            int const bit_count = write_end - write_start;
+            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
           }
         }
 
@@ -322,20 +342,25 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
         size_type const block_null_count =
           cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
         if (t == 0) { ni.null_count += block_null_count; }
+      }
+      // trivial for non-nullable columns
+      else {
+        thread_valid_count = thread_value_count;
+        block_valid_count  = block_value_count;
+      }
 
-        // if this is valid and we're at the leaf, output dst_pos
-        if (is_valid && d_idx == max_depth) {
-          // for non-list types, the value count is always the same across
-          __syncthreads();  // handle modification of ni.valid_count from below
-          int const dst_pos = value_count + thread_value_count;
-          int const src_pos = ni.valid_count + thread_valid_count;
-          sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
-        }
-        __syncthreads();  // handle modification of ni.valid_count from below
+      // if this is valid and we're at the leaf, output dst_pos
+      __syncthreads();  // handle modification of ni.value_count from below
+      if (is_valid && d_idx == max_depth) {
+        // for non-list types, the value count is always the same across
+        int const dst_pos = (value_count + thread_value_count) - 1;
+        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      }
+      __syncthreads();  // handle modification of ni.value_count from below
 
-        // update stuff
-        if (t == 0) { ni.valid_count += block_valid_count; }
-      } //END OF DEPTH LOOP
+      // update stuff
+      if (t == 0) { ni.valid_count += block_valid_count; }
     }
 
     value_count += block_value_count;
@@ -482,7 +507,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
 }
 
 template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
+static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, 
   level_t const* const rep, int t)
 {
@@ -493,16 +518,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
   // how many (input) values we've processed in the page so far, prior to this loop iteration
   int value_count = s->input_value_count;
 
-  int printf_num_threads = 34;
+  static constexpr bool enable_print = false;
+  int const printf_num_threads = 32;
 
   // how many rows we've processed in the page so far
   int input_row_count = s->input_row_count;
-  if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); }
+  if constexpr (enable_print) {
+    if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); }
+  }
 
   // cap by last row so that we don't process any rows past what we want to output.
   int const first_row                 = s->first_row;
   int const last_row                  = first_row + s->num_rows;
-  if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); }
+  if constexpr (enable_print) {
+    if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); }
+  }
 
   int const row_index_lower_bound = s->row_index_lower_bound;
   int const max_depth = s->col.max_nesting_depth - 1;
@@ -510,6 +540,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
   __syncthreads();
 
   while (value_count < target_value_count) {
+    if constexpr (enable_print) {
+      if(t == 0) { printf("VALUE COUNT: %d\n", value_count); }
+    }
     bool const within_batch = value_count + t < target_value_count;
 
     // get definition level, use repitition level to get start/end depth
@@ -517,13 +550,23 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
     int def_level = -1, start_depth = -1, end_depth = -1;
     if (within_batch) {
       int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
-      int const rep_level = rep[index];
-      def_level = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      auto const rep_level = static_cast<int>(rep[index]);
+      def_level = static_cast<int>(def[index]);
 
       //computed by generate_depth_remappings()
+      if constexpr (enable_print) {
+        if((rep_level < 0) || (rep_level > max_depth)) {
+          printf("WHOA: rep level %d out of bounds %d!\n", rep_level, max_depth);
+        }
+      }
       start_depth = s->nesting_info[rep_level].start_depth;
       end_depth   = s->nesting_info[def_level].end_depth;
-      if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
+      if constexpr (enable_print) {
+        if((def_level < 0) || (def_level > (max_depth + 1))) {
+          printf("WHOA: def level %d out of bounds (max_depth %d) (index %d) (end_depth %d)!\n", def_level, max_depth, index, end_depth);
+        }
+      }
+      if (enable_print && (t < printf_num_threads)) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
         t, def_level, rep_level, start_depth, end_depth); }
     }
 
@@ -537,7 +580,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
     block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows);
     __syncthreads(); //Needed because scan_storage will be reused
 
-    if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); }
+    if constexpr (enable_print) {
+      if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); }
+    }
 
     int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1);
     input_row_count += total_num_new_rows;
@@ -549,21 +594,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
     // is from/in current rep level to/in the rep level AT the depth with the def value
     int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
 
-    if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d, last_row %d\n", \
-      row_index, in_row_bounds, in_nesting_bounds, last_row); }
-    if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", 
-      t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); }
+    if constexpr (enable_print) {
+      if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d, last_row %d\n", \
+        row_index, in_row_bounds, in_nesting_bounds, last_row); }
+      if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", 
+        t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); }
+    }
 
     // queries is_valid from all threads, stores prior total and total total
     int thread_value_count = 0, block_value_count = 0;
     block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count);
 
-    if (t == 0) { printf("block_value_count %d\n", block_value_count); }
-    if (t < printf_num_threads) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", 
-      t, thread_value_count, in_nesting_bounds); }
-
-    //bit mask of all threads that passed true //TODO DELETE ME
-    //uint32_t const in_write_row_bounds_mask = ballot(in_row_bounds);
+    if constexpr (enable_print) {
+      if (t == 0) { printf("block_value_count %d\n", block_value_count); }
+      if (t < printf_num_threads) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", 
+        t, thread_value_count, in_nesting_bounds); }
+    }
 
     // column is either nullable or is a list (or both): iterate by depth
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
@@ -578,9 +624,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
         is_valid = in_nesting_bounds;
       }
 
-      if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", 
-        int(nullable), d_idx, max_depth, ni.max_def_level, value_count); }
-      if (t < printf_num_threads) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", t, def_level, in_nesting_bounds, is_valid); }
+      if constexpr (enable_print) {
+        if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", 
+          int(nullable), d_idx, max_depth, ni.max_def_level, value_count); }
+        if (t < printf_num_threads) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", 
+          t, def_level, in_nesting_bounds, is_valid); }
+      }
 
       // thread and block validity count
       // queries is_valid of all threads, stores prior total and total total
@@ -612,8 +661,16 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
       auto thread_mask = (__uint128_t(1) << thread_value_count) - 1;
       int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask);
 
-      if (t == 0) { printf("block_valid_mask %d\n", int(block_valid_mask)); }
-      if (t < printf_num_threads) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
+      if constexpr (enable_print) {
+        if((block_valid_mask == 0) && (t == 0) && (d_idx == max_depth)) { 
+          printf("EMPTY VALID MASK: def_level %d, max_def_level %d, in_nesting_bounds %d, start_depth %d, "
+            "end_depth %d, in_row_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, input_row_count %d\n", 
+            def_level, ni.max_def_level, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, 
+            row_index_lower_bound, last_row, input_row_count); }
+
+        if (t == 0) { printf("block_valid_mask %u\n", int(block_valid_mask)); }
+        if (t < printf_num_threads) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
+      }
 
       // compute warp and thread value counts for the -next- nesting level. we need to
       // do this for nested schemas so that we can emit an offset for the -current- nesting
@@ -630,10 +687,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
         __shared__ typename block_scan::TempStorage scan_storage;
         block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count);
 
-        if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); }
-        if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", 
-          t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); }
-        if (t < printf_num_threads) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); }
+        if constexpr (enable_print) {
+          if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); }
+          if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", 
+            t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); }
+          if (t < printf_num_threads) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); }
+        }
 
         // if we're -not- at a leaf column and we're within nesting/row bounds
         // and we have a valid data_out pointer, it implies this is a list column, so
@@ -645,8 +704,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
 
           //STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
-          if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", 
-            t, idx, next_ni.value_count, next_ni.page_start_value, ofs); }
+
+          if constexpr (enable_print) {
+            if(idx < 0) { printf("WHOA: offset index out of bounds!\n"); }
+            if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", 
+              t, idx, next_ni.value_count, next_ni.page_start_value, ofs); }
+          }
         }
       }
 
@@ -668,24 +731,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
             int const bit_count = __popc(warp_count_mask);
             if(bit_count > 0) {
 
-              // absolute input value index
-              int const vindex = value_count + thread_value_count;
-
               // absolute bit offset into the output validity map
               //is cumulative sum of bit_count at the given nesting depth
               // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
-              // valid_map_offset was already set during list pre-processing for appropriate start index
-              int const bit_offset = ni.valid_map_offset + vindex;
-
+              int const bit_offset = ni.valid_map_offset + thread_value_count;
               auto const shifted_valid_mask = static_cast<uint32_t>(block_valid_mask >> thread_value_count);
               auto const bit_range_mask = (1 << bit_count) - 1; //mainly needed for warp_null_count
               auto const warp_validity_mask = shifted_valid_mask & bit_range_mask;
 
-              printf("t %d, thread_value_count %d, vindex %d, bit_offset %d, bit_count %d, warp_validity_mask %u\n", 
-                t, thread_value_count, vindex, bit_offset, bit_count, warp_validity_mask);
-
               store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count);
               warp_null_count = bit_count - __popc(warp_validity_mask);
+
+              if constexpr (enable_print) {
+                printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, bit_count %d, warp_validity_mask %u\n", 
+                  t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, bit_count, warp_validity_mask);
+                printf("NUM NULLS: t %d, depth %d, warp_null_count %d\n", t, d_idx, warp_null_count);
+              }
             }
           }
         }
@@ -696,6 +757,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
         // compute it directly at the end of the kernel.
         size_type const block_null_count =
           cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+        if constexpr (enable_print) {
+          if (t == 0) { printf("BLOCK NULLS: depth %d, prior %d, block_null_count %u\n", 
+            d_idx, ni.null_count, block_null_count); }
+        }
         if (t == 0) { ni.null_count += block_null_count; }
       }
 
@@ -707,9 +772,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
         int const src_pos = ni.valid_count + thread_valid_count;
         int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
 
-        if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); }
-        if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
+        if constexpr (enable_print) {
+          if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); }
+          if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
+
+          if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) {
+            printf("WHOA: output index out of bounds!\n");
+          }
+
+          if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, ni.valid_count %d, thread_value_count %d, thread_valid_count %d\n", 
+            output_index, dst_pos, ni.value_count, ni.valid_count, thread_value_count, thread_valid_count);}
 
+          printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos);
+        }
+
+        //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
         sb->nz_idx[output_index] = dst_pos;
       }
       __syncthreads();  // handle modification of ni.value_count from below
@@ -719,6 +796,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
         int const block_valid_count = count_set_bits(block_valid_mask);
         ni.valid_count += block_valid_count;
         ni.value_count += block_value_count;
+        ni.valid_map_offset += block_value_count;
       }
 
       // propagate value counts for the next depth level
@@ -727,13 +805,17 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedLists(
       in_nesting_bounds  = next_in_nesting_bounds;
     } //END OF DEPTH LOOP
 
-    if (t == 0) { printf("END DEPTH LOOP\n"); }
+    if constexpr (enable_print) {
+      if (t == 0) { printf("END DEPTH LOOP\n"); }
+    }
 
     int const batch_size = min(max_batch_size, target_value_count - value_count);
     value_count += batch_size;
   }
 
-  if (t == 0) { printf("END LOOP\n"); }
+  if constexpr (enable_print) {
+    if (t == 0) { printf("END LOOP\n"); }
+  }
 
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
@@ -795,7 +877,7 @@ template <typename level_t,
           bool has_dict_t,
           bool has_nesting_t,
           bool has_lists_t,
-          template <int block_size, typename state_buf>
+          template <int block_size, bool decode_has_lists_t, typename state_buf>
           typename DecodeValuesFunc>
 CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   gpuDecodePageDataGeneric(PageInfo* pages,
@@ -837,7 +919,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  DecodeValuesFunc<decode_block_size_t, state_buf_t> decode_values;
+  DecodeValuesFunc<decode_block_size_t, has_lists_t, state_buf_t> decode_values;
 
   bool const nullable             = is_nullable(s);
   bool const should_process_nulls = nullable && maybe_has_nulls(s);
@@ -885,18 +967,27 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
                      s->page.num_input_values);
   }
 
+  static constexpr bool enable_print = false;
+
   rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
   if constexpr (has_dict_t) {
+    //auto const skipped_leaf_values = s->page.skipped_leaf_values;
+    //int const dict_offset = skipped_leaf_values * sizeof(uint32_t);
     dict_stream.init(
       s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
+    if constexpr (enable_print) {
+      if(t == 0) { printf("INIT DICT: dict_bits %d, data_start %p, data_end %p, dict_idx %p, page.num_input_values %d, s->dict_pos %d \n", 
+        s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); }
+    }
+    dict_stream.decode_next(t, s->page.skipped_leaf_values);
   }
   __syncthreads();
 
   // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
+  // - processed_count: number of values out of num_input_values that we have decoded so far.
   //   the definition stream returns the number of total rows it has processed in each call
   //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
+  // - valid_count: number of non-null values we have decoded so far. In each iteration of the
   //   loop below, we look at the number of valid items (which could be all for non-nullable),
   //   and valid_count is that running count.
   int processed_count = 0;
@@ -904,23 +995,55 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
 
-if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d, should nulls %d \n", 
-  int(has_lists_t), int(has_nesting_t), int(should_process_nulls)); }
+  if constexpr (enable_print) {
+    if(t == 0) { printf("page_idx %d, nullable %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", 
+      page_idx, int(nullable), int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); }
+  }
+
+
+  auto print_nestings = [&](bool is_post){
+    if constexpr (enable_print) {
+      auto print_nesting_level = [&](const PageNestingDecodeInfo& ni) {
+        printf("page_idx %d, max_def_level %d, start_depth %d, end_depth %d, page_start_value %d, null_count %d, "
+          "valid_map_offset %d, valid_count %d, value_count %d\n", 
+          page_idx, ni.max_def_level, ni.start_depth, ni.end_depth, ni.page_start_value, ni.null_count, 
+          ni.valid_map_offset, ni.valid_count, ni.value_count);
+      };
+
+      if(t == 0) {
+        printf("POST %d NESTING 0: ", int(is_post));
+        print_nesting_level(s->nesting_info[0]);
+        printf("POST %d NESTING 1: ", int(is_post));
+        print_nesting_level(s->nesting_info[1]);
+        printf("POST %d NESTING 2: ", int(is_post));
+        print_nesting_level(s->nesting_info[2]);
+      }
+    }
+  };
+
+  print_nestings(false);
 
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
+    if constexpr (has_lists_t){
+      rep_decoder.decode_next(t);
+    }
+
     // only need to process definition levels if this is a nullable column
     if (should_process_nulls) {
-      if constexpr (has_lists_t){
-        rep_decoder.decode_next(t);
-      }
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
       if constexpr (has_lists_t) {
-        next_valid_count = gpuUpdateValidityAndRowIndicesNestedLists<decode_block_size_t, true, level_t>(
+        int value_count = s->input_value_count;
+        next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, rep, t);
+        if constexpr (enable_print) {
+          if(t == 0) { printf("PROCESSING: page total values %d, num_input_values %d, pre value_count %d, post value_count %d, "
+            "processed_count %d, valid_count %d, next_valid_count %d\n", 
+            s->page.num_input_values, s->input_value_count, value_count, s->input_value_count, processed_count, valid_count, next_valid_count); }
+        }
       } else if constexpr (has_nesting_t) {
         next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, t);
@@ -933,21 +1056,25 @@ if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d, should nulls %d
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-
       if constexpr (has_lists_t) {
         // no nulls, but if we have a list we still need the definition levels
-        def_decoder.decode_next(t);
+        processed_count += def_decoder.decode_next(t);
+        __syncthreads();
+
         next_valid_count =
-          gpuUpdateValidityAndRowIndicesNestedLists<decode_block_size_t, false, level_t>(
+          gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, false, level_t>(
             processed_count, s, sb, def, rep, t);
-      } else if constexpr (has_nesting_t) {
-        next_valid_count =
-          gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, false, level_t>(
-            processed_count, s, sb, nullptr, t);
       } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
-          processed_count, s, sb, nullptr, t);
+        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+
+        if constexpr (has_nesting_t) {
+          next_valid_count =
+            gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, false, level_t>(
+              processed_count, s, sb, nullptr, t);
+        } else {
+          next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
+            processed_count, s, sb, nullptr, t);
+        }
       }
     }
     __syncthreads();
@@ -966,8 +1093,15 @@ if (t == 0) { printf("DECODE LOOP: has-list %d, has-nesting %d, should nulls %d
     __syncthreads();
 
     valid_count = next_valid_count;
+
+    if constexpr (enable_print) {
+      if(t == 0) { printf("LOOP: processed_count %d, #page values %d, error %d\n", 
+        processed_count, s->page.num_input_values, s->error); }
+    }
   }
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  print_nestings(true);
 }
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh
index f182747650e..e82f927e34f 100644
--- a/cpp/src/io/parquet/page_data.cuh
+++ b/cpp/src/io/parquet/page_data.cuh
@@ -89,6 +89,14 @@ inline __device__ void gpuStoreOutput(uint32_t* dst,
     bytebuf = 0;
   }
   *dst = bytebuf;
+
+  static constexpr bool enable_print = false;
+  if constexpr (enable_print) {
+    if (threadIdx.x == 0) {
+      printf("STORE VALUE %u at %p, src8 %p, dict_pos %u, dict_size %u, ofs %u\n", 
+        bytebuf, dst, src8, dict_pos, dict_size, ofs);
+    }
+  }
 }
 
 /**
@@ -328,6 +336,7 @@ inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos
   uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
 
+auto dict_lookup_idx = rolling_index<state_buf::dict_buf_size>(src_pos);
   if (s->dict_base) {
     // Dictionary
     dict_pos =
@@ -339,6 +348,15 @@ inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos
     dict     = s->data_start;
   }
   dict_pos *= (uint32_t)s->dtype_len_in;
+
+  static constexpr bool enable_print = false;
+  if constexpr (enable_print) {
+    if (threadIdx.x == 0) {
+      printf("PREP OUTPUT VALUE at dst %p, dict %p, dict_pos %u, dict_size %u, dict_base %p, dict_bits %d, dict_lookup_idx %d, dtype_len_in %d\n", 
+        dst, dict, dict_pos, dict_size, s->dict_base, s->dict_bits, dict_lookup_idx, s->dtype_len_in);
+    }
+  }
+
   gpuStoreOutput(dst, dict, dict_pos, dict_size);
 }
 
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index b1f8e6dd5fe..cb682112195 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -588,8 +588,8 @@ inline __device__ void store_validity(int valid_map_offset,
     if (relevant_mask == ~0) {
       valid_map[word_offset] = valid_mask;
     } else {
-      atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset));
-      atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
+      atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset)); //clears old bits
+      atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); //sets valid mask
     }
   }
   // we're going to spill over into the next word.
@@ -719,9 +719,16 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
     // track (page-relative) row index for the thread so we can compare against input bounds
     // keep track of overall # of rows we've read.
     int const is_new_row               = start_depth == 0 ? 1 : 0;
-    uint32_t const warp_row_count_mask = ballot(is_new_row);
+    uint32_t const warp_row_count_mask = ballot(is_new_row); //how many threads are starting a new row
+    //t is zero through 31. the shifted bit is the 1st through the 32nd bit. then we -1: mask
+    //the mask we and with is querying PRIOR threads
+    uint32_t const prior_thread_mask = ((1 << t) - 1); //query "for all threads before me"
+    uint32_t const prior_new_rows_bits = warp_row_count_mask & prior_thread_mask;
+    int32_t const num_prior_new_rows = __popc(prior_new_rows_bits);
+
     int32_t const thread_row_index =
-      input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
+      input_row_count + ((num_prior_new_rows + is_new_row) - 1);
+
     input_row_count += __popc(warp_row_count_mask);
     // is this thread within read row bounds?
     int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
@@ -729,30 +736,34 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
                                 ? 1
                                 : 0;
 
+    // if we are within the range of nesting levels we should be adding value indices for
+//if list: is from/in current rep level to/in the rep level AT the depth with the def value
+    int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
+
     // compute warp and thread value counts
-    uint32_t const warp_count_mask =
-      ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
+    uint32_t const warp_count_mask = ballot(in_nesting_bounds);
 
     warp_value_count = __popc(warp_count_mask);
-    // Note : ((1 << t) - 1) implies "for all threads before me"
-    thread_value_count = __popc(warp_count_mask & ((1 << t) - 1));
+    // thread_value_count : # of output values from the view of this thread
+    // is all threads before me that start from rep level zero (new row)
+    thread_value_count = __popc(warp_count_mask & prior_thread_mask);
 
     // walk from 0 to max_depth
-    uint32_t next_thread_value_count, next_warp_value_count;
     for (int s_idx = 0; s_idx < max_depth; s_idx++) {
       PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx];
 
-      // if we are within the range of nesting levels we should be adding value indices for
-      int const in_nesting_bounds =
-        ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
-
       // everything up to the max_def_level is a non-null value
+//if is NOT list, then means is-not-null, OR is-null in a CHILD node
+//if IS list, also: is from/in current rep level to/in the rep level AT the depth with the def value
       uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0;
 
       // compute warp and thread valid counts
+      // bit is set for each thread in the warp that is_valid
+//OR of all is_valid's shifted by thread_value_count
       uint32_t const warp_valid_mask =
         // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
         // because every value in the input matches to a value in the output
+//If no lists: every entry is a new row, which may be null
         !has_repetition
           ? ballot(is_valid)
           :
@@ -763,8 +774,10 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
           // __reduce_or_sync(), but until then we have to do a warp reduce.
           WarpReduceOr32(is_valid << thread_value_count);
 
+//For this value, we save an offset at every depth (in the loop)
+      //# bits prior to this thread that are valid (set)
       thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
-      warp_valid_count   = __popc(warp_valid_mask);
+      warp_valid_count   = __popc(warp_valid_mask); //#set bits of all threads in warp
 
       // if this is the value column emit an index for value decoding
       if (is_valid && s_idx == max_depth - 1) {
@@ -778,10 +791,15 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
       // do this for nested schemas so that we can emit an offset for the -current- nesting
       // level. more concretely : the offset for the current nesting level == current length of the
       // next nesting level
+      uint32_t next_thread_value_count = 0, next_warp_value_count = 0;
+      int next_in_nesting_bounds = 0;
       if (s_idx < max_depth - 1) {
-        uint32_t const next_warp_count_mask =
-          ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
-        next_warp_value_count   = __popc(next_warp_count_mask);
+        //mask is different between depths
+        next_in_nesting_bounds = 
+          (s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0;
+        uint32_t const next_warp_count_mask = ballot(next_in_nesting_bounds);
+
+        next_warp_value_count   = __popc(next_warp_count_mask); //same for all threads, but not all depths
         next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
 
         // if we're -not- at a leaf column and we're within nesting/row bounds
@@ -792,34 +810,36 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
           cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count +
                                       next_thread_value_count +
                                       nesting_info_base[s_idx + 1].page_start_value;
+//STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(nesting_info->data_out))[idx] = ofs;
         }
       }
 
-      // nested schemas always read and write to the same bounds (that is, read and write positions
-      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
-      // first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector here
+      // lists always read and write to the same bounds (that is, read and write positions
+      // are already pre-bounded by first_row/num_rows) how? we have pre-processed them. 
+      // flat schemas will start reading at the first value, even if that is before first_row, 
+      // because we cannot trivially jump to the correct position to start reading. 
+      // why not? because we don't know how many nulls were before it (haven't preprocessed them)
+      // since we are about to write the validity vector here
       // we need to adjust our computed mask to take into account the write row bounds.
       int const in_write_row_bounds =
         !has_repetition
           ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
           : in_row_bounds;
+      //is write_start in new
       int const first_thread_in_write_range =
-        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
-
-      // # of bits to of the validity mask to write out
-      int const warp_valid_mask_bit_count =
-        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
+        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; //index of lowest bit set to
 
       // increment count of valid values, count of total values, and update validity mask
       if (!t) {
+        // # of bits to of the validity mask to write out //becomes bit_count
+        int const warp_valid_mask_bit_count =
+          first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
+
         if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
           uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
-          store_validity(nesting_info->valid_map_offset,
-                         nesting_info->valid_map,
-                         warp_output_valid_mask,
-                         warp_valid_mask_bit_count);
+          store_validity(nesting_info->valid_map_offset, nesting_info->valid_map,
+                         warp_output_valid_mask, warp_valid_mask_bit_count);
           nesting_info->valid_map_offset += warp_valid_mask_bit_count;
           nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
         }
@@ -830,7 +850,8 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
       // propagate value counts for the next level
       warp_value_count   = next_warp_value_count;
       thread_value_count = next_thread_value_count;
-    }
+      in_nesting_bounds = next_in_nesting_bounds;
+    } //END OF DEPTH LOOP
 
     input_value_count += min(32, (target_input_value_count - input_value_count));
     __syncwarp();
@@ -1096,6 +1117,12 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
                                    ? max_page_rows
                                    : max_row - (page_start_row + s->first_row);
+
+      static constexpr bool enable_print = false;
+      if constexpr (enable_print) {
+        printf("NUM_ROWS: col.start_row %lu, page.chunk_row %d, page_start_row %lu, s->first_row %d, s->page.num_rows %d, max_row %lu, min_row %lu, num_rows %lu, s->num_rows %d\n", 
+          s->col.start_row, s->page.chunk_row, page_start_row, s->first_row, s->page.num_rows, max_row, min_row, num_rows, s->num_rows);
+      }
     }
   }
 
@@ -1256,13 +1283,11 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           }
 
           if (s->col.column_data_base != nullptr) {
-            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
             if (s->col.column_string_base != nullptr) {
               nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
             }
 
             nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
-
             if (nesting_info->data_out != nullptr) {
               // anything below max depth with a valid data pointer must be a list, so the
               // element size is the size of the offset type.
@@ -1277,8 +1302,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
             }
             nesting_info->valid_map = s->col.valid_map_base[idx];
             if (nesting_info->valid_map != nullptr) {
-              nesting_info->valid_map += output_offset >> 5;
-              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
+              nesting_info->valid_map += output_offset >> 5; //is pointer to warp start
+              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f); //is index within warp
             }
           }
         }
@@ -1357,7 +1382,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
     s->dict_pos                          = 0;
     s->src_pos                           = 0;
 
-    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
+    // for non-lists, we can't know how many leaf values to skip unless we do a full
     // preprocess of the definition levels (since nulls will have no actual decodable value, there
     // is no direct correlation between # of rows and # of decodable values).  so we will start
     // processing at the beginning of the value stream and disregard any indices that start
@@ -1371,7 +1396,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
 
       s->row_index_lower_bound = -1;
     }
-    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
+    // for lists, we have run a preprocess that lets us skip directly to the values
     // we need to start decoding at
     else {
       // input_row_count translates to "how many rows we have processed so far", so since we are
@@ -1379,7 +1404,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       s->input_row_count = s->first_row;
 
       // return the lower bound to compare (page-relative) thread row index against. Explanation:
-      // In the case of nested schemas, rows can span page boundaries.  That is to say,
+      // In the case of lists, rows can span page boundaries.  That is to say,
       // we can encounter the first value for row X on page M, but the last value for page M
       // might not be the last value for row X. page M+1 (or further) may contain the last value.
       //
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index ac39e2ac291..53a55a43300 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -183,7 +183,8 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
     return decode_kernel_mask::STRING;
   } 
   
-  if (is_list(chunk)) {
+  if (is_list(chunk) && !is_string_col(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  //if (is_list(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
       return decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index cc98e263664..b72359f0d73 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -71,8 +71,6 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   // figure out which kernels to run
   auto const kernel_mask = GetAggregatedDecodeKernelMask(subpass.pages, _stream);
 
-printf("DECODE DATA PAGE, mask %d\n", kernel_mask);
-
   // Check to see if there are any string columns present. If so, then we need to get size info
   // for each string page. This size info will be used to pre-allocate memory for the column,
   // allowing the page decoder to write string data directly to the column buffer, rather than
@@ -223,6 +221,11 @@ printf("DECODE DATA PAGE, mask %d\n", kernel_mask);
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
+  static constexpr bool enable_print = false;
+  if constexpr (enable_print) {
+    printf("PAGE DATA DECODE MASK: %d\n", kernel_mask);
+  }
+
   // launch string decoder
   int s_idx = 0;
   if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
@@ -333,7 +336,6 @@ printf("DECODE DATA PAGE, mask %d\n", kernel_mask);
 
   // launch fixed width type decoder for lists
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) {
-printf("LIST PAGE\n");
     DecodePageDataFixed(subpass.pages,
                         pass.chunks,
                         num_rows,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f28a7311ccb..9405f658429 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -129,7 +129,12 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   // depth.
   //
 
+  static constexpr bool enable_print = false;
+
   // compute "X" from above
+  if constexpr (enable_print) {
+    printf("REMAPPING: max def %d, max rep %d\n", schema.max_definition_level, schema.max_repetition_level);
+  }
   for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) {
     auto find_shallowest = [&](int r) {
       int shallowest = -1;
@@ -148,6 +153,9 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
         if (!cur_schema.is_stub()) { cur_depth--; }
         schema_idx = cur_schema.parent_idx;
       }
+      if constexpr (enable_print) {
+        printf("REMAPPING: s_idx / r %d, shallowest %d\n", r, shallowest);
+      }
       return shallowest;
     };
     rep_depth_remap[s_idx] = find_shallowest(s_idx);
@@ -186,6 +194,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
         prev_schema = cur_schema;
         schema_idx  = cur_schema.parent_idx;
       }
+
+      if constexpr (enable_print) {
+        printf("REMAPPING: s_idx %d, r1 %d, end_depth %d\n", s_idx, r1, depth);
+      }
       return depth;
     };
     def_depth_remap[s_idx] = find_deepest(s_idx);

From ead17b8bc2f12f77e122ca7de7fbce52ef77c945 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Wed, 28 Aug 2024 13:56:28 -0400
Subject: [PATCH 04/36] Revert page_decode changes

---
 cpp/src/io/parquet/decode_fixed.cu |  47 +++++++++++--
 cpp/src/io/parquet/page_data.cuh   |   2 +-
 cpp/src/io/parquet/page_decode.cuh | 103 +++++++++++------------------
 cpp/src/io/parquet/page_hdr.cu     |   1 -
 cpp/src/io/parquet/parquet_gpu.hpp |  22 ------
 5 files changed, 80 insertions(+), 95 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 312aa31c67b..8ba251aec0b 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -261,12 +261,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
   int const last_row                  = first_row + s->num_rows;
   int const capped_target_value_count = min(target_value_count, last_row);
 
+  static constexpr bool enable_print = false;
+  if constexpr (enable_print) {
+    if (t == 0) { printf("NESTED: s->input_value_count %d, first_row %d, last_row %d, target_value_count %d, capped_target_value_count %d\n", 
+      s->input_value_count, first_row, last_row, target_value_count, capped_target_value_count); }
+  }
+
   int const row_index_lower_bound = s->row_index_lower_bound;
 
   int const max_depth = s->col.max_nesting_depth - 1;
   __syncthreads();
 
   while (value_count < capped_target_value_count) {
+    if constexpr (enable_print) {
+      if(t == 0) { printf("NESTED VALUE COUNT: %d\n", value_count); }
+    }
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     // definition level. only need to process for nullable columns
@@ -290,6 +299,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
     int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
     int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
 
+    if constexpr (enable_print) {
+      if(t == 0) { printf("NESTED ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d\n", 
+        row_index, row_index_lower_bound, last_row, in_row_bounds); }
+    }
+
     // iterate by depth
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
       auto& ni = s->nesting_info[d_idx];
@@ -356,6 +370,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
         int const dst_pos = (value_count + thread_value_count) - 1;
         int const src_pos = (ni.valid_count + thread_valid_count) - 1;
         sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+        if constexpr (enable_print) {
+          if(t == 0) {printf("NESTED STORE: first_row %d, row_index %d dst_pos %d, src_pos %d\n", 
+            first_row, row_index, dst_pos, src_pos);}
+        }
       }
       __syncthreads();  // handle modification of ni.value_count from below
 
@@ -395,12 +413,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   int const last_row                  = first_row + s->num_rows;
   int const capped_target_value_count = min(target_value_count, last_row);
 
+  static constexpr bool enable_print = false;
+  if constexpr (enable_print) {
+    if (t == 0) { printf("FLAT: s->input_value_count %d, first_row %d, last_row %d, target_value_count %d, capped_target_value_count %d\n", 
+      s->input_value_count, first_row, last_row, target_value_count, capped_target_value_count); }
+  }
+
   int const valid_map_offset      = ni.valid_map_offset;
   int const row_index_lower_bound = s->row_index_lower_bound;
 
   __syncthreads();
 
   while (value_count < capped_target_value_count) {
+    if constexpr (enable_print) {
+      if(t == 0) { printf("FLAT VALUE COUNT: %d\n", value_count); }
+    }
+
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     int const thread_value_count = t;
@@ -519,7 +547,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int value_count = s->input_value_count;
 
   static constexpr bool enable_print = false;
-  int const printf_num_threads = 32;
+  int const printf_num_threads = 0;
 
   // how many rows we've processed in the page so far
   int input_row_count = s->input_row_count;
@@ -531,7 +559,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int const first_row                 = s->first_row;
   int const last_row                  = first_row + s->num_rows;
   if constexpr (enable_print) {
-    if (t == 0) { printf("first_row %d, last_row %d, target_value_count %d\n", first_row, last_row, target_value_count); }
+    if (t == 0) { printf("LIST s->input_value_count %d, first_row %d, last_row %d, target_value_count %d\n", 
+      s->input_value_count, first_row, last_row, target_value_count); }
   }
 
   int const row_index_lower_bound = s->row_index_lower_bound;
@@ -541,7 +570,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
   while (value_count < target_value_count) {
     if constexpr (enable_print) {
-      if(t == 0) { printf("VALUE COUNT: %d\n", value_count); }
+      if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); }
     }
     bool const within_batch = value_count + t < target_value_count;
 
@@ -565,9 +594,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
         if((def_level < 0) || (def_level > (max_depth + 1))) {
           printf("WHOA: def level %d out of bounds (max_depth %d) (index %d) (end_depth %d)!\n", def_level, max_depth, index, end_depth);
         }
+        if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
+          t, def_level, rep_level, start_depth, end_depth); }
       }
-      if (enable_print && (t < printf_num_threads)) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
-        t, def_level, rep_level, start_depth, end_depth); }
     }
 
     //Determine value count & row index
@@ -595,8 +624,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
     int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
 
     if constexpr (enable_print) {
-      if (t == 0) { printf("row_index %d, in_row_bounds %d, in_nesting_bounds %d, last_row %d\n", \
-        row_index, in_row_bounds, in_nesting_bounds, last_row); }
+      if(t == 0) { printf("LIST ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d, in_nesting_bounds %d\n", 
+        row_index, row_index_lower_bound, last_row, in_row_bounds, in_nesting_bounds); }
       if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", 
         t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); }
     }
@@ -1040,6 +1069,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
         next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, rep, t);
         if constexpr (enable_print) {
+          if(t == 0) { printf("LISTS NEXT: next_valid_count %d\n", next_valid_count); }
           if(t == 0) { printf("PROCESSING: page total values %d, num_input_values %d, pre value_count %d, post value_count %d, "
             "processed_count %d, valid_count %d, next_valid_count %d\n", 
             s->page.num_input_values, s->input_value_count, value_count, s->input_value_count, processed_count, valid_count, next_valid_count); }
@@ -1047,6 +1077,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       } else if constexpr (has_nesting_t) {
         next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, t);
+        if constexpr (enable_print) {
+          if(t == 0) { printf("NESTED NEXT: next_valid_count %d\n", next_valid_count); }
+        }
       } else {
         next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, t);
diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh
index e82f927e34f..1e13302c467 100644
--- a/cpp/src/io/parquet/page_data.cuh
+++ b/cpp/src/io/parquet/page_data.cuh
@@ -336,7 +336,6 @@ inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos
   uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
 
-auto dict_lookup_idx = rolling_index<state_buf::dict_buf_size>(src_pos);
   if (s->dict_base) {
     // Dictionary
     dict_pos =
@@ -352,6 +351,7 @@ auto dict_lookup_idx = rolling_index<state_buf::dict_buf_size>(src_pos);
   static constexpr bool enable_print = false;
   if constexpr (enable_print) {
     if (threadIdx.x == 0) {
+      auto dict_lookup_idx = rolling_index<state_buf::dict_buf_size>(src_pos);
       printf("PREP OUTPUT VALUE at dst %p, dict %p, dict_pos %u, dict_size %u, dict_base %p, dict_bits %d, dict_lookup_idx %d, dtype_len_in %d\n", 
         dst, dict, dict_pos, dict_size, s->dict_base, s->dict_bits, dict_lookup_idx, s->dtype_len_in);
     }
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index cb682112195..7e4fb0271d5 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -588,8 +588,8 @@ inline __device__ void store_validity(int valid_map_offset,
     if (relevant_mask == ~0) {
       valid_map[word_offset] = valid_mask;
     } else {
-      atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset)); //clears old bits
-      atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset); //sets valid mask
+      atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset));
+      atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
     }
   }
   // we're going to spill over into the next word.
@@ -719,16 +719,9 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
     // track (page-relative) row index for the thread so we can compare against input bounds
     // keep track of overall # of rows we've read.
     int const is_new_row               = start_depth == 0 ? 1 : 0;
-    uint32_t const warp_row_count_mask = ballot(is_new_row); //how many threads are starting a new row
-    //t is zero through 31. the shifted bit is the 1st through the 32nd bit. then we -1: mask
-    //the mask we and with is querying PRIOR threads
-    uint32_t const prior_thread_mask = ((1 << t) - 1); //query "for all threads before me"
-    uint32_t const prior_new_rows_bits = warp_row_count_mask & prior_thread_mask;
-    int32_t const num_prior_new_rows = __popc(prior_new_rows_bits);
-
+    uint32_t const warp_row_count_mask = ballot(is_new_row);
     int32_t const thread_row_index =
-      input_row_count + ((num_prior_new_rows + is_new_row) - 1);
-
+      input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
     input_row_count += __popc(warp_row_count_mask);
     // is this thread within read row bounds?
     int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
@@ -736,34 +729,30 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
                                 ? 1
                                 : 0;
 
-    // if we are within the range of nesting levels we should be adding value indices for
-//if list: is from/in current rep level to/in the rep level AT the depth with the def value
-    int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
-
     // compute warp and thread value counts
-    uint32_t const warp_count_mask = ballot(in_nesting_bounds);
+    uint32_t const warp_count_mask =
+      ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
 
     warp_value_count = __popc(warp_count_mask);
-    // thread_value_count : # of output values from the view of this thread
-    // is all threads before me that start from rep level zero (new row)
-    thread_value_count = __popc(warp_count_mask & prior_thread_mask);
+    // Note : ((1 << t) - 1) implies "for all threads before me"
+    thread_value_count = __popc(warp_count_mask & ((1 << t) - 1));
 
     // walk from 0 to max_depth
+    uint32_t next_thread_value_count, next_warp_value_count;
     for (int s_idx = 0; s_idx < max_depth; s_idx++) {
       PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx];
 
+      // if we are within the range of nesting levels we should be adding value indices for
+      int const in_nesting_bounds =
+        ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
+
       // everything up to the max_def_level is a non-null value
-//if is NOT list, then means is-not-null, OR is-null in a CHILD node
-//if IS list, also: is from/in current rep level to/in the rep level AT the depth with the def value
       uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0;
 
       // compute warp and thread valid counts
-      // bit is set for each thread in the warp that is_valid
-//OR of all is_valid's shifted by thread_value_count
       uint32_t const warp_valid_mask =
         // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
         // because every value in the input matches to a value in the output
-//If no lists: every entry is a new row, which may be null
         !has_repetition
           ? ballot(is_valid)
           :
@@ -774,10 +763,8 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
           // __reduce_or_sync(), but until then we have to do a warp reduce.
           WarpReduceOr32(is_valid << thread_value_count);
 
-//For this value, we save an offset at every depth (in the loop)
-      //# bits prior to this thread that are valid (set)
       thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
-      warp_valid_count   = __popc(warp_valid_mask); //#set bits of all threads in warp
+      warp_valid_count   = __popc(warp_valid_mask);
 
       // if this is the value column emit an index for value decoding
       if (is_valid && s_idx == max_depth - 1) {
@@ -791,15 +778,10 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
       // do this for nested schemas so that we can emit an offset for the -current- nesting
       // level. more concretely : the offset for the current nesting level == current length of the
       // next nesting level
-      uint32_t next_thread_value_count = 0, next_warp_value_count = 0;
-      int next_in_nesting_bounds = 0;
       if (s_idx < max_depth - 1) {
-        //mask is different between depths
-        next_in_nesting_bounds = 
-          (s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0;
-        uint32_t const next_warp_count_mask = ballot(next_in_nesting_bounds);
-
-        next_warp_value_count   = __popc(next_warp_count_mask); //same for all threads, but not all depths
+        uint32_t const next_warp_count_mask =
+          ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
+        next_warp_value_count   = __popc(next_warp_count_mask);
         next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
 
         // if we're -not- at a leaf column and we're within nesting/row bounds
@@ -810,36 +792,34 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
           cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count +
                                       next_thread_value_count +
                                       nesting_info_base[s_idx + 1].page_start_value;
-//STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(nesting_info->data_out))[idx] = ofs;
         }
       }
 
-      // lists always read and write to the same bounds (that is, read and write positions
-      // are already pre-bounded by first_row/num_rows) how? we have pre-processed them. 
-      // flat schemas will start reading at the first value, even if that is before first_row, 
-      // because we cannot trivially jump to the correct position to start reading. 
-      // why not? because we don't know how many nulls were before it (haven't preprocessed them)
-      // since we are about to write the validity vector here
+      // nested schemas always read and write to the same bounds (that is, read and write positions
+      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
+      // first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector here
       // we need to adjust our computed mask to take into account the write row bounds.
       int const in_write_row_bounds =
         !has_repetition
           ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
           : in_row_bounds;
-      //is write_start in new
       int const first_thread_in_write_range =
-        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; //index of lowest bit set to
+        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
+
+      // # of bits to of the validity mask to write out
+      int const warp_valid_mask_bit_count =
+        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
 
       // increment count of valid values, count of total values, and update validity mask
       if (!t) {
-        // # of bits to of the validity mask to write out //becomes bit_count
-        int const warp_valid_mask_bit_count =
-          first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
-
         if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
           uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
-          store_validity(nesting_info->valid_map_offset, nesting_info->valid_map,
-                         warp_output_valid_mask, warp_valid_mask_bit_count);
+          store_validity(nesting_info->valid_map_offset,
+                         nesting_info->valid_map,
+                         warp_output_valid_mask,
+                         warp_valid_mask_bit_count);
           nesting_info->valid_map_offset += warp_valid_mask_bit_count;
           nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
         }
@@ -850,8 +830,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
       // propagate value counts for the next level
       warp_value_count   = next_warp_value_count;
       thread_value_count = next_thread_value_count;
-      in_nesting_bounds = next_in_nesting_bounds;
-    } //END OF DEPTH LOOP
+    }
 
     input_value_count += min(32, (target_input_value_count - input_value_count));
     __syncwarp();
@@ -1117,12 +1096,6 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
                                    ? max_page_rows
                                    : max_row - (page_start_row + s->first_row);
-
-      static constexpr bool enable_print = false;
-      if constexpr (enable_print) {
-        printf("NUM_ROWS: col.start_row %lu, page.chunk_row %d, page_start_row %lu, s->first_row %d, s->page.num_rows %d, max_row %lu, min_row %lu, num_rows %lu, s->num_rows %d\n", 
-          s->col.start_row, s->page.chunk_row, page_start_row, s->first_row, s->page.num_rows, max_row, min_row, num_rows, s->num_rows);
-      }
     }
   }
 
@@ -1283,11 +1256,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           }
 
           if (s->col.column_data_base != nullptr) {
+            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
             if (s->col.column_string_base != nullptr) {
               nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
             }
 
             nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+
             if (nesting_info->data_out != nullptr) {
               // anything below max depth with a valid data pointer must be a list, so the
               // element size is the size of the offset type.
@@ -1302,8 +1277,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
             }
             nesting_info->valid_map = s->col.valid_map_base[idx];
             if (nesting_info->valid_map != nullptr) {
-              nesting_info->valid_map += output_offset >> 5; //is pointer to warp start
-              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f); //is index within warp
+              nesting_info->valid_map += output_offset >> 5;
+              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
             }
           }
         }
@@ -1382,7 +1357,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
     s->dict_pos                          = 0;
     s->src_pos                           = 0;
 
-    // for non-lists, we can't know how many leaf values to skip unless we do a full
+    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
     // preprocess of the definition levels (since nulls will have no actual decodable value, there
     // is no direct correlation between # of rows and # of decodable values).  so we will start
     // processing at the beginning of the value stream and disregard any indices that start
@@ -1396,7 +1371,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
 
       s->row_index_lower_bound = -1;
     }
-    // for lists, we have run a preprocess that lets us skip directly to the values
+    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
     // we need to start decoding at
     else {
       // input_row_count translates to "how many rows we have processed so far", so since we are
@@ -1404,7 +1379,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       s->input_row_count = s->first_row;
 
       // return the lower bound to compare (page-relative) thread row index against. Explanation:
-      // In the case of lists, rows can span page boundaries.  That is to say,
+      // In the case of nested schemas, rows can span page boundaries.  That is to say,
       // we can encounter the first value for row X on page M, but the last value for page M
       // might not be the last value for row X. page M+1 (or further) may contain the last value.
       //
@@ -1439,4 +1414,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   return true;
 }
 
-}  // namespace cudf::io::parquet::detail
+}  // namespace cudf::io::parquet::detail
\ No newline at end of file
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 53a55a43300..3fad8e344ea 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -184,7 +184,6 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
   } 
   
   if (is_list(chunk) && !is_string_col(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
-  //if (is_list(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
       return decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index d666f129af8..b8093cb3195 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -818,28 +818,6 @@ void DecodeStringPageData(cudf::detail::hostdevice_span<PageInfo> pages,
                           kernel_error::pointer error_code,
                           rmm::cuda_stream_view stream);
 
-/**
- * @brief Launches kernel for reading the list column data stored in the pages
- *
- * The page data will be written to the output pointed to in the page's
- * associated column chunk.
- *
- * @param[in,out] pages All pages to be decoded
- * @param[in] chunks All chunks to be decoded
- * @param[in] num_rows Total number of rows to read
- * @param[in] min_row Minimum number of rows to read
- * @param[in] level_type_size Size in bytes of the type for level decoding
- * @param[out] error_code Error code for kernel failures
- * @param[in] stream CUDA stream to use
- */
-void DecodeListPageData(cudf::detail::hostdevice_span<PageInfo> pages,
-                        cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                        size_t num_rows,
-                        size_t min_row,
-                        int level_type_size,
-                        kernel_error::pointer error_code,
-                        rmm::cuda_stream_view stream);
-
 /**
  * @brief Launches kernel for reading the DELTA_BINARY_PACKED column data stored in the pages
  *

From 0dccec54a6a3f40d7096d565d23755d48be68cad Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Thu, 5 Sep 2024 18:20:30 -0400
Subject: [PATCH 05/36] Add debugging

---
 cpp/src/io/parquet/decode_fixed.cu | 192 ++++++++++++++++++++++++++---
 1 file changed, 173 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 685249c607e..33f11aef9b2 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -40,6 +40,8 @@ __device__ inline void gpuDecodeFixedWidthValues(
   uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
 
   static constexpr bool enable_print = false;
+  static constexpr bool enable_print_range_error = false;
+
   if constexpr (enable_print) {
     if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, "
       "data_out %p, dict_base %p, dict_size %d, dict_bits %d, dict_val %d, data_start %p, skipped_leaf_values %u, input_row_count %d\n", 
@@ -57,13 +59,17 @@ __device__ inline void gpuDecodeFixedWidthValues(
     int src_pos    = pos + t;
 
     // the position in the output column/buffer
-//Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
+//Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)
     auto offset = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
     int dst_pos = offset;
     if constexpr (!has_lists_t) {
       dst_pos -= s->first_row;
     }
 
+    if constexpr (has_lists_t && enable_print_range_error) {
+      if((dst_pos < 0) && (src_pos < target_pos)) { printf("WHOA: decode dst_pos %d out of bounds, src_pos %d, start %d\n", dst_pos, src_pos, start); }
+    }
+
     int dict_idx = rolling_index<state_buf::dict_buf_size>(src_pos + skipped_leaf_values);
     int dict_pos = sb->dict_idx[dict_idx];
     if constexpr (enable_print) {
@@ -126,6 +132,14 @@ __device__ inline void gpuDecodeFixedWidthValues(
       } else {
         gpuOutputGeneric(s, sb, src_pos, static_cast<uint8_t*>(dst), dtype_len);
       }
+
+      if (dtype == INT32) {
+        int value_stored = *static_cast<uint32_t*>(dst);
+        int overall_index = blockIdx.x * 20000 * 4 + src_pos;
+        if((overall_index % 1024) != value_stored) {
+          printf("WHOA BAD VALUE: WROTE %d to %d!\n", value_stored, overall_index);
+        }
+      }
     }
 
     pos += batch_size;
@@ -547,6 +561,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int value_count = s->input_value_count;
 
   static constexpr bool enable_print = false;
+  static constexpr bool enable_print_range_error = false;
+  static constexpr bool enable_print_large_list = true;
   int const printf_num_threads = 0;
 
   // how many rows we've processed in the page so far
@@ -568,6 +584,14 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
   __syncthreads();
 
+if constexpr (enable_print_large_list) {
+  auto first_ni_value_count = s->nesting_info[0].value_count;
+  if((value_count != (4*input_row_count)) || (input_row_count != first_ni_value_count)){
+    printf("ALGO GARBAGE GET: blockIdx.x %d, value_count %d, target_value_count %d, t %d, value_count %d, input_row_count %d, first_ni_value_count %d\n", 
+    blockIdx.x, value_count, target_value_count, t, value_count, input_row_count, first_ni_value_count);
+  }
+}
+
   while (value_count < target_value_count) {
     if constexpr (enable_print) {
       if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); }
@@ -576,24 +600,33 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
     // get definition level, use repitition level to get start/end depth
     // different for each thread, as each thread has a different r/d
-    int def_level = -1, start_depth = -1, end_depth = -1;
+    int rep_level = -1, def_level = -1, start_depth = -1, end_depth = -1;
     if (within_batch) {
       int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
-      auto const rep_level = static_cast<int>(rep[index]);
+      rep_level = static_cast<int>(rep[index]);
       def_level = static_cast<int>(def[index]);
 
       //computed by generate_depth_remappings()
-      if constexpr (enable_print) {
+      if constexpr (enable_print || enable_print_range_error) {
         if((rep_level < 0) || (rep_level > max_depth)) {
           printf("WHOA: rep level %d out of bounds %d!\n", rep_level, max_depth);
         }
+        if((def_level < 0)/* || (def_level > (max_depth + 1)) */ ) {
+          printf("WHOA: def level %d out of bounds (max_depth %d) (index %d)!\n", def_level, max_depth, index);
+        }
       }
+
       start_depth = s->nesting_info[rep_level].start_depth;
       end_depth   = s->nesting_info[def_level].end_depth;
-      if constexpr (enable_print) {
-        if((def_level < 0) || (def_level > (max_depth + 1))) {
-          printf("WHOA: def level %d out of bounds (max_depth %d) (index %d) (end_depth %d)!\n", def_level, max_depth, index, end_depth);
+      if constexpr (enable_print || enable_print_range_error) {
+        if((start_depth < 0) || (start_depth > (max_depth + 1))) {
+          printf("WHOA: start_depth %d out of bounds (max_depth %d) (index %d)!\n", start_depth, max_depth, index);
         }
+        if((end_depth < 0) || (end_depth > (max_depth + 1))) {
+          printf("WHOA: end_depth %d out of bounds (max_depth %d) (index %d)!\n", end_depth, max_depth, index);
+        }
+      }
+      if constexpr (enable_print) {
         if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
           t, def_level, rep_level, start_depth, end_depth); }
       }
@@ -609,6 +642,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
     block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows);
     __syncthreads(); //Needed because scan_storage will be reused
 
+if constexpr (enable_print_large_list) {
+  if(bool(is_new_row) != (t % 4 == 0)) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d\n", 
+      blockIdx.x, value_count, target_value_count, t, is_new_row);
+  }
+  if(num_prior_new_rows != ((t + 3) / 4)) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, num_prior_new_rows %d\n", 
+      blockIdx.x, value_count, target_value_count, t, num_prior_new_rows);
+  }
+  if(total_num_new_rows != 32) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, total_num_new_rows %d\n", 
+      blockIdx.x, value_count, target_value_count, t, total_num_new_rows);
+  }
+}
+
     if constexpr (enable_print) {
       if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); }
     }
@@ -633,6 +681,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
     // queries is_valid from all threads, stores prior total and total total
     int thread_value_count = 0, block_value_count = 0;
     block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count);
+    __syncthreads();
+
+if constexpr (enable_print_large_list) {
+  if(in_nesting_bounds != (t % 4 == 0)) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", 
+      blockIdx.x, value_count, target_value_count, t, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count);
+  }
+  if(thread_value_count != ((t + 3) / 4)) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, thread_value_count %d\n", 
+      blockIdx.x, value_count, target_value_count, t, thread_value_count);
+  }
+  if(block_value_count != 32) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, block_value_count %d\n", 
+      blockIdx.x, value_count, target_value_count, t, block_value_count);
+  }
+}
 
     if constexpr (enable_print) {
       if (t == 0) { printf("block_value_count %d\n", block_value_count); }
@@ -670,13 +734,15 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       static_assert(decode_block_size <= 8*sizeof(__uint128_t), 
         "This code relies on bits for block threads fitting within a uint128!");
 
-      using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>;
-      __shared__ typename block_reduce::TempStorage reduce_storage;
       auto shifted_validity = static_cast<__uint128_t>(is_valid) << thread_value_count;
       auto or_reducer = [](const __uint128_t& lhs, const __uint128_t& rhs){
         return lhs | rhs;
       };
+
+      using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>;
+      __shared__ typename block_reduce::TempStorage reduce_storage;
       __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer);
+      __syncthreads(); // TODO: WHY IS THIS NEEDED?
 
       //Reduction result is only visible to thread zero, must share with other threads:
       __shared__ __uint128_t block_valid_mask_storage;
@@ -689,6 +755,24 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       };
       auto thread_mask = (__uint128_t(1) << thread_value_count) - 1;
       int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask);
+int const block_valid_count = count_set_bits(block_valid_mask);
+
+if constexpr (enable_print_large_list) {
+  if(((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid)) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, is_valid %d, in_nesting_bounds %d\n", 
+      blockIdx.x, value_count, target_value_count, t, d_idx, is_valid, in_nesting_bounds);
+  }
+  if (((d_idx == 0) && (thread_valid_count != ((t + 3)/ 4))) || ((d_idx == 1) && (thread_valid_count != t))) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", 
+      blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count);
+  }
+  if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", 
+      blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count);
+  }
+}
+
+
 
       if constexpr (enable_print) {
         if((block_valid_mask == 0) && (t == 0) && (d_idx == max_depth)) { 
@@ -715,6 +799,24 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
         using block_scan = cub::BlockScan<int, decode_block_size>;
         __shared__ typename block_scan::TempStorage scan_storage;
         block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count);
+        __syncthreads(); // TODO: WHY IS THIS NEEDED?
+
+
+if constexpr (enable_print_large_list) {
+  if(next_in_nesting_bounds != 1) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", 
+      blockIdx.x, value_count, target_value_count, t, next_in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count);
+  }
+  if(next_thread_value_count != t) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_thread_value_count %d\n", 
+      blockIdx.x, value_count, target_value_count, t, next_thread_value_count);
+  }
+  if(next_block_value_count != 128) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_block_value_count %d\n", 
+      blockIdx.x, value_count, target_value_count, t, next_block_value_count);
+  }
+}
+
 
         if constexpr (enable_print) {
           if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); }
@@ -734,6 +836,24 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
           //STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
 
+int overall_index = 4*(blockIdx.x * 20000 + idx);
+if(overall_index != ofs) {
+  printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, "
+    "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, "
+    "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, "
+    "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, "
+    "target_value_count %d, block_value_count %d, next_block_value_count %d\n", 
+    ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, 
+    next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, 
+    total_num_new_rows, rep_level, def_level, ni.value_count, thread_value_count, next_ni.value_count, 
+    next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count);
+}
+
+          if constexpr (enable_print || enable_print_range_error) {
+            if((idx < 0) || (idx > 50000)){ printf("WHOA: offset index %d out of bounds!\n", idx); }
+            if(ofs < 0){ printf("WHOA: offset value %d out of bounds!\n", ofs); }
+          }
+
           if constexpr (enable_print) {
             if(idx < 0) { printf("WHOA: offset index out of bounds!\n"); }
             if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", 
@@ -801,18 +921,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
         int const src_pos = ni.valid_count + thread_valid_count;
         int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
 
+        if constexpr (enable_print || enable_print_range_error) {
+          if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) {
+            printf("WHOA: output index STORE %d out of bounds!\n", output_index);
+          }
+          if(dst_pos < 0) { printf("WHOA: dst_pos STORE %d out of bounds!\n", dst_pos); }
+        }
+
         if constexpr (enable_print) {
           if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); }
           if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
 
-          if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) {
-            printf("WHOA: output index out of bounds!\n");
-          }
-
           if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, ni.valid_count %d, thread_value_count %d, thread_valid_count %d\n", 
             output_index, dst_pos, ni.value_count, ni.valid_count, thread_value_count, thread_valid_count);}
 
-          printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos);
+          if (t == 0) { printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); }
         }
 
         //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
@@ -822,11 +945,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
       // update stuff
       if (t == 0) {
-        int const block_valid_count = count_set_bits(block_valid_mask);
+//        int const block_valid_count = count_set_bits(block_valid_mask);
         ni.valid_count += block_valid_count;
         ni.value_count += block_value_count;
         ni.valid_map_offset += block_value_count;
       }
+      __syncthreads();  // handle modification of ni.value_count from below
 
       // propagate value counts for the next depth level
       block_value_count  = next_block_value_count;
@@ -853,6 +977,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
     // If we have lists # rows != # values
     s->input_row_count = input_row_count;
+if constexpr (enable_print_large_list) {
+  auto first_ni_value_count = s->nesting_info[0].value_count;
+  if((value_count != (4*input_row_count)) || (input_row_count != first_ni_value_count)){
+    printf("ALGO GARBAGE SET: blockIdx.x %d, value_count %d, target_value_count %d, t %d, value_count %d, input_row_count %d, first_ni_value_count %d\n", 
+    blockIdx.x, value_count, target_value_count, t, value_count, input_row_count, first_ni_value_count);
+  }
+}
   }
 
   __syncthreads();
@@ -927,6 +1058,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
   int const page_idx    = blockIdx.x;
+/*  page_idx = (page_idx == -1) ? blockIdx.x : page_idx + blockIdx.x;
+  if(page_idx >= num_pages) {
+    printf("BAIL ON PAGE %d of %d\n", page_idx, num_pages);
+    return;
+  }*/
   int const t           = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
@@ -1008,7 +1144,15 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       if(t == 0) { printf("INIT DICT: dict_bits %d, data_start %p, data_end %p, dict_idx %p, page.num_input_values %d, s->dict_pos %d \n", 
         s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); }
     }
-    dict_stream.decode_next(t, s->page.skipped_leaf_values);
+    if constexpr (has_lists_t){
+      int init_decode = 0;
+      while (init_decode < s->page.skipped_leaf_values) {
+        auto const to_skip = min(decode_block_size_t, s->page.skipped_leaf_values - init_decode);
+        dict_stream.decode_next(t, to_skip);
+        init_decode += to_skip;
+        __syncthreads();
+      }
+    }
   }
   __syncthreads();
 
@@ -1044,14 +1188,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
         print_nesting_level(s->nesting_info[0]);
         printf("POST %d NESTING 1: ", int(is_post));
         print_nesting_level(s->nesting_info[1]);
-        printf("POST %d NESTING 2: ", int(is_post));
-        print_nesting_level(s->nesting_info[2]);
+        //printf("POST %d NESTING 2: ", int(is_post));
+        //print_nesting_level(s->nesting_info[2]);
       }
     }
   };
 
   print_nestings(false);
-
+  if constexpr (enable_print) {
+    if(t == 0) {printf("LOOP START page_idx %d\n", page_idx);}
+  }
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
@@ -1153,7 +1299,13 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
 
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+  /*
+  auto num_pages = pages.size();
+  auto grid_dim = num_pages; //2, 10, 40, 100 no problem; all = problem
+  dim3 dim_grid(grid_dim, 1);  // 1 threadblock per page
 
+for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) {
+  */
   if (level_type_size == 1) {
     if (is_list) {
       gpuDecodePageDataGeneric<uint8_t,
@@ -1165,6 +1317,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
+//          pages.device_ptr(), chunks, min_row, num_rows, error_code, idx, num_pages);
     } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
@@ -1197,6 +1350,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
+//          pages.device_ptr(), chunks, min_row, num_rows, error_code, idx, num_pages);
     } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,

From e239e79a3a767bc4a4f124b0f58fd09586eba2ee Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Sat, 7 Sep 2024 14:55:13 -0400
Subject: [PATCH 06/36] Tests working

---
 cpp/src/io/parquet/decode_fixed.cu | 147 +++++++++++++++++++----------
 cpp/src/io/parquet/reader_impl.cpp |   4 +
 2 files changed, 103 insertions(+), 48 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 33f11aef9b2..b1acb4d8a86 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -41,6 +41,8 @@ __device__ inline void gpuDecodeFixedWidthValues(
 
   static constexpr bool enable_print = false;
   static constexpr bool enable_print_range_error = false;
+  static constexpr bool enable_print_large_list = false;
+  static constexpr bool enable_print_loop_check = false;
 
   if constexpr (enable_print) {
     if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, "
@@ -50,11 +52,20 @@ __device__ inline void gpuDecodeFixedWidthValues(
     }
   }
 
+int loop_count = 0;
+
   // decode values
   int pos = start;
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
 
+    if constexpr (enable_print_loop_check) {
+      ++loop_count;
+      if((loop_count > 100) && (t == 0)) {
+        printf("INFINITE LOOP IN gpuDecodeFixedWidthValues!\n");
+      }
+    }
+
     int const target_pos = pos + batch_size;
     int src_pos    = pos + t;
 
@@ -133,11 +144,13 @@ __device__ inline void gpuDecodeFixedWidthValues(
         gpuOutputGeneric(s, sb, src_pos, static_cast<uint8_t*>(dst), dtype_len);
       }
 
-      if (dtype == INT32) {
-        int value_stored = *static_cast<uint32_t*>(dst);
-        int overall_index = blockIdx.x * 20000 * 4 + src_pos;
-        if((overall_index % 1024) != value_stored) {
-          printf("WHOA BAD VALUE: WROTE %d to %d!\n", value_stored, overall_index);
+      if constexpr (enable_print_large_list) {
+        if (dtype == INT32) {
+          int value_stored = *static_cast<uint32_t*>(dst);
+          int overall_index = blockIdx.x * 20000 * 4 + src_pos;
+          if((overall_index % 1024) != value_stored) {
+            printf("WHOA BAD VALUE: WROTE %d to %d!\n", value_stored, overall_index);
+          }
         }
       }
     }
@@ -562,7 +575,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
   static constexpr bool enable_print = false;
   static constexpr bool enable_print_range_error = false;
-  static constexpr bool enable_print_large_list = true;
+  static constexpr bool enable_print_large_list = false;
+  static constexpr bool enable_print_loop_check = false;
   int const printf_num_threads = 0;
 
   // how many rows we've processed in the page so far
@@ -592,7 +606,20 @@ if constexpr (enable_print_large_list) {
   }
 }
 
+  using block_scan = cub::BlockScan<int, decode_block_size>;
+  __shared__ typename block_scan::TempStorage scan_storage;
+
+int loop_count = 0;
+
   while (value_count < target_value_count) {
+
+    if constexpr (enable_print_loop_check) {
+      ++loop_count;
+      if((loop_count > 100) && (t == 0)) {
+        printf("INFINITE LOOP IN LISTS!\n");
+      }
+    }
+
     if constexpr (enable_print) {
       if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); }
     }
@@ -627,20 +654,19 @@ if constexpr (enable_print_large_list) {
         }
       }
       if constexpr (enable_print) {
-        if (t < printf_num_threads) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d\n", \
-          t, def_level, rep_level, start_depth, end_depth); }
+        if (t == 0) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d, max_depth %d\n", \
+          t, def_level, rep_level, start_depth, end_depth, max_depth); }
       }
     }
 
     //Determine value count & row index
     // track (page-relative) row index for the thread so we can compare against input bounds
     // keep track of overall # of rows we've read.
+    //THIS IS THE UNDO POINT
     int const is_new_row = start_depth == 0 ? 1 : 0;
     int num_prior_new_rows, total_num_new_rows;
-    using block_scan = cub::BlockScan<int, decode_block_size>;
-    __shared__ typename block_scan::TempStorage scan_storage;
     block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows);
-    __syncthreads(); //Needed because scan_storage will be reused
+    __syncthreads();
 
 if constexpr (enable_print_large_list) {
   if(bool(is_new_row) != (t % 4 == 0)) {
@@ -704,9 +730,17 @@ if constexpr (enable_print_large_list) {
         t, thread_value_count, in_nesting_bounds); }
     }
 
+int depth_loop_count = 0;
     // column is either nullable or is a list (or both): iterate by depth
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
 
+      if constexpr (enable_print_loop_check) {
+        ++depth_loop_count;
+        if((depth_loop_count > 100) && (t == 0)) {
+          printf("INFINITE LOOP IN LISTS DEPTH!\n");
+        }
+      }
+
       auto& ni = s->nesting_info[d_idx];
 
       // everything up to the max_def_level is a non-null value
@@ -742,7 +776,6 @@ if constexpr (enable_print_large_list) {
       using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>;
       __shared__ typename block_reduce::TempStorage reduce_storage;
       __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer);
-      __syncthreads(); // TODO: WHY IS THIS NEEDED?
 
       //Reduction result is only visible to thread zero, must share with other threads:
       __shared__ __uint128_t block_valid_mask_storage;
@@ -755,7 +788,7 @@ if constexpr (enable_print_large_list) {
       };
       auto thread_mask = (__uint128_t(1) << thread_value_count) - 1;
       int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask);
-int const block_valid_count = count_set_bits(block_valid_mask);
+//int const block_valid_count = count_set_bits(block_valid_mask);
 
 if constexpr (enable_print_large_list) {
   if(((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid)) {
@@ -766,14 +799,12 @@ if constexpr (enable_print_large_list) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count);
   }
-  if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) {
+/*  if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count);
-  }
+  }*/
 }
 
-
-
       if constexpr (enable_print) {
         if((block_valid_mask == 0) && (t == 0) && (d_idx == max_depth)) { 
           printf("EMPTY VALID MASK: def_level %d, max_def_level %d, in_nesting_bounds %d, start_depth %d, "
@@ -796,11 +827,8 @@ if constexpr (enable_print_large_list) {
         next_in_nesting_bounds = 
           (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0;
 
-        using block_scan = cub::BlockScan<int, decode_block_size>;
-        __shared__ typename block_scan::TempStorage scan_storage;
         block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count);
-        __syncthreads(); // TODO: WHY IS THIS NEEDED?
-
+        __syncthreads();
 
 if constexpr (enable_print_large_list) {
   if(next_in_nesting_bounds != 1) {
@@ -817,7 +845,6 @@ if constexpr (enable_print_large_list) {
   }
 }
 
-
         if constexpr (enable_print) {
           if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); }
           if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", 
@@ -836,17 +863,19 @@ if constexpr (enable_print_large_list) {
           //STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
 
-int overall_index = 4*(blockIdx.x * 20000 + idx);
-if(overall_index != ofs) {
-  printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, "
-    "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, "
-    "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, "
-    "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, "
-    "target_value_count %d, block_value_count %d, next_block_value_count %d\n", 
-    ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, 
-    next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, 
-    total_num_new_rows, rep_level, def_level, ni.value_count, thread_value_count, next_ni.value_count, 
-    next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count);
+if constexpr (enable_print_large_list) {
+  int overall_index = 4*(blockIdx.x * 20000 + idx);
+  if(overall_index != ofs) {
+    printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, "
+      "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, "
+      "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, "
+      "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, "
+      "target_value_count %d, block_value_count %d, next_block_value_count %d\n", 
+      ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, 
+      next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, 
+      total_num_new_rows, rep_level, def_level, ni.value_count, thread_value_count, next_ni.value_count, 
+      next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count);
+  }
 }
 
           if constexpr (enable_print || enable_print_range_error) {
@@ -914,11 +943,14 @@ if(overall_index != ofs) {
       }
 
       // if this is valid and we're at the leaf, output dst_pos
+      // Read these before the sync, so that when thread 0 modifies them we've already read their values
+      int current_value_count = ni.value_count;
+      int current_valid_count = ni.valid_count;
       __syncthreads();  // handle modification of ni.valid_count from below
       if (is_valid && d_idx == max_depth) {
         // for non-list types, the value count is always the same across
-        int const dst_pos = ni.value_count + thread_value_count;
-        int const src_pos = ni.valid_count + thread_valid_count;
+        int const dst_pos = current_value_count + thread_value_count;
+        int const src_pos = current_valid_count + thread_valid_count;
         int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
 
         if constexpr (enable_print || enable_print_range_error) {
@@ -941,11 +973,11 @@ if(overall_index != ofs) {
         //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
         sb->nz_idx[output_index] = dst_pos;
       }
-      __syncthreads();  // handle modification of ni.value_count from below
+//      __syncthreads();  // handle modification of ni.value_count from below TODO: TRY REMOVE
 
       // update stuff
       if (t == 0) {
-//        int const block_valid_count = count_set_bits(block_valid_mask);
+        int const block_valid_count = count_set_bits(block_valid_mask);
         ni.valid_count += block_valid_count;
         ni.value_count += block_value_count;
         ni.valid_map_offset += block_value_count;
@@ -1044,7 +1076,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
                            size_t num_rows,
-                           kernel_error::pointer error_code)
+                           kernel_error::pointer error_code /*, int page_idx = -1, int num_pages = -1*/)
 {
   constexpr int rolling_buf_size    = decode_block_size_t * 2;
   constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size_t>();
@@ -1059,7 +1091,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   auto* const sb        = &state_buffers;
   int const page_idx    = blockIdx.x;
 /*  page_idx = (page_idx == -1) ? blockIdx.x : page_idx + blockIdx.x;
-  if(page_idx >= num_pages) {
+  if((page_idx >= num_pages) && (num_pages != -1)) {
     printf("BAIL ON PAGE %d of %d\n", page_idx, num_pages);
     return;
   }*/
@@ -1091,16 +1123,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   bool const should_process_def_levels = should_process_nulls || has_lists_t;
 
   // shared buffer. all shared memory is suballocated out of here
+  static constexpr auto align_test = false;
+  static constexpr size_t buffer_alignment = align_test ? 128 : 16;
   constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
-    sizeof(rle_run<level_t>), size_t{16}) : 0;
+    sizeof(rle_run<level_t>), buffer_alignment) : 0;
   constexpr int shared_dict_size =
     has_dict_t
-      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), buffer_alignment)
       : 0;
   constexpr int shared_def_size =
-    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
+    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), buffer_alignment);
   constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size;
-  __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
+  __shared__ __align__(buffer_alignment) uint8_t shared_buf[shared_buf_size];
 
   // setup all shared memory buffers
   int shared_offset = 0;
@@ -1133,11 +1167,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   }
 
   static constexpr bool enable_print = false;
+  static constexpr bool enable_print_loop_check = false;
 
   rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
   if constexpr (has_dict_t) {
-    //auto const skipped_leaf_values = s->page.skipped_leaf_values;
-    //int const dict_offset = skipped_leaf_values * sizeof(uint32_t);
     dict_stream.init(
       s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
     if constexpr (enable_print) {
@@ -1156,6 +1189,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   }
   __syncthreads();
 
+  if constexpr (enable_print) {
+    if((t == 0) && (page_idx == 0)){
+      printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size);
+    }
+  }
+
   // We use two counters in the loop below: processed_count and valid_count.
   // - processed_count: number of values out of num_input_values that we have decoded so far.
   //   the definition stream returns the number of total rows it has processed in each call
@@ -1198,11 +1237,22 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   if constexpr (enable_print) {
     if(t == 0) {printf("LOOP START page_idx %d\n", page_idx);}
   }
+int loop_count = 0;  
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
+    if constexpr (enable_print_loop_check) {
+      ++loop_count;
+      if((loop_count > 10000) && (t == 0)) {
+        printf("INFINITE LOOP IN MAIN!\n");
+      }
+    }
+
     if constexpr (has_lists_t){
       rep_decoder.decode_next(t);
+      if constexpr (!align_test) {
+        __syncthreads();
+      }
     }
 
     // only need to process definition levels if this is a nullable column
@@ -1299,13 +1349,13 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
 
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-  /*
+/*  
   auto num_pages = pages.size();
-  auto grid_dim = num_pages; //2, 10, 40, 100 no problem; all = problem
+  auto grid_dim = 1; //2, 10, 40, 100 no problem; all = problem
   dim3 dim_grid(grid_dim, 1);  // 1 threadblock per page
 
 for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) {
-  */
+*/
   if (level_type_size == 1) {
     if (is_list) {
       gpuDecodePageDataGeneric<uint8_t,
@@ -1373,6 +1423,7 @@ for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) {
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   }
+//}
 }
 
 void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 8f33f318f54..b305a7348e1 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -51,6 +51,8 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
+//printf("PREP LAUNCH: decode_page_data: mode %d, skip_rows %lu, num_rows %lu, #pages %lu\n", 
+//  (int)mode, skip_rows, num_rows, subpass.pages.size());
 
   auto& page_nesting        = subpass.page_nesting_info;
   auto& page_nesting_decode = subpass.page_nesting_decode_info;
@@ -418,9 +420,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
+//printf("SYNC ERROR CODE\n");
   if (auto const error = error_code.value_sync(_stream); error != 0) {
     CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
   }
+//printf("ERROR CODE SUNK\n");
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each

From 24c9ab1012d69d0a5e98134ce8c3c88082f34c24 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Mon, 9 Sep 2024 17:27:16 -0400
Subject: [PATCH 07/36] compile fixes

---
 cpp/src/io/parquet/decode_fixed.cu | 60 +++++++-----------------------
 1 file changed, 14 insertions(+), 46 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index b1acb4d8a86..62bdf01c533 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -42,7 +42,6 @@ __device__ inline void gpuDecodeFixedWidthValues(
   static constexpr bool enable_print = false;
   static constexpr bool enable_print_range_error = false;
   static constexpr bool enable_print_large_list = false;
-  static constexpr bool enable_print_loop_check = false;
 
   if constexpr (enable_print) {
     if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, "
@@ -52,20 +51,11 @@ __device__ inline void gpuDecodeFixedWidthValues(
     }
   }
 
-int loop_count = 0;
-
   // decode values
   int pos = start;
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
 
-    if constexpr (enable_print_loop_check) {
-      ++loop_count;
-      if((loop_count > 100) && (t == 0)) {
-        printf("INFINITE LOOP IN gpuDecodeFixedWidthValues!\n");
-      }
-    }
-
     int const target_pos = pos + batch_size;
     int src_pos    = pos + t;
 
@@ -576,8 +566,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   static constexpr bool enable_print = false;
   static constexpr bool enable_print_range_error = false;
   static constexpr bool enable_print_large_list = false;
-  static constexpr bool enable_print_loop_check = false;
-  int const printf_num_threads = 0;
 
   // how many rows we've processed in the page so far
   int input_row_count = s->input_row_count;
@@ -609,17 +597,8 @@ if constexpr (enable_print_large_list) {
   using block_scan = cub::BlockScan<int, decode_block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
 
-int loop_count = 0;
-
   while (value_count < target_value_count) {
 
-    if constexpr (enable_print_loop_check) {
-      ++loop_count;
-      if((loop_count > 100) && (t == 0)) {
-        printf("INFINITE LOOP IN LISTS!\n");
-      }
-    }
-
     if constexpr (enable_print) {
       if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); }
     }
@@ -700,7 +679,7 @@ if constexpr (enable_print_large_list) {
     if constexpr (enable_print) {
       if(t == 0) { printf("LIST ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d, in_nesting_bounds %d\n", 
         row_index, row_index_lower_bound, last_row, in_row_bounds, in_nesting_bounds); }
-      if (t < printf_num_threads) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", 
+      if (t < 32) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", 
         t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); }
     }
 
@@ -726,21 +705,13 @@ if constexpr (enable_print_large_list) {
 
     if constexpr (enable_print) {
       if (t == 0) { printf("block_value_count %d\n", block_value_count); }
-      if (t < printf_num_threads) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", 
+      if (t < 32) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", 
         t, thread_value_count, in_nesting_bounds); }
     }
 
-int depth_loop_count = 0;
     // column is either nullable or is a list (or both): iterate by depth
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
 
-      if constexpr (enable_print_loop_check) {
-        ++depth_loop_count;
-        if((depth_loop_count > 100) && (t == 0)) {
-          printf("INFINITE LOOP IN LISTS DEPTH!\n");
-        }
-      }
-
       auto& ni = s->nesting_info[d_idx];
 
       // everything up to the max_def_level is a non-null value
@@ -754,7 +725,7 @@ int depth_loop_count = 0;
       if constexpr (enable_print) {
         if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", 
           int(nullable), d_idx, max_depth, ni.max_def_level, value_count); }
-        if (t < printf_num_threads) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", 
+        if (t < 32) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", 
           t, def_level, in_nesting_bounds, is_valid); }
       }
 
@@ -813,7 +784,7 @@ if constexpr (enable_print_large_list) {
             row_index_lower_bound, last_row, input_row_count); }
 
         if (t == 0) { printf("block_valid_mask %u\n", int(block_valid_mask)); }
-        if (t < printf_num_threads) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
+        if (t < 32) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
       }
 
       // compute warp and thread value counts for the -next- nesting level. we need to
@@ -847,9 +818,9 @@ if constexpr (enable_print_large_list) {
 
         if constexpr (enable_print) {
           if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); }
-          if (t < printf_num_threads) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", 
+          if (t < 32) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", 
             t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); }
-          if (t < printf_num_threads) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); }
+          if (t < 32) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); }
         }
 
         // if we're -not- at a leaf column and we're within nesting/row bounds
@@ -885,7 +856,7 @@ if constexpr (enable_print_large_list) {
 
           if constexpr (enable_print) {
             if(idx < 0) { printf("WHOA: offset index out of bounds!\n"); }
-            if (t < printf_num_threads) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", 
+            if (t < 32) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", 
               t, idx, next_ni.value_count, next_ni.page_start_value, ofs); }
           }
         }
@@ -962,7 +933,7 @@ if constexpr (enable_print_large_list) {
 
         if constexpr (enable_print) {
           if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); }
-          if (t < printf_num_threads) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
+          if (t < 32) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
 
           if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, ni.valid_count %d, thread_value_count %d, thread_valid_count %d\n", 
             output_index, dst_pos, ni.value_count, ni.valid_count, thread_value_count, thread_valid_count);}
@@ -1167,7 +1138,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   }
 
   static constexpr bool enable_print = false;
-  static constexpr bool enable_print_loop_check = false;
 
   rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
   if constexpr (has_dict_t) {
@@ -1193,6 +1163,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     if((t == 0) && (page_idx == 0)){
       printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size);
     }
+    if constexpr (has_lists_t){
+      printf("Is fixed list page\n");
+    } else {
+      printf("Is fixed non-list page\n");
+    }
   }
 
   // We use two counters in the loop below: processed_count and valid_count.
@@ -1237,17 +1212,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   if constexpr (enable_print) {
     if(t == 0) {printf("LOOP START page_idx %d\n", page_idx);}
   }
-int loop_count = 0;  
+
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    if constexpr (enable_print_loop_check) {
-      ++loop_count;
-      if((loop_count > 10000) && (t == 0)) {
-        printf("INFINITE LOOP IN MAIN!\n");
-      }
-    }
-
     if constexpr (has_lists_t){
       rep_decoder.decode_next(t);
       if constexpr (!align_test) {

From 342c2f43633ca10c0a547d2f6ace65836fde6b94 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Tue, 10 Sep 2024 14:25:33 -0400
Subject: [PATCH 08/36] No need to decode def levels if not nullable

---
 cpp/src/io/parquet/decode_fixed.cu | 31 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 62bdf01c533..cd4345266d2 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -610,20 +610,24 @@ if constexpr (enable_print_large_list) {
     if (within_batch) {
       int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
       rep_level = static_cast<int>(rep[index]);
-      def_level = static_cast<int>(def[index]);
+      if constexpr (nullable) {
+        def_level = static_cast<int>(def[index]);
+        end_depth = s->nesting_info[def_level].end_depth;
+      } else {
+        end_depth = max_depth;
+      }
 
       //computed by generate_depth_remappings()
       if constexpr (enable_print || enable_print_range_error) {
         if((rep_level < 0) || (rep_level > max_depth)) {
           printf("WHOA: rep level %d out of bounds %d!\n", rep_level, max_depth);
         }
-        if((def_level < 0)/* || (def_level > (max_depth + 1)) */ ) {
+        if(nullable && ((def_level < 0)/* || (def_level > (max_depth + 1)) */ )) {
           printf("WHOA: def level %d out of bounds (max_depth %d) (index %d)!\n", def_level, max_depth, index);
         }
       }
 
       start_depth = s->nesting_info[rep_level].start_depth;
-      end_depth   = s->nesting_info[def_level].end_depth;
       if constexpr (enable_print || enable_print_range_error) {
         if((start_depth < 0) || (start_depth > (max_depth + 1))) {
           printf("WHOA: start_depth %d out of bounds (max_depth %d) (index %d)!\n", start_depth, max_depth, index);
@@ -736,6 +740,7 @@ if constexpr (enable_print_large_list) {
       // however not all of them will necessarily represent a value at this nesting level. so
       // the validity bit for thread t might actually represent output value t-6. the correct
       // position for thread t's bit is thread_value_count. 
+
       static_assert(decode_block_size <= 8*sizeof(__uint128_t), 
         "This code relies on bits for block threads fitting within a uint128!");
 
@@ -944,7 +949,6 @@ if constexpr (enable_print_large_list) {
         //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
         sb->nz_idx[output_index] = dst_pos;
       }
-//      __syncthreads();  // handle modification of ni.value_count from below TODO: TRY REMOVE
 
       // update stuff
       if (t == 0) {
@@ -1091,7 +1095,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
 
   bool const nullable             = is_nullable(s);
   bool const should_process_nulls = nullable && maybe_has_nulls(s);
-  bool const should_process_def_levels = should_process_nulls || has_lists_t;
 
   // shared buffer. all shared memory is suballocated out of here
   static constexpr auto align_test = false;
@@ -1119,7 +1122,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
   rle_stream<level_t, decode_block_size_t, rolling_buf_size> def_decoder{def_runs};
   level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (should_process_def_levels) {
+  if (should_process_nulls) {
     def_decoder.init(s->col.level_bits[level_type::DEFINITION],
                      s->abs_lvl_start[level_type::DEFINITION],
                      s->abs_lvl_end[level_type::DEFINITION],
@@ -1187,7 +1190,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       page_idx, int(nullable), int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); }
   }
 
-
   auto print_nestings = [&](bool is_post){
     if constexpr (enable_print) {
       auto print_nesting_level = [&](const PageNestingDecodeInfo& ni) {
@@ -1216,19 +1218,15 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    if constexpr (has_lists_t){
-      rep_decoder.decode_next(t);
-      if constexpr (!align_test) {
-        __syncthreads();
-      }
-    }
-
     // only need to process definition levels if this is a nullable column
     if (should_process_nulls) {
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
       if constexpr (has_lists_t) {
+        rep_decoder.decode_next(t);
+        __syncthreads();
+
         int value_count = s->input_value_count;
         next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, rep, t);
@@ -1254,13 +1252,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       if constexpr (has_lists_t) {
-        // no nulls, but if we have a list we still need the definition levels
-        processed_count += def_decoder.decode_next(t);
+        processed_count += rep_decoder.decode_next(t);
         __syncthreads();
 
         next_valid_count =
           gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, false, level_t>(
-            processed_count, s, sb, def, rep, t);
+            processed_count, s, sb, nullptr, rep, t);
       } else {
         processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
 

From 50bbc94a571ea1163acebbb23453d01c0ba8793d Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Tue, 10 Sep 2024 16:03:25 -0400
Subject: [PATCH 09/36] Manual block scan

---
 cpp/src/io/parquet/decode_fixed.cu | 177 ++++++++++++++++++-----------
 1 file changed, 110 insertions(+), 67 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index cd4345266d2..b47b96b91a2 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -551,6 +551,48 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   return valid_count;
 }
 
+struct scan_results
+{
+  uint32_t warp_bits;
+  int thread_count_within_warp;
+  int warp_count;
+
+  int thread_count_within_block;
+  int block_count;
+};
+
+template <int decode_block_size>
+static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t thread_mask, scan_results& results)
+{
+  constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
+
+  results.warp_bits = warp_bits;
+  results.warp_count = __popc(results.warp_bits);
+  results.thread_count_within_warp = __popc(results.warp_bits & thread_mask);
+
+  __shared__ uint32_t warp_counts[num_warps];
+  if(warp_lane == 0) {
+    warp_counts[warp_index] = results.warp_count;
+  }
+  __syncthreads();
+
+  results.block_count = 0;
+  results.thread_count_within_block = results.thread_count_within_warp;
+  for(int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
+    results.block_count += warp_counts[warp_idx];
+    if(warp_idx < warp_index) {
+      results.thread_count_within_block += warp_counts[warp_idx];
+    }
+  }
+}
+
+template <int decode_block_size>
+static __device__ void scan_block(int thread_bit, int warp_lane, int warp_index, uint32_t thread_mask, scan_results& results)
+{
+  uint32_t warp_bits = ballot(thread_bit);
+  scan_block<decode_block_size>(warp_bits, warp_lane, warp_index, thread_mask, results);
+}
+
 template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, 
@@ -597,6 +639,11 @@ if constexpr (enable_print_large_list) {
   using block_scan = cub::BlockScan<int, decode_block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
 
+  int const warp_lane = t % cudf::detail::warp_size;
+  bool const is_first_lane = warp_lane == 0;
+  int const warp_index = t / cudf::detail::warp_size;
+  uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
+
   while (value_count < target_value_count) {
 
     if constexpr (enable_print) {
@@ -688,9 +735,15 @@ if constexpr (enable_print_large_list) {
     }
 
     // queries is_valid from all threads, stores prior total and total total
-    int thread_value_count = 0, block_value_count = 0;
-    block_scan(scan_storage).ExclusiveSum(in_nesting_bounds, thread_value_count, block_value_count);
-    __syncthreads();
+
+    //WARP VALUE COUNT:
+    scan_results value_count_scan_results;
+    scan_block<decode_block_size>(in_nesting_bounds, warp_lane, warp_index, lane_mask, value_count_scan_results);
+
+    int thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp;
+    int warp_value_count = value_count_scan_results.warp_count;
+    int thread_value_count = value_count_scan_results.thread_count_within_block;
+    int block_value_count = value_count_scan_results.block_count;
 
 if constexpr (enable_print_large_list) {
   if(in_nesting_bounds != (t % 4 == 0)) {
@@ -741,30 +794,22 @@ if constexpr (enable_print_large_list) {
       // the validity bit for thread t might actually represent output value t-6. the correct
       // position for thread t's bit is thread_value_count. 
 
-      static_assert(decode_block_size <= 8*sizeof(__uint128_t), 
-        "This code relies on bits for block threads fitting within a uint128!");
-
-      auto shifted_validity = static_cast<__uint128_t>(is_valid) << thread_value_count;
-      auto or_reducer = [](const __uint128_t& lhs, const __uint128_t& rhs){
-        return lhs | rhs;
-      };
 
-      using block_reduce = cub::BlockReduce<__uint128_t, decode_block_size>;
-      __shared__ typename block_reduce::TempStorage reduce_storage;
-      __uint128_t block_valid_mask = block_reduce(reduce_storage).Reduce(shifted_validity, or_reducer);
+//WARP VALID COUNT:
+        // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
+        // however not all of them will necessarily represent a value at this nesting level. so
+        // the validity bit for thread t might actually represent output value t-6. the correct
+        // position for thread t's bit is thread_value_count. for cuda 11 we could use
+        // __reduce_or_sync(), but until then we have to do a warp reduce.
+        uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
+        auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
 
-      //Reduction result is only visible to thread zero, must share with other threads:
-      __shared__ __uint128_t block_valid_mask_storage;
-      if(t == 0) { block_valid_mask_storage = block_valid_mask; }
-      __syncthreads();
-      block_valid_mask = block_valid_mask_storage;
+        scan_results valid_count_scan_results;
+        scan_block<decode_block_size>(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results);
 
-      auto count_set_bits = [](__uint128_t bits){
-        return __popcll((uint64_t)bits) + __popcll((uint64_t)(bits >> 64));
-      };
-      auto thread_mask = (__uint128_t(1) << thread_value_count) - 1;
-      int const thread_valid_count = count_set_bits(block_valid_mask & thread_mask);
-//int const block_valid_count = count_set_bits(block_valid_mask);
+        int warp_valid_count = valid_count_scan_results.warp_count;
+        int thread_valid_count = valid_count_scan_results.thread_count_within_block;
+        int block_valid_count = valid_count_scan_results.block_count;
 
 if constexpr (enable_print_large_list) {
   if(((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid)) {
@@ -775,20 +820,20 @@ if constexpr (enable_print_large_list) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count);
   }
-/*  if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) {
+  if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count);
-  }*/
+  }
 }
 
       if constexpr (enable_print) {
-        if((block_valid_mask == 0) && (t == 0) && (d_idx == max_depth)) { 
+        if((block_valid_count == 0) && (t == 0) && (d_idx == max_depth)) { 
           printf("EMPTY VALID MASK: def_level %d, max_def_level %d, in_nesting_bounds %d, start_depth %d, "
             "end_depth %d, in_row_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, input_row_count %d\n", 
             def_level, ni.max_def_level, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, 
             row_index_lower_bound, last_row, input_row_count); }
 
-        if (t == 0) { printf("block_valid_mask %u\n", int(block_valid_mask)); }
+        if (t == 0) { printf("block_valid_count %u\n", int(block_valid_count)); }
         if (t < 32) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
       }
 
@@ -796,15 +841,23 @@ if constexpr (enable_print_large_list) {
       // do this for nested schemas so that we can emit an offset for the -current- nesting
       // level. more concretely : the offset for the current nesting level == current length of the
       // next nesting level
-      int32_t next_thread_value_count = 0, next_block_value_count = 0;
+      int next_thread_value_count_within_warp = 0, next_warp_value_count = 0;
+      int next_thread_value_count = 0, next_block_value_count = 0;
       int next_in_nesting_bounds = 0;
       if (d_idx < max_depth) {
         //mask is different between depths
         next_in_nesting_bounds = 
           (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0;
 
-        block_scan(scan_storage).ExclusiveSum(next_in_nesting_bounds, next_thread_value_count, next_block_value_count);
-        __syncthreads();
+//NEXT WARP VALUE COUNT:
+        scan_results next_value_count_scan_results;
+        scan_block<decode_block_size>(next_in_nesting_bounds, warp_lane, warp_index, lane_mask, next_value_count_scan_results);
+
+        next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp;
+        next_warp_value_count = next_value_count_scan_results.warp_count;
+        next_thread_value_count = next_value_count_scan_results.thread_count_within_block;
+        next_block_value_count = next_value_count_scan_results.block_count;
+
 
 if constexpr (enable_print_large_list) {
   if(next_in_nesting_bounds != 1) {
@@ -873,49 +926,38 @@ if constexpr (enable_print_large_list) {
       // (that is, read and write positions are already pre-bounded by first_row/num_rows). 
       // since we are about to write the validity vector
       // here we need to adjust our computed mask to take into account the write row bounds.
-      int warp_null_count = 0;
       if constexpr (nullable) {
-        if (ni.valid_map != nullptr) {
 //TODO: Consider OR'ING for next_thread_value_count and popc() for next_thread_value_count
 //so that we don't have to take a ballot here. Is uint128 so may deconstruct to this anyway ...
-          uint32_t const warp_count_mask = ballot(in_nesting_bounds);
-          if ((t % cudf::detail::warp_size) == 0) {
-            // last bit in the warp to store //in old is warp_valid_mask_bit_count
+
+        int warp_null_count = 0;
+        if(is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) {
+          // last bit in the warp to store //in old is warp_valid_mask_bit_count
 //so it's a count of everything in nesting bounds, though bits can be zero if NULL at this level            
-            int const bit_count = __popc(warp_count_mask);
-            if(bit_count > 0) {
-
-              // absolute bit offset into the output validity map
-              //is cumulative sum of bit_count at the given nesting depth
-              // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
-              int const bit_offset = ni.valid_map_offset + thread_value_count;
-              auto const shifted_valid_mask = static_cast<uint32_t>(block_valid_mask >> thread_value_count);
-              auto const bit_range_mask = (1 << bit_count) - 1; //mainly needed for warp_null_count
-              auto const warp_validity_mask = shifted_valid_mask & bit_range_mask;
-
-              store_validity(bit_offset, ni.valid_map, warp_validity_mask, bit_count);
-              warp_null_count = bit_count - __popc(warp_validity_mask);
-
-              if constexpr (enable_print) {
-                printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, bit_count %d, warp_validity_mask %u\n", 
-                  t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, bit_count, warp_validity_mask);
-                printf("NUM NULLS: t %d, depth %d, warp_null_count %d\n", t, d_idx, warp_null_count);
-              }
+
+          // absolute bit offset into the output validity map
+          //is cumulative sum of warp_value_count at the given nesting depth
+          // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
+          int const bit_offset = ni.valid_map_offset + thread_value_count;
+
+          store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count);
+          warp_null_count = warp_value_count - warp_valid_count;
+
+          if constexpr (enable_print) {
+              printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, warp_value_count %d, warp_valid_mask %u\n", 
+                t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, warp_value_count, warp_valid_mask);
+              printf("NUM NULLS: t %d, depth %d, warp_null_count %d\n", t, d_idx, warp_null_count);
             }
-          }
         }
 
-        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-        // valid_count) because valid_count also includes rows that potentially start before our row
-        // bounds. if we could come up with a way to clean that up, we could remove this and just
-        // compute it directly at the end of the kernel.
-        size_type const block_null_count =
-          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-        if constexpr (enable_print) {
-          if (t == 0) { printf("BLOCK NULLS: depth %d, prior %d, block_null_count %u\n", 
-            d_idx, ni.null_count, block_null_count); }
+        if (t == 0) { 
+          size_type const block_null_count = block_value_count - block_valid_count;
+          if constexpr (enable_print) {
+            if (t == 0) { printf("BLOCK NULLS: depth %d, prior %d, block_null_count %u\n", 
+              d_idx, ni.null_count, block_null_count); }
+          }
+          ni.null_count += block_null_count;
         }
-        if (t == 0) { ni.null_count += block_null_count; }
       }
 
       // if this is valid and we're at the leaf, output dst_pos
@@ -952,7 +994,6 @@ if constexpr (enable_print_large_list) {
 
       // update stuff
       if (t == 0) {
-        int const block_valid_count = count_set_bits(block_valid_mask);
         ni.valid_count += block_valid_count;
         ni.value_count += block_value_count;
         ni.valid_map_offset += block_value_count;
@@ -963,6 +1004,8 @@ if constexpr (enable_print_large_list) {
       block_value_count  = next_block_value_count;
       thread_value_count = next_thread_value_count;
       in_nesting_bounds  = next_in_nesting_bounds;
+      warp_value_count = next_warp_value_count;
+      thread_value_count_within_warp = next_thread_value_count_within_warp;
     } //END OF DEPTH LOOP
 
     if constexpr (enable_print) {

From 539066114f909f13b329468f570bdaad1bf3470e Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Wed, 18 Sep 2024 11:47:43 -0400
Subject: [PATCH 10/36] Optimize parquet reader block scans, simplify and
 consolidate non-nullable column code

---
 cpp/src/io/parquet/decode_fixed.cu | 369 +++++++++++++++++------------
 1 file changed, 212 insertions(+), 157 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 8a866141c4b..73eb9e87c61 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,6 +24,39 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+struct block_scan_results {
+  uint32_t warp_bits;
+  int thread_count_within_warp;
+  int warp_count;
+
+  int thread_count_within_block;
+  int block_count;
+};
+
+template <int decode_block_size>
+static __device__ void scan_block_exclusive_sum(int t, int thread_bit, block_scan_results& results)
+{
+  constexpr int num_warps  = decode_block_size / cudf::detail::warp_size;
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
+
+  results.warp_bits                = ballot(thread_bit);
+  results.warp_count               = __popc(results.warp_bits);
+  results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
+
+  __shared__ int warp_counts[num_warps];
+  if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
+  __syncthreads();
+
+  results.block_count               = 0;
+  results.thread_count_within_block = results.thread_count_within_warp;
+  for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
+    results.block_count += warp_counts[warp_idx];
+    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+  }
+}
+
 template <int block_size, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
@@ -194,7 +227,7 @@ struct decode_fixed_width_split_values_func {
   }
 };
 
-template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+template <int decode_block_size, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesNested(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
@@ -212,28 +245,27 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
   int const row_index_lower_bound = s->row_index_lower_bound;
 
   int const max_depth = s->col.max_nesting_depth - 1;
-  __syncthreads();
+  auto& max_depth_ni  = s->nesting_info[max_depth];
+  int valid_count     = max_depth_ni.valid_count;
 
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
+    // definition level
+    int d;
+    if (t >= batch_size) {
+      d = -1;
+    } else if (def) {
+      d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+    } else {
+      d = 1;
     }
 
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index           = (thread_value_count + value_count) - 1;
+    int const row_index           = thread_value_count + value_count;
     int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
     int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
     int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
@@ -242,90 +274,74 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
       auto& ni = s->nesting_info[d_idx];
 
-      int is_valid;
-      if constexpr (nullable) {
-        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
-      } else {
-        is_valid = in_row_bounds;
-      }
+      int is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
 
       // thread and block validity count
-      int thread_valid_count, block_valid_count;
-      if constexpr (nullable) {
-        using block_scan = cub::BlockScan<int, decode_block_size>;
-        __shared__ typename block_scan::TempStorage scan_storage;
-        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-        __syncthreads();
-
-        // validity is processed per-warp
-        //
-        // nested schemas always read and write to the same bounds (that is, read and write
-        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-        // at the first value, even if that is before first_row, because we cannot trivially jump to
-        // the correct position to start reading. since we are about to write the validity vector
-        // here we need to adjust our computed mask to take into account the write row bounds.
-        int warp_null_count = 0;
-        if (write_start >= 0 && ni.valid_map != nullptr) {
-          int const valid_map_offset        = ni.valid_map_offset;
-          uint32_t const warp_validity_mask = ballot(is_valid);
-          // lane 0 from each warp writes out validity
-          if ((t % cudf::detail::warp_size) == 0) {
-            int const vindex =
-              (value_count + thread_value_count) - 1;  // absolute input value index
-            int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                   first_row;  // absolute bit offset into the output validity map
-            int const write_end = cudf::detail::warp_size -
-                                  __clz(in_write_row_bounds);  // last bit in the warp to store
-            int const bit_count = write_end - write_start;
-            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-          }
-        }
+      block_scan_results valid_count_results;
+      scan_block_exclusive_sum<decode_block_size>(t, is_valid, valid_count_results);
+      uint32_t const warp_validity_mask = valid_count_results.warp_bits;
+      int thread_valid_count            = valid_count_results.thread_count_within_block;
+      int block_valid_count             = valid_count_results.block_count;
 
-        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-        // valid_count) because valid_count also includes rows that potentially start before our row
-        // bounds. if we could come up with a way to clean that up, we could remove this and just
-        // compute it directly at the end of the kernel.
-        size_type const block_null_count =
-          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-        if (t == 0) { ni.null_count += block_null_count; }
-      }
-      // trivial for non-nullable columns
-      else {
-        thread_valid_count = thread_value_count;
-        block_valid_count  = block_value_count;
+      // validity is processed per-warp
+      //
+      // nested schemas always read and write to the same bounds (that is, read and write
+      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+      // at the first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int warp_null_count = 0;
+      // lane 0 from each warp writes out validity
+      if ((write_start >= 0) && (ni.valid_map != nullptr) && ((t % cudf::detail::warp_size) == 0)) {
+        int const valid_map_offset = ni.valid_map_offset;
+        int const vindex     = value_count + thread_value_count;  // absolute input value index
+        int const bit_offset = (valid_map_offset + vindex + write_start) -
+                               first_row;  // absolute bit offset into the output validity map
+        int const write_end =
+          cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+        int const bit_count = write_end - write_start;
+        warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+        store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
       }
 
+      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+      // valid_count) because valid_count also includes rows that potentially start before our row
+      // bounds. if we could come up with a way to clean that up, we could remove this and just
+      // compute it directly at the end of the kernel.
+      size_type const block_null_count =
+        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+      if (t == 0) { ni.null_count += block_null_count; }
+
       // if this is valid and we're at the leaf, output dst_pos
-      __syncthreads();  // handle modification of ni.value_count from below
-      if (is_valid && d_idx == max_depth) {
-        // for non-list types, the value count is always the same across
-        int const dst_pos = (value_count + thread_value_count) - 1;
-        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
-        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      if (d_idx == max_depth) {
+        if (is_valid) {
+          // for non-list types, the value count is always the same across
+          int const dst_pos = value_count + thread_value_count;
+          int const src_pos = valid_count + thread_valid_count;
+          sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+        }
+        // update stuff
+        valid_count += block_valid_count;
       }
-      __syncthreads();  // handle modification of ni.value_count from below
 
-      // update stuff
-      if (t == 0) { ni.valid_count += block_valid_count; }
-    }
+    }  // end depth loop
 
     value_count += block_value_count;
-  }
+  }  // end loop
 
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
-    s->nz_count          = s->nesting_info[max_depth].valid_count;
-    s->input_value_count = value_count;
-    s->input_row_count   = value_count;
+    max_depth_ni.valid_count = valid_count;
+    s->nz_count              = valid_count;
+    s->input_value_count     = value_count;
+    s->input_row_count       = value_count;
   }
 
-  __syncthreads();
-  return s->nesting_info[max_depth].valid_count;
+  return valid_count;
 }
 
-template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+template <int decode_block_size, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
@@ -346,88 +362,70 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   int const valid_map_offset      = ni.valid_map_offset;
   int const row_index_lower_bound = s->row_index_lower_bound;
 
-  __syncthreads();
-
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      if (def) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
-    }
-
-    int const thread_value_count = t + 1;
+    int const thread_value_count = t;
     int const block_value_count  = batch_size;
 
     // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
+    int const row_index     = thread_value_count + value_count;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // use definition level & row bounds to determine if is valid
     int is_valid;
-    if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+    if (t >= batch_size) {
+      is_valid = 0;
+    } else if (def) {
+      int const def_level =
+        static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
     } else {
       is_valid = in_row_bounds;
     }
 
     // thread and block validity count
-    int thread_valid_count, block_valid_count;
-    if constexpr (nullable) {
-      using block_scan = cub::BlockScan<int, decode_block_size>;
-      __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-      __syncthreads();
-
-      // validity is processed per-warp
-      //
-      // nested schemas always read and write to the same bounds (that is, read and write
-      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-      // at the first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector
-      // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
-        // lane 0 from each warp writes out validity
-        if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
-          int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-          int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-        }
-      }
-
-      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-      // valid_count) because valid_count also includes rows that potentially start before our row
-      // bounds. if we could come up with a way to clean that up, we could remove this and just
-      // compute it directly at the end of the kernel.
-      size_type const block_null_count =
-        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-      if (t == 0) { ni.null_count += block_null_count; }
-    }
-    // trivial for non-nullable columns
-    else {
-      thread_valid_count = thread_value_count;
-      block_valid_count  = block_value_count;
+    block_scan_results valid_count_results;
+    scan_block_exclusive_sum<decode_block_size>(t, is_valid, valid_count_results);
+    uint32_t const warp_validity_mask = valid_count_results.warp_bits;
+    int thread_valid_count            = valid_count_results.thread_count_within_block;
+    int block_valid_count             = valid_count_results.block_count;
+
+    // validity is processed per-warp
+    //
+    // nested schemas always read and write to the same bounds (that is, read and write
+    // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+    // at the first value, even if that is before first_row, because we cannot trivially jump to
+    // the correct position to start reading. since we are about to write the validity vector
+    // here we need to adjust our computed mask to take into account the write row bounds.
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+    int warp_null_count   = 0;
+    // lane 0 from each warp writes out validity
+    if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) {
+      int const vindex     = value_count + thread_value_count;  // absolute input value index
+      int const bit_offset = (valid_map_offset + vindex + write_start) -
+                             first_row;  // absolute bit offset into the output validity map
+      int const write_end =
+        cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+      int const bit_count = write_end - write_start;
+      warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+      store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
     }
 
+    // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+    // valid_count) because valid_count also includes rows that potentially start before our row
+    // bounds. if we could come up with a way to clean that up, we could remove this and just
+    // compute it directly at the end of the kernel.
+    size_type const block_null_count =
+      cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+    if (t == 0) { ni.null_count += block_null_count; }
+
     // output offset
     if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
+      int const dst_pos                                          = value_count + thread_value_count;
+      int const src_pos                                          = valid_count + thread_valid_count;
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 
@@ -448,6 +446,70 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   return valid_count;
 }
 
+template <int decode_block_size, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_value_count,
+                                                                page_state_s* s,
+                                                                state_buf* sb,
+                                                                int t)
+{
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
+
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
+  int const row_index_lower_bound     = s->row_index_lower_bound;
+
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
+
+  int const max_depth = s->col.max_nesting_depth - 1;
+  auto& ni            = s->nesting_info[max_depth];
+  int valid_count     = ni.valid_count;
+
+  __syncthreads();
+
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
+
+    int const thread_value_count = t;
+    int const block_value_count  = batch_size;
+
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index     = thread_value_count + value_count;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    int is_valid = in_row_bounds;
+
+    int thread_valid_count = thread_value_count;
+    int block_valid_count  = block_value_count;
+
+    // if this is valid and we're at the leaf, output dst_pos
+    if (is_valid) {
+      // for non-list types, the value count is always the same across
+      int const dst_pos                                          = value_count + thread_value_count;
+      int const src_pos                                          = valid_count + thread_valid_count;
+      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+    }
+
+    // update stuff
+    value_count += block_value_count;
+    valid_count += block_valid_count;
+  }  // end loop
+
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    ni.valid_count       = valid_count;
+    ni.value_count       = value_count;
+    s->nz_count          = valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
+  }
+
+  return valid_count;
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -614,10 +676,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       __syncthreads();
 
       if constexpr (has_nesting_t) {
-        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       }
     }
@@ -626,15 +688,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-
-      if constexpr (has_nesting_t) {
-        next_valid_count =
-          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
-            processed_count, s, sb, nullptr, t);
-      } else {
-        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
-          processed_count, s, sb, nullptr, t);
-      }
+      next_valid_count =
+        gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
     }
     __syncthreads();
 

From 3ef7b0d8c6109d618c2f114d1970342a0442d9a9 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Wed, 18 Sep 2024 12:07:40 -0400
Subject: [PATCH 11/36] tweak syncing

---
 cpp/src/io/parquet/decode_fixed.cu | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 73eb9e87c61..0638b3e5d5a 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -248,6 +248,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
   auto& max_depth_ni  = s->nesting_info[max_depth];
   int valid_count     = max_depth_ni.valid_count;
 
+  __syncthreads();
+
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
@@ -362,6 +364,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   int const valid_map_offset      = ni.valid_map_offset;
   int const row_index_lower_bound = s->row_index_lower_bound;
 
+  __syncthreads();
+
   while (value_count < capped_target_value_count) {
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
@@ -480,16 +484,16 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v
     int const row_index     = thread_value_count + value_count;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
-    int is_valid = in_row_bounds;
-
+    int is_valid           = in_row_bounds;
     int thread_valid_count = thread_value_count;
     int block_valid_count  = block_value_count;
 
     // if this is valid and we're at the leaf, output dst_pos
     if (is_valid) {
       // for non-list types, the value count is always the same across
-      int const dst_pos                                          = value_count + thread_value_count;
-      int const src_pos                                          = valid_count + thread_valid_count;
+      int const dst_pos = value_count + thread_value_count;
+      int const src_pos = valid_count + thread_valid_count;
+
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 

From 788287936f4f21abc34a6bf9fc04f310e6b2824c Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Wed, 18 Sep 2024 18:06:33 -0400
Subject: [PATCH 12/36] small tweaks

---
 cpp/src/io/parquet/decode_fixed.cu | 77 +++++++++++++-----------------
 1 file changed, 32 insertions(+), 45 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index b47b96b91a2..ac7a628bc19 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -562,13 +562,13 @@ struct scan_results
 };
 
 template <int decode_block_size>
-static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t thread_mask, scan_results& results)
+static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, scan_results& results)
 {
   constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
 
   results.warp_bits = warp_bits;
   results.warp_count = __popc(results.warp_bits);
-  results.thread_count_within_warp = __popc(results.warp_bits & thread_mask);
+  results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
 
   __shared__ uint32_t warp_counts[num_warps];
   if(warp_lane == 0) {
@@ -587,10 +587,10 @@ static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_in
 }
 
 template <int decode_block_size>
-static __device__ void scan_block(int thread_bit, int warp_lane, int warp_index, uint32_t thread_mask, scan_results& results)
+static __device__ void scan_block(int thread_bit, int warp_lane, int warp_index, uint32_t lane_mask, scan_results& results)
 {
   uint32_t warp_bits = ballot(thread_bit);
-  scan_block<decode_block_size>(warp_bits, warp_lane, warp_index, thread_mask, results);
+  scan_block<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
 }
 
 template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
@@ -625,6 +625,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
   int const row_index_lower_bound = s->row_index_lower_bound;
   int const max_depth = s->col.max_nesting_depth - 1;
+  int max_depth_valid_count = s->nesting_info[max_depth].valid_count;
 
   __syncthreads();
 
@@ -963,38 +964,39 @@ if constexpr (enable_print_large_list) {
       // if this is valid and we're at the leaf, output dst_pos
       // Read these before the sync, so that when thread 0 modifies them we've already read their values
       int current_value_count = ni.value_count;
-      int current_valid_count = ni.valid_count;
-      __syncthreads();  // handle modification of ni.valid_count from below
-      if (is_valid && d_idx == max_depth) {
-        // for non-list types, the value count is always the same across
-        int const dst_pos = current_value_count + thread_value_count;
-        int const src_pos = current_valid_count + thread_valid_count;
-        int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
+      __syncthreads();  // handle modification of ni.value_count from below
+      if (d_idx == max_depth) {
+        if (is_valid) {
+          // for non-list types, the value count is always the same across
+          int const dst_pos = current_value_count + thread_value_count;
+          int const src_pos = max_depth_valid_count + thread_valid_count;
+          int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
 
-        if constexpr (enable_print || enable_print_range_error) {
-          if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) {
-            printf("WHOA: output index STORE %d out of bounds!\n", output_index);
+          if constexpr (enable_print || enable_print_range_error) {
+            if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) {
+              printf("WHOA: output index STORE %d out of bounds!\n", output_index);
+            }
+            if(dst_pos < 0) { printf("WHOA: dst_pos STORE %d out of bounds!\n", dst_pos); }
           }
-          if(dst_pos < 0) { printf("WHOA: dst_pos STORE %d out of bounds!\n", dst_pos); }
-        }
 
-        if constexpr (enable_print) {
-          if (t == 0) { printf("ni.value_count %d, ni.valid_count %d\n", int(ni.value_count), int(ni.valid_count)); }
-          if (t < 32) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
+          if constexpr (enable_print) {
+            if (t == 0) { printf("ni.value_count %d, max_depth_valid_count %d\n", int(ni.value_count), max_depth_valid_count); }
+            if (t < 32) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
 
-          if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, ni.valid_count %d, thread_value_count %d, thread_valid_count %d\n", 
-            output_index, dst_pos, ni.value_count, ni.valid_count, thread_value_count, thread_valid_count);}
+            if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, max_depth_valid_count %d, thread_value_count %d, thread_valid_count %d\n", 
+              output_index, dst_pos, ni.value_count, max_depth_valid_count, thread_value_count, thread_valid_count);}
 
-          if (t == 0) { printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); }
-        }
+            if (t == 0) { printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); }
+          }
 
-        //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
-        sb->nz_idx[output_index] = dst_pos;
+          //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
+          sb->nz_idx[output_index] = dst_pos;
+        }
+        max_depth_valid_count += block_valid_count;
       }
 
       // update stuff
       if (t == 0) {
-        ni.valid_count += block_valid_count;
         ni.value_count += block_value_count;
         ni.valid_map_offset += block_value_count;
       }
@@ -1022,7 +1024,8 @@ if constexpr (enable_print_large_list) {
 
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
-    s->nz_count          = s->nesting_info[max_depth].valid_count;
+    s->nesting_info[max_depth].valid_count = max_depth_valid_count;
+    s->nz_count          = max_depth_valid_count;
     s->input_value_count = value_count;
 
     // If we have lists # rows != # values
@@ -1036,8 +1039,7 @@ if constexpr (enable_print_large_list) {
 }
   }
 
-  __syncthreads();
-  return s->nesting_info[max_depth].valid_count;
+  return max_depth_valid_count;
 }
 
 // is the page marked nullable or not
@@ -1094,7 +1096,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
                            size_t num_rows,
-                           kernel_error::pointer error_code /*, int page_idx = -1, int num_pages = -1*/)
+                           kernel_error::pointer error_code)
 {
   constexpr int rolling_buf_size    = decode_block_size_t * 2;
   constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size_t>();
@@ -1108,11 +1110,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
   int const page_idx    = blockIdx.x;
-/*  page_idx = (page_idx == -1) ? blockIdx.x : page_idx + blockIdx.x;
-  if((page_idx >= num_pages) && (num_pages != -1)) {
-    printf("BAIL ON PAGE %d of %d\n", page_idx, num_pages);
-    return;
-  }*/
   int const t           = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
@@ -1357,13 +1354,6 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
 
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-/*  
-  auto num_pages = pages.size();
-  auto grid_dim = 1; //2, 10, 40, 100 no problem; all = problem
-  dim3 dim_grid(grid_dim, 1);  // 1 threadblock per page
-
-for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) {
-*/
   if (level_type_size == 1) {
     if (is_list) {
       gpuDecodePageDataGeneric<uint8_t,
@@ -1375,7 +1365,6 @@ for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) {
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
-//          pages.device_ptr(), chunks, min_row, num_rows, error_code, idx, num_pages);
     } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
@@ -1408,7 +1397,6 @@ for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) {
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
-//          pages.device_ptr(), chunks, min_row, num_rows, error_code, idx, num_pages);
     } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
@@ -1431,7 +1419,6 @@ for(decltype(num_pages) idx = 0; idx < num_pages; idx += grid_dim) {
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   }
-//}
 }
 
 void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,

From e285fbfd44bb3f8ef14e81a35fdc0301bd2a0a1a Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Mon, 23 Sep 2024 16:42:05 -0400
Subject: [PATCH 13/36] Add skipping to rle_stream, use for lists (chunked
 reads)

---
 cpp/src/io/parquet/decode_fixed.cu | 112 +++++++++++++++++------------
 cpp/src/io/parquet/rle_stream.cuh  |  69 ++++++++++++++++++
 2 files changed, 134 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index ac7a628bc19..fce8f53700d 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -41,7 +41,7 @@ __device__ inline void gpuDecodeFixedWidthValues(
 
   static constexpr bool enable_print = false;
   static constexpr bool enable_print_range_error = false;
-  static constexpr bool enable_print_large_list = false;
+//  static constexpr bool enable_print_large_list = true;
 
   if constexpr (enable_print) {
     if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, "
@@ -133,7 +133,7 @@ __device__ inline void gpuDecodeFixedWidthValues(
       } else {
         gpuOutputGeneric(s, sb, src_pos, static_cast<uint8_t*>(dst), dtype_len);
       }
-
+/*
       if constexpr (enable_print_large_list) {
         if (dtype == INT32) {
           int value_stored = *static_cast<uint32_t*>(dst);
@@ -143,6 +143,7 @@ __device__ inline void gpuDecodeFixedWidthValues(
           }
         }
       }
+      */
     }
 
     pos += batch_size;
@@ -628,15 +629,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int max_depth_valid_count = s->nesting_info[max_depth].valid_count;
 
   __syncthreads();
-
-if constexpr (enable_print_large_list) {
-  auto first_ni_value_count = s->nesting_info[0].value_count;
-  if((value_count != (4*input_row_count)) || (input_row_count != first_ni_value_count)){
-    printf("ALGO GARBAGE GET: blockIdx.x %d, value_count %d, target_value_count %d, t %d, value_count %d, input_row_count %d, first_ni_value_count %d\n", 
-    blockIdx.x, value_count, target_value_count, t, value_count, input_row_count, first_ni_value_count);
-  }
-}
-
+  
   using block_scan = cub::BlockScan<int, decode_block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
 
@@ -700,15 +693,15 @@ if constexpr (enable_print_large_list) {
     __syncthreads();
 
 if constexpr (enable_print_large_list) {
-  if(bool(is_new_row) != (t % 4 == 0)) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d\n", 
-      blockIdx.x, value_count, target_value_count, t, is_new_row);
+  if(within_batch && (bool(is_new_row) != (t % 4 == 0))) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d, start_depth %d, rep_level %d\n", 
+      blockIdx.x, value_count, target_value_count, t, is_new_row, start_depth, rep_level);
   }
-  if(num_prior_new_rows != ((t + 3) / 4)) {
+  if(within_batch && (num_prior_new_rows != ((t + 3) / 4))) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, num_prior_new_rows %d\n", 
       blockIdx.x, value_count, target_value_count, t, num_prior_new_rows);
   }
-  if(total_num_new_rows != 32) {
+  if((value_count + 128 <= target_value_count) && (total_num_new_rows != 32)) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, total_num_new_rows %d\n", 
       blockIdx.x, value_count, target_value_count, t, total_num_new_rows);
   }
@@ -747,15 +740,17 @@ if constexpr (enable_print_large_list) {
     int block_value_count = value_count_scan_results.block_count;
 
 if constexpr (enable_print_large_list) {
-  if(in_nesting_bounds != (t % 4 == 0)) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", 
-      blockIdx.x, value_count, target_value_count, t, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count);
+  if(within_batch && in_row_bounds && (in_nesting_bounds != (t % 4 == 0))) {
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, in_nesting_bounds %d, start_depth %d, end_depth %d, "
+      "in_row_bounds %d, row_index %d, input_row_count %d, row_index_lower_bound %d, last_row %d, first_row %d, s->num_rows %d\n", 
+      blockIdx.x, value_count, target_value_count, t, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count, 
+      row_index_lower_bound, last_row, first_row, s->num_rows);
   }
-  if(thread_value_count != ((t + 3) / 4)) {
+  if(within_batch && in_row_bounds && (thread_value_count != ((t + 3) / 4))) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, thread_value_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, thread_value_count);
   }
-  if(block_value_count != 32) {
+  if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (block_value_count != 32)) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, block_value_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, block_value_count);
   }
@@ -813,15 +808,15 @@ if constexpr (enable_print_large_list) {
         int block_valid_count = valid_count_scan_results.block_count;
 
 if constexpr (enable_print_large_list) {
-  if(((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid)) {
+  if(within_batch && in_row_bounds && (((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid))) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, is_valid %d, in_nesting_bounds %d\n", 
       blockIdx.x, value_count, target_value_count, t, d_idx, is_valid, in_nesting_bounds);
   }
-  if (((d_idx == 0) && (thread_valid_count != ((t + 3)/ 4))) || ((d_idx == 1) && (thread_valid_count != t))) {
+  if (within_batch && in_row_bounds && (((d_idx == 0) && (thread_valid_count != ((t + 3)/ 4))) || ((d_idx == 1) && (thread_valid_count != t)))) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count);
   }
-  if(((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128))) {
+  if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128)))) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count);
   }
@@ -859,17 +854,16 @@ if constexpr (enable_print_large_list) {
         next_thread_value_count = next_value_count_scan_results.thread_count_within_block;
         next_block_value_count = next_value_count_scan_results.block_count;
 
-
 if constexpr (enable_print_large_list) {
-  if(next_in_nesting_bounds != 1) {
+  if(within_batch && in_row_bounds && (next_in_nesting_bounds != 1)) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, next_in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count);
   }
-  if(next_thread_value_count != t) {
+  if(within_batch && in_row_bounds && (next_thread_value_count != t)) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_thread_value_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, next_thread_value_count);
   }
-  if(next_block_value_count != 128) {
+  if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (next_block_value_count != 128)) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_block_value_count %d\n", 
       blockIdx.x, value_count, target_value_count, t, next_block_value_count);
   }
@@ -893,9 +887,11 @@ if constexpr (enable_print_large_list) {
           //STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
 
+/*
 if constexpr (enable_print_large_list) {
   int overall_index = 4*(blockIdx.x * 20000 + idx);
   if(overall_index != ofs) {
+    printf("WHOA BAD OFFSET\n");
     printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, "
       "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, "
       "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, "
@@ -907,7 +903,7 @@ if constexpr (enable_print_large_list) {
       next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count);
   }
 }
-
+*/
           if constexpr (enable_print || enable_print_range_error) {
             if((idx < 0) || (idx > 50000)){ printf("WHOA: offset index %d out of bounds!\n", idx); }
             if(ofs < 0){ printf("WHOA: offset value %d out of bounds!\n", ofs); }
@@ -1030,13 +1026,6 @@ if constexpr (enable_print_large_list) {
 
     // If we have lists # rows != # values
     s->input_row_count = input_row_count;
-if constexpr (enable_print_large_list) {
-  auto first_ni_value_count = s->nesting_info[0].value_count;
-  if((value_count != (4*input_row_count)) || (input_row_count != first_ni_value_count)){
-    printf("ALGO GARBAGE SET: blockIdx.x %d, value_count %d, target_value_count %d, t %d, value_count %d, input_row_count %d, first_ni_value_count %d\n", 
-    blockIdx.x, value_count, target_value_count, t, value_count, input_row_count, first_ni_value_count);
-  }
-}
   }
 
   return max_depth_valid_count;
@@ -1069,6 +1058,32 @@ __device__ inline bool maybe_has_nulls(page_state_s* s)
   return run_val != s->col.max_level[lvl];
 }
 
+template <int decode_block_size_t, typename stream_type>
+__device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
+{
+  static constexpr bool enable_print = false;
+
+  //Dictionary
+  int num_skipped = parquet_stream.skip_decode(t, num_to_skip);
+  if constexpr (enable_print) {
+    if (t == 0) { printf("SKIPPED: num_skipped %d, for %d\n", num_skipped, num_to_skip); }
+  }
+  //it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000:
+  //in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front
+  //modulo 2 * block_size of course, since that's as many as we process at once
+  while (num_skipped < num_to_skip) {
+    auto const to_skip = min(2*decode_block_size_t, num_to_skip - num_skipped);
+    parquet_stream.decode_next(t, to_skip);
+    num_skipped += to_skip;
+    if constexpr (enable_print) {
+      if (t == 0) { printf("EXTRA SKIPPED: to_skip %d, at %d, for %d\n", to_skip, num_skipped, num_to_skip); }
+    }
+    __syncthreads();
+  }
+
+  return num_skipped;
+}
+
 /**
  * @brief Kernel for computing fixed width non dictionary column data stored in the pages
  *
@@ -1190,18 +1205,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       if(t == 0) { printf("INIT DICT: dict_bits %d, data_start %p, data_end %p, dict_idx %p, page.num_input_values %d, s->dict_pos %d \n", 
         s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); }
     }
-    if constexpr (has_lists_t){
-      int init_decode = 0;
-      while (init_decode < s->page.skipped_leaf_values) {
-        auto const to_skip = min(decode_block_size_t, s->page.skipped_leaf_values - init_decode);
-        dict_stream.decode_next(t, to_skip);
-        init_decode += to_skip;
-        __syncthreads();
-      }
-    }
   }
-  __syncthreads();
-
+  
   if constexpr (enable_print) {
     if((t == 0) && (page_idx == 0)){
       printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size);
@@ -1225,6 +1230,19 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
 
+  //For lists (which can have skipped values, skip ahead in the decoding so that we don't repeat work
+  if constexpr (has_lists_t){
+    if(s->page.skipped_leaf_values > 0) {
+      if (should_process_nulls) {
+        skip_decode<decode_block_size_t>(def_decoder, s->page.skipped_leaf_values, t);
+      }
+      processed_count = skip_decode<decode_block_size_t>(rep_decoder, s->page.skipped_leaf_values, t);
+      if constexpr (has_dict_t) {
+        skip_decode<decode_block_size_t>(dict_stream, s->page.skipped_leaf_values, t);
+      }
+    }
+  }
+
   if constexpr (enable_print) {
     if(t == 0) { printf("page_idx %d, nullable %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", 
       page_idx, int(nullable), int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); }
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 4a0791d5c54..490cf1d43c3 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -252,6 +252,8 @@ struct rle_stream {
       run.level_run  = level_run;
       run.remaining  = run.size;
       cur += run_bytes;
+//printf("STORE RUN: decode_index %d, fill_index %d, output_pos %d, run.size %d\n", 
+  //decode_index, fill_index, output_pos, run.size);
       output_pos += run.size;
       fill_index++;
     }
@@ -353,6 +355,8 @@ struct rle_stream {
             // this is the last batch we will process this iteration if:
             // - either this run still has remaining values
             // - or it is consumed fully and its last index corresponds to output_count
+//printf("STATUS: run_index %d, batch_len %d, remaining %d, at_end %d, last_run_pos %d, cur_values %d\n", 
+  //run_index, batch_len, remaining, at_end, last_run_pos, cur_values);
             if (remaining > 0 || at_end) { values_processed_shared = output_count; }
             if (remaining == 0 && (at_end || is_last_decode_warp(warp_id))) {
               decode_index_shared = run_index + 1;
@@ -372,6 +376,71 @@ struct rle_stream {
     return values_processed_shared;
   }
 
+  __device__ inline int skip_runs(int target_count)
+  {
+    //we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip amount
+    //so thread 0 spins like crazy on fill_run_batch(), skipping writing unnecessary run info
+    //then when it hits the one that matters, we don't process it at all and bail as if we never started
+    //basically we're setting up the global vars necessary to start fill_run_batch for the first time
+    while (cur < end) {
+      // bytes for the varint header
+      uint8_t const* _cur = cur;
+      int const level_run = get_vlq32(_cur, end);
+
+      // run_bytes includes the header size
+      int run_bytes = _cur - cur;
+      int run_size;
+      if (is_literal_run(level_run)) {
+        // from the parquet spec: literal runs always come in multiples of 8 values.
+        run_size = (level_run >> 1) * 8;
+        run_bytes += ((run_size * level_bits) + 7) >> 3;
+      } else {
+        // repeated value run
+        run_size = (level_run >> 1);
+        run_bytes += ((level_bits) + 7) >> 3;
+      }
+
+      if((output_pos + run_size) > target_count) {
+//printf("SKIPPING: target_count %d, run_size %d, output_pos %d\n", target_count, run_size, output_pos);
+        return output_pos; //bail! we've reached the starting one
+      }
+
+      output_pos += run_size;
+      cur += run_bytes;
+    }
+
+//printf("SKIPPING: target_count %d, output_pos %d\n", target_count, output_pos);
+    return output_pos; //we skipped everything
+  }
+
+
+  __device__ inline int skip_decode(int t, int count)
+  {
+    int const output_count = min(count, total_values - cur_values);
+
+    // special case. if level_bits == 0, just return all zeros. this should tremendously speed up
+    // a very common case: columns with no nulls, especially if they are non-nested
+    if (level_bits == 0) {
+      cur_values = output_count;
+      return output_count;
+    }
+
+    __shared__ int values_processed_shared;
+
+    __syncthreads();
+
+    // warp 0 reads ahead and fills `runs` array to be decoded by remaining warps.
+    if (t == 0) {
+      values_processed_shared = skip_runs(output_count);
+    }
+    __syncthreads();
+
+    cur_values = values_processed_shared;
+
+    // valid for every thread
+    return values_processed_shared;
+  }
+
   __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); }
 };
 

From 254f3e9ea9ea1934b82f7996475362b2aa3f8e4c Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Tue, 24 Sep 2024 09:40:29 -0400
Subject: [PATCH 14/36] tweak scan interface for linked lists

---
 cpp/src/io/parquet/decode_fixed.cu | 46 +++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 0638b3e5d5a..5010e116aa6 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,6 +24,8 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. 
+// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for lists. 
 struct block_scan_results {
   uint32_t warp_bits;
   int thread_count_within_warp;
@@ -34,21 +36,34 @@ struct block_scan_results {
 };
 
 template <int decode_block_size>
-static __device__ void scan_block_exclusive_sum(int t, int thread_bit, block_scan_results& results)
+static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
 {
-  constexpr int num_warps  = decode_block_size / cudf::detail::warp_size;
+  int const t = threadIdx.x;
   int const warp_index     = t / cudf::detail::warp_size;
   int const warp_lane      = t % cudf::detail::warp_size;
   uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
 
-  results.warp_bits                = ballot(thread_bit);
+  uint32_t warp_bits = ballot(thread_bit);
+  scan_block_exclusive_sum<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
+}
+
+template <int decode_block_size>
+static __device__ void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, block_scan_results& results)
+{
+  //Compute # warps
+  constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
+  
+  //Compute the warp-wide results
+  results.warp_bits                = warp_bits;
   results.warp_count               = __popc(results.warp_bits);
   results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
 
+  //Share the warp counts amongst the block threads
   __shared__ int warp_counts[num_warps];
   if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
   __syncthreads();
 
+  //Compute block-wide results
   results.block_count               = 0;
   results.thread_count_within_block = results.thread_count_within_warp;
   for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
@@ -244,9 +259,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
 
   int const row_index_lower_bound = s->row_index_lower_bound;
 
-  int const max_depth = s->col.max_nesting_depth - 1;
-  auto& max_depth_ni  = s->nesting_info[max_depth];
-  int valid_count     = max_depth_ni.valid_count;
+  int const max_depth       = s->col.max_nesting_depth - 1;
+  auto& max_depth_ni        = s->nesting_info[max_depth];
+  int max_depth_valid_count = max_depth_ni.valid_count;
 
   __syncthreads();
 
@@ -280,7 +295,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
 
       // thread and block validity count
       block_scan_results valid_count_results;
-      scan_block_exclusive_sum<decode_block_size>(t, is_valid, valid_count_results);
+      scan_block_exclusive_sum<decode_block_size>(is_valid, valid_count_results);
       uint32_t const warp_validity_mask = valid_count_results.warp_bits;
       int thread_valid_count            = valid_count_results.thread_count_within_block;
       int block_valid_count             = valid_count_results.block_count;
@@ -320,11 +335,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
         if (is_valid) {
           // for non-list types, the value count is always the same across
           int const dst_pos = value_count + thread_value_count;
-          int const src_pos = valid_count + thread_valid_count;
+          int const src_pos = max_depth_valid_count + thread_valid_count;
           sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
         }
         // update stuff
-        valid_count += block_valid_count;
+        max_depth_valid_count += block_valid_count;
       }
 
     }  // end depth loop
@@ -334,13 +349,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
 
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
-    max_depth_ni.valid_count = valid_count;
-    s->nz_count              = valid_count;
+    max_depth_ni.valid_count = max_depth_valid_count;
+    s->nz_count              = max_depth_valid_count;
     s->input_value_count     = value_count;
     s->input_row_count       = value_count;
   }
 
-  return valid_count;
+  return max_depth_valid_count;
 }
 
 template <int decode_block_size, typename level_t, typename state_buf>
@@ -390,7 +405,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
 
     // thread and block validity count
     block_scan_results valid_count_results;
-    scan_block_exclusive_sum<decode_block_size>(t, is_valid, valid_count_results);
+    scan_block_exclusive_sum<decode_block_size>(is_valid, valid_count_results);
     uint32_t const warp_validity_mask = valid_count_results.warp_bits;
     int thread_valid_count            = valid_count_results.thread_count_within_block;
     int block_valid_count             = valid_count_results.block_count;
@@ -671,7 +686,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   int valid_count     = 0;
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
+  // For chunked reads we may not process all of the rows on the page; if not stop early
+  int last_row = s->first_row + s->num_rows;
+  while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
+         (s->input_row_count <= last_row)) {
     int next_valid_count;
 
     // only need to process definition levels if this is a nullable column

From 8ea1e0e723a9558ff462143e46d9feaabe974f2e Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Wed, 25 Sep 2024 13:31:04 -0400
Subject: [PATCH 15/36] style fixes

---
 cpp/src/io/parquet/decode_fixed.cu | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 5010e116aa6..9214af3e9e4 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,8 +24,9 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. 
-// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for lists. 
+// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously.
+// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for
+// lists.
 struct block_scan_results {
   uint32_t warp_bits;
   int thread_count_within_warp;
@@ -38,7 +39,7 @@ struct block_scan_results {
 template <int decode_block_size>
 static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
 {
-  int const t = threadIdx.x;
+  int const t              = threadIdx.x;
   int const warp_index     = t / cudf::detail::warp_size;
   int const warp_lane      = t % cudf::detail::warp_size;
   uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
@@ -48,22 +49,26 @@ static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_resul
 }
 
 template <int decode_block_size>
-static __device__ void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, block_scan_results& results)
+static __device__ void scan_block_exclusive_sum(uint32_t warp_bits,
+                                                int warp_lane,
+                                                int warp_index,
+                                                uint32_t lane_mask,
+                                                block_scan_results& results)
 {
-  //Compute # warps
+  // Compute # warps
   constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
-  
-  //Compute the warp-wide results
+
+  // Compute the warp-wide results
   results.warp_bits                = warp_bits;
   results.warp_count               = __popc(results.warp_bits);
   results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
 
-  //Share the warp counts amongst the block threads
+  // Share the warp counts amongst the block threads
   __shared__ int warp_counts[num_warps];
   if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
   __syncthreads();
 
-  //Compute block-wide results
+  // Compute block-wide results
   results.block_count               = 0;
   results.thread_count_within_block = results.thread_count_within_warp;
   for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {

From 41cb98206640c57293d7ea325a6df7d85d08a56b Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Thu, 26 Sep 2024 10:16:44 -0400
Subject: [PATCH 16/36] Update cpp/src/io/parquet/decode_fixed.cu

Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
---
 cpp/src/io/parquet/decode_fixed.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 9214af3e9e4..6b8559d400f 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -296,7 +296,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
       auto& ni = s->nesting_info[d_idx];
 
-      int is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
+      int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
 
       // thread and block validity count
       block_scan_results valid_count_results;

From 6e705549e708c02795cfd3da52ffd3fa9cdfd4d7 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Thu, 26 Sep 2024 10:17:05 -0400
Subject: [PATCH 17/36] Update cpp/src/io/parquet/decode_fixed.cu

Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
---
 cpp/src/io/parquet/decode_fixed.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 6b8559d400f..f84cd7e4944 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -302,8 +302,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
       block_scan_results valid_count_results;
       scan_block_exclusive_sum<decode_block_size>(is_valid, valid_count_results);
       uint32_t const warp_validity_mask = valid_count_results.warp_bits;
-      int thread_valid_count            = valid_count_results.thread_count_within_block;
-      int block_valid_count             = valid_count_results.block_count;
+      int const thread_valid_count            = valid_count_results.thread_count_within_block;
+      int const block_valid_count             = valid_count_results.block_count;
 
       // validity is processed per-warp
       //

From 9ad44155988e2702e4b4526c5b60d9532cc59cd7 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Thu, 26 Sep 2024 10:18:00 -0400
Subject: [PATCH 18/36] Update cpp/src/io/parquet/decode_fixed.cu

Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
---
 cpp/src/io/parquet/decode_fixed.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index f84cd7e4944..b18813551d9 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -504,9 +504,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v
     int const row_index     = thread_value_count + value_count;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
-    int is_valid           = in_row_bounds;
-    int thread_valid_count = thread_value_count;
-    int block_valid_count  = block_value_count;
+    int const is_valid           = in_row_bounds;
+    int const thread_valid_count = thread_value_count;
+    int const block_valid_count  = block_value_count;
 
     // if this is valid and we're at the leaf, output dst_pos
     if (is_valid) {

From 3a1fc951fb04bc844c3cea8d327c688d3b49487d Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Thu, 26 Sep 2024 11:01:27 -0400
Subject: [PATCH 19/36] Unroll block-count loop

---
 cpp/src/io/parquet/decode_fixed.cu | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 9214af3e9e4..98e64bf3475 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -71,9 +71,19 @@ static __device__ void scan_block_exclusive_sum(uint32_t warp_bits,
   // Compute block-wide results
   results.block_count               = 0;
   results.thread_count_within_block = results.thread_count_within_warp;
-  for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
-    results.block_count += warp_counts[warp_idx];
-    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+  if constexpr ((num_warps == 4) || (num_warps == 8)) {
+    results.block_count = warp_counts[0] + warp_counts[1] + warp_counts[2] + warp_counts[3];
+    if constexpr (num_warps == 8) {
+      results.block_count += warp_counts[4] + warp_counts[5] + warp_counts[6] + warp_counts[7];
+    }
+    for (int warp_idx = 0; warp_idx < warp_index; ++warp_idx) {
+      results.thread_count_within_block += warp_counts[warp_idx];
+    }
+  } else {
+    for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
+      results.block_count += warp_counts[warp_idx];
+      if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+    }
   }
 }
 
@@ -338,7 +348,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
       // if this is valid and we're at the leaf, output dst_pos
       if (d_idx == max_depth) {
         if (is_valid) {
-          // for non-list types, the value count is always the same across
           int const dst_pos = value_count + thread_value_count;
           int const src_pos = max_depth_valid_count + thread_valid_count;
           sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;

From 5ab9829c59d63ff112680ec088054696b18e6069 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Thu, 26 Sep 2024 13:10:53 -0400
Subject: [PATCH 20/36] more style fixes

---
 cpp/src/io/parquet/decode_fixed.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index e6ea4dbbebe..993021fa5ef 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -312,8 +312,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
       block_scan_results valid_count_results;
       scan_block_exclusive_sum<decode_block_size>(is_valid, valid_count_results);
       uint32_t const warp_validity_mask = valid_count_results.warp_bits;
-      int const thread_valid_count            = valid_count_results.thread_count_within_block;
-      int const block_valid_count             = valid_count_results.block_count;
+      int const thread_valid_count      = valid_count_results.thread_count_within_block;
+      int const block_valid_count       = valid_count_results.block_count;
 
       // validity is processed per-warp
       //

From 447102230c3355b3a1cf61642e8f4b196fa1afb4 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Wed, 2 Oct 2024 15:43:47 -0400
Subject: [PATCH 21/36] Disable manual block scan for non-lists

---
 cpp/src/io/parquet/decode_fixed.cu | 48 ++++++++++++++++--------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 993021fa5ef..552cadcc509 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -49,7 +49,7 @@ static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_resul
 }
 
 template <int decode_block_size>
-static __device__ void scan_block_exclusive_sum(uint32_t warp_bits,
+__device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
                                                 int warp_lane,
                                                 int warp_index,
                                                 uint32_t lane_mask,
@@ -309,11 +309,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
       int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
 
       // thread and block validity count
-      block_scan_results valid_count_results;
-      scan_block_exclusive_sum<decode_block_size>(is_valid, valid_count_results);
-      uint32_t const warp_validity_mask = valid_count_results.warp_bits;
-      int const thread_valid_count      = valid_count_results.thread_count_within_block;
-      int const block_valid_count       = valid_count_results.block_count;
+      using block_scan = cub::BlockScan<int, decode_block_size>;
+      __shared__ typename block_scan::TempStorage scan_storage;
+      int thread_valid_count, block_valid_count;
+      block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
 
       // validity is processed per-warp
       //
@@ -323,18 +322,21 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
       // the correct position to start reading. since we are about to write the validity vector
       // here we need to adjust our computed mask to take into account the write row bounds.
       int warp_null_count = 0;
-      // lane 0 from each warp writes out validity
-      if ((write_start >= 0) && (ni.valid_map != nullptr) && ((t % cudf::detail::warp_size) == 0)) {
-        int const valid_map_offset = ni.valid_map_offset;
-        int const vindex     = value_count + thread_value_count;  // absolute input value index
-        int const bit_offset = (valid_map_offset + vindex + write_start) -
-                               first_row;  // absolute bit offset into the output validity map
-        int const write_end =
-          cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-        int const bit_count = write_end - write_start;
-        warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-        store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+      if (ni.valid_map != nullptr) {
+        uint32_t const warp_validity_mask = ballot(is_valid);
+        // lane 0 from each warp writes out validity
+        if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) {
+          int const valid_map_offset = ni.valid_map_offset;
+          int const vindex     = value_count + thread_value_count;  // absolute input value index
+          int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                first_row;  // absolute bit offset into the output validity map
+          int const write_end =
+            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+          int const bit_count = write_end - write_start;
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+        }
       }
 
       // sum null counts. we have to do it this way instead of just incrementing by (value_count -
@@ -418,11 +420,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
     }
 
     // thread and block validity count
-    block_scan_results valid_count_results;
-    scan_block_exclusive_sum<decode_block_size>(is_valid, valid_count_results);
-    uint32_t const warp_validity_mask = valid_count_results.warp_bits;
-    int thread_valid_count            = valid_count_results.thread_count_within_block;
-    int block_valid_count             = valid_count_results.block_count;
+    using block_scan = cub::BlockScan<int, decode_block_size>;
+    __shared__ typename block_scan::TempStorage scan_storage;
+    int thread_valid_count, block_valid_count;
+    block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count);
+    uint32_t const warp_validity_mask = ballot(is_valid);
 
     // validity is processed per-warp
     //

From c0ed2cb3175183d85579c2197ac5f80bdc4e0a17 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 4 Oct 2024 12:29:17 -0400
Subject: [PATCH 22/36] Update cpp/src/io/parquet/decode_fixed.cu

Co-authored-by: Vukasin Milovanovic <vmilovanovic@nvidia.com>
---
 cpp/src/io/parquet/decode_fixed.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 993021fa5ef..c2548fcd42a 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -284,13 +284,11 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     // definition level
-    int d;
+    int d = 1;
     if (t >= batch_size) {
       d = -1;
     } else if (def) {
       d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-    } else {
-      d = 1;
     }
 
     int const thread_value_count = t;

From b898cbabbf2821da8dcaba92e6c724a24069c8bc Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Fri, 4 Oct 2024 12:32:53 -0400
Subject: [PATCH 23/36] Style fixes

---
 cpp/src/io/parquet/decode_fixed.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 62709e0b27f..42f90880fe9 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -327,7 +327,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
           int const valid_map_offset = ni.valid_map_offset;
           int const vindex     = value_count + thread_value_count;  // absolute input value index
           int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                first_row;  // absolute bit offset into the output validity map
+                                 first_row;  // absolute bit offset into the output validity map
           int const write_end =
             cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
           int const bit_count = write_end - write_start;

From b0ee9fc97873a36da2f0dd0c23fc9fcd787b9905 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Mon, 7 Oct 2024 10:34:58 -0400
Subject: [PATCH 24/36] renaming

---
 cpp/src/io/parquet/decode_fixed.cu | 188 +++++++++++++++--------------
 1 file changed, 97 insertions(+), 91 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index fce8f53700d..5fe14d09e9f 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,6 +24,54 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. 
+// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for lists. 
+struct block_scan_results {
+  uint32_t warp_bits;
+  int thread_count_within_warp;
+  int warp_count;
+
+  int thread_count_within_block;
+  int block_count;
+};
+
+template <int decode_block_size>
+__device__ inline static void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
+{
+  int const t = threadIdx.x;
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
+
+  uint32_t warp_bits = ballot(thread_bit);
+  scan_block_exclusive_sum<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
+}
+
+template <int decode_block_size>
+__device__ inline static void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, block_scan_results& results)
+{
+  //Compute # warps
+  constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
+  
+  //Compute the warp-wide results
+  results.warp_bits                = warp_bits;
+  results.warp_count               = __popc(results.warp_bits);
+  results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
+
+  //Share the warp counts amongst the block threads
+  __shared__ int warp_counts[num_warps];
+  if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
+  __syncthreads();
+
+  //Compute block-wide results
+  results.block_count               = 0;
+  results.thread_count_within_block = results.thread_count_within_warp;
+  for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
+    results.block_count += warp_counts[warp_idx];
+    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+  }
+}
+
 template <int block_size, bool has_lists_t, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
@@ -265,7 +313,7 @@ struct decode_fixed_width_split_values_func {
 };
 
 template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityAndRowIndicesNestedNonLists(
+static __device__ int gpuUpdateValidityAndRowIndicesNested(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
@@ -552,48 +600,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   return valid_count;
 }
 
-struct scan_results
-{
-  uint32_t warp_bits;
-  int thread_count_within_warp;
-  int warp_count;
-
-  int thread_count_within_block;
-  int block_count;
-};
-
-template <int decode_block_size>
-static __device__ void scan_block(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, scan_results& results)
-{
-  constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
-
-  results.warp_bits = warp_bits;
-  results.warp_count = __popc(results.warp_bits);
-  results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
-
-  __shared__ uint32_t warp_counts[num_warps];
-  if(warp_lane == 0) {
-    warp_counts[warp_index] = results.warp_count;
-  }
-  __syncthreads();
-
-  results.block_count = 0;
-  results.thread_count_within_block = results.thread_count_within_warp;
-  for(int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
-    results.block_count += warp_counts[warp_idx];
-    if(warp_idx < warp_index) {
-      results.thread_count_within_block += warp_counts[warp_idx];
-    }
-  }
-}
-
-template <int decode_block_size>
-static __device__ void scan_block(int thread_bit, int warp_lane, int warp_index, uint32_t lane_mask, scan_results& results)
-{
-  uint32_t warp_bits = ballot(thread_bit);
-  scan_block<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
-}
-
 template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
 static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, 
@@ -630,13 +636,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
   __syncthreads();
   
-  using block_scan = cub::BlockScan<int, decode_block_size>;
-  __shared__ typename block_scan::TempStorage scan_storage;
-
-  int const warp_lane = t % cudf::detail::warp_size;
-  bool const is_first_lane = warp_lane == 0;
-  int const warp_index = t / cudf::detail::warp_size;
-  uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  bool const is_first_lane = (warp_lane == 0);
 
   while (value_count < target_value_count) {
 
@@ -647,10 +649,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
     // get definition level, use repitition level to get start/end depth
     // different for each thread, as each thread has a different r/d
-    int rep_level = -1, def_level = -1, start_depth = -1, end_depth = -1;
+    int def_level = -1, start_depth = -1, end_depth = -1;
     if (within_batch) {
       int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
-      rep_level = static_cast<int>(rep[index]);
+      int rep_level = static_cast<int>(rep[index]);
       if constexpr (nullable) {
         def_level = static_cast<int>(def[index]);
         end_depth = s->nesting_info[def_level].end_depth;
@@ -686,16 +688,19 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
     //Determine value count & row index
     // track (page-relative) row index for the thread so we can compare against input bounds
     // keep track of overall # of rows we've read.
-    //THIS IS THE UNDO POINT
     int const is_new_row = start_depth == 0 ? 1 : 0;
     int num_prior_new_rows, total_num_new_rows;
-    block_scan(scan_storage).ExclusiveSum(is_new_row, num_prior_new_rows, total_num_new_rows);
-    __syncthreads();
+    {
+      block_scan_results new_row_scan_results;
+      scan_block_exclusive_sum<decode_block_size>(is_new_row, new_row_scan_results);
+      num_prior_new_rows = new_row_scan_results.thread_count_within_block;
+      total_num_new_rows = new_row_scan_results.block_count;
+    }
 
 if constexpr (enable_print_large_list) {
   if(within_batch && (bool(is_new_row) != (t % 4 == 0))) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d, start_depth %d, rep_level %d\n", 
-      blockIdx.x, value_count, target_value_count, t, is_new_row, start_depth, rep_level);
+    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d, start_depth %d\n", 
+      blockIdx.x, value_count, target_value_count, t, is_new_row, start_depth);
   }
   if(within_batch && (num_prior_new_rows != ((t + 3) / 4))) {
     printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, num_prior_new_rows %d\n", 
@@ -731,13 +736,16 @@ if constexpr (enable_print_large_list) {
     // queries is_valid from all threads, stores prior total and total total
 
     //WARP VALUE COUNT:
-    scan_results value_count_scan_results;
-    scan_block<decode_block_size>(in_nesting_bounds, warp_lane, warp_index, lane_mask, value_count_scan_results);
-
-    int thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp;
-    int warp_value_count = value_count_scan_results.warp_count;
-    int thread_value_count = value_count_scan_results.thread_count_within_block;
-    int block_value_count = value_count_scan_results.block_count;
+    int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count;
+    {
+      block_scan_results value_count_scan_results;
+      scan_block_exclusive_sum<decode_block_size>(in_nesting_bounds, value_count_scan_results);
+
+      thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp;
+      warp_value_count = value_count_scan_results.warp_count;
+      thread_value_count = value_count_scan_results.thread_count_within_block;
+      block_value_count = value_count_scan_results.block_count;
+    }
 
 if constexpr (enable_print_large_list) {
   if(within_batch && in_row_bounds && (in_nesting_bounds != (t % 4 == 0))) {
@@ -798,14 +806,15 @@ if constexpr (enable_print_large_list) {
         // position for thread t's bit is thread_value_count. for cuda 11 we could use
         // __reduce_or_sync(), but until then we have to do a warp reduce.
         uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
-        auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
-
-        scan_results valid_count_scan_results;
-        scan_block<decode_block_size>(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results);
-
-        int warp_valid_count = valid_count_scan_results.warp_count;
-        int thread_valid_count = valid_count_scan_results.thread_count_within_block;
-        int block_valid_count = valid_count_scan_results.block_count;
+        int thread_valid_count, block_valid_count;
+        {
+          auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
+
+          block_scan_results valid_count_scan_results;
+          scan_block_exclusive_sum<decode_block_size>(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results);
+          thread_valid_count = valid_count_scan_results.thread_count_within_block;
+          block_valid_count = valid_count_scan_results.block_count;
+        }
 
 if constexpr (enable_print_large_list) {
   if(within_batch && in_row_bounds && (((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid))) {
@@ -846,13 +855,15 @@ if constexpr (enable_print_large_list) {
           (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0;
 
 //NEXT WARP VALUE COUNT:
-        scan_results next_value_count_scan_results;
-        scan_block<decode_block_size>(next_in_nesting_bounds, warp_lane, warp_index, lane_mask, next_value_count_scan_results);
-
-        next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp;
-        next_warp_value_count = next_value_count_scan_results.warp_count;
-        next_thread_value_count = next_value_count_scan_results.thread_count_within_block;
-        next_block_value_count = next_value_count_scan_results.block_count;
+        {
+          block_scan_results next_value_count_scan_results;
+          scan_block_exclusive_sum<decode_block_size>(next_in_nesting_bounds, next_value_count_scan_results);
+
+          next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp;
+          next_warp_value_count = next_value_count_scan_results.warp_count;
+          next_thread_value_count = next_value_count_scan_results.thread_count_within_block;
+          next_block_value_count = next_value_count_scan_results.block_count;
+        }
 
 if constexpr (enable_print_large_list) {
   if(within_batch && in_row_bounds && (next_in_nesting_bounds != 1)) {
@@ -894,12 +905,12 @@ if constexpr (enable_print_large_list) {
     printf("WHOA BAD OFFSET\n");
     printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, "
       "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, "
-      "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, rep_level %d, def_level %d, ni.value_count %d, "
+      "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, def_level %d, ni.value_count %d, "
       "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, "
       "target_value_count %d, block_value_count %d, next_block_value_count %d\n", 
       ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, 
       next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, 
-      total_num_new_rows, rep_level, def_level, ni.value_count, thread_value_count, next_ni.value_count, 
+      total_num_new_rows, def_level, ni.value_count, thread_value_count, next_ni.value_count, 
       next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count);
   }
 }
@@ -927,7 +938,6 @@ if constexpr (enable_print_large_list) {
 //TODO: Consider OR'ING for next_thread_value_count and popc() for next_thread_value_count
 //so that we don't have to take a ballot here. Is uint128 so may deconstruct to this anyway ...
 
-        int warp_null_count = 0;
         if(is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) {
           // last bit in the warp to store //in old is warp_valid_mask_bit_count
 //so it's a count of everything in nesting bounds, though bits can be zero if NULL at this level            
@@ -936,14 +946,11 @@ if constexpr (enable_print_large_list) {
           //is cumulative sum of warp_value_count at the given nesting depth
           // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
           int const bit_offset = ni.valid_map_offset + thread_value_count;
-
           store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count);
-          warp_null_count = warp_value_count - warp_valid_count;
 
           if constexpr (enable_print) {
               printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, warp_value_count %d, warp_valid_mask %u\n", 
                 t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, warp_value_count, warp_valid_mask);
-              printf("NUM NULLS: t %d, depth %d, warp_null_count %d\n", t, d_idx, warp_null_count);
             }
         }
 
@@ -1148,8 +1155,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
 
   DecodeValuesFunc<decode_block_size_t, has_lists_t, state_buf_t> decode_values;
 
-  bool const nullable             = is_nullable(s);
-  bool const should_process_nulls = nullable && maybe_has_nulls(s);
+  bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s);
 
   // shared buffer. all shared memory is suballocated out of here
   static constexpr auto align_test = false;
@@ -1244,8 +1250,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   }
 
   if constexpr (enable_print) {
-    if(t == 0) { printf("page_idx %d, nullable %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", 
-      page_idx, int(nullable), int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); }
+    if(t == 0) { printf("page_idx %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", 
+      page_idx, int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); }
   }
 
   auto print_nestings = [&](bool is_post){
@@ -1295,7 +1301,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
             s->page.num_input_values, s->input_value_count, value_count, s->input_value_count, processed_count, valid_count, next_valid_count); }
         }
       } else if constexpr (has_nesting_t) {
-        next_valid_count = gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, true, level_t>(
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, t);
         if constexpr (enable_print) {
           if(t == 0) { printf("NESTED NEXT: next_valid_count %d\n", next_valid_count); }
@@ -1321,7 +1327,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
 
         if constexpr (has_nesting_t) {
           next_valid_count =
-            gpuUpdateValidityAndRowIndicesNestedNonLists<decode_block_size_t, false, level_t>(
+            gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
               processed_count, s, sb, nullptr, t);
         } else {
           next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(

From 4b7d1df38570663ffcfc25cc4eb5223331ce7c71 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Mon, 7 Oct 2024 17:05:53 -0400
Subject: [PATCH 25/36] minor tweaks

---
 cpp/src/io/parquet/decode_fixed.cu | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index f058ac310db..05d9aeb1b5b 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -1098,20 +1098,18 @@ __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
 {
   static constexpr bool enable_print = false;
 
-  //Dictionary
+  // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000:
+  // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front
+  // modulo 2 * block_size of course, since that's as many as we process at once
   int num_skipped = parquet_stream.skip_decode(t, num_to_skip);
   if constexpr (enable_print) {
     if (t == 0) { printf("SKIPPED: num_skipped %d, for %d\n", num_skipped, num_to_skip); }
   }
-  //it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000:
-  //in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front
-  //modulo 2 * block_size of course, since that's as many as we process at once
   while (num_skipped < num_to_skip) {
-    auto const to_skip = min(2*decode_block_size_t, num_to_skip - num_skipped);
-    parquet_stream.decode_next(t, to_skip);
-    num_skipped += to_skip;
+    auto const to_decode = min(2 * decode_block_size_t, num_to_skip - num_skipped);
+    num_skipped += parquet_stream.decode_next(t, to_decode);
     if constexpr (enable_print) {
-      if (t == 0) { printf("EXTRA SKIPPED: to_skip %d, at %d, for %d\n", to_skip, num_skipped, num_to_skip); }
+      if (t == 0) { printf("EXTRA SKIPPED: to_decode %d, at %d, for %d\n", to_decode, num_skipped, num_to_skip); }
     }
     __syncthreads();
   }
@@ -1240,7 +1238,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
         s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); }
     }
   }
-  
+
   if constexpr (enable_print) {
     if((t == 0) && (page_idx == 0)){
       printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size);

From b36b3b29769f7c6a088c44d673684c6cb187afc8 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Mon, 7 Oct 2024 17:47:45 -0400
Subject: [PATCH 26/36] delete some debug printing

---
 .../cudf/table/experimental/row_operators.cuh  | 14 +-------------
 cpp/src/io/parquet/page_data.cuh               | 18 ------------------
 cpp/src/io/parquet/reader_impl.cpp             |  9 ---------
 cpp/src/io/parquet/reader_impl_preprocess.cu   | 12 ------------
 cpp/src/io/parquet/rle_stream.cuh              |  6 ------
 5 files changed, 1 insertion(+), 58 deletions(-)

diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index e4aca2f142a..3f33c70c29a 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -1429,30 +1429,18 @@ class device_row_comparator {
     __device__ bool operator()(size_type const lhs_element_index,
                                size_type const rhs_element_index) const noexcept
     {
-      static constexpr bool enable_print = false;
       if (check_nulls) {
         bool const lhs_is_null{lhs.is_null(lhs_element_index)};
         bool const rhs_is_null{rhs.is_null(rhs_element_index)};
         if (lhs_is_null and rhs_is_null) {
           return nulls_are_equal == null_equality::EQUAL;
         } else if (lhs_is_null != rhs_is_null) {
-          if constexpr (enable_print) {
-            printf("NULLS UNEQUAL AT %d, %d; values: %d %d\n", 
-              lhs_element_index, rhs_element_index, int(lhs_is_null), int(rhs_is_null));
-          }
           return false;
         }
       }
 
-      bool result = comparator(lhs.element<Element>(lhs_element_index),
+      return comparator(lhs.element<Element>(lhs_element_index),
                         rhs.element<Element>(rhs_element_index));
-      if constexpr (enable_print && cuda::std::is_integral_v<Element>) {
-        if(!result) {
-          printf("VALUES UNEQUAL: AT %d, %d, VALUES %d, %d\n", lhs_element_index, rhs_element_index, 
-            (int)lhs.element<Element>(lhs_element_index), (int)rhs.element<Element>(rhs_element_index));
-        }
-      }
-      return result;
     }
 
     template <typename Element,
diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh
index 1e13302c467..f182747650e 100644
--- a/cpp/src/io/parquet/page_data.cuh
+++ b/cpp/src/io/parquet/page_data.cuh
@@ -89,14 +89,6 @@ inline __device__ void gpuStoreOutput(uint32_t* dst,
     bytebuf = 0;
   }
   *dst = bytebuf;
-
-  static constexpr bool enable_print = false;
-  if constexpr (enable_print) {
-    if (threadIdx.x == 0) {
-      printf("STORE VALUE %u at %p, src8 %p, dict_pos %u, dict_size %u, ofs %u\n", 
-        bytebuf, dst, src8, dict_pos, dict_size, ofs);
-    }
-  }
 }
 
 /**
@@ -347,16 +339,6 @@ inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos
     dict     = s->data_start;
   }
   dict_pos *= (uint32_t)s->dtype_len_in;
-
-  static constexpr bool enable_print = false;
-  if constexpr (enable_print) {
-    if (threadIdx.x == 0) {
-      auto dict_lookup_idx = rolling_index<state_buf::dict_buf_size>(src_pos);
-      printf("PREP OUTPUT VALUE at dst %p, dict %p, dict_pos %u, dict_size %u, dict_base %p, dict_bits %d, dict_lookup_idx %d, dtype_len_in %d\n", 
-        dst, dict, dict_pos, dict_size, s->dict_base, s->dict_bits, dict_lookup_idx, s->dtype_len_in);
-    }
-  }
-
   gpuStoreOutput(dst, dict, dict_pos, dict_size);
 }
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 43c11f917ab..9f66160f73c 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -50,8 +50,6 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
-//printf("PREP LAUNCH: decode_page_data: mode %d, skip_rows %lu, num_rows %lu, #pages %lu\n", 
-//  (int)mode, skip_rows, num_rows, subpass.pages.size());
 
   auto& page_nesting        = subpass.page_nesting_info;
   auto& page_nesting_decode = subpass.page_nesting_decode_info;
@@ -223,11 +221,6 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
-  static constexpr bool enable_print = false;
-  if constexpr (enable_print) {
-    printf("PAGE DATA DECODE MASK: %d\n", kernel_mask);
-  }
-
   // launch string decoder
   int s_idx = 0;
   if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
@@ -419,11 +412,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
-//printf("SYNC ERROR CODE\n");
   if (auto const error = error_code.value_sync(_stream); error != 0) {
     CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
   }
-//printf("ERROR CODE SUNK\n");
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 2bb96f0087d..8e67f233213 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -138,12 +138,7 @@ void generate_depth_remappings(
   // depth.
   //
 
-  static constexpr bool enable_print = false;
-
   // compute "X" from above
-  if constexpr (enable_print) {
-    printf("REMAPPING: max def %d, max rep %d\n", schema.max_definition_level, schema.max_repetition_level);
-  }
   for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) {
     auto find_shallowest = [&](int r) {
       int shallowest = -1;
@@ -162,9 +157,6 @@ void generate_depth_remappings(
         if (!cur_schema.is_stub()) { cur_depth--; }
         schema_idx = cur_schema.parent_idx;
       }
-      if constexpr (enable_print) {
-        printf("REMAPPING: s_idx / r %d, shallowest %d\n", r, shallowest);
-      }
       return shallowest;
     };
     rep_depth_remap[s_idx] = find_shallowest(s_idx);
@@ -203,10 +195,6 @@ void generate_depth_remappings(
         prev_schema = cur_schema;
         schema_idx  = cur_schema.parent_idx;
       }
-
-      if constexpr (enable_print) {
-        printf("REMAPPING: s_idx %d, r1 %d, end_depth %d\n", s_idx, r1, depth);
-      }
       return depth;
     };
     def_depth_remap[s_idx] = find_deepest(s_idx);
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 490cf1d43c3..24db60d11b6 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -252,8 +252,6 @@ struct rle_stream {
       run.level_run  = level_run;
       run.remaining  = run.size;
       cur += run_bytes;
-//printf("STORE RUN: decode_index %d, fill_index %d, output_pos %d, run.size %d\n", 
-  //decode_index, fill_index, output_pos, run.size);
       output_pos += run.size;
       fill_index++;
     }
@@ -355,8 +353,6 @@ struct rle_stream {
             // this is the last batch we will process this iteration if:
             // - either this run still has remaining values
             // - or it is consumed fully and its last index corresponds to output_count
-//printf("STATUS: run_index %d, batch_len %d, remaining %d, at_end %d, last_run_pos %d, cur_values %d\n", 
-  //run_index, batch_len, remaining, at_end, last_run_pos, cur_values);
             if (remaining > 0 || at_end) { values_processed_shared = output_count; }
             if (remaining == 0 && (at_end || is_last_decode_warp(warp_id))) {
               decode_index_shared = run_index + 1;
@@ -401,7 +397,6 @@ struct rle_stream {
       }
 
       if((output_pos + run_size) > target_count) {
-//printf("SKIPPING: target_count %d, run_size %d, output_pos %d\n", target_count, run_size, output_pos);
         return output_pos; //bail! we've reached the starting one
       }
 
@@ -409,7 +404,6 @@ struct rle_stream {
       cur += run_bytes;
     }
 
-//printf("SKIPPING: target_count %d, output_pos %d\n", target_count, output_pos);
     return output_pos; //we skipped everything
   }
 

From 5b157042d5a2e2cd9de65e141be3c0e2e8528a47 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Mon, 7 Oct 2024 17:54:41 -0400
Subject: [PATCH 27/36] Remove more prints

---
 cpp/src/io/parquet/decode_fixed.cu | 343 -----------------------------
 1 file changed, 343 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 05d9aeb1b5b..e7d7582cd2c 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -88,18 +88,6 @@ __device__ inline void gpuDecodeFixedWidthValues(
   auto const data_out = nesting_info_base[leaf_level_index].data_out;
   uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
 
-  static constexpr bool enable_print = false;
-  static constexpr bool enable_print_range_error = false;
-//  static constexpr bool enable_print_large_list = true;
-
-  if constexpr (enable_print) {
-    if(t == 0) { printf("DECODE VALUES: start %d, end %d, first_row %d, leaf_level_index %d, dtype_len %u, "
-      "data_out %p, dict_base %p, dict_size %d, dict_bits %d, dict_val %d, data_start %p, skipped_leaf_values %u, input_row_count %d\n", 
-      start, end, s->first_row, leaf_level_index, dtype_len, data_out, s->dict_base, s->dict_bits, s->dict_val, 
-      s->dict_size, s->data_start, skipped_leaf_values, s->input_row_count);
-    }
-  }
-
   // decode values
   int pos = start;
   while (pos < end) {
@@ -116,18 +104,8 @@ __device__ inline void gpuDecodeFixedWidthValues(
       dst_pos -= s->first_row;
     }
 
-    if constexpr (has_lists_t && enable_print_range_error) {
-      if((dst_pos < 0) && (src_pos < target_pos)) { printf("WHOA: decode dst_pos %d out of bounds, src_pos %d, start %d\n", dst_pos, src_pos, start); }
-    }
-
     int dict_idx = rolling_index<state_buf::dict_buf_size>(src_pos + skipped_leaf_values);
     int dict_pos = sb->dict_idx[dict_idx];
-    if constexpr (enable_print) {
-      if(t == 0) { 
-        printf("DECODE OFFSETS: pos %d, src_pos %d, offset %d, dst_pos %d, target_pos %d, dict_idx %d, dict_pos %d\n", 
-          pos, src_pos, offset, dst_pos, target_pos, dict_idx, dict_pos);
-      }
-    }
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
@@ -143,12 +121,6 @@ __device__ inline void gpuDecodeFixedWidthValues(
       }
 
       void* dst = data_out + static_cast<size_t>(dst_pos) * dtype_len;
-      if constexpr (enable_print) {
-        if(dst_pos == 0) {
-          printf("WRITTEN TO dst_pos ZERO: t %d, data_out %p, dst %p, src_pos %d, dict_idx %d, dict_pos %d, dict_base %p\n", 
-            t, data_out, dst, src_pos, dict_idx, dict_pos, s->dict_base);
-        }
-      }
 
       if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
@@ -182,17 +154,6 @@ __device__ inline void gpuDecodeFixedWidthValues(
       } else {
         gpuOutputGeneric(s, sb, src_pos, static_cast<uint8_t*>(dst), dtype_len);
       }
-/*
-      if constexpr (enable_print_large_list) {
-        if (dtype == INT32) {
-          int value_stored = *static_cast<uint32_t*>(dst);
-          int overall_index = blockIdx.x * 20000 * 4 + src_pos;
-          if((overall_index % 1024) != value_stored) {
-            printf("WHOA BAD VALUE: WROTE %d to %d!\n", value_stored, overall_index);
-          }
-        }
-      }
-      */
     }
 
     pos += batch_size;
@@ -328,12 +289,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
   int const last_row                  = first_row + s->num_rows;
   int const capped_target_value_count = min(target_value_count, last_row);
 
-  static constexpr bool enable_print = false;
-  if constexpr (enable_print) {
-    if (t == 0) { printf("NESTED: s->input_value_count %d, first_row %d, last_row %d, target_value_count %d, capped_target_value_count %d\n", 
-      s->input_value_count, first_row, last_row, target_value_count, capped_target_value_count); }
-  }
-
   int const row_index_lower_bound = s->row_index_lower_bound;
 
   int const max_depth       = s->col.max_nesting_depth - 1;
@@ -343,9 +298,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
   __syncthreads();
 
   while (value_count < capped_target_value_count) {
-    if constexpr (enable_print) {
-      if(t == 0) { printf("NESTED VALUE COUNT: %d\n", value_count); }
-    }
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     // definition level
@@ -365,11 +317,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
     int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
 
-    if constexpr (enable_print) {
-      if(t == 0) { printf("NESTED ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d\n", 
-        row_index, row_index_lower_bound, last_row, in_row_bounds); }
-    }
-
     // iterate by depth
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
       auto& ni = s->nesting_info[d_idx];
@@ -421,10 +368,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
           int const dst_pos = value_count + thread_value_count;
           int const src_pos = max_depth_valid_count + thread_valid_count;
           sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
-          if constexpr (enable_print) {
-            if(t == 0) {printf("NESTED STORE: first_row %d, row_index %d dst_pos %d, src_pos %d\n", 
-              first_row, row_index, dst_pos, src_pos);}
-          }
         }
         // update stuff
         max_depth_valid_count += block_valid_count;
@@ -464,22 +407,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   int const last_row                  = first_row + s->num_rows;
   int const capped_target_value_count = min(target_value_count, last_row);
 
-  static constexpr bool enable_print = false;
-  if constexpr (enable_print) {
-    if (t == 0) { printf("FLAT: s->input_value_count %d, first_row %d, last_row %d, target_value_count %d, capped_target_value_count %d\n", 
-      s->input_value_count, first_row, last_row, target_value_count, capped_target_value_count); }
-  }
-
   int const valid_map_offset      = ni.valid_map_offset;
   int const row_index_lower_bound = s->row_index_lower_bound;
 
   __syncthreads();
 
   while (value_count < capped_target_value_count) {
-    if constexpr (enable_print) {
-      if(t == 0) { printf("FLAT VALUE COUNT: %d\n", value_count); }
-    }
-
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     int const thread_value_count = t;
@@ -640,23 +573,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   // how many (input) values we've processed in the page so far, prior to this loop iteration
   int value_count = s->input_value_count;
 
-  static constexpr bool enable_print = false;
-  static constexpr bool enable_print_range_error = false;
-  static constexpr bool enable_print_large_list = false;
-
   // how many rows we've processed in the page so far
   int input_row_count = s->input_row_count;
-  if constexpr (enable_print) {
-    if (t == 0) { printf("value_count %d, input_row_count %d\n", value_count, input_row_count); }
-  }
 
   // cap by last row so that we don't process any rows past what we want to output.
   int const first_row                 = s->first_row;
   int const last_row                  = first_row + s->num_rows;
-  if constexpr (enable_print) {
-    if (t == 0) { printf("LIST s->input_value_count %d, first_row %d, last_row %d, target_value_count %d\n", 
-      s->input_value_count, first_row, last_row, target_value_count); }
-  }
 
   int const row_index_lower_bound = s->row_index_lower_bound;
   int const max_depth = s->col.max_nesting_depth - 1;
@@ -669,10 +591,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   bool const is_first_lane = (warp_lane == 0);
 
   while (value_count < target_value_count) {
-
-    if constexpr (enable_print) {
-      if(t == 0) { printf("LIST VALUE COUNT: %d\n", value_count); }
-    }
     bool const within_batch = value_count + t < target_value_count;
 
     // get definition level, use repitition level to get start/end depth
@@ -689,28 +607,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       }
 
       //computed by generate_depth_remappings()
-      if constexpr (enable_print || enable_print_range_error) {
-        if((rep_level < 0) || (rep_level > max_depth)) {
-          printf("WHOA: rep level %d out of bounds %d!\n", rep_level, max_depth);
-        }
-        if(nullable && ((def_level < 0)/* || (def_level > (max_depth + 1)) */ )) {
-          printf("WHOA: def level %d out of bounds (max_depth %d) (index %d)!\n", def_level, max_depth, index);
-        }
-      }
-
       start_depth = s->nesting_info[rep_level].start_depth;
-      if constexpr (enable_print || enable_print_range_error) {
-        if((start_depth < 0) || (start_depth > (max_depth + 1))) {
-          printf("WHOA: start_depth %d out of bounds (max_depth %d) (index %d)!\n", start_depth, max_depth, index);
-        }
-        if((end_depth < 0) || (end_depth > (max_depth + 1))) {
-          printf("WHOA: end_depth %d out of bounds (max_depth %d) (index %d)!\n", end_depth, max_depth, index);
-        }
-      }
-      if constexpr (enable_print) {
-        if (t == 0) { printf("t %d, def_level %d, rep_level %d, start_depth %d, end_depth %d, max_depth %d\n", \
-          t, def_level, rep_level, start_depth, end_depth, max_depth); }
-      }
     }
 
     //Determine value count & row index
@@ -725,25 +622,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       total_num_new_rows = new_row_scan_results.block_count;
     }
 
-if constexpr (enable_print_large_list) {
-  if(within_batch && (bool(is_new_row) != (t % 4 == 0))) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, is_new_row %d, start_depth %d\n", 
-      blockIdx.x, value_count, target_value_count, t, is_new_row, start_depth);
-  }
-  if(within_batch && (num_prior_new_rows != ((t + 3) / 4))) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, num_prior_new_rows %d\n", 
-      blockIdx.x, value_count, target_value_count, t, num_prior_new_rows);
-  }
-  if((value_count + 128 <= target_value_count) && (total_num_new_rows != 32)) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, total_num_new_rows %d\n", 
-      blockIdx.x, value_count, target_value_count, t, total_num_new_rows);
-  }
-}
-
-    if constexpr (enable_print) {
-      if (t == 0) { printf("num_prior_new_rows %d, total_num_new_rows %d\n", num_prior_new_rows, total_num_new_rows); }
-    }
-
     int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1);
     input_row_count += total_num_new_rows;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
@@ -754,13 +632,6 @@ if constexpr (enable_print_large_list) {
     // is from/in current rep level to/in the rep level AT the depth with the def value
     int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
 
-    if constexpr (enable_print) {
-      if(t == 0) { printf("LIST ROWS: row_index %d, row_index_lower_bound %d, last_row %d, in_row_bounds %d, in_nesting_bounds %d\n", 
-        row_index, row_index_lower_bound, last_row, in_row_bounds, in_nesting_bounds); }
-      if (t < 32) { printf("t %d, is_new_row %d, num_prior_new_rows %d, row_index %d, in_row_bounds %d\n", 
-        t, is_new_row, num_prior_new_rows, row_index, in_row_bounds); }
-    }
-
     // queries is_valid from all threads, stores prior total and total total
 
     //WARP VALUE COUNT:
@@ -775,29 +646,6 @@ if constexpr (enable_print_large_list) {
       block_value_count = value_count_scan_results.block_count;
     }
 
-if constexpr (enable_print_large_list) {
-  if(within_batch && in_row_bounds && (in_nesting_bounds != (t % 4 == 0))) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, in_nesting_bounds %d, start_depth %d, end_depth %d, "
-      "in_row_bounds %d, row_index %d, input_row_count %d, row_index_lower_bound %d, last_row %d, first_row %d, s->num_rows %d\n", 
-      blockIdx.x, value_count, target_value_count, t, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count, 
-      row_index_lower_bound, last_row, first_row, s->num_rows);
-  }
-  if(within_batch && in_row_bounds && (thread_value_count != ((t + 3) / 4))) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, thread_value_count %d\n", 
-      blockIdx.x, value_count, target_value_count, t, thread_value_count);
-  }
-  if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (block_value_count != 32)) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, block_value_count %d\n", 
-      blockIdx.x, value_count, target_value_count, t, block_value_count);
-  }
-}
-
-    if constexpr (enable_print) {
-      if (t == 0) { printf("block_value_count %d\n", block_value_count); }
-      if (t < 32) { printf("t %d, thread_value_count %d, in_nesting_bounds %d\n", 
-        t, thread_value_count, in_nesting_bounds); }
-    }
-
     // column is either nullable or is a list (or both): iterate by depth
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
 
@@ -811,13 +659,6 @@ if constexpr (enable_print_large_list) {
         is_valid = in_nesting_bounds;
       }
 
-      if constexpr (enable_print) {
-        if (t == 0) { printf("nullable %d, depth %d, max_depth %d, max_def_level %d, value_count %d\n", 
-          int(nullable), d_idx, max_depth, ni.max_def_level, value_count); }
-        if (t < 32) { printf("t %d, def_level %d, in_nesting_bounds %d, is_valid %d\n", 
-          t, def_level, in_nesting_bounds, is_valid); }
-      }
-
       // thread and block validity count
       // queries is_valid of all threads, stores prior total and total total
 
@@ -844,32 +685,6 @@ if constexpr (enable_print_large_list) {
           block_valid_count = valid_count_scan_results.block_count;
         }
 
-if constexpr (enable_print_large_list) {
-  if(within_batch && in_row_bounds && (((d_idx == 0) && (is_valid != (t % 4 == 0))) || ((d_idx == 1) && !is_valid))) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, is_valid %d, in_nesting_bounds %d\n", 
-      blockIdx.x, value_count, target_value_count, t, d_idx, is_valid, in_nesting_bounds);
-  }
-  if (within_batch && in_row_bounds && (((d_idx == 0) && (thread_valid_count != ((t + 3)/ 4))) || ((d_idx == 1) && (thread_valid_count != t)))) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, thread_valid_count %d\n", 
-      blockIdx.x, value_count, target_value_count, t, d_idx, thread_valid_count);
-  }
-  if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (((d_idx == 0) && (block_valid_count != 32)) || ((d_idx == 1) && (block_valid_count != 128)))) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, d_idx %d, block_valid_count %d\n", 
-      blockIdx.x, value_count, target_value_count, t, d_idx, block_valid_count);
-  }
-}
-
-      if constexpr (enable_print) {
-        if((block_valid_count == 0) && (t == 0) && (d_idx == max_depth)) { 
-          printf("EMPTY VALID MASK: def_level %d, max_def_level %d, in_nesting_bounds %d, start_depth %d, "
-            "end_depth %d, in_row_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, input_row_count %d\n", 
-            def_level, ni.max_def_level, in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, 
-            row_index_lower_bound, last_row, input_row_count); }
-
-        if (t == 0) { printf("block_valid_count %u\n", int(block_valid_count)); }
-        if (t < 32) { printf("t %d, thread_valid_count %d\n", t, thread_valid_count); }
-      }
-
       // compute warp and thread value counts for the -next- nesting level. we need to
       // do this for nested schemas so that we can emit an offset for the -current- nesting
       // level. more concretely : the offset for the current nesting level == current length of the
@@ -893,28 +708,6 @@ if constexpr (enable_print_large_list) {
           next_block_value_count = next_value_count_scan_results.block_count;
         }
 
-if constexpr (enable_print_large_list) {
-  if(within_batch && in_row_bounds && (next_in_nesting_bounds != 1)) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_in_nesting_bounds %d, start_depth %d, end_depth %d, in_row_bounds %d, row_index %d, input_row_count %d\n", 
-      blockIdx.x, value_count, target_value_count, t, next_in_nesting_bounds, start_depth, end_depth, in_row_bounds, row_index, input_row_count);
-  }
-  if(within_batch && in_row_bounds && (next_thread_value_count != t)) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_thread_value_count %d\n", 
-      blockIdx.x, value_count, target_value_count, t, next_thread_value_count);
-  }
-  if((value_count + 128 <= target_value_count) && (input_row_count + total_num_new_rows <= last_row) && (next_block_value_count != 128)) {
-    printf("CUB GARBAGE: blockIdx.x %d, value_count %d, target_value_count %d, t %d, next_block_value_count %d\n", 
-      blockIdx.x, value_count, target_value_count, t, next_block_value_count);
-  }
-}
-
-        if constexpr (enable_print) {
-          if (t == 0) { printf("next depth %d, next_block_value_count %d\n", d_idx + 1, next_block_value_count); }
-          if (t < 32) { printf("t %d, start_depth %d, end_depth %d, in_row_bounds %d, next_in_nesting_bounds %d\n", 
-            t, start_depth, end_depth, in_row_bounds, next_in_nesting_bounds); }
-          if (t < 32) { printf("t %d, next_thread_value_count %d\n", t, next_thread_value_count); }
-        }
-
         // if we're -not- at a leaf column and we're within nesting/row bounds
         // and we have a valid data_out pointer, it implies this is a list column, so
         // emit an offset.
@@ -925,34 +718,6 @@ if constexpr (enable_print_large_list) {
 
           //STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
-
-/*
-if constexpr (enable_print_large_list) {
-  int overall_index = 4*(blockIdx.x * 20000 + idx);
-  if(overall_index != ofs) {
-    printf("WHOA BAD OFFSET\n");
-    printf("WHOA BAD OFFSET: WROTE %d to %d! t %d, blockIdx.x %d, idx %d, d_idx %d, start_depth %d, end_depth %d, max_depth %d, "
-      "in_row_bounds %d, in_nesting_bounds %d, next_in_nesting_bounds %d, row_index %d, row_index_lower_bound %d, last_row %d, "
-      "input_row_count %d, num_prior_new_rows %d, is_new_row %d, total_num_new_rows %d, def_level %d, ni.value_count %d, "
-      "thread_value_count %d, next_ni.value_count %d, next_thread_value_count %d, next_ni.page_start_value %d, value_count %d, "
-      "target_value_count %d, block_value_count %d, next_block_value_count %d\n", 
-      ofs, overall_index, t, blockIdx.x, idx, d_idx, start_depth, end_depth, max_depth, in_row_bounds, in_nesting_bounds, 
-      next_in_nesting_bounds, row_index, row_index_lower_bound, last_row, input_row_count, num_prior_new_rows, is_new_row, 
-      total_num_new_rows, def_level, ni.value_count, thread_value_count, next_ni.value_count, 
-      next_thread_value_count, next_ni.page_start_value, value_count, target_value_count, block_value_count, next_block_value_count);
-  }
-}
-*/
-          if constexpr (enable_print || enable_print_range_error) {
-            if((idx < 0) || (idx > 50000)){ printf("WHOA: offset index %d out of bounds!\n", idx); }
-            if(ofs < 0){ printf("WHOA: offset value %d out of bounds!\n", ofs); }
-          }
-
-          if constexpr (enable_print) {
-            if(idx < 0) { printf("WHOA: offset index out of bounds!\n"); }
-            if (t < 32) { printf("OFFSETS: t %d, idx %d, next value count %d, next page_start_value %d, ofs %d\n", 
-              t, idx, next_ni.value_count, next_ni.page_start_value, ofs); }
-          }
         }
       }
 
@@ -975,19 +740,10 @@ if constexpr (enable_print_large_list) {
           // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
           int const bit_offset = ni.valid_map_offset + thread_value_count;
           store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count);
-
-          if constexpr (enable_print) {
-              printf("STORE VALIDITY: t %d, depth %d, thread_value_count %d, valid_map_offset %d, bit_offset %d, warp_value_count %d, warp_valid_mask %u\n", 
-                t, d_idx, thread_value_count, ni.valid_map_offset, bit_offset, warp_value_count, warp_valid_mask);
-            }
         }
 
         if (t == 0) { 
           size_type const block_null_count = block_value_count - block_valid_count;
-          if constexpr (enable_print) {
-            if (t == 0) { printf("BLOCK NULLS: depth %d, prior %d, block_null_count %u\n", 
-              d_idx, ni.null_count, block_null_count); }
-          }
           ni.null_count += block_null_count;
         }
       }
@@ -1003,23 +759,6 @@ if constexpr (enable_print_large_list) {
           int const src_pos = max_depth_valid_count + thread_valid_count;
           int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
 
-          if constexpr (enable_print || enable_print_range_error) {
-            if((output_index < 0) || (output_index >= state_buf::nz_buf_size)) {
-              printf("WHOA: output index STORE %d out of bounds!\n", output_index);
-            }
-            if(dst_pos < 0) { printf("WHOA: dst_pos STORE %d out of bounds!\n", dst_pos); }
-          }
-
-          if constexpr (enable_print) {
-            if (t == 0) { printf("ni.value_count %d, max_depth_valid_count %d\n", int(ni.value_count), max_depth_valid_count); }
-            if (t < 32) { printf("t %d, src_pos %d, output_index %d\n", t, src_pos, output_index); }
-
-            if((t == 0) && (src_pos == 0)) {printf("SPECIAL: output_index %d, dst_pos %d, ni.value_count %d, max_depth_valid_count %d, thread_value_count %d, thread_valid_count %d\n", 
-              output_index, dst_pos, ni.value_count, max_depth_valid_count, thread_value_count, thread_valid_count);}
-
-            if (t == 0) { printf("OUTPUT_INDICES: output_index %d, dst_pos %d\n", output_index, dst_pos); }
-          }
-
           //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
           sb->nz_idx[output_index] = dst_pos;
         }
@@ -1041,18 +780,10 @@ if constexpr (enable_print_large_list) {
       thread_value_count_within_warp = next_thread_value_count_within_warp;
     } //END OF DEPTH LOOP
 
-    if constexpr (enable_print) {
-      if (t == 0) { printf("END DEPTH LOOP\n"); }
-    }
-
     int const batch_size = min(max_batch_size, target_value_count - value_count);
     value_count += batch_size;
   }
 
-  if constexpr (enable_print) {
-    if (t == 0) { printf("END LOOP\n"); }
-  }
-
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
     s->nesting_info[max_depth].valid_count = max_depth_valid_count;
@@ -1096,21 +827,13 @@ __device__ inline bool maybe_has_nulls(page_state_s* s)
 template <int decode_block_size_t, typename stream_type>
 __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
 {
-  static constexpr bool enable_print = false;
-
   // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000:
   // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front
   // modulo 2 * block_size of course, since that's as many as we process at once
   int num_skipped = parquet_stream.skip_decode(t, num_to_skip);
-  if constexpr (enable_print) {
-    if (t == 0) { printf("SKIPPED: num_skipped %d, for %d\n", num_skipped, num_to_skip); }
-  }
   while (num_skipped < num_to_skip) {
     auto const to_decode = min(2 * decode_block_size_t, num_to_skip - num_skipped);
     num_skipped += parquet_stream.decode_next(t, to_decode);
-    if constexpr (enable_print) {
-      if (t == 0) { printf("EXTRA SKIPPED: to_decode %d, at %d, for %d\n", to_decode, num_skipped, num_to_skip); }
-    }
     __syncthreads();
   }
 
@@ -1227,27 +950,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
                      s->page.num_input_values);
   }
 
-  static constexpr bool enable_print = false;
-
   rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
   if constexpr (has_dict_t) {
     dict_stream.init(
       s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
-    if constexpr (enable_print) {
-      if(t == 0) { printf("INIT DICT: dict_bits %d, data_start %p, data_end %p, dict_idx %p, page.num_input_values %d, s->dict_pos %d \n", 
-        s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values, s->dict_pos); }
-    }
-  }
-
-  if constexpr (enable_print) {
-    if((t == 0) && (page_idx == 0)){
-      printf("SIZES: shared_rep_size %d, shared_dict_size %d, shared_def_size %d\n", shared_rep_size, shared_dict_size, shared_def_size);
-    }
-    if constexpr (has_lists_t){
-      printf("Is fixed list page\n");
-    } else {
-      printf("Is fixed non-list page\n");
-    }
   }
 
   // We use two counters in the loop below: processed_count and valid_count.
@@ -1275,36 +981,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     }
   }
 
-  if constexpr (enable_print) {
-    if(t == 0) { printf("page_idx %d, should_process_nulls %d, has_lists_t %d, has_dict_t %d, num_rows %lu, page.num_input_values %d\n", 
-      page_idx, int(should_process_nulls), int(has_lists_t), int(has_dict_t), num_rows, s->page.num_input_values); }
-  }
-
-  auto print_nestings = [&](bool is_post){
-    if constexpr (enable_print) {
-      auto print_nesting_level = [&](const PageNestingDecodeInfo& ni) {
-        printf("page_idx %d, max_def_level %d, start_depth %d, end_depth %d, page_start_value %d, null_count %d, "
-          "valid_map_offset %d, valid_count %d, value_count %d\n", 
-          page_idx, ni.max_def_level, ni.start_depth, ni.end_depth, ni.page_start_value, ni.null_count, 
-          ni.valid_map_offset, ni.valid_count, ni.value_count);
-      };
-
-      if(t == 0) {
-        printf("POST %d NESTING 0: ", int(is_post));
-        print_nesting_level(s->nesting_info[0]);
-        printf("POST %d NESTING 1: ", int(is_post));
-        print_nesting_level(s->nesting_info[1]);
-        //printf("POST %d NESTING 2: ", int(is_post));
-        //print_nesting_level(s->nesting_info[2]);
-      }
-    }
-  };
-
-  print_nestings(false);
-  if constexpr (enable_print) {
-    if(t == 0) {printf("LOOP START page_idx %d\n", page_idx);}
-  }
-
   int last_row = s->first_row + s->num_rows;
   while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
          (s->input_row_count <= last_row)) {
@@ -1318,22 +994,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       if constexpr (has_lists_t) {
         rep_decoder.decode_next(t);
         __syncthreads();
-
-        int value_count = s->input_value_count;
         next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, true, level_t>(
           processed_count, s, sb, def, rep, t);
-        if constexpr (enable_print) {
-          if(t == 0) { printf("LISTS NEXT: next_valid_count %d\n", next_valid_count); }
-          if(t == 0) { printf("PROCESSING: page total values %d, num_input_values %d, pre value_count %d, post value_count %d, "
-            "processed_count %d, valid_count %d, next_valid_count %d\n", 
-            s->page.num_input_values, s->input_value_count, value_count, s->input_value_count, processed_count, valid_count, next_valid_count); }
-        }
       } else if constexpr (has_nesting_t) {
         next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
-        if constexpr (enable_print) {
-          if(t == 0) { printf("NESTED NEXT: next_valid_count %d\n", next_valid_count); }
-        }
       } else {
         next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
@@ -1346,7 +1011,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       if constexpr (has_lists_t) {
         processed_count += rep_decoder.decode_next(t);
         __syncthreads();
-
         next_valid_count =
           gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, false, level_t>(
             processed_count, s, sb, nullptr, rep, t);
@@ -1371,15 +1035,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     __syncthreads();
 
     valid_count = next_valid_count;
-
-    if constexpr (enable_print) {
-      if(t == 0) { printf("LOOP: processed_count %d, #page values %d, error %d\n", 
-        processed_count, s->page.num_input_values, s->error); }
-    }
   }
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
-
-  print_nestings(true);
 }
 
 }  // anonymous namespace

From e84af82274bd241f39c686f1286eed02ebb4f2bc Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Tue, 8 Oct 2024 13:03:14 -0400
Subject: [PATCH 28/36] cleanup

---
 cpp/src/io/parquet/decode_fixed.cu | 46 +++++++++++++-----------------
 cpp/src/io/parquet/page_decode.cuh |  2 +-
 cpp/src/io/parquet/page_hdr.cu     | 22 +++++---------
 cpp/src/io/parquet/rle_stream.cuh  | 30 +++++--------------
 4 files changed, 35 insertions(+), 65 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index e7d7582cd2c..d60b4f79168 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -80,40 +80,35 @@ __device__ inline void gpuDecodeFixedWidthValues(
   constexpr int num_warps      = block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.physical_type;
-
+  // nesting level that is storing actual leaf values
   int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto const data_out = s->nesting_info[leaf_level_index].data_out;
+
+  int const dtype = s->col.physical_type;
   uint32_t dtype_len = s->dtype_len;
-  auto const data_out = nesting_info_base[leaf_level_index].data_out;
+
   uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
-
     int const target_pos = pos + batch_size;
     int src_pos    = pos + t;
 
-    // the position in the output column/buffer
-//Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)
-    auto offset = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
-    int dst_pos = offset;
+    //Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
     if constexpr (!has_lists_t) {
       dst_pos -= s->first_row;
     }
 
-    int dict_idx = rolling_index<state_buf::dict_buf_size>(src_pos + skipped_leaf_values);
-    int dict_pos = sb->dict_idx[dict_idx];
-
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
     if (src_pos < target_pos && dst_pos >= 0) {
       // nesting level that is storing actual leaf values
 
       // src_pos represents the logical row position we want to read from. But in the case of
-      // nested hierarchies (lists), there is no 1:1 mapping of rows to values.  So our true read position
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
       // has to take into account the # of values we have to skip in the page to get to the
       // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
       if constexpr (has_lists_t) {
@@ -176,10 +171,14 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   constexpr int num_warps      = block_size / warp_size;
   constexpr int max_batch_size = num_warps * warp_size;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+  // nesting level that is storing actual leaf values
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto const data_out = s->nesting_info[leaf_level_index].data_out;
+
   int const dtype                          = s->col.physical_type;
   auto const data_len                      = thrust::distance(s->data_start, s->data_end);
   auto const num_values                    = data_len / s->dtype_len_in;
+
   uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
@@ -199,11 +198,9 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
     if (src_pos < target_pos && dst_pos >= 0) {
-      // nesting level that is storing actual leaf values
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
 
       // src_pos represents the logical row position we want to read from. But in the case of
-      // nested hierarchies (lists), there is no 1:1 mapping of rows to values.  So our true read position
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
       // has to take into account the # of values we have to skip in the page to get to the
       // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
       if constexpr (has_lists_t) {
@@ -212,8 +209,7 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
 
       uint32_t dtype_len = s->dtype_len;
       uint8_t const* src = s->data_start + src_pos;
-      uint8_t* dst =
-        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      uint8_t* dst = data_out + static_cast<size_t>(dst_pos) * dtype_len;
       auto const is_decimal =
         s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
 
@@ -862,7 +858,7 @@ template <typename level_t,
           bool has_lists_t,
           template <int block_size, bool decode_has_lists_t, typename state_buf>
           typename DecodeValuesFunc>
-CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
+CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   gpuDecodePageDataGeneric(PageInfo* pages,
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
@@ -907,18 +903,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s);
 
   // shared buffer. all shared memory is suballocated out of here
-  static constexpr auto align_test = false;
-  static constexpr size_t buffer_alignment = align_test ? 128 : 16;
   constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
-    sizeof(rle_run<level_t>), buffer_alignment) : 0;
+    sizeof(rle_run<level_t>), size_t{16}) : 0;
   constexpr int shared_dict_size =
     has_dict_t
-      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), buffer_alignment)
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
       : 0;
   constexpr int shared_def_size =
-    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), buffer_alignment);
+    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
   constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size;
-  __shared__ __align__(buffer_alignment) uint8_t shared_buf[shared_buf_size];
+  __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
 
   // setup all shared memory buffers
   int shared_offset = 0;
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 3adc02c9387..9ed2929a70e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1431,4 +1431,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   return true;
 }
 
-}  // namespace cudf::io::parquet::detail
\ No newline at end of file
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 3fad8e344ea..85a55fa97c9 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -181,29 +181,21 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
   } else if (is_string_col(chunk)) {
     // check for string before byte_stream_split so FLBA will go to the right kernel
     return decode_kernel_mask::STRING;
-  } 
-  
-  if (is_list(chunk) && !is_string_col(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
-    if (page.encoding == Encoding::PLAIN) {
-      return decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST;
-    } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST;
-    } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
-               page.encoding == Encoding::RLE_DICTIONARY) {
-      return decode_kernel_mask::FIXED_WIDTH_DICT_LIST;
-    }
   }
 
-  if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+      return is_list(chunk)   ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST : 
+             is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
                               : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+      return is_list(chunk)   ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST : 
+             is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
                               : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+      return is_list(chunk)   ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST : 
+             is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
                               : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 24db60d11b6..caa7c45840e 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -375,9 +375,9 @@ struct rle_stream {
   __device__ inline int skip_runs(int target_count)
   {
     //we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip amount
-    //so thread 0 spins like crazy on fill_run_batch(), skipping writing unnecessary run info
+    //so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info
     //then when it hits the one that matters, we don't process it at all and bail as if we never started
-    //basically we're setting up the global vars necessary to start fill_run_batch for the first time
+    //basically we're setting up the rle_stream vars necessary to start fill_run_batch for the first time
     while (cur < end) {
       // bytes for the varint header
       uint8_t const* _cur = cur;
@@ -397,9 +397,10 @@ struct rle_stream {
       }
 
       if((output_pos + run_size) > target_count) {
-        return output_pos; //bail! we've reached the starting one
+        return output_pos; //bail! we've reached the starting run
       }
 
+      //skip this run
       output_pos += run_size;
       cur += run_bytes;
     }
@@ -412,27 +413,10 @@ struct rle_stream {
   {
     int const output_count = min(count, total_values - cur_values);
 
-    // special case. if level_bits == 0, just return all zeros. this should tremendously speed up
+    // if level_bits == 0, there's nothing to do
     // a very common case: columns with no nulls, especially if they are non-nested
-    if (level_bits == 0) {
-      cur_values = output_count;
-      return output_count;
-    }
-
-    __shared__ int values_processed_shared;
-
-    __syncthreads();
-
-    // warp 0 reads ahead and fills `runs` array to be decoded by remaining warps.
-    if (t == 0) {
-      values_processed_shared = skip_runs(output_count);
-    }
-    __syncthreads();
-
-    cur_values = values_processed_shared;
-
-    // valid for every thread
-    return values_processed_shared;
+    cur_values = (level_bits == 0) ? output_count : skip_runs(output_count);
+    return cur_values;
   }
 
   __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); }

From f2007484fe06ec2d18dfee18f6fd9c2d86d269eb Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Tue, 8 Oct 2024 15:47:55 -0400
Subject: [PATCH 29/36] cleanup comments

---
 cpp/src/io/parquet/decode_fixed.cu | 107 ++++++++++++-----------------
 1 file changed, 43 insertions(+), 64 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index d60b4f79168..2f3923de8fe 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -562,7 +562,6 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, 
   level_t const* const rep, int t)
 {
-  //What is the output of this? Validity bits and offsets to list starts
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
@@ -579,25 +578,30 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int const row_index_lower_bound = s->row_index_lower_bound;
   int const max_depth = s->col.max_nesting_depth - 1;
   int max_depth_valid_count = s->nesting_info[max_depth].valid_count;
-
-  __syncthreads();
   
   int const warp_index     = t / cudf::detail::warp_size;
   int const warp_lane      = t % cudf::detail::warp_size;
   bool const is_first_lane = (warp_lane == 0);
 
+  __syncthreads();
+
   while (value_count < target_value_count) {
     bool const within_batch = value_count + t < target_value_count;
 
-    // get definition level, use repitition level to get start/end depth
+    // get definition level, use repetition level to get start/end depth
     // different for each thread, as each thread has a different r/d
     int def_level = -1, start_depth = -1, end_depth = -1;
     if (within_batch) {
       int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
       int rep_level = static_cast<int>(rep[index]);
       if constexpr (nullable) {
-        def_level = static_cast<int>(def[index]);
-        end_depth = s->nesting_info[def_level].end_depth;
+        if(def != nullptr) {
+          def_level = static_cast<int>(def[index]);
+          end_depth = s->nesting_info[def_level].end_depth;
+        } else {
+          def_level = 1;
+          end_depth = max_depth;
+        }
       } else {
         end_depth = max_depth;
       }
@@ -622,15 +626,10 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
     input_row_count += total_num_new_rows;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
-    // thread and block value count
-
+    // VALUE COUNT:
     // if we are within the range of nesting levels we should be adding value indices for
     // is from/in current rep level to/in the rep level AT the depth with the def value
     int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
-
-    // queries is_valid from all threads, stores prior total and total total
-
-    //WARP VALUE COUNT:
     int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count;
     {
       block_scan_results value_count_scan_results;
@@ -642,9 +641,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       block_value_count = value_count_scan_results.block_count;
     }
 
-    // column is either nullable or is a list (or both): iterate by depth
+    // iterate by depth
     for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
-
       auto& ni = s->nesting_info[d_idx];
 
       // everything up to the max_def_level is a non-null value
@@ -655,45 +653,32 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
         is_valid = in_nesting_bounds;
       }
 
-      // thread and block validity count
-      // queries is_valid of all threads, stores prior total and total total
-
-      // for nested lists, it's more complicated.  This block will visit 128 incoming values,
-      // however not all of them will necessarily represent a value at this nesting level. so
-      // the validity bit for thread t might actually represent output value t-6. the correct
-      // position for thread t's bit is thread_value_count. 
-
-
-//WARP VALID COUNT:
-        // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
-        // however not all of them will necessarily represent a value at this nesting level. so
-        // the validity bit for thread t might actually represent output value t-6. the correct
-        // position for thread t's bit is thread_value_count. for cuda 11 we could use
-        // __reduce_or_sync(), but until then we have to do a warp reduce.
-        uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
-        int thread_valid_count, block_valid_count;
-        {
-          auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
+      // VALID COUNT:
+      // Not all values visited by this block will represent a value at this nesting level. 
+      // the validity bit for thread t might actually represent output value t-6. 
+      // the correct position for thread t's bit is thread_value_count. 
+      uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
+      int thread_valid_count, block_valid_count;
+      {
+        auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
 
-          block_scan_results valid_count_scan_results;
-          scan_block_exclusive_sum<decode_block_size>(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results);
-          thread_valid_count = valid_count_scan_results.thread_count_within_block;
-          block_valid_count = valid_count_scan_results.block_count;
-        }
+        block_scan_results valid_count_scan_results;
+        scan_block_exclusive_sum<decode_block_size>(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results);
+        thread_valid_count = valid_count_scan_results.thread_count_within_block;
+        block_valid_count = valid_count_scan_results.block_count;
+      }
 
       // compute warp and thread value counts for the -next- nesting level. we need to
-      // do this for nested schemas so that we can emit an offset for the -current- nesting
-      // level. more concretely : the offset for the current nesting level == current length of the
-      // next nesting level
+      // do this for lists so that we can emit an offset for the -current- nesting level.
+      // the offset for the current nesting level == current length of the next nesting level
       int next_thread_value_count_within_warp = 0, next_warp_value_count = 0;
       int next_thread_value_count = 0, next_block_value_count = 0;
       int next_in_nesting_bounds = 0;
       if (d_idx < max_depth) {
-        //mask is different between depths
-        next_in_nesting_bounds = 
-          (d_idx + 1 >= start_depth && d_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0;
 
-//NEXT WARP VALUE COUNT:
+        //NEXT DEPTH VALUE COUNT:
+        next_in_nesting_bounds = 
+          ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0;
         {
           block_scan_results next_value_count_scan_results;
           scan_block_exclusive_sum<decode_block_size>(next_in_nesting_bounds, next_value_count_scan_results);
@@ -704,6 +689,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
           next_block_value_count = next_value_count_scan_results.block_count;
         }
 
+        // STORE OFFSET TO THE LIST LOCATION
         // if we're -not- at a leaf column and we're within nesting/row bounds
         // and we have a valid data_out pointer, it implies this is a list column, so
         // emit an offset.
@@ -712,45 +698,37 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
           int const idx             = ni.value_count + thread_value_count;
           cudf::size_type const ofs = next_ni.value_count + next_thread_value_count + next_ni.page_start_value;
 
-          //STORE THE OFFSET FOR THE NEW LIST LOCATION
           (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
         }
       }
 
-      // validity is processed per-warp (on lane 0's), because writes are atomic
+      // validity is processed per-warp (on lane 0's), because writes are 32-bit atomic ops
       //
-      // nested schemas always read and write to the same bounds 
+      // lists always read and write to the same bounds 
       // (that is, read and write positions are already pre-bounded by first_row/num_rows). 
       // since we are about to write the validity vector
       // here we need to adjust our computed mask to take into account the write row bounds.
       if constexpr (nullable) {
-//TODO: Consider OR'ING for next_thread_value_count and popc() for next_thread_value_count
-//so that we don't have to take a ballot here. Is uint128 so may deconstruct to this anyway ...
-
         if(is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) {
-          // last bit in the warp to store //in old is warp_valid_mask_bit_count
-//so it's a count of everything in nesting bounds, though bits can be zero if NULL at this level            
-
           // absolute bit offset into the output validity map
-          //is cumulative sum of warp_value_count at the given nesting depth
+          // is cumulative sum of warp_value_count at the given nesting depth
           // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
           int const bit_offset = ni.valid_map_offset + thread_value_count;
+
           store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count);
         }
 
         if (t == 0) { 
-          size_type const block_null_count = block_value_count - block_valid_count;
-          ni.null_count += block_null_count;
+          ni.null_count += block_value_count - block_valid_count;
         }
       }
 
       // if this is valid and we're at the leaf, output dst_pos
-      // Read these before the sync, so that when thread 0 modifies them we've already read their values
+      // Read value_count before the sync, so that when thread 0 modifies it we've already read its value
       int current_value_count = ni.value_count;
-      __syncthreads();  // handle modification of ni.value_count from below
+      __syncthreads();  // guard against modification of ni.value_count below
       if (d_idx == max_depth) {
         if (is_valid) {
-          // for non-list types, the value count is always the same across
           int const dst_pos = current_value_count + thread_value_count;
           int const src_pos = max_depth_valid_count + thread_valid_count;
           int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
@@ -766,7 +744,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
         ni.value_count += block_value_count;
         ni.valid_map_offset += block_value_count;
       }
-      __syncthreads();  // handle modification of ni.value_count from below
+      __syncthreads();  // sync modification of ni.value_count
 
       // propagate value counts for the next depth level
       block_value_count  = next_block_value_count;
@@ -959,10 +937,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   //   and valid_count is that running count.
   int processed_count = 0;
   int valid_count     = 0;
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
 
-  //For lists (which can have skipped values, skip ahead in the decoding so that we don't repeat work
+  // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists)
   if constexpr (has_lists_t){
     if(s->page.skipped_leaf_values > 0) {
       if (should_process_nulls) {
@@ -975,6 +951,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     }
   }
 
+  // the core loop. decode batches of level stream data using rle_stream objects
+  // and pass the results to gpuDecodeValues
   int last_row = s->first_row + s->num_rows;
   while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
          (s->input_row_count <= last_row)) {
@@ -1049,6 +1027,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
 
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
   if (level_type_size == 1) {
     if (is_list) {
       gpuDecodePageDataGeneric<uint8_t,

From 3fc76ee24fb247b2551964c862c2d7209fcdf992 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Tue, 8 Oct 2024 15:54:28 -0400
Subject: [PATCH 30/36] style changes

---
 cpp/src/io/parquet/decode_fixed.cu | 189 +++++++++++++++--------------
 cpp/src/io/parquet/page_hdr.cu     |  18 +--
 cpp/src/io/parquet/parquet_gpu.hpp |   7 +-
 cpp/src/io/parquet/rle_stream.cuh  |  18 +--
 4 files changed, 119 insertions(+), 113 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 2f3923de8fe..159398a927e 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -39,7 +39,7 @@ struct block_scan_results {
 template <int decode_block_size>
 __device__ inline static void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
 {
-  int const t = threadIdx.x;
+  int const t              = threadIdx.x;
   int const warp_index     = t / cudf::detail::warp_size;
   int const warp_lane      = t % cudf::detail::warp_size;
   uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
@@ -49,22 +49,26 @@ __device__ inline static void scan_block_exclusive_sum(int thread_bit, block_sca
 }
 
 template <int decode_block_size>
-__device__ inline static void scan_block_exclusive_sum(uint32_t warp_bits, int warp_lane, int warp_index, uint32_t lane_mask, block_scan_results& results)
+__device__ inline static void scan_block_exclusive_sum(uint32_t warp_bits,
+                                                       int warp_lane,
+                                                       int warp_index,
+                                                       uint32_t lane_mask,
+                                                       block_scan_results& results)
 {
-  //Compute # warps
+  // Compute # warps
   constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
-  
-  //Compute the warp-wide results
+
+  // Compute the warp-wide results
   results.warp_bits                = warp_bits;
   results.warp_count               = __popc(results.warp_bits);
   results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
 
-  //Share the warp counts amongst the block threads
+  // Share the warp counts amongst the block threads
   __shared__ int warp_counts[num_warps];
   if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
   __syncthreads();
 
-  //Compute block-wide results
+  // Compute block-wide results
   results.block_count               = 0;
   results.thread_count_within_block = results.thread_count_within_warp;
   for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
@@ -82,9 +86,9 @@ __device__ inline void gpuDecodeFixedWidthValues(
 
   // nesting level that is storing actual leaf values
   int const leaf_level_index = s->col.max_nesting_depth - 1;
-  auto const data_out = s->nesting_info[leaf_level_index].data_out;
+  auto const data_out        = s->nesting_info[leaf_level_index].data_out;
 
-  int const dtype = s->col.physical_type;
+  int const dtype    = s->col.physical_type;
   uint32_t dtype_len = s->dtype_len;
 
   uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
@@ -94,13 +98,11 @@ __device__ inline void gpuDecodeFixedWidthValues(
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
     int const target_pos = pos + batch_size;
-    int src_pos    = pos + t;
+    int src_pos          = pos + t;
 
-    //Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
     int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
-    if constexpr (!has_lists_t) {
-      dst_pos -= s->first_row;
-    }
+    if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
@@ -111,9 +113,7 @@ __device__ inline void gpuDecodeFixedWidthValues(
       // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
       // has to take into account the # of values we have to skip in the page to get to the
       // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
-      if constexpr (has_lists_t) {
-        src_pos += skipped_leaf_values;
-      }
+      if constexpr (has_lists_t) { src_pos += skipped_leaf_values; }
 
       void* dst = data_out + static_cast<size_t>(dst_pos) * dtype_len;
 
@@ -173,11 +173,11 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
 
   // nesting level that is storing actual leaf values
   int const leaf_level_index = s->col.max_nesting_depth - 1;
-  auto const data_out = s->nesting_info[leaf_level_index].data_out;
+  auto const data_out        = s->nesting_info[leaf_level_index].data_out;
 
-  int const dtype                          = s->col.physical_type;
-  auto const data_len                      = thrust::distance(s->data_start, s->data_end);
-  auto const num_values                    = data_len / s->dtype_len_in;
+  int const dtype       = s->col.physical_type;
+  auto const data_len   = thrust::distance(s->data_start, s->data_end);
+  auto const num_values = data_len / s->dtype_len_in;
 
   uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
 
@@ -187,29 +187,24 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
     int const batch_size = min(max_batch_size, end - pos);
 
     int const target_pos = pos + batch_size;
-    int src_pos    = pos + t;
+    int src_pos          = pos + t;
 
     // the position in the output column/buffer
     int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
-    if constexpr (!has_lists_t) {
-      dst_pos -= s->first_row;
-    }
+    if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
     if (src_pos < target_pos && dst_pos >= 0) {
-
       // src_pos represents the logical row position we want to read from. But in the case of
       // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
       // has to take into account the # of values we have to skip in the page to get to the
       // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
-      if constexpr (has_lists_t) {
-        src_pos += skipped_leaf_values;
-      }
+      if constexpr (has_lists_t) { src_pos += skipped_leaf_values; }
 
       uint32_t dtype_len = s->dtype_len;
       uint8_t const* src = s->data_start + src_pos;
-      uint8_t* dst = data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      uint8_t* dst       = data_out + static_cast<size_t>(dst_pos) * dtype_len;
       auto const is_decimal =
         s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
 
@@ -558,9 +553,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v
 }
 
 template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityAndRowIndicesLists(
-  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, 
-  level_t const* const rep, int t)
+static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_count,
+                                                          page_state_s* s,
+                                                          state_buf* sb,
+                                                          level_t const* const def,
+                                                          level_t const* const rep,
+                                                          int t)
 {
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
@@ -572,13 +570,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   int input_row_count = s->input_row_count;
 
   // cap by last row so that we don't process any rows past what we want to output.
-  int const first_row                 = s->first_row;
-  int const last_row                  = first_row + s->num_rows;
+  int const first_row = s->first_row;
+  int const last_row  = first_row + s->num_rows;
 
   int const row_index_lower_bound = s->row_index_lower_bound;
-  int const max_depth = s->col.max_nesting_depth - 1;
-  int max_depth_valid_count = s->nesting_info[max_depth].valid_count;
-  
+  int const max_depth             = s->col.max_nesting_depth - 1;
+  int max_depth_valid_count       = s->nesting_info[max_depth].valid_count;
+
   int const warp_index     = t / cudf::detail::warp_size;
   int const warp_lane      = t % cudf::detail::warp_size;
   bool const is_first_lane = (warp_lane == 0);
@@ -593,9 +591,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
     int def_level = -1, start_depth = -1, end_depth = -1;
     if (within_batch) {
       int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
-      int rep_level = static_cast<int>(rep[index]);
+      int rep_level   = static_cast<int>(rep[index]);
       if constexpr (nullable) {
-        if(def != nullptr) {
+        if (def != nullptr) {
           def_level = static_cast<int>(def[index]);
           end_depth = s->nesting_info[def_level].end_depth;
         } else {
@@ -606,13 +604,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
         end_depth = max_depth;
       }
 
-      //computed by generate_depth_remappings()
+      // computed by generate_depth_remappings()
       start_depth = s->nesting_info[rep_level].start_depth;
     }
 
-    //Determine value count & row index
-    // track (page-relative) row index for the thread so we can compare against input bounds
-    // keep track of overall # of rows we've read.
+    // Determine value count & row index
+    //  track (page-relative) row index for the thread so we can compare against input bounds
+    //  keep track of overall # of rows we've read.
     int const is_new_row = start_depth == 0 ? 1 : 0;
     int num_prior_new_rows, total_num_new_rows;
     {
@@ -636,9 +634,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       scan_block_exclusive_sum<decode_block_size>(in_nesting_bounds, value_count_scan_results);
 
       thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp;
-      warp_value_count = value_count_scan_results.warp_count;
-      thread_value_count = value_count_scan_results.thread_count_within_block;
-      block_value_count = value_count_scan_results.block_count;
+      warp_value_count               = value_count_scan_results.warp_count;
+      thread_value_count             = value_count_scan_results.thread_count_within_block;
+      block_value_count              = value_count_scan_results.block_count;
     }
 
     // iterate by depth
@@ -654,18 +652,20 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       }
 
       // VALID COUNT:
-      // Not all values visited by this block will represent a value at this nesting level. 
-      // the validity bit for thread t might actually represent output value t-6. 
-      // the correct position for thread t's bit is thread_value_count. 
-      uint32_t const warp_valid_mask = WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
+      // Not all values visited by this block will represent a value at this nesting level.
+      // the validity bit for thread t might actually represent output value t-6.
+      // the correct position for thread t's bit is thread_value_count.
+      uint32_t const warp_valid_mask =
+        WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
       int thread_valid_count, block_valid_count;
       {
         auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
 
         block_scan_results valid_count_scan_results;
-        scan_block_exclusive_sum<decode_block_size>(warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results);
+        scan_block_exclusive_sum<decode_block_size>(
+          warp_valid_mask, warp_lane, warp_index, thread_mask, valid_count_scan_results);
         thread_valid_count = valid_count_scan_results.thread_count_within_block;
-        block_valid_count = valid_count_scan_results.block_count;
+        block_valid_count  = valid_count_scan_results.block_count;
       }
 
       // compute warp and thread value counts for the -next- nesting level. we need to
@@ -675,18 +675,19 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       int next_thread_value_count = 0, next_block_value_count = 0;
       int next_in_nesting_bounds = 0;
       if (d_idx < max_depth) {
-
-        //NEXT DEPTH VALUE COUNT:
-        next_in_nesting_bounds = 
+        // NEXT DEPTH VALUE COUNT:
+        next_in_nesting_bounds =
           ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0;
         {
           block_scan_results next_value_count_scan_results;
-          scan_block_exclusive_sum<decode_block_size>(next_in_nesting_bounds, next_value_count_scan_results);
+          scan_block_exclusive_sum<decode_block_size>(next_in_nesting_bounds,
+                                                      next_value_count_scan_results);
 
-          next_thread_value_count_within_warp = next_value_count_scan_results.thread_count_within_warp;
-          next_warp_value_count = next_value_count_scan_results.warp_count;
+          next_thread_value_count_within_warp =
+            next_value_count_scan_results.thread_count_within_warp;
+          next_warp_value_count   = next_value_count_scan_results.warp_count;
           next_thread_value_count = next_value_count_scan_results.thread_count_within_block;
-          next_block_value_count = next_value_count_scan_results.block_count;
+          next_block_value_count  = next_value_count_scan_results.block_count;
         }
 
         // STORE OFFSET TO THE LIST LOCATION
@@ -695,8 +696,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
         // emit an offset.
         if (in_nesting_bounds && ni.data_out != nullptr) {
           const auto& next_ni = s->nesting_info[d_idx + 1];
-          int const idx             = ni.value_count + thread_value_count;
-          cudf::size_type const ofs = next_ni.value_count + next_thread_value_count + next_ni.page_start_value;
+          int const idx       = ni.value_count + thread_value_count;
+          cudf::size_type const ofs =
+            next_ni.value_count + next_thread_value_count + next_ni.page_start_value;
 
           (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
         }
@@ -704,12 +706,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
 
       // validity is processed per-warp (on lane 0's), because writes are 32-bit atomic ops
       //
-      // lists always read and write to the same bounds 
-      // (that is, read and write positions are already pre-bounded by first_row/num_rows). 
+      // lists always read and write to the same bounds
+      // (that is, read and write positions are already pre-bounded by first_row/num_rows).
       // since we are about to write the validity vector
       // here we need to adjust our computed mask to take into account the write row bounds.
       if constexpr (nullable) {
-        if(is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) {
+        if (is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) {
           // absolute bit offset into the output validity map
           // is cumulative sum of warp_value_count at the given nesting depth
           // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
@@ -718,22 +720,22 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
           store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count);
         }
 
-        if (t == 0) { 
-          ni.null_count += block_value_count - block_valid_count;
-        }
+        if (t == 0) { ni.null_count += block_value_count - block_valid_count; }
       }
 
       // if this is valid and we're at the leaf, output dst_pos
-      // Read value_count before the sync, so that when thread 0 modifies it we've already read its value
+      // Read value_count before the sync, so that when thread 0 modifies it we've already read its
+      // value
       int current_value_count = ni.value_count;
       __syncthreads();  // guard against modification of ni.value_count below
       if (d_idx == max_depth) {
         if (is_valid) {
-          int const dst_pos = current_value_count + thread_value_count;
-          int const src_pos = max_depth_valid_count + thread_valid_count;
+          int const dst_pos      = current_value_count + thread_value_count;
+          int const src_pos      = max_depth_valid_count + thread_valid_count;
           int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
 
-          //Index from rolling buffer of values (which doesn't include nulls) to final array (which includes gaps for nulls)        
+          // Index from rolling buffer of values (which doesn't include nulls) to final array (which
+          // includes gaps for nulls)
           sb->nz_idx[output_index] = dst_pos;
         }
         max_depth_valid_count += block_valid_count;
@@ -747,12 +749,12 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
       __syncthreads();  // sync modification of ni.value_count
 
       // propagate value counts for the next depth level
-      block_value_count  = next_block_value_count;
-      thread_value_count = next_thread_value_count;
-      in_nesting_bounds  = next_in_nesting_bounds;
-      warp_value_count = next_warp_value_count;
+      block_value_count              = next_block_value_count;
+      thread_value_count             = next_thread_value_count;
+      in_nesting_bounds              = next_in_nesting_bounds;
+      warp_value_count               = next_warp_value_count;
       thread_value_count_within_warp = next_thread_value_count_within_warp;
-    } //END OF DEPTH LOOP
+    }  // END OF DEPTH LOOP
 
     int const batch_size = min(max_batch_size, target_value_count - value_count);
     value_count += batch_size;
@@ -761,8 +763,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
     s->nesting_info[max_depth].valid_count = max_depth_valid_count;
-    s->nz_count          = max_depth_valid_count;
-    s->input_value_count = value_count;
+    s->nz_count                            = max_depth_valid_count;
+    s->input_value_count                   = value_count;
 
     // If we have lists # rows != # values
     s->input_row_count = input_row_count;
@@ -881,8 +883,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s);
 
   // shared buffer. all shared memory is suballocated out of here
-  constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
-    sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_rep_size =
+    has_lists_t
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16})
+      : 0;
   constexpr int shared_dict_size =
     has_dict_t
       ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
@@ -893,9 +897,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
 
   // setup all shared memory buffers
-  int shared_offset = 0;
+  int shared_offset          = 0;
   rle_run<level_t>* rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
-  if constexpr (has_lists_t){ shared_offset += shared_rep_size; }
+  if constexpr (has_lists_t) { shared_offset += shared_rep_size; }
 
   rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
   if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
@@ -911,10 +915,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
                      def,
                      s->page.num_input_values);
   }
-  
+
   rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
   level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
-  if constexpr (has_lists_t){
+  if constexpr (has_lists_t) {
     rep_decoder.init(s->col.level_bits[level_type::REPETITION],
                      s->abs_lvl_start[level_type::REPETITION],
                      s->abs_lvl_end[level_type::REPETITION],
@@ -939,12 +943,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   int valid_count     = 0;
 
   // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists)
-  if constexpr (has_lists_t){
-    if(s->page.skipped_leaf_values > 0) {
+  if constexpr (has_lists_t) {
+    if (s->page.skipped_leaf_values > 0) {
       if (should_process_nulls) {
         skip_decode<decode_block_size_t>(def_decoder, s->page.skipped_leaf_values, t);
       }
-      processed_count = skip_decode<decode_block_size_t>(rep_decoder, s->page.skipped_leaf_values, t);
+      processed_count =
+        skip_decode<decode_block_size_t>(rep_decoder, s->page.skipped_leaf_values, t);
       if constexpr (has_dict_t) {
         skip_decode<decode_block_size_t>(dict_stream, s->page.skipped_leaf_values, t);
       }
@@ -983,12 +988,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
       if constexpr (has_lists_t) {
         processed_count += rep_decoder.decode_next(t);
         __syncthreads();
-        next_valid_count =
-          gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, false, level_t>(
-            processed_count, s, sb, nullptr, rep, t);
+        next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, false, level_t>(
+          processed_count, s, sb, nullptr, rep, t);
       } else {
         processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-        next_valid_count = gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
       }
     }
     __syncthreads();
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 85a55fa97c9..52d53cb8225 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -185,18 +185,18 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
 
   if (!is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return is_list(chunk)   ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST : 
-             is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
-                              : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+      return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST
+             : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+                                : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return is_list(chunk)   ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST : 
-             is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
-                              : decode_kernel_mask::FIXED_WIDTH_DICT;
+      return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST
+             : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+                                : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return is_list(chunk)   ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST : 
-             is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
-                              : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
+      return is_list(chunk)     ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST
+             : is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+                                : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a77a5f5ad50..695cc40297d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -221,9 +221,10 @@ enum class decode_kernel_mask {
     (1 << 9),                              // Same as above but for nested, fixed-width data
   FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
   FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
-  FIXED_WIDTH_DICT_LIST      = (1 << 12),  // Run decode kernel for fixed width dictionary pages for lists
-  FIXED_WIDTH_NO_DICT_LIST   = (1 << 13),  // Run decode kernel for fixed width non-dictionary pages for lists
-  BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST = (1 << 14),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists
+  FIXED_WIDTH_DICT_LIST      = (1 << 12),  // Run decode kernel for fixed width dictionary pages
+  FIXED_WIDTH_NO_DICT_LIST   = (1 << 13),  // Run decode kernel for fixed width non-dictionary pages
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST =
+    (1 << 14),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists
 };
 
 // mask representing all the ways in which a string can be encoded
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index caa7c45840e..9270db16c08 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -374,10 +374,11 @@ struct rle_stream {
 
   __device__ inline int skip_runs(int target_count)
   {
-    //we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip amount
-    //so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info
-    //then when it hits the one that matters, we don't process it at all and bail as if we never started
-    //basically we're setting up the rle_stream vars necessary to start fill_run_batch for the first time
+    // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip
+    // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info
+    // then when it hits the one that matters, we don't process it at all and bail as if we never
+    // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for
+    // the first time
     while (cur < end) {
       // bytes for the varint header
       uint8_t const* _cur = cur;
@@ -396,19 +397,18 @@ struct rle_stream {
         run_bytes += ((level_bits) + 7) >> 3;
       }
 
-      if((output_pos + run_size) > target_count) {
-        return output_pos; //bail! we've reached the starting run
+      if ((output_pos + run_size) > target_count) {
+        return output_pos;  // bail! we've reached the starting run
       }
 
-      //skip this run
+      // skip this run
       output_pos += run_size;
       cur += run_bytes;
     }
 
-    return output_pos; //we skipped everything
+    return output_pos;  // we skipped everything
   }
 
-
   __device__ inline int skip_decode(int t, int count)
   {
     int const output_count = min(count, total_values - cur_values);

From edc56bdd0e22b756f26c3f5ec26bcf9db3ea06d3 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Fri, 11 Oct 2024 12:44:49 -0400
Subject: [PATCH 31/36] constify variables

---
 cpp/benchmarks/CMakeLists.txt      |   9 +-
 cpp/src/io/parquet/decode_fixed.cu | 149 +++++++++++++++++------------
 2 files changed, 94 insertions(+), 64 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b8a53cd8bd9..4113e38dcf4 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
+ConfigureBench(AST_BENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
@@ -392,6 +392,11 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
+# ##################################################################################################
+# * multi buffer memset benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
+
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 159398a927e..897c0c04be1 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -88,34 +88,43 @@ __device__ inline void gpuDecodeFixedWidthValues(
   int const leaf_level_index = s->col.max_nesting_depth - 1;
   auto const data_out        = s->nesting_info[leaf_level_index].data_out;
 
-  int const dtype    = s->col.physical_type;
-  uint32_t dtype_len = s->dtype_len;
+  int const dtype          = s->col.physical_type;
+  uint32_t const dtype_len = s->dtype_len;
 
-  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
+  int const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
     int const target_pos = pos + batch_size;
-    int src_pos          = pos + t;
+    int const thread_pos = pos + t;
 
     // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
-    if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+    int const dst_pos = [&]() {
+      int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(thread_pos)];
+      if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+      return dst_pos;
+    }();
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
-    if (src_pos < target_pos && dst_pos >= 0) {
+    if (thread_pos < target_pos && dst_pos >= 0) {
       // nesting level that is storing actual leaf values
 
       // src_pos represents the logical row position we want to read from. But in the case of
       // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
       // has to take into account the # of values we have to skip in the page to get to the
       // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
-      if constexpr (has_lists_t) { src_pos += skipped_leaf_values; }
+      int const src_pos = [&]() {
+        if constexpr (has_lists_t) {
+          return thread_pos + skipped_leaf_values;
+        } else {
+          return thread_pos;
+        }
+      }();
 
-      void* dst = data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      void* const dst = data_out + static_cast<size_t>(dst_pos) * dtype_len;
 
       if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
@@ -179,7 +188,7 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   auto const data_len   = thrust::distance(s->data_start, s->data_end);
   auto const num_values = data_len / s->dtype_len_in;
 
-  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
+  int const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
@@ -187,24 +196,34 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
     int const batch_size = min(max_batch_size, end - pos);
 
     int const target_pos = pos + batch_size;
-    int src_pos          = pos + t;
+    int const thread_pos = pos + t;
 
     // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)];
-    if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+    // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    int const dst_pos = [&]() {
+      int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(thread_pos)];
+      if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+      return dst_pos;
+    }();
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
-    if (src_pos < target_pos && dst_pos >= 0) {
+    if (thread_pos < target_pos && dst_pos >= 0) {
       // src_pos represents the logical row position we want to read from. But in the case of
       // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
       // has to take into account the # of values we have to skip in the page to get to the
       // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
-      if constexpr (has_lists_t) { src_pos += skipped_leaf_values; }
+      int const src_pos = [&]() {
+        if constexpr (has_lists_t) {
+          return thread_pos + skipped_leaf_values;
+        } else {
+          return thread_pos;
+        }
+      }();
 
-      uint32_t dtype_len = s->dtype_len;
-      uint8_t const* src = s->data_start + src_pos;
-      uint8_t* dst       = data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      uint32_t const dtype_len = s->dtype_len;
+      uint8_t const* const src = s->data_start + src_pos;
+      uint8_t* const dst       = data_out + static_cast<size_t>(dst_pos) * dtype_len;
       auto const is_decimal =
         s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
 
@@ -292,12 +311,15 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     // definition level
-    int d = 1;
-    if (t >= batch_size) {
-      d = -1;
-    } else if (def) {
-      d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-    }
+    int const d = [&]() {
+      if (t >= batch_size) {
+        return -1;
+      } else if (def) {
+        return static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      } else {
+        return 1;
+      }
+    }();
 
     int const thread_value_count = t;
     int const block_value_count  = batch_size;
@@ -358,6 +380,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
         if (is_valid) {
           int const dst_pos = value_count + thread_value_count;
           int const src_pos = max_depth_valid_count + thread_valid_count;
+
           sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
         }
         // update stuff
@@ -414,16 +437,17 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
     // use definition level & row bounds to determine if is valid
-    int is_valid;
-    if (t >= batch_size) {
-      is_valid = 0;
-    } else if (def) {
-      int const def_level =
-        static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-      is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
-    } else {
-      is_valid = in_row_bounds;
-    }
+    int const is_valid = [&]() {
+      if (t >= batch_size) {
+        return 0;
+      } else if (def) {
+        int const def_level =
+          static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+        return ((def_level > 0) && in_row_bounds) ? 1 : 0;
+      } else {
+        return in_row_bounds;
+      }
+    }();
 
     // thread and block validity count
     using block_scan = cub::BlockScan<int, decode_block_size>;
@@ -588,25 +612,25 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c
 
     // get definition level, use repetition level to get start/end depth
     // different for each thread, as each thread has a different r/d
-    int def_level = -1, start_depth = -1, end_depth = -1;
-    if (within_batch) {
-      int const index = rolling_index<state_buf::nz_buf_size>(value_count + t);
-      int rep_level   = static_cast<int>(rep[index]);
-      if constexpr (nullable) {
+    auto const [def_level, start_depth, end_depth] = [&]() {
+      if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); }
+
+      int const index       = rolling_index<state_buf::nz_buf_size>(value_count + t);
+      int const rep_level   = static_cast<int>(rep[index]);
+      int const start_depth = s->nesting_info[rep_level].start_depth;
+
+      if constexpr (!nullable) {
+        return cuda::std::make_tuple(-1, start_depth, max_depth);
+      } else {
         if (def != nullptr) {
-          def_level = static_cast<int>(def[index]);
-          end_depth = s->nesting_info[def_level].end_depth;
+          int const def_level = static_cast<int>(def[index]);
+          return cuda::std::make_tuple(
+            def_level, start_depth, s->nesting_info[def_level].end_depth);
         } else {
-          def_level = 1;
-          end_depth = max_depth;
+          return cuda::std::make_tuple(1, start_depth, max_depth);
         }
-      } else {
-        end_depth = max_depth;
       }
-
-      // computed by generate_depth_remappings()
-      start_depth = s->nesting_info[rep_level].start_depth;
-    }
+    }();
 
     // Determine value count & row index
     //  track (page-relative) row index for the thread so we can compare against input bounds
@@ -644,12 +668,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c
       auto& ni = s->nesting_info[d_idx];
 
       // everything up to the max_def_level is a non-null value
-      int is_valid;
-      if constexpr (nullable) {
-        is_valid = ((def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0;
-      } else {
-        is_valid = in_nesting_bounds;
-      }
+      int const is_valid = [&](int input_def_level) {
+        if constexpr (nullable) {
+          return ((input_def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0;
+        } else {
+          return in_nesting_bounds;
+        }
+      }(def_level);
 
       // VALID COUNT:
       // Not all values visited by this block will represent a value at this nesting level.
@@ -726,7 +751,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c
       // if this is valid and we're at the leaf, output dst_pos
       // Read value_count before the sync, so that when thread 0 modifies it we've already read its
       // value
-      int current_value_count = ni.value_count;
+      int const current_value_count = ni.value_count;
       __syncthreads();  // guard against modification of ni.value_count below
       if (d_idx == max_depth) {
         if (is_valid) {
@@ -944,21 +969,21 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
 
   // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists)
   if constexpr (has_lists_t) {
-    if (s->page.skipped_leaf_values > 0) {
+    auto const skipped_leaf_values = s->page.skipped_leaf_values;
+    if (skipped_leaf_values > 0) {
       if (should_process_nulls) {
-        skip_decode<decode_block_size_t>(def_decoder, s->page.skipped_leaf_values, t);
+        skip_decode<decode_block_size_t>(def_decoder, skipped_leaf_values, t);
       }
-      processed_count =
-        skip_decode<decode_block_size_t>(rep_decoder, s->page.skipped_leaf_values, t);
+      processed_count = skip_decode<decode_block_size_t>(rep_decoder, skipped_leaf_values, t);
       if constexpr (has_dict_t) {
-        skip_decode<decode_block_size_t>(dict_stream, s->page.skipped_leaf_values, t);
+        skip_decode<decode_block_size_t>(dict_stream, skipped_leaf_values, t);
       }
     }
   }
 
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
-  int last_row = s->first_row + s->num_rows;
+  int const last_row = s->first_row + s->num_rows;
   while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
          (s->input_row_count <= last_row)) {
     int next_valid_count;

From e51406ce1992919bf567fc65125e2106015af239 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Fri, 11 Oct 2024 12:49:28 -0400
Subject: [PATCH 32/36] revert cmakelists change

---
 cpp/benchmarks/CMakeLists.txt | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4113e38dcf4..b8a53cd8bd9 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureBench(AST_BENCH ast/transform.cpp)
+ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
@@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
-# ##################################################################################################
-# * multi buffer memset benchmark
-# ----------------------------------------------------------------------
-ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
-
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)

From 07ffbf26046ae5fd72a112f030dbf233b1f7b677 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 18 Oct 2024 11:40:46 -0400
Subject: [PATCH 33/36] Update cpp/src/io/parquet/rle_stream.cuh

Co-authored-by: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
---
 cpp/src/io/parquet/rle_stream.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 9270db16c08..a84067743df 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -375,7 +375,7 @@ struct rle_stream {
   __device__ inline int skip_runs(int target_count)
   {
     // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip
-    // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info
+    // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info.
     // then when it hits the one that matters, we don't process it at all and bail as if we never
     // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for
     // the first time

From 32fe8b97dc46999dcfbf8d9abf4e74d6bbd88379 Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Fri, 18 Oct 2024 12:48:46 -0400
Subject: [PATCH 34/36] refactor rle_stream

---
 cpp/src/io/parquet/decode_fixed.cu | 15 +++----
 cpp/src/io/parquet/rle_stream.cuh  | 69 ++++++++++++------------------
 2 files changed, 33 insertions(+), 51 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 4e83a788747..e806e54a522 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -117,14 +117,11 @@ __device__ void gpuDecodeFixedWidthValues(
       // has to take into account the # of values we have to skip in the page to get to the
       // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
       int const src_pos = [&]() {
-        if constexpr (has_lists_t) {
-          return thread_pos + skipped_leaf_values;
-        } else {
-          return thread_pos;
-        }
+        if constexpr (has_lists_t) { return thread_pos + skipped_leaf_values; }
+        return thread_pos;
       }();
 
-      void* const dst = data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      void* const dst = data_out + (static_cast<size_t>(dst_pos) * dtype_len);
 
       if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
@@ -316,9 +313,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
         return -1;
       } else if (def) {
         return static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-      } else {
-        return 1;
       }
+      return 1;
     }();
 
     int const thread_value_count = t;
@@ -444,9 +440,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
         int const def_level =
           static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
         return ((def_level > 0) && in_row_bounds) ? 1 : 0;
-      } else {
-        return in_row_bounds;
       }
+      return in_row_bounds;
     }();
 
     // thread and block validity count
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index a84067743df..55339dbc289 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -216,6 +216,26 @@ struct rle_stream {
     decode_index = -1;  // signals the first iteration. Nothing to decode.
   }
 
+  __device__ inline int get_rle_run_info(rle_run<level_t>& run)
+  {
+    run.start     = cur;
+    run.level_run = get_vlq32(run.start, end);
+
+    // run_bytes includes the header size
+    int run_bytes = run.start - cur;
+    if (is_literal_run(run.level_run)) {
+      // from the parquet spec: literal runs always come in multiples of 8 values.
+      run.size = (run.level_run >> 1) * 8;
+      run_bytes += ((run.size * level_bits) + 7) >> 3;
+    } else {
+      // repeated value run
+      run.size = (run.level_run >> 1);
+      run_bytes += ((level_bits) + 7) >> 3;
+    }
+
+    return run_bytes;
+  }
+
   __device__ inline void fill_run_batch()
   {
     // decode_index == -1 means we are on the very first decode iteration for this stream.
@@ -226,31 +246,12 @@ struct rle_stream {
     while (((decode_index == -1 && fill_index < num_rle_stream_decode_warps) ||
             fill_index < decode_index + run_buffer_size) &&
            cur < end) {
-      auto& run = runs[rolling_index<run_buffer_size>(fill_index)];
-
       // Encoding::RLE
+      auto& run           = runs[rolling_index<run_buffer_size>(fill_index)];
+      int const run_bytes = get_rle_run_info(run);
+      run.remaining       = run.size;
+      run.output_pos      = output_pos;
 
-      // bytes for the varint header
-      uint8_t const* _cur = cur;
-      int const level_run = get_vlq32(_cur, end);
-      // run_bytes includes the header size
-      int run_bytes = _cur - cur;
-
-      // literal run
-      if (is_literal_run(level_run)) {
-        // from the parquet spec: literal runs always come in multiples of 8 values.
-        run.size = (level_run >> 1) * 8;
-        run_bytes += ((run.size * level_bits) + 7) >> 3;
-      }
-      // repeated value run
-      else {
-        run.size = (level_run >> 1);
-        run_bytes += ((level_bits) + 7) >> 3;
-      }
-      run.output_pos = output_pos;
-      run.start      = _cur;
-      run.level_run  = level_run;
-      run.remaining  = run.size;
       cur += run_bytes;
       output_pos += run.size;
       fill_index++;
@@ -380,29 +381,15 @@ struct rle_stream {
     // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for
     // the first time
     while (cur < end) {
-      // bytes for the varint header
-      uint8_t const* _cur = cur;
-      int const level_run = get_vlq32(_cur, end);
-
-      // run_bytes includes the header size
-      int run_bytes = _cur - cur;
-      int run_size;
-      if (is_literal_run(level_run)) {
-        // from the parquet spec: literal runs always come in multiples of 8 values.
-        run_size = (level_run >> 1) * 8;
-        run_bytes += ((run_size * level_bits) + 7) >> 3;
-      } else {
-        // repeated value run
-        run_size = (level_run >> 1);
-        run_bytes += ((level_bits) + 7) >> 3;
-      }
+      rle_run<level_t> run;
+      int run_bytes = get_rle_run_info(run);
 
-      if ((output_pos + run_size) > target_count) {
+      if ((output_pos + run.size) > target_count) {
         return output_pos;  // bail! we've reached the starting run
       }
 
       // skip this run
-      output_pos += run_size;
+      output_pos += run.size;
       cur += run_bytes;
     }
 

From 031ac6b414abc9045040e27ce4c5c1d5fa6f1c3f Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Wed, 23 Oct 2024 11:58:41 -0400
Subject: [PATCH 35/36] Use divide function

---
 cpp/src/io/parquet/rle_stream.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 55339dbc289..78f6bcaa7b4 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -19,6 +19,7 @@
 #include "parquet_gpu.hpp"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 
 namespace cudf::io::parquet::detail {
 
@@ -226,11 +227,11 @@ struct rle_stream {
     if (is_literal_run(run.level_run)) {
       // from the parquet spec: literal runs always come in multiples of 8 values.
       run.size = (run.level_run >> 1) * 8;
-      run_bytes += ((run.size * level_bits) + 7) >> 3;
+      run_bytes += util::div_rounding_up_unsafe(run.size * level_bits, 8);
     } else {
       // repeated value run
       run.size = (run.level_run >> 1);
-      run_bytes += ((level_bits) + 7) >> 3;
+      run_bytes += util::div_rounding_up_unsafe(level_bits, 8);
     }
 
     return run_bytes;

From 534e67de716dd880b638da68b0af7e2e0f4bf8fa Mon Sep 17 00:00:00 2001
From: Paul Mattione <pmattione@nvidia.com>
Date: Fri, 25 Oct 2024 16:06:44 -0400
Subject: [PATCH 36/36] address comments

---
 cpp/src/io/parquet/decode_fixed.cu | 26 ++++++++++++++------------
 cpp/src/io/parquet/rle_stream.cuh  |  6 ++++--
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index e806e54a522..cedced55d51 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -610,15 +610,15 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c
     auto const [def_level, start_depth, end_depth] = [&]() {
       if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); }
 
-      int const index       = rolling_index<state_buf::nz_buf_size>(value_count + t);
-      int const rep_level   = static_cast<int>(rep[index]);
+      int const level_index = rolling_index<state_buf::nz_buf_size>(value_count + t);
+      int const rep_level   = static_cast<int>(rep[level_index]);
       int const start_depth = s->nesting_info[rep_level].start_depth;
 
       if constexpr (!nullable) {
         return cuda::std::make_tuple(-1, start_depth, max_depth);
       } else {
         if (def != nullptr) {
-          int const def_level = static_cast<int>(def[index]);
+          int const def_level = static_cast<int>(def[level_index]);
           return cuda::std::make_tuple(
             def_level, start_depth, s->nesting_info[def_level].end_depth);
         } else {
@@ -639,13 +639,13 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c
       total_num_new_rows = new_row_scan_results.block_count;
     }
 
-    int const row_index = input_row_count + (num_prior_new_rows + is_new_row - 1);
+    int const row_index = input_row_count + ((num_prior_new_rows + is_new_row) - 1);
     input_row_count += total_num_new_rows;
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
     // VALUE COUNT:
-    // if we are within the range of nesting levels we should be adding value indices for
-    // is from/in current rep level to/in the rep level AT the depth with the def value
+    // in_nesting_bounds: if at a nesting level where we need to add value indices
+    // the bounds: from current rep to the rep AT the def depth
     int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
     int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count;
     {
@@ -724,7 +724,8 @@ static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_c
         }
       }
 
-      // validity is processed per-warp (on lane 0's), because writes are 32-bit atomic ops
+      // validity is processed per-warp (on lane 0's)
+      // thi is because when atomic writes are needed, they are 32-bit operations
       //
       // lists always read and write to the same bounds
       // (that is, read and write positions are already pre-bounded by first_row/num_rows).
@@ -820,7 +821,7 @@ __device__ inline bool maybe_has_nulls(page_state_s* s)
   return run_val != s->col.max_level[lvl];
 }
 
-template <int decode_block_size_t, typename stream_type>
+template <int rolling_buf_size, typename stream_type>
 __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
 {
   // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000:
@@ -828,7 +829,8 @@ __device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
   // modulo 2 * block_size of course, since that's as many as we process at once
   int num_skipped = parquet_stream.skip_decode(t, num_to_skip);
   while (num_skipped < num_to_skip) {
-    auto const to_decode = min(2 * decode_block_size_t, num_to_skip - num_skipped);
+    // TODO: Instead of decoding, skip within the run to the appropriate location
+    auto const to_decode = min(rolling_buf_size, num_to_skip - num_skipped);
     num_skipped += parquet_stream.decode_next(t, to_decode);
     __syncthreads();
   }
@@ -967,11 +969,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     auto const skipped_leaf_values = s->page.skipped_leaf_values;
     if (skipped_leaf_values > 0) {
       if (should_process_nulls) {
-        skip_decode<decode_block_size_t>(def_decoder, skipped_leaf_values, t);
+        skip_decode<rolling_buf_size>(def_decoder, skipped_leaf_values, t);
       }
-      processed_count = skip_decode<decode_block_size_t>(rep_decoder, skipped_leaf_values, t);
+      processed_count = skip_decode<rolling_buf_size>(rep_decoder, skipped_leaf_values, t);
       if constexpr (has_dict_t) {
-        skip_decode<decode_block_size_t>(dict_stream, skipped_leaf_values, t);
+        skip_decode<rolling_buf_size>(dict_stream, skipped_leaf_values, t);
       }
     }
   }
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 78f6bcaa7b4..69e783a89d0 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -248,10 +248,12 @@ struct rle_stream {
             fill_index < decode_index + run_buffer_size) &&
            cur < end) {
       // Encoding::RLE
+      // Pass by reference to fill the runs shared memory with the run data
       auto& run           = runs[rolling_index<run_buffer_size>(fill_index)];
       int const run_bytes = get_rle_run_info(run);
-      run.remaining       = run.size;
-      run.output_pos      = output_pos;
+
+      run.remaining  = run.size;
+      run.output_pos = output_pos;
 
       cur += run_bytes;
       output_pos += run.size;