larq · Tombana · Apr 7, 2020 · Mar 24, 2020 · Mar 25, 2020 · Mar 25, 2020
diff --git a/larq_compute_engine/core/bconv2d_impl_ref.h b/larq_compute_engine/core/bconv2d_impl_ref.h
@@ -30,21 +30,24 @@ namespace compute_engine {
 namespace ce = compute_engine;
 namespace ref {
 
-template <class T, class TBitpacked>
+template <typename SrcScalar, typename TBitpacked, typename AccumScalar,
+          typename DstScalar>
 inline void BConv2D(const ConvParams& params,
                     const RuntimeShape& packed_input_shape,
                     const TBitpacked* packed_input_data,
                     const RuntimeShape& packed_filter_shape,
                     const TBitpacked* packed_filter_data,
                     const float* post_activation_multiplier_data,
                     const float* post_activation_bias_data,
-                    const RuntimeShape& output_shape, T* output_data,
-                    const RuntimeShape& im2col_shape, T* im2col_data,
-                    bool bitpack_before_im2col, T* padding_buffer,
+                    const RuntimeShape& output_shape, DstScalar* output_data,
+                    const RuntimeShape& im2col_shape, SrcScalar* im2col_data,
+                    bool bitpack_before_im2col, SrcScalar* padding_buffer,
                     const int pad_value, void* cpu_backend_context,
                     const std::int32_t backtransform_add) {
-  using AccumScalar = std::int32_t;
-  using DstScalar = T;
+  static_assert(std::is_same<DstScalar, float>::value ||
+                    std::is_same<DstScalar, std::int32_t>::value,
+                "The reference implementation supports either float "
+                "output or 32-bit bitpacked output.");
 
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
@@ -67,7 +70,7 @@ inline void BConv2D(const ConvParams& params,
   const int batches = MatchingDim(packed_input_shape, 0, output_shape, 0);
   const int input_depth =
       MatchingDim(packed_input_shape, 3, packed_filter_shape, 3);
-  const int output_depth = MatchingDim(packed_filter_shape, 0, output_shape, 3);
+  const int output_depth = packed_filter_shape.Dims(0);
   const int input_height = packed_input_shape.Dims(1);
   const int input_width = packed_input_shape.Dims(2);
   const int filter_height = packed_filter_shape.Dims(1);
@@ -77,6 +80,8 @@ inline void BConv2D(const ConvParams& params,
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        // This variable is only used if we are writing bitpacked output.
+        std::uint32_t bitpacked_column = 0;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
           const int in_x_origin = (out_x * stride_width) - pad_width;
           const int in_y_origin = (out_y * stride_height) - pad_height;
@@ -109,15 +114,35 @@ inline void BConv2D(const ConvParams& params,
           accum = std::min<AccumScalar>(accum, clamp_max);
           accum = std::max<AccumScalar>(accum, clamp_min);
           // Post multiply and add are done in float
-          DstScalar dst_val = static_cast<DstScalar>(accum);
+          float dst_val = static_cast<float>(accum);
           if (post_activation_multiplier) {
             dst_val *= post_activation_multiplier[out_channel];
           }
           if (post_activation_bias) {
             dst_val += post_activation_bias[out_channel];
           }
-          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              dst_val;
+
+          // If the destination scalar is int32, we're writing bitpacked output.
+          if (std::is_same<DstScalar, std::int32_t>::value) {
+            // In our bitpacking we map strictly negative values to 1, and
+            // non-negative values to 0.
+            if (dst_val < 0) bitpacked_column += 1 << (out_channel % 32);
+
+            // After we've 'filled' the `bitpacked_column` with 32 values, or
+            // reached the end of the channels, we write it to memory.
+            if ((out_channel + 1) % 32 == 0 ||
+                (out_channel + 1 == output_depth)) {
+              output_data[Offset(output_shape, batch, out_y, out_x,
+                                 out_channel / 32)] = bitpacked_column;
+              bitpacked_column = 0;
+            }
+          }
+
+          // Otherwise, we're not writing bitpacked output; it must be float.
+          else {
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               out_channel)] = dst_val;
+          }
         }
       }
     }

diff --git a/larq_compute_engine/core/bgemm_kernels_arm.h b/larq_compute_engine/core/bgemm_kernels_arm.h
@@ -87,7 +87,6 @@ struct BgemmKernel<ruy::Path::kNeon, LhsScalar, RhsScalar, DstScalar, Spec> {
         "Input to binary kernel should be of type unsigned integral.");
     static_assert(std::is_signed<DstScalar>::value,
                   "Output of binary kernel should be of a signed type.");
-
     // TODO: not implemented -> fallback to standard cpp
   }
 };

diff --git a/larq_compute_engine/core/bgemm_kernels_ruy.h b/larq_compute_engine/core/bgemm_kernels_ruy.h
@@ -88,6 +88,99 @@ struct BgemmKernel<ruy::Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar,
   }
 };
 
+// A template specialisation for writing 8-bit bitpacked output.
+template <typename LhsScalar, typename RhsScalar, typename Spec>
+struct BgemmKernel<ruy::Path::kStandardCpp, LhsScalar, RhsScalar, std::int8_t,
+                   Spec> {
+  using AccumScalar = typename Spec::AccumScalar;
+  using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
+  using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
+  explicit BgemmKernel(ruy::Tuning) {}
+  void Run(const ruy::PackedMatrix<LhsScalar>& lhs,
+           const ruy::PackedMatrix<RhsScalar>& rhs, const Spec& spec,
+           int start_row, int start_col, int end_row, int end_col,
+           ruy::Matrix<std::int8_t>* dst) const {
+    static_assert(std::is_same<LhsScalar, RhsScalar>::value,
+                  "Inputs to binary kernel should have the same type.");
+    static_assert(
+        std::is_unsigned<LhsScalar>::value &&
+            std::is_integral<LhsScalar>::value,
+        "Input to binary kernel should be of type unsigned integral.");
+
+    using TBitpacked = LhsScalar;
+
+    // We are writing 8-bit bitpacked output (where we bitpack along the channel
+    // axis) and so we need to operate on blocks of 8 channels at a time. As the
+    // destination is column major, this means blocks of 8 rows at a time. The
+    // blocks Ruy uses are always a power of two and are almost always >> 8.
+    // However, when running with multiple threads and a very large input size,
+    // Ruy may use blocks of 4 rows.
+    //     In this scenario, we round the start and end row down and up to the
+    // nearest multiple of 8 respectively. This is a thread-safe way to ensure
+    // that the result is correct, at the cost of some rare repeated
+    // computation, which is acceptable for this non-optimised kernel.
+    start_row = 8 * (start_row / 8);
+    end_row = 8 * ((end_row + 7) / 8);
+
+    int clamped_end_row = std::min(end_row, dst->layout.rows);
+    int clamped_end_col = std::min(end_col, dst->layout.cols);
+    RUY_DCHECK_LE(0, start_row);
+    RUY_DCHECK_LE(start_row, clamped_end_row);
+    RUY_DCHECK_LE(clamped_end_row, dst->layout.rows);
+    RUY_DCHECK_LE(clamped_end_row, end_row);
+    RUY_DCHECK_LE(0, start_col);
+    RUY_DCHECK_LE(start_col, clamped_end_col);
+    RUY_DCHECK_LE(clamped_end_col, dst->layout.cols);
+    RUY_DCHECK_LE(clamped_end_col, end_col);
+    RUY_DCHECK_LE(end_col - clamped_end_col, RhsLayout::kCols);
+
+    RUY_DCHECK_EQ(dst->layout.order, Order::kColMajor);
+
+    gemmlowp::ScopedProfilingLabel label(
+        "Binary Kernel (Standard Cpp) Bitpacked Output.");
+
+    const int depth = lhs.layout.rows;
+    const int dst_stride_bitpacked = (dst->layout.stride + 7) / 8;
+
+    // The destination is column major and we need to bitpack along the row
+    // (channels) axis so we need to loop over column index then row index.
+    for (int j = start_col; j < clamped_end_col; j++) {
+      std::uint8_t bitpacked_column = 0;
+      for (int i = start_row; i < clamped_end_row; i++) {
+        using AccumScalar = typename Spec::AccumScalar;
+        AccumScalar accum = 0;
+        for (int k = 0; k < depth; k++) {
+          TBitpacked lhs_val = Element(lhs, k, i);
+          TBitpacked rhs_val = Element(rhs, k, j);
+          accum +=
+              ce::core::xor_popcount<TBitpacked, AccumScalar>(lhs_val, rhs_val);
+        }
+        // Backtransform can still be done in int32
+        accum = spec.backtransform_add - 2 * accum;
+        // Activation function can also be done in int32
+        accum = std::min<AccumScalar>(accum, spec.clamp_max);
+        accum = std::max<AccumScalar>(accum, spec.clamp_min);
+        // Post multiply and add are done in float
+        auto dst_val = static_cast<float>(accum);
+        if (spec.post_activation_multiplier) {
+          dst_val *= spec.post_activation_multiplier[i];
+        }
+        if (spec.post_activation_bias) {
+          dst_val += spec.post_activation_bias[i];
+        }
+        if (dst_val < 0) {
+          bitpacked_column += 1 << ((i - start_row) % 8);
+        }
+        if (((i - start_row + 1) % 8 == 0) || (i + 1 == clamped_end_row)) {
+          *(dst->data.get() + i / 8 + j * dst_stride_bitpacked) =
+              bitpacked_column;
+          bitpacked_column = 0;
+        }
+      }
+    }
+  }
+};
+
 template <ruy::Path ThePath, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
 void RunBgemmKernelTyped(ruy::Tuning tuning,

diff --git a/larq_compute_engine/tflite/build_make/Makefile b/larq_compute_engine/tflite/build_make/Makefile
@@ -1,7 +1,7 @@
 #
 # This is based on
 # tensorflow/tensorflow/lite/tools/make/Makefile
-# 
+#
 # The makefile will always be run from the root of the compute engine repository
 
 # Make uses /bin/sh by default, which is incompatible with the bashisms seen