Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support reading and writing bitpacked activations in C++ kernels. #305

Merged
merged 19 commits into from
Apr 7, 2020
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 35 additions & 10 deletions larq_compute_engine/core/bconv2d_impl_ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,24 @@ namespace compute_engine {
namespace ce = compute_engine;
namespace ref {

template <class T, class TBitpacked>
template <typename SrcScalar, typename TBitpacked, typename AccumScalar,
typename DstScalar>
inline void BConv2D(const ConvParams& params,
const RuntimeShape& packed_input_shape,
const TBitpacked* packed_input_data,
const RuntimeShape& packed_filter_shape,
const TBitpacked* packed_filter_data,
const float* post_activation_multiplier_data,
const float* post_activation_bias_data,
const RuntimeShape& output_shape, T* output_data,
const RuntimeShape& im2col_shape, T* im2col_data,
bool bitpack_before_im2col, T* padding_buffer,
const RuntimeShape& output_shape, DstScalar* output_data,
const RuntimeShape& im2col_shape, SrcScalar* im2col_data,
bool bitpack_before_im2col, SrcScalar* padding_buffer,
const int pad_value, void* cpu_backend_context,
const std::int32_t backtransform_add) {
using AccumScalar = std::int32_t;
using DstScalar = T;
static_assert(std::is_same<DstScalar, float>::value ||
std::is_same<DstScalar, std::int32_t>::value,
"The reference implementation supports either float "
"output or 32-bit bitpacked output.");

const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
Expand All @@ -67,7 +70,7 @@ inline void BConv2D(const ConvParams& params,
const int batches = MatchingDim(packed_input_shape, 0, output_shape, 0);
const int input_depth =
MatchingDim(packed_input_shape, 3, packed_filter_shape, 3);
const int output_depth = MatchingDim(packed_filter_shape, 0, output_shape, 3);
const int output_depth = packed_filter_shape.Dims(0);
const int input_height = packed_input_shape.Dims(1);
const int input_width = packed_input_shape.Dims(2);
const int filter_height = packed_filter_shape.Dims(1);
Expand All @@ -77,6 +80,8 @@ inline void BConv2D(const ConvParams& params,
for (int batch = 0; batch < batches; ++batch) {
for (int out_y = 0; out_y < output_height; ++out_y) {
for (int out_x = 0; out_x < output_width; ++out_x) {
// This variable is only used if we are writing bitpacked output.
std::uint32_t bitpacked_column = 0;
for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
Expand Down Expand Up @@ -109,15 +114,35 @@ inline void BConv2D(const ConvParams& params,
accum = std::min<AccumScalar>(accum, clamp_max);
accum = std::max<AccumScalar>(accum, clamp_min);
// Post multiply and add are done in float
DstScalar dst_val = static_cast<DstScalar>(accum);
float dst_val = static_cast<float>(accum);
if (post_activation_multiplier) {
dst_val *= post_activation_multiplier[out_channel];
}
if (post_activation_bias) {
dst_val += post_activation_bias[out_channel];
}
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
dst_val;

// If the destination scalar is int32, we're writing bitpacked output.
if (std::is_same<DstScalar, std::int32_t>::value) {
// In our bitpacking we map strictly negative values to 1, and
// non-negative values to 0.
if (dst_val < 0) bitpacked_column += 1 << (out_channel % 32);

// After we've 'filled' the `bitpacked_column` with 32 values, or
// reached the end of the channels, we write it to memory.
if ((out_channel + 1) % 32 == 0 ||
(out_channel + 1 == output_depth)) {
output_data[Offset(output_shape, batch, out_y, out_x,
out_channel / 32)] = bitpacked_column;
bitpacked_column = 0;
}
}

// Otherwise, we're not writing bitpacked output; it must be float.
else {
output_data[Offset(output_shape, batch, out_y, out_x,
out_channel)] = dst_val;
}
}
}
}
Expand Down
1 change: 0 additions & 1 deletion larq_compute_engine/core/bgemm_kernels_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ struct BgemmKernel<ruy::Path::kNeon, LhsScalar, RhsScalar, DstScalar, Spec> {
"Input to binary kernel should be of type unsigned integral.");
static_assert(std::is_signed<DstScalar>::value,
"Output of binary kernel should be of a signed type.");

// TODO: not implemented -> fallback to standard cpp
}
};
Expand Down
93 changes: 93 additions & 0 deletions larq_compute_engine/core/bgemm_kernels_ruy.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,99 @@ struct BgemmKernel<ruy::Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar,
}
};

// A template specialisation for writing 8-bit bitpacked output.
Tombana marked this conversation as resolved.
Show resolved Hide resolved
template <typename LhsScalar, typename RhsScalar, typename Spec>
struct BgemmKernel<ruy::Path::kStandardCpp, LhsScalar, RhsScalar, std::int8_t,
Spec> {
using AccumScalar = typename Spec::AccumScalar;
using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
explicit BgemmKernel(ruy::Tuning) {}
void Run(const ruy::PackedMatrix<LhsScalar>& lhs,
const ruy::PackedMatrix<RhsScalar>& rhs, const Spec& spec,
int start_row, int start_col, int end_row, int end_col,
ruy::Matrix<std::int8_t>* dst) const {
static_assert(std::is_same<LhsScalar, RhsScalar>::value,
"Inputs to binary kernel should have the same type.");
static_assert(
std::is_unsigned<LhsScalar>::value &&
std::is_integral<LhsScalar>::value,
"Input to binary kernel should be of type unsigned integral.");

using TBitpacked = LhsScalar;

// We are writing 8-bit bitpacked output (where we bitpack along the channel
// axis) and so we need to operate on blocks of 8 channels at a time. As the
// destination is column major, this means blocks of 8 rows at a time. The
// blocks Ruy uses are always a power of two and are almost always >> 8.
// However, when running with multiple threads and a very large input size,
// Ruy may use blocks of 4 rows.
// In this scenario, we round the start and end row down and up to the
// nearest multiple of 8 respectively. This is a thread-safe way to ensure
// that the result is correct, at the cost of some rare repeated
// computation, which is acceptable for this non-optimised kernel.
start_row = 8 * (start_row / 8);
end_row = 8 * ((end_row + 7) / 8);

int clamped_end_row = std::min(end_row, dst->layout.rows);
int clamped_end_col = std::min(end_col, dst->layout.cols);
RUY_DCHECK_LE(0, start_row);
RUY_DCHECK_LE(start_row, clamped_end_row);
RUY_DCHECK_LE(clamped_end_row, dst->layout.rows);
RUY_DCHECK_LE(clamped_end_row, end_row);
RUY_DCHECK_LE(0, start_col);
RUY_DCHECK_LE(start_col, clamped_end_col);
RUY_DCHECK_LE(clamped_end_col, dst->layout.cols);
RUY_DCHECK_LE(clamped_end_col, end_col);
RUY_DCHECK_LE(end_col - clamped_end_col, RhsLayout::kCols);

RUY_DCHECK_EQ(dst->layout.order, Order::kColMajor);

gemmlowp::ScopedProfilingLabel label(
"Binary Kernel (Standard Cpp) Bitpacked Output.");

const int depth = lhs.layout.rows;
const int dst_stride_bitpacked = (dst->layout.stride + 7) / 8;

// The destination is column major and we need to bitpack along the row
// (channels) axis so we need to loop over column index then row index.
for (int j = start_col; j < clamped_end_col; j++) {
std::uint8_t bitpacked_column = 0;
for (int i = start_row; i < clamped_end_row; i++) {
using AccumScalar = typename Spec::AccumScalar;
AccumScalar accum = 0;
for (int k = 0; k < depth; k++) {
TBitpacked lhs_val = Element(lhs, k, i);
TBitpacked rhs_val = Element(rhs, k, j);
accum +=
ce::core::xor_popcount<TBitpacked, AccumScalar>(lhs_val, rhs_val);
}
// Backtransform can still be done in int32
accum = spec.backtransform_add - 2 * accum;
// Activation function can also be done in int32
accum = std::min<AccumScalar>(accum, spec.clamp_max);
accum = std::max<AccumScalar>(accum, spec.clamp_min);
// Post multiply and add are done in float
auto dst_val = static_cast<float>(accum);
if (spec.post_activation_multiplier) {
dst_val *= spec.post_activation_multiplier[i];
}
if (spec.post_activation_bias) {
dst_val += spec.post_activation_bias[i];
}
if (dst_val < 0) {
bitpacked_column += 1 << ((i - start_row) % 8);
}
if (((i - start_row + 1) % 8 == 0) || (i + 1 == clamped_end_row)) {
*(dst->data.get() + i / 8 + j * dst_stride_bitpacked) =
bitpacked_column;
bitpacked_column = 0;
}
}
}
}
};

template <ruy::Path ThePath, typename LhsScalar, typename RhsScalar,
typename DstScalar, typename Spec>
void RunBgemmKernelTyped(ruy::Tuning tuning,
Expand Down
2 changes: 1 addition & 1 deletion larq_compute_engine/tflite/build_make/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# This is based on
# tensorflow/tensorflow/lite/tools/make/Makefile
#
#
# The makefile will always be run from the root of the compute engine repository

# Make uses /bin/sh by default, which is incompatible with the bashisms seen
Expand Down
Loading