diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp index 4afc02acaa8..5799a63ba9c 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp @@ -2,21 +2,13 @@ // // SPDX-License-Identifier: Apache-2.0 -#include - - #include "test_golden_impls.hpp" -#include "common/test_tiles.hpp" -#include "common/bfloat16.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tests/tt_metal/test_utils/packing.hpp" using std::vector; namespace unit_tests::compute { -std::vector gold_standard_untilize(const std::vector &src_vec, const GoldenConfig &config) { +vector gold_standard_untilize(const vector &src_vec, const GoldenConfig &config) { vector dst_vec; int num_rows = config.num_tiles_r_dim * config.face_r_dim * (config.num_faces > 2 ? 2: 1); @@ -74,7 +66,7 @@ std::vector gold_standard_untilize(const std::vector &src_ve return dst_vec; } -std::vector gold_standard_tilize(const std::vector &src_vec, const GoldenConfig &config) { +vector gold_standard_tilize(const vector &src_vec, const GoldenConfig &config) { vector dst_vec; //TODO: RT update this one to use variable tile sizes @@ -116,7 +108,7 @@ std::vector gold_standard_tilize(const std::vector &src_vec, // input shape.x is assumed to have the full number of elements in bfloat16 // src_vec is expected to be untilized // result is also untilized -std::vector gold_transpose_wh(const std::vector &src_vec, const std::vector &shape) { +vector gold_transpose_wh(const vector &src_vec, const vector &shape) { vector shapeT{shape[0], shape[1], shape[3], shape[2]}; TensAddr addr(shape); TensAddr addrt(shapeT); @@ -138,7 +130,7 @@ std::vector gold_transpose_wh(const std::vector &src_vec, co // input shape.x is assumed to have the full number of elements in bfloat16 // src_vec is expected to be untilized // result is also untilized -std::vector gold_reduce_h(const std::vector &src_vec, const std::vector &shape, float scaler, uint8_t red_type, bool zeropad) { +vector gold_reduce_h(const vector &src_vec, const vector &shape, float scaler, uint8_t red_type, bool zeropad) { vector shape_dst{shape[0], shape[1], 1, shape[3]}; TT_FATAL(shape[2] > 0, "Error"); if (zeropad) @@ -167,7 +159,7 @@ std::vector gold_reduce_h(const std::vector &src_vec, const return reduced; }; -std::vector gold_reduce_w(const vector &src_vec, const std::vector &shape, float scaler, uint8_t red_type, bool zeropad) { +vector gold_reduce_w(const vector &src_vec, const vector &shape, float scaler, uint8_t red_type, bool zeropad) { vector shape_dst{shape[0], shape[1], shape[2], 1}; if (zeropad) shape_dst[3] = 32; @@ -194,7 +186,7 @@ std::vector gold_reduce_w(const vector &src_vec, const std:: return reduced; } -std::vector gold_reduce_hw(const std::vector &src_vec, const std::vector &shape, float scaler, uint8_t red_type, bool zeropad) { +vector gold_reduce_hw(const vector &src_vec, const vector &shape, float scaler, uint8_t red_type, bool zeropad) { vector shape_dst{shape[0], shape[1], 1, 1}; if (zeropad) { shape_dst[2] = 32; @@ -225,12 +217,12 @@ std::vector gold_reduce_hw(const std::vector &src_vec, const return reduced; } -std::vector gold_standard_tilize_w_elwadd(const std::vector &src0_vec, const std::vector &src1_vec, const GoldenConfig &config) { +vector gold_standard_tilize_w_elwadd(const vector &src0_vec, const vector &src1_vec, const GoldenConfig &config) { - std::vector unpacked_tilize_src0_vec = tt::test_utils::unpack_vector(gold_standard_tilize(src0_vec, config)); - std::vector unpacked_src1_vec = tt::test_utils::unpack_vector(src1_vec); + vector unpacked_tilize_src0_vec = tt::test_utils::unpack_vector(gold_standard_tilize(src0_vec, config)); + vector unpacked_src1_vec = tt::test_utils::unpack_vector(src1_vec); - std::vector result_vec(unpacked_tilize_src0_vec.size()); + vector result_vec(unpacked_tilize_src0_vec.size()); std::transform( unpacked_tilize_src0_vec.begin(), @@ -244,5 +236,127 @@ std::vector gold_standard_tilize_w_elwadd(const std::vector return tt::test_utils::pack_vector(result_vec); } +// A pointer to the appropriate function for generating random packed vector depending on the data format +using RandomVectorGenerator = std::function(uint32_t num_bytes, bool is_exp_a, int max_float, int seed, float offset)>; +// A pointer to the appropriate function for unpacking the vector from the given data format to float vec +using VectorUnpacker = std::function(const vector &packed_input, bool row_major_output, bool is_exp_a)>; + + +vector generate_random_vector_generalized( + const float lower, + const float upper, + const size_t num_bytes, + const tt::DataFormat data_format, + const int seed, + bool exclude_zeroes, + float golden_neg_epsilon, + float golden_pos_epsilon) { + + RandomVectorGenerator vector_generator; + + // Select the appropriate vector generator based on the data format + switch (data_format) { + case tt::DataFormat::Float16_b: + vector_generator = [&](uint32_t num_bytes, bool is_exp_a, int max_float, int seed, float offset) { + return create_random_vector_of_bfloat16(num_bytes, max_float, seed, offset); + }; + break; + case tt::DataFormat::Float32: + vector_generator = [&](uint32_t num_bytes, bool is_exp_a, int max_float, int seed, float offset) { + auto rand_float = std::bind(std::uniform_real_distribution(0, max_float), std::mt19937(seed)); + vector vec(num_bytes/sizeof(uint32_t), 0); + for (int i = 0; i < vec.size(); i++) { + float num_float = rand_float() + offset; + std::memcpy(&vec[i], &num_float, sizeof(float)); + } + return vec; + }; + break; + case tt::DataFormat::Bfp8_b: + vector_generator = [&](uint32_t num_bytes,bool is_exp_a, int max_float, int seed, float offset) { + return create_random_vector_of_bfp8(num_bytes, is_exp_a, max_float, seed, offset); + }; + break; + case tt::DataFormat::Bfp4_b: + vector_generator = [&](uint32_t num_bytes,bool is_exp_a, int max_float, int seed, float offset) { + return create_random_vector_of_bfp4(num_bytes, is_exp_a, max_float, seed, offset); + }; + break; + default: + TT_THROW("Unsupported DataFormat!"); + return {}; + } + + if (exclude_zeroes) { + if (lower < 0 && upper > 0) { + vector vec; + + // Split into negative and positive parts, avoiding zero + auto negative_part = vector_generator( + num_bytes / 2, + false, + std::abs(lower - golden_neg_epsilon), + seed, + lower); + auto positive_part = vector_generator( + num_bytes - num_bytes / 2, + false, + upper - golden_pos_epsilon, + seed + 1, // Use a different seed for the positive part + golden_pos_epsilon); + + // Combine both parts + vec.insert(vec.end(), negative_part.begin(), negative_part.end()); + vec.insert(vec.end(), positive_part.begin(), positive_part.end()); + return vec; + } else { + TT_THROW("Cannot create a vector without zeroes with selected input value range!"); + } + } else { + // Use the generic generator for the entire range + return vector_generator(num_bytes, false, upper - lower, seed, lower); + } +} + +vector unpack_generalized(const tt::DataFormat data_format, const vector& packed_input) { + VectorUnpacker unpacker_function; + + // Select the appropriate vector generator based on the data format + switch (data_format) { + case tt::DataFormat::Float16_b: + unpacker_function = [&](const vector &packed_input, bool row_major_output, bool is_exp_a) { + vector vec = unpack_uint32_vec_into_bfloat16_vec(packed_input); + vector vec_float(vec.size()); + for (int i = 0; i < vec.size(); i++) { + vec_float[i] = vec[i].to_float(); + } + return vec_float; + }; + break; + case tt::DataFormat::Float32: + unpacker_function = [&](const vector &packed_input, bool row_major_output, bool is_exp_a) { + vector vec(packed_input.size(), 0); + for (int i = 0; i < packed_input.size(); i++) { + std::memcpy(&vec[i], &packed_input[i], sizeof(uint32_t)); + } + return vec; + }; + break; + case tt::DataFormat::Bfp8_b: + unpacker_function = [&](const vector &packed_input, bool row_major_output, bool is_exp_a) { + return unpack_bfp8_tiles_into_float_vec(packed_input, row_major_output, is_exp_a); + }; + break; + case tt::DataFormat::Bfp4_b: + unpacker_function = [&](const vector &packed_input, bool row_major_output, bool is_exp_a) { + return unpack_bfp4_tiles_into_float_vec(packed_input, row_major_output, is_exp_a); + }; + break; + default: + TT_THROW("Unsupported DataFormat!"); + return {}; + } + return unpacker_function(packed_input, true, false); +} } // unit_tests::compute diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp index 550ced2cf6b..f44c8cc405c 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp @@ -9,6 +9,15 @@ #include #include #include +#include + +#include "common/test_tiles.hpp" +#include "tt_metal/common/bfloat16.hpp" +#include "tt_metal/common/bfloat8.hpp" +#include "tt_metal/common/bfloat4.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tests/tt_metal/test_utils/packing.hpp" //TODO: RT these functions should be templated for different data formats namespace unit_tests::compute { @@ -54,4 +63,31 @@ std::vector gold_reduce_hw(const std::vector &src_vec, const // Assumes all elements in bfloat16 std::vector gold_standard_tilize_w_elwadd(const std::vector &src0_vec, const std::vector &src1_vec, const GoldenConfig &config); +// Random packed uint32_t vector generator which is data-format agnostic. +// Takes the following parameters: +// +// lower - a lower limit of the input range +// upper - an upper limit of the input range +// num_bytes - number of bytes that the vector will occupy +// data_format - data format of each element, packed to uint32_t, currently supporting Bfloat16, Float32, Bfp8_b and Bfp4_b +// seed - randomization seed +// exclude_zeroes - if true, excludes values around zero, with the limits given by next two parameters +// golden_neg_epsilon - small negative value above which no elements of the vector will take value from +// golden_pos_epsilon - small positive value below which no elements of the vector will take value from +// +// Returns: +// +// a uint32_t vector of packed values depending on the data format and given limits +std::vector generate_random_vector_generalized(const float lower, const float upper, const size_t num_bytes, const tt::DataFormat data_format, const int seed, bool exclude_zeroes = false, float golden_neg_epsilon = -0.0001f, float golden_pos_epsilon = 0.0001f); + +// Unpacking function which is data-format agnostic +// Takes the following parameters: +// +// data_format - data format in which the vector was packed, currently supporting Bfloat16, Float32, Bfp8_b and Bfp4_b +// packed_input - a uint32_t packed vector +// +// Returns: +// a float vector of unpacked values depending on the data format +std::vector unpack_generalized(const tt::DataFormat data_format, const std::vector& packed_input); + } // unit_tests::compute diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp index 35ffb316d01..419dc486fd9 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp @@ -5,30 +5,37 @@ #include #include -#include -#include -#include - #include "device_fixture.hpp" +#include "test_golden_impls.hpp" #include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" #include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +// Limits of SFPU input value range. These values are chosen because they +// cover the domains of all SFPU functions used in the test +#define GOLDEN_BOT_LIMIT (-7.0f) +#define GOLDEN_TOP_LIMIT (7.0f) +// Small values around zero for domains which have to exclude zeroes, like +// log or reciprocal +#define GOLDEN_NEG_EPSILON (-0.0001f) +#define GOLDEN_POS_EPSILON (0.0001f) +// Min/max values of the randomly-generated input block height/width +#define MIN_BLOCK_DIM (1) +#define MAX_BLOCK_DIM (10) +// Number of dimensions randomly generated, can be expanded up to 4, +// making an input a full tensor rather than a matrix +#define NUM_DIMS (2) + using std::map; using std::vector; using namespace tt; using namespace tt::test_utils; -using namespace tt::test_utils::df; using namespace tt::tt_metal; namespace unit_tests::sfpu_util { - -const map> sfpu_op_to_op_name = { - // FIXME: #1157 +// Internal sfpu_op is mapped to proper SFPU function calls +const map> sfpu_op_to_op_name = { {"relu", {{"SFPU_OP_CHAIN_0", "relu_tile_init(); relu_tile(0);"}}}, {"exponential", {{"SFPU_OP_CHAIN_0", "exp_tile_init(); exp_tile(0);"}}}, {"reciprocal", {{"SFPU_OP_CHAIN_0", "recip_tile_init(); recip_tile(0);"}}}, @@ -39,62 +46,65 @@ const map> sfpu_op_to_op_name = { {"tanh", {{"SFPU_OP_CHAIN_0", "tanh_tile_init(); tanh_tile(0);"}}}, }; -bfloat16 sfpu_function(const string& op_name, const bfloat16& input) { +// Function that generates different input ranges depending on the SFPU op specified +vector generate_random_sfpu_vector( + const float lower, + const float upper, + const size_t num_bytes, + const string& op_name, + const tt::DataFormat data_format, + const int seed) { + if ((op_name == "sqrt") || (op_name == "log")) { + // sqrt and log have values between (0, upper] + return unit_tests::compute::generate_random_vector_generalized(GOLDEN_POS_EPSILON, upper, num_bytes, data_format, seed); + } else if (op_name == "reciprocal") { + // For reciprocal, exclude zeroes and use (lower, upper) range + return unit_tests::compute::generate_random_vector_generalized(lower, upper, num_bytes, data_format, seed, true, GOLDEN_NEG_EPSILON, GOLDEN_POS_EPSILON); + } else { + // For all other operations, allow zeroes and use (lower, upper) range + return unit_tests::compute::generate_random_vector_generalized(lower, upper, num_bytes, data_format, seed); + } +} + +// Function that performs SFPU ops on float values. +// It is used to generate golden +float sfpu_function(const string& op_name, const float& input) { if (op_name == "relu") { - return bfloat16(fmaxf(input.to_float(), 0.0f)); + return fmaxf(input, 0.0f); } else if (op_name == "exponential") { - return bfloat16(std::exp(input.to_float())); + return std::exp(input); } else if (op_name == "reciprocal") { - return bfloat16(1 / input.to_float()); + return 1 / input; } else if (op_name == "gelu") { static constexpr float alpha = M_2_SQRTPI * M_SQRT1_2; - auto x = input.to_float(); - auto x3 = x * x * x; - float result = x * 0.5 * (1.0 + tanhf(alpha * (x + 0.044715 * x3))); - return bfloat16(result); + auto x3 = input * input * input; + return input * 0.5 * (1.0 + tanhf(alpha * (input + 0.044715 * x3))); } else if (op_name == "sqrt") { - return bfloat16(sqrtf(input.to_float())); + return sqrtf(input); } else if (op_name == "sigmoid") { - auto x = input.to_float(); - float result = 1 / (1 + std::exp(-x)); - return bfloat16(result); + return 1 / (1 + std::exp(-input)); } else if (op_name == "log") { - return bfloat16(logf(input.to_float())); + return input ? logf(input) : 0.0f; } else if (op_name == "tanh") { - return bfloat16(std::tanh(input.to_float())); - } else { - TT_THROW("Unsupported op_name in test"); - return bfloat16(0.0f); - } -} -vector generate_packed_sfpu_input(const unsigned int numel, const string& op_name, const int seed) { - if ((op_name == "sqrt") or (op_name == "log")) { - return generate_packed_uniform_random_vector(0.0001f, 4.0f, numel, seed); - } else if ((op_name == "exponential") or (op_name == "gelu") or (op_name == "reciprocal")) { - auto possible_values = vector({-1.0f, -0.5f, 0.5f, 1.0f}); - return generate_packed_random_vector_from_vector(possible_values, numel, seed); + return std::tanh(input); } else { - return generate_packed_uniform_random_vector(-1.0f, 1.0f, numel, seed); + TT_THROW("Unsupported op_name!"); + return 0.0f; } } -bool is_close_packed_sfpu_output(const std::vector& vec_a, const std::vector& vec_b, const string& op_name) { - if (op_name == "tanh") { - return is_close_packed_vectors( - vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); }); - } else if ((op_name == "gelu") or (op_name == "relu")) { - return is_close_packed_vectors( - vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.15f); }); - } else if ((op_name == "exponential")) { - return is_close_packed_vectors( - vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); }); - } else if ((op_name == "log")) { - return is_close_packed_vectors( - vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.03f, 0.02f); }); - } else { - return is_close_packed_vectors( - vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); }); +// Function that compares SFPU output and golden. Different tollerances are needed for different ops +bool is_close_packed_sfpu_output(const vector& vec_a, const vector& vec_b, const string& op_name) { + for (int i = 0; i < vec_a.size(); i++) { + if (op_name == "tanh") { + return is_close(vec_a[i], vec_b[i], 0.175f, 0.1f); + } else if ((op_name == "sqrt") or (op_name == "reciprocal") or (op_name == "exponential")) { + return is_close(vec_a[i], vec_b[i], 0.06f, 0.002); + } else { + return is_close(vec_a[i], vec_b[i], 0.01f, 0.05f); + } } + return false; } } // namespace unit_tests::sfpu_util @@ -102,13 +112,14 @@ bool is_close_packed_sfpu_output(const std::vector& vec_a, const std:: namespace unit_tests::compute::sfpu { struct SfpuConfig { - size_t num_tiles = 0; - size_t tile_byte_size = 0; + size_t r_tile_dim = 0; + size_t c_tile_dim = 0; tt::DataFormat l1_input_data_format = tt::DataFormat::Invalid; tt::DataFormat l1_output_data_format = tt::DataFormat::Invalid; CoreRangeSet cores = CoreRangeSet(); std::string sfpu_op = ""; bool approx_mode = true; + bool fp32_dest_acc_en = true; }; /// @brief Does Dram --> Reader --> CB --> Sfpu Compute --> CB --> Writer --> Dram. So far, enqueue APIs only added to @@ -116,62 +127,81 @@ struct SfpuConfig { /// @param device /// @param test_config - Configuration of the test -- see struct /// @return -bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_config) { - const size_t byte_size = test_config.num_tiles * test_config.tile_byte_size; +bool run_sfpu_test(tt_metal::Device* device, const SfpuConfig& test_config) { + size_t num_tiles = test_config.r_tile_dim * test_config.c_tile_dim; + const size_t input_byte_size = num_tiles * tile_size(test_config.l1_input_data_format); + const size_t output_byte_size = num_tiles * tile_size(test_config.l1_output_data_format); + tt_metal::Program program = tt_metal::CreateProgram(); - tt::tt_metal::InterleavedBufferConfig dram_config{ - .device=device, - .size = byte_size, - .page_size = byte_size, + + // Create input/output buffers + tt::tt_metal::InterleavedBufferConfig input_dram_config{ + .device = device, + .size = input_byte_size, + .page_size = input_byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM - }; + }; - auto input_dram_buffer = CreateBuffer(dram_config); + tt::tt_metal::InterleavedBufferConfig output_dram_config{ + .device = device, + .size = output_byte_size, + .page_size = output_byte_size, + .buffer_type = tt::tt_metal::BufferType::DRAM + }; + + auto input_dram_buffer = CreateBuffer(input_dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_buffer = CreateBuffer(dram_config); + auto output_dram_buffer = CreateBuffer(output_dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); vector compute_kernel_args = { - uint32_t(test_config.num_tiles), // per_core_block_cnt - 1 // per_core_block_cnt + uint32_t(test_config.c_tile_dim), // per_core_block_cnt + uint32_t(test_config.r_tile_dim) // per_core_block_dim }; - // Input - std::vector packed_input = sfpu_util::generate_packed_sfpu_input( - byte_size / bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count()); - - // Golden output - auto input = unpack_vector(packed_input); - std::vector golden(input.size()); - std::transform(input.begin(), input.end(), golden.begin(), [&](const bfloat16& val) { + // Create packed input + vector packed_input = sfpu_util::generate_random_sfpu_vector( + GOLDEN_BOT_LIMIT, + GOLDEN_TOP_LIMIT, + input_byte_size, + test_config.sfpu_op, + test_config.l1_input_data_format, + std::chrono::system_clock::now().time_since_epoch().count() + ); + + // Unpack input to prepare for golden + vector unpacked_input = unit_tests::compute::unpack_generalized(test_config.l1_input_data_format, packed_input); + + // Golden output, a float vector + vector golden(unpacked_input.size()); + std::transform(unpacked_input.begin(), unpacked_input.end(), golden.begin(), [&](const float& val) { return sfpu_util::sfpu_function(test_config.sfpu_op, val); }); - std::vector packed_golden = pack_vector(golden); // Same runtime args for every core vector reader_rt_args = { (uint32_t)input_dram_byte_address, (uint32_t)input_dram_noc_xy.x, (uint32_t)input_dram_noc_xy.y, - (uint32_t)test_config.num_tiles, + (uint32_t)num_tiles, }; vector writer_rt_args = { (uint32_t)output_dram_byte_address, (uint32_t)output_dram_noc_xy.x, (uint32_t)output_dram_noc_xy.y, - (uint32_t)test_config.num_tiles, + (uint32_t)num_tiles, }; for (const CoreRange& core_range : test_config.cores.ranges()) { - tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(byte_size, {{0, test_config.l1_input_data_format}}) - .set_page_size(0, test_config.tile_byte_size); + tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(input_byte_size, {{0, test_config.l1_input_data_format}}) + .set_page_size(0, tile_size(test_config.l1_input_data_format)); auto l1_input_cb = tt_metal::CreateCircularBuffer(program, core_range, l1_input_cb_config); - tt_metal::CircularBufferConfig l1_output_cb_config = tt_metal::CircularBufferConfig(byte_size, {{16, test_config.l1_output_data_format}}) - .set_page_size(16, test_config.tile_byte_size); + tt_metal::CircularBufferConfig l1_output_cb_config = tt_metal::CircularBufferConfig(output_byte_size, {{16, test_config.l1_output_data_format}}) + .set_page_size(16, tile_size(test_config.l1_output_data_format)); auto l1_output_cb = tt_metal::CreateCircularBuffer(program, core_range, l1_output_cb_config); auto reader_kernel = tt_metal::CreateKernel( @@ -188,7 +218,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); - std::map sfpu_defines = sfpu_util::sfpu_op_to_op_name.at(test_config.sfpu_op); + map sfpu_defines = sfpu_util::sfpu_op_to_op_name.at(test_config.sfpu_op); sfpu_defines["SFPU_OP_EXP_INCLUDE"] = "1"; sfpu_defines["SFPU_OP_GELU_INCLUDE"] = "1"; @@ -200,11 +230,16 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c sfpu_defines["SFPU_OP_RELU_FAMILY_INCLUDE"] = "1"; sfpu_defines["SFPU_OP_COMPUTE_KERNEL_API_INCLUDE"]="1"; + if (test_config.fp32_dest_acc_en) { + sfpu_defines["DEST_ACCUM_EN"] = "1"; + } + auto sfpu_kernel = tt_metal::CreateKernel( program, "tt_metal/kernels/compute/eltwise_sfpu.cpp", test_config.cores, tt_metal::ComputeConfig{ + .fp32_dest_acc_en = test_config.fp32_dest_acc_en, .math_approx_mode = test_config.approx_mode, .compile_args = compute_kernel_args, .defines = sfpu_defines}); @@ -216,258 +251,93 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c } } - std::vector dest_buffer_data; + vector packed_output; tt_metal::detail::WriteToBuffer(input_dram_buffer, packed_input); tt_metal::detail::LaunchProgram(device, program); - tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data); + tt_metal::detail::ReadFromBuffer(output_dram_buffer, packed_output); + + // Unpack SFPU output to float vector + vector unpacked_output = unit_tests::compute::unpack_generalized(test_config.l1_output_data_format, packed_output); - return sfpu_util::is_close_packed_sfpu_output(dest_buffer_data, packed_golden, test_config.sfpu_op); + return sfpu_util::is_close_packed_sfpu_output(golden, unpacked_output, test_config.sfpu_op); } } // namespace unit_tests::compute::sfpu + class SingleCoreSingleDeviceSfpuParameterizedFixture : public DeviceFixture, - public testing::WithParamInterface> { + public testing::WithParamInterface, std::string>> { }; + TEST_P(SingleCoreSingleDeviceSfpuParameterizedFixture, SfpuCompute) { - size_t num_tiles = std::get<0>(GetParam()); + // Generate random width and height of the input block + // Can be easily expanded to all dimensions of tensor + vector random_shape = generate_uniform_random_vector( + MIN_BLOCK_DIM, + MAX_BLOCK_DIM, + NUM_DIMS, + std::chrono::system_clock::now().time_since_epoch().count() // Seed + ); + size_t r_tile_dim = random_shape[0]; + size_t c_tile_dim = random_shape[1]; + + // Extract the tuple of input/output formats and the sfpu_op + auto formats = std::get<0>(GetParam()); string sfpu_op = std::get<1>(GetParam()); + // Extract input and output formats from the tuple + tt::DataFormat input_format = std::get<0>(formats); + tt::DataFormat output_format = std::get<1>(formats); + CoreRange core_range({0, 0}, {0, 0}); CoreRangeSet core_range_set({core_range}); - unit_tests::compute::sfpu::SfpuConfig test_config = { - .num_tiles = num_tiles, - .tile_byte_size = 2 * 32 * 32, - .l1_input_data_format = tt::DataFormat::Float16_b, - .l1_output_data_format = tt::DataFormat::Float16_b, - .cores = core_range_set, - .sfpu_op = sfpu_op, - .approx_mode = false}; - log_info("Testing SFPU_OP={} num_tiles={}", sfpu_op, num_tiles); - for (unsigned int id = 0; id < num_devices_; id++) { - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(id), test_config)); + for (bool approx_mode: {true, false}) { + for (bool fp32_dest_acc_en : {true, false}) { + // FP32 dest acc not possible for GS + if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; + unit_tests::compute::sfpu::SfpuConfig test_config = { + .r_tile_dim = r_tile_dim, + .c_tile_dim = c_tile_dim, + .l1_input_data_format = input_format, + .l1_output_data_format = output_format, + .cores = core_range_set, + .sfpu_op = sfpu_op, + .approx_mode = approx_mode, + .fp32_dest_acc_en = fp32_dest_acc_en + }; + log_info("SFPU_OP={}, r_tile_dim={}, c_tile_dim={}, approx_mode={}, fp32_dest_acc_en={} input_format={} output_format={}", + sfpu_op, + r_tile_dim, + c_tile_dim, + approx_mode, + fp32_dest_acc_en, + input_format, + output_format); + for (unsigned int id = 0; id < num_devices_; id++) { + EXPECT_TRUE(run_sfpu_test(devices_.at(id), test_config)); + } + } } } INSTANTIATE_TEST_SUITE_P( SingleCoreSfpuCompute, SingleCoreSingleDeviceSfpuParameterizedFixture, - ::testing::Values( - std::make_tuple(1, "relu"), - std::make_tuple(1, "exponential"), - std::make_tuple(1, "reciprocal"), - std::make_tuple(1, "gelu"), - std::make_tuple(1, "sqrt"), - std::make_tuple(1, "sigmoid"), - std::make_tuple(1, "log"), - std::make_tuple(1, "tanh"), - std::make_tuple(4, "relu"), - std::make_tuple(4, "exponential"), - std::make_tuple(4, "reciprocal"), - std::make_tuple(4, "gelu"), - std::make_tuple(4, "sqrt"), - std::make_tuple(4, "sigmoid"), - std::make_tuple(4, "log"), - std::make_tuple(4, "tanh"))); -class SingleCoreSingleDeviceSfpuParameterizedApproxFixture - : public DeviceFixture, - public testing::WithParamInterface> {}; - -TEST_P(SingleCoreSingleDeviceSfpuParameterizedApproxFixture, SfpuCompute) { - size_t num_tiles = std::get<0>(GetParam()); - string sfpu_op = std::get<1>(GetParam()); - - if (((arch_ == tt::ARCH::WORMHOLE_B0) and (sfpu_op == "relu")) or - ((arch_ == tt::ARCH::WORMHOLE_B0) and (sfpu_op == "exponential")) or - ((arch_ == tt::ARCH::WORMHOLE_B0) and (sfpu_op == "log"))) { - GTEST_SKIP(); - } else { - CoreRange core_range({0, 0}, {0, 0}); - CoreRangeSet core_range_set({core_range}); - unit_tests::compute::sfpu::SfpuConfig test_config = { - .num_tiles = num_tiles, - .tile_byte_size = 2 * 32 * 32, - .l1_input_data_format = tt::DataFormat::Float16_b, - .l1_output_data_format = tt::DataFormat::Float16_b, - .cores = core_range_set, - .sfpu_op = sfpu_op, - .approx_mode = true}; - log_info("Testing SFPU_OP={} num_tiles={}", sfpu_op, num_tiles); - for (unsigned int id = 0; id < num_devices_; id++) { - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(id), test_config)); - } - } -} -INSTANTIATE_TEST_SUITE_P( - SingleCoreSfpuCompute, - SingleCoreSingleDeviceSfpuParameterizedApproxFixture, - ::testing::Values( - std::make_tuple(1, "relu"), - std::make_tuple(1, "exponential"), - std::make_tuple(1, "reciprocal"), - std::make_tuple(1, "gelu"), - std::make_tuple(1, "sqrt"), - std::make_tuple(1, "sigmoid"), - std::make_tuple(1, "log"), - std::make_tuple(1, "tanh"), - std::make_tuple(4, "relu"), - std::make_tuple(4, "exponential"), - std::make_tuple(4, "reciprocal"), - std::make_tuple(4, "gelu"), - std::make_tuple(4, "sqrt"), - std::make_tuple(4, "sigmoid"), - std::make_tuple(4, "log"), - std::make_tuple(4, "tanh"))); - -TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreSingleTileSfpuApproxCompute) { - CoreRange core_range({0, 0}, {1, 0}); - CoreRangeSet core_range_set({core_range}); - unit_tests::compute::sfpu::SfpuConfig test_config = { - .tile_byte_size = 2 * 32 * 32, - .l1_input_data_format = tt::DataFormat::Float16_b, - .l1_output_data_format = tt::DataFormat::Float16_b, - .cores = core_range_set, - .approx_mode = true}; - - auto arch = this->arch_; - - if (arch != tt::ARCH::GRAYSKULL) { - GTEST_SKIP(); - } - - CoreRangeSet core_set({core_range}); - test_config.cores = core_set; - - test_config.num_tiles = 1; - test_config.sfpu_op = "relu"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "exponential"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "reciprocal"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "gelu"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "sqrt"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "sigmoid"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "log"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "tanh"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); -} - -TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreMultiTileSfpuApproxCompute) { - CoreRange core_range({0, 0}, {1, 0}); - CoreRangeSet core_range_set({core_range}); - unit_tests::compute::sfpu::SfpuConfig test_config = { - .tile_byte_size = 2 * 32 * 32, - .l1_input_data_format = tt::DataFormat::Float16_b, - .l1_output_data_format = tt::DataFormat::Float16_b, - .cores = core_range_set, - .approx_mode = true}; - - auto arch = this->arch_; - - if (arch != tt::ARCH::GRAYSKULL) { - GTEST_SKIP(); - } - - CoreRangeSet core_set({core_range}); - test_config.cores = core_set; - - test_config.num_tiles = 4; - - test_config.sfpu_op = "relu"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "exponential"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "reciprocal"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "gelu"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "sqrt"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "sigmoid"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "log"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "tanh"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); -} -TEST_F(DeviceFixture, DISABLED_AllCoreSingleTileSfpuApproxCompute) { - unit_tests::compute::sfpu::SfpuConfig test_config = { - .tile_byte_size = 2 * 32 * 32, - .l1_input_data_format = tt::DataFormat::Float16_b, - .l1_output_data_format = tt::DataFormat::Float16_b, - .cores = CoreRangeSet(), - .approx_mode = true}; - - auto arch = this->arch_; - - if (arch != tt::ARCH::GRAYSKULL) { - GTEST_SKIP(); - } - - int chip_id = 0; - CoreCoord worker_grid_size = this->devices_.at(0)->logical_grid_size(); - CoreRange core_range({0, 0}, {worker_grid_size.x - 2, worker_grid_size.y - 2}); - - CoreRangeSet core_set({core_range}); - test_config.cores = core_set; - - test_config.num_tiles = 1; - test_config.sfpu_op = "relu"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "exponential"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "reciprocal"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "gelu"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "sqrt"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "sigmoid"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "log"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "tanh"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); -} -TEST_F(DeviceFixture, DISABLED_AllCoreMultiTileSfpuApproxCompute) { - unit_tests::compute::sfpu::SfpuConfig test_config = { - .tile_byte_size = 2 * 32 * 32, - .l1_input_data_format = tt::DataFormat::Float16_b, - .l1_output_data_format = tt::DataFormat::Float16_b, - .cores = CoreRangeSet(), - .approx_mode = true}; - - auto arch = this->arch_; - - if (arch != tt::ARCH::GRAYSKULL) { - GTEST_SKIP(); - } - - int chip_id = 0; - CoreCoord worker_grid_size = this->devices_.at(0)->logical_grid_size(); - CoreRange core_range({0, 0}, {worker_grid_size.x - 2, worker_grid_size.y - 2}); - - CoreRangeSet core_set({core_range}); - test_config.cores = core_set; - test_config.num_tiles = 4; - test_config.sfpu_op = "relu"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "exponential"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "reciprocal"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "gelu"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "sqrt"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "sigmoid"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "log"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); - test_config.sfpu_op = "tanh"; - EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); -} + ::testing::Combine( + ::testing::Values( + std::make_tuple(tt::DataFormat::Float16_b, tt::DataFormat::Float16_b), + std::make_tuple(tt::DataFormat::Float16_b, tt::DataFormat::Float32), + std::make_tuple(tt::DataFormat::Float32, tt::DataFormat::Float16_b), + std::make_tuple(tt::DataFormat::Bfp4_b, tt::DataFormat::Float16_b), + std::make_tuple(tt::DataFormat::Bfp8_b, tt::DataFormat::Float32) + ), + ::testing::Values( + "gelu", + "relu", + "sqrt", + "exponential", + "log", + "reciprocal", + "tanh", + "sigmoid" + ) + ));