diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp
index 4afc02acaa8..5799a63ba9c 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp
@@ -2,21 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <algorithm>
-
-
 #include "test_golden_impls.hpp"
-#include "common/test_tiles.hpp"
-#include "common/bfloat16.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tests/tt_metal/test_utils/packing.hpp"
 
 using std::vector;
 
 namespace unit_tests::compute {
 
-std::vector<uint32_t> gold_standard_untilize(const std::vector<uint32_t> &src_vec, const GoldenConfig &config) {
+vector<uint32_t> gold_standard_untilize(const vector<uint32_t> &src_vec, const GoldenConfig &config) {
     vector<uint32_t> dst_vec;
 
     int num_rows = config.num_tiles_r_dim * config.face_r_dim * (config.num_faces > 2 ? 2: 1);
@@ -74,7 +66,7 @@ std::vector<uint32_t> gold_standard_untilize(const std::vector<uint32_t> &src_ve
     return dst_vec;
 }
 
-std::vector<uint32_t> gold_standard_tilize(const std::vector<uint32_t> &src_vec, const GoldenConfig &config) {
+vector<uint32_t> gold_standard_tilize(const vector<uint32_t> &src_vec, const GoldenConfig &config) {
     vector<uint32_t> dst_vec;
 
     //TODO: RT update this one to use variable tile sizes
@@ -116,7 +108,7 @@ std::vector<uint32_t> gold_standard_tilize(const std::vector<uint32_t> &src_vec,
 // input shape.x is assumed to have the full number of elements in bfloat16
 // src_vec is expected to be untilized
 // result is also untilized
-std::vector<uint16_t> gold_transpose_wh(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape) {
+vector<uint16_t> gold_transpose_wh(const vector<uint16_t> &src_vec, const vector<uint32_t> &shape) {
     vector<uint32_t> shapeT{shape[0], shape[1], shape[3], shape[2]};
     TensAddr addr(shape);
     TensAddr addrt(shapeT);
@@ -138,7 +130,7 @@ std::vector<uint16_t> gold_transpose_wh(const std::vector<uint16_t> &src_vec, co
 // input shape.x is assumed to have the full number of elements in bfloat16
 // src_vec is expected to be untilized
 // result is also untilized
-std::vector<uint16_t> gold_reduce_h(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
+vector<uint16_t> gold_reduce_h(const vector<uint16_t> &src_vec, const vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
     vector<uint32_t> shape_dst{shape[0], shape[1], 1, shape[3]};
     TT_FATAL(shape[2] > 0, "Error");
     if (zeropad)
@@ -167,7 +159,7 @@ std::vector<uint16_t> gold_reduce_h(const std::vector<uint16_t> &src_vec, const
     return reduced;
 };
 
-std::vector<uint16_t> gold_reduce_w(const vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
+vector<uint16_t> gold_reduce_w(const vector<uint16_t> &src_vec, const vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
     vector<uint32_t> shape_dst{shape[0], shape[1], shape[2], 1};
     if (zeropad)
         shape_dst[3] = 32;
@@ -194,7 +186,7 @@ std::vector<uint16_t> gold_reduce_w(const vector<uint16_t> &src_vec, const std::
     return reduced;
 }
 
-std::vector<uint16_t> gold_reduce_hw(const std::vector<uint16_t> &src_vec, const std::vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
+vector<uint16_t> gold_reduce_hw(const vector<uint16_t> &src_vec, const vector<uint32_t> &shape, float scaler, uint8_t red_type, bool zeropad) {
     vector<uint32_t> shape_dst{shape[0], shape[1], 1, 1};
     if (zeropad) {
         shape_dst[2] = 32;
@@ -225,12 +217,12 @@ std::vector<uint16_t> gold_reduce_hw(const std::vector<uint16_t> &src_vec, const
     return reduced;
 }
 
-std::vector<uint32_t> gold_standard_tilize_w_elwadd(const std::vector<uint32_t> &src0_vec, const std::vector<uint32_t> &src1_vec, const GoldenConfig &config) {
+vector<uint32_t> gold_standard_tilize_w_elwadd(const vector<uint32_t> &src0_vec, const vector<uint32_t> &src1_vec, const GoldenConfig &config) {
 
-    std::vector<bfloat16> unpacked_tilize_src0_vec = tt::test_utils::unpack_vector<bfloat16, uint32_t>(gold_standard_tilize(src0_vec, config));
-    std::vector<bfloat16> unpacked_src1_vec = tt::test_utils::unpack_vector<bfloat16, uint32_t>(src1_vec);
+    vector<bfloat16> unpacked_tilize_src0_vec = tt::test_utils::unpack_vector<bfloat16, uint32_t>(gold_standard_tilize(src0_vec, config));
+    vector<bfloat16> unpacked_src1_vec = tt::test_utils::unpack_vector<bfloat16, uint32_t>(src1_vec);
 
-    std::vector<bfloat16> result_vec(unpacked_tilize_src0_vec.size());
+    vector<bfloat16> result_vec(unpacked_tilize_src0_vec.size());
 
     std::transform(
         unpacked_tilize_src0_vec.begin(),
@@ -244,5 +236,127 @@ std::vector<uint32_t> gold_standard_tilize_w_elwadd(const std::vector<uint32_t>
     return tt::test_utils::pack_vector<uint32_t, bfloat16>(result_vec);
 }
 
+// A pointer to the appropriate function for generating random packed vector depending on the data format
+using RandomVectorGenerator = std::function<vector<uint32_t>(uint32_t num_bytes, bool is_exp_a, int max_float, int seed, float offset)>;
+// A pointer to the appropriate function for unpacking the vector from the given data format to float vec
+using VectorUnpacker = std::function<vector<float>(const vector<uint32_t> &packed_input, bool row_major_output, bool is_exp_a)>;
+
+
+vector<uint32_t> generate_random_vector_generalized(
+    const float lower,
+    const float upper,
+    const size_t num_bytes,
+    const tt::DataFormat data_format,
+    const int seed,
+    bool exclude_zeroes,
+    float golden_neg_epsilon,
+    float golden_pos_epsilon) {
+
+    RandomVectorGenerator vector_generator;
+
+    // Select the appropriate vector generator based on the data format
+    switch (data_format) {
+        case tt::DataFormat::Float16_b:
+            vector_generator = [&](uint32_t num_bytes, bool is_exp_a, int max_float, int seed, float offset) {
+                return create_random_vector_of_bfloat16(num_bytes, max_float, seed, offset);
+            };
+            break;
+        case tt::DataFormat::Float32:
+            vector_generator = [&](uint32_t num_bytes, bool is_exp_a, int max_float, int seed, float offset) {
+                auto rand_float = std::bind(std::uniform_real_distribution<float>(0, max_float), std::mt19937(seed));
+                vector<uint32_t> vec(num_bytes/sizeof(uint32_t), 0);
+                for (int i = 0; i < vec.size(); i++) {
+                    float num_float = rand_float() + offset;
+                    std::memcpy(&vec[i], &num_float, sizeof(float));
+                }
+                return vec;
+            };
+            break;
+        case tt::DataFormat::Bfp8_b:
+            vector_generator = [&](uint32_t num_bytes,bool is_exp_a, int max_float, int seed, float offset) {
+                return create_random_vector_of_bfp8(num_bytes, is_exp_a, max_float, seed, offset);
+            };
+            break;
+        case tt::DataFormat::Bfp4_b:
+            vector_generator = [&](uint32_t num_bytes,bool is_exp_a, int max_float, int seed, float offset) {
+                return create_random_vector_of_bfp4(num_bytes, is_exp_a, max_float, seed, offset);
+            };
+            break;
+        default:
+            TT_THROW("Unsupported DataFormat!");
+            return {};
+    }
+
+    if (exclude_zeroes) {
+        if (lower < 0 && upper > 0) {
+            vector<uint32_t> vec;
+
+            // Split into negative and positive parts, avoiding zero
+            auto negative_part = vector_generator(
+                num_bytes / 2,
+                false,
+                std::abs(lower - golden_neg_epsilon),
+                seed,
+                lower);
+            auto positive_part = vector_generator(
+                num_bytes - num_bytes / 2,
+                false,
+                upper - golden_pos_epsilon,
+                seed + 1, // Use a different seed for the positive part
+                golden_pos_epsilon);
+
+            // Combine both parts
+            vec.insert(vec.end(), negative_part.begin(), negative_part.end());
+            vec.insert(vec.end(), positive_part.begin(), positive_part.end());
+            return vec;
+        } else {
+            TT_THROW("Cannot create a vector without zeroes with selected input value range!");
+        }
+    } else {
+        // Use the generic generator for the entire range
+        return vector_generator(num_bytes, false, upper - lower, seed, lower);
+    }
+}
+
+vector<float> unpack_generalized(const tt::DataFormat data_format, const vector<uint32_t>& packed_input) {
+    VectorUnpacker unpacker_function;
+
+    // Select the appropriate vector generator based on the data format
+    switch (data_format) {
+        case tt::DataFormat::Float16_b:
+            unpacker_function = [&](const vector<uint32_t> &packed_input, bool row_major_output, bool is_exp_a) {
+                vector<bfloat16> vec = unpack_uint32_vec_into_bfloat16_vec(packed_input);
+                vector<float> vec_float(vec.size());
+                for (int i = 0; i < vec.size(); i++) {
+                    vec_float[i] = vec[i].to_float();
+                }
+                return vec_float;
+            };
+            break;
+        case tt::DataFormat::Float32:
+            unpacker_function = [&](const vector<uint32_t> &packed_input, bool row_major_output, bool is_exp_a) {
+                vector<float> vec(packed_input.size(), 0);
+                for (int i = 0; i < packed_input.size(); i++) {
+                    std::memcpy(&vec[i], &packed_input[i], sizeof(uint32_t));
+                }
+                return vec;
+            };
+            break;
+        case tt::DataFormat::Bfp8_b:
+            unpacker_function = [&](const vector<uint32_t> &packed_input, bool row_major_output, bool is_exp_a) {
+                return unpack_bfp8_tiles_into_float_vec(packed_input, row_major_output, is_exp_a);
+            };
+            break;
+        case tt::DataFormat::Bfp4_b:
+            unpacker_function = [&](const vector<uint32_t> &packed_input, bool row_major_output, bool is_exp_a) {
+                return unpack_bfp4_tiles_into_float_vec(packed_input, row_major_output, is_exp_a);
+            };
+            break;
+        default:
+            TT_THROW("Unsupported DataFormat!");
+            return {};
+    }
+    return unpacker_function(packed_input, true, false);
+}
 
 }   // unit_tests::compute
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp
index 550ced2cf6b..f44c8cc405c 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp
@@ -9,6 +9,15 @@
 #include <limits>
 #include <random>
 #include <vector>
+#include <algorithm>
+
+#include "common/test_tiles.hpp"
+#include "tt_metal/common/bfloat16.hpp"
+#include "tt_metal/common/bfloat8.hpp"
+#include "tt_metal/common/bfloat4.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tests/tt_metal/test_utils/packing.hpp"
 
 //TODO: RT these functions should be templated for different data formats
 namespace unit_tests::compute {
@@ -54,4 +63,31 @@ std::vector<uint16_t> gold_reduce_hw(const std::vector<uint16_t> &src_vec, const
 // Assumes all elements in bfloat16
 std::vector<uint32_t> gold_standard_tilize_w_elwadd(const std::vector<uint32_t> &src0_vec, const std::vector<uint32_t> &src1_vec, const GoldenConfig &config);
 
+// Random packed uint32_t vector generator which is data-format agnostic.
+// Takes the following parameters:
+//
+// lower - a lower limit of the input range
+// upper - an upper limit of the input range
+// num_bytes - number of bytes that the vector will occupy
+// data_format - data format of each element, packed to uint32_t, currently supporting Bfloat16, Float32, Bfp8_b and Bfp4_b
+// seed - randomization seed
+// exclude_zeroes - if true, excludes values around zero, with the limits given by next two parameters
+// golden_neg_epsilon - small negative value above which no elements of the vector will take value from
+// golden_pos_epsilon - small positive value below which no elements of the vector will take value from
+//
+// Returns:
+//
+// a uint32_t vector of packed values depending on the data format and given limits
+std::vector<uint32_t> generate_random_vector_generalized(const float lower, const float upper, const size_t num_bytes, const tt::DataFormat data_format, const int seed, bool exclude_zeroes = false, float golden_neg_epsilon = -0.0001f, float golden_pos_epsilon = 0.0001f);
+
+// Unpacking function which is data-format agnostic
+// Takes the following parameters:
+//
+// data_format -  data format in which the vector was packed, currently supporting Bfloat16, Float32, Bfp8_b and Bfp4_b
+// packed_input - a uint32_t packed vector
+//
+// Returns:
+// a float vector of unpacked values depending on the data format
+std::vector<float> unpack_generalized(const tt::DataFormat data_format, const std::vector<uint32_t>& packed_input);
+
 }   // unit_tests::compute
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
index 35ffb316d01..419dc486fd9 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
@@ -5,30 +5,37 @@
 #include <gtest/gtest.h>
 #include <math.h>
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
 #include "device_fixture.hpp"
+#include "test_golden_impls.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
 #include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
+// Limits of SFPU input value range. These values are chosen because they
+// cover the domains of all SFPU functions used in the test
+#define GOLDEN_BOT_LIMIT    (-7.0f)
+#define GOLDEN_TOP_LIMIT    (7.0f)
+// Small values around zero for domains which have to exclude zeroes, like
+// log or reciprocal
+#define GOLDEN_NEG_EPSILON  (-0.0001f)
+#define GOLDEN_POS_EPSILON  (0.0001f)
+// Min/max values of the randomly-generated input block height/width
+#define MIN_BLOCK_DIM       (1)
+#define MAX_BLOCK_DIM       (10)
+// Number of dimensions randomly generated, can be expanded up to 4,
+// making an input a full tensor rather than a matrix
+#define NUM_DIMS            (2)
+
 using std::map;
 using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
-using namespace tt::test_utils::df;
 using namespace tt::tt_metal;
 
 namespace unit_tests::sfpu_util {
 
-
-const map<string, std::map<string, string>> sfpu_op_to_op_name = {
-    // FIXME: #1157
+// Internal sfpu_op is mapped to proper SFPU function calls
+const map<string, map<string, string>> sfpu_op_to_op_name = {
     {"relu", {{"SFPU_OP_CHAIN_0", "relu_tile_init(); relu_tile(0);"}}},
     {"exponential", {{"SFPU_OP_CHAIN_0", "exp_tile_init(); exp_tile(0);"}}},
     {"reciprocal", {{"SFPU_OP_CHAIN_0", "recip_tile_init(); recip_tile(0);"}}},
@@ -39,62 +46,65 @@ const map<string, std::map<string, string>> sfpu_op_to_op_name = {
     {"tanh", {{"SFPU_OP_CHAIN_0", "tanh_tile_init(); tanh_tile(0);"}}},
 };
 
-bfloat16 sfpu_function(const string& op_name, const bfloat16& input) {
+// Function that generates different input ranges depending on the SFPU op specified
+vector<uint32_t> generate_random_sfpu_vector(
+    const float lower,
+    const float upper,
+    const size_t num_bytes,
+    const string& op_name,
+    const tt::DataFormat data_format,
+    const int seed) {
+    if ((op_name == "sqrt") || (op_name == "log")) {
+        // sqrt and log have values between (0, upper]
+        return unit_tests::compute::generate_random_vector_generalized(GOLDEN_POS_EPSILON, upper, num_bytes, data_format, seed);
+    } else if (op_name == "reciprocal") {
+        // For reciprocal, exclude zeroes and use (lower, upper) range
+        return unit_tests::compute::generate_random_vector_generalized(lower, upper, num_bytes, data_format, seed, true, GOLDEN_NEG_EPSILON, GOLDEN_POS_EPSILON);
+    } else {
+        // For all other operations, allow zeroes and use (lower, upper) range
+        return unit_tests::compute::generate_random_vector_generalized(lower, upper, num_bytes, data_format, seed);
+    }
+}
+
+// Function that performs SFPU ops on float values.
+// It is used to generate golden
+float sfpu_function(const string& op_name, const float& input) {
     if (op_name == "relu") {
-        return bfloat16(fmaxf(input.to_float(), 0.0f));
+        return fmaxf(input, 0.0f);
     } else if (op_name == "exponential") {
-        return bfloat16(std::exp(input.to_float()));
+        return std::exp(input);
     } else if (op_name == "reciprocal") {
-        return bfloat16(1 / input.to_float());
+        return 1 / input;
     } else if (op_name == "gelu") {
         static constexpr float alpha = M_2_SQRTPI * M_SQRT1_2;
-        auto x = input.to_float();
-        auto x3 = x * x * x;
-        float result = x * 0.5 * (1.0 + tanhf(alpha * (x + 0.044715 * x3)));
-        return bfloat16(result);
+        auto x3 = input * input * input;
+        return input * 0.5 * (1.0 + tanhf(alpha * (input + 0.044715 * x3)));
     } else if (op_name == "sqrt") {
-        return bfloat16(sqrtf(input.to_float()));
+        return sqrtf(input);
     } else if (op_name == "sigmoid") {
-        auto x = input.to_float();
-        float result = 1 / (1 + std::exp(-x));
-        return bfloat16(result);
+        return 1 / (1 + std::exp(-input));
     } else if (op_name == "log") {
-        return bfloat16(logf(input.to_float()));
+        return input ? logf(input) : 0.0f;
     } else if (op_name == "tanh") {
-        return bfloat16(std::tanh(input.to_float()));
-    } else {
-        TT_THROW("Unsupported op_name in test");
-        return bfloat16(0.0f);
-    }
-}
-vector<uint32_t> generate_packed_sfpu_input(const unsigned int numel, const string& op_name, const int seed) {
-    if ((op_name == "sqrt") or (op_name == "log")) {
-        return generate_packed_uniform_random_vector<uint32_t, bfloat16>(0.0001f, 4.0f, numel, seed);
-    } else if ((op_name == "exponential") or (op_name == "gelu") or (op_name == "reciprocal")) {
-        auto possible_values = vector<bfloat16>({-1.0f, -0.5f, 0.5f, 1.0f});
-        return generate_packed_random_vector_from_vector<uint32_t, bfloat16>(possible_values, numel, seed);
+        return std::tanh(input);
     } else {
-        return generate_packed_uniform_random_vector<uint32_t, bfloat16>(-1.0f, 1.0f, numel, seed);
+        TT_THROW("Unsupported op_name!");
+        return 0.0f;
     }
 }
 
-bool is_close_packed_sfpu_output(const std::vector<uint32_t>& vec_a, const std::vector<uint32_t>& vec_b, const string& op_name) {
-    if (op_name == "tanh") {
-        return is_close_packed_vectors<bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); });
-    } else if ((op_name == "gelu") or (op_name == "relu")) {
-        return is_close_packed_vectors<bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.15f); });
-    } else if ((op_name == "exponential")) {
-        return is_close_packed_vectors<bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); });
-    } else if ((op_name == "log")) {
-        return is_close_packed_vectors<bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.03f, 0.02f); });
-    } else {
-        return is_close_packed_vectors<bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); });
+// Function that compares SFPU output and golden. Different tollerances are needed for different ops
+bool is_close_packed_sfpu_output(const vector<float>& vec_a, const vector<float>& vec_b, const string& op_name) {
+    for (int i = 0; i < vec_a.size(); i++) {
+        if (op_name == "tanh") {
+            return is_close<float>(vec_a[i], vec_b[i], 0.175f, 0.1f);
+        } else if ((op_name == "sqrt") or (op_name == "reciprocal") or (op_name == "exponential")) {
+            return is_close<float>(vec_a[i], vec_b[i], 0.06f, 0.002);
+        } else {
+            return is_close<float>(vec_a[i], vec_b[i], 0.01f, 0.05f);
+        }
     }
+    return false;
 }
 
 }  // namespace unit_tests::sfpu_util
@@ -102,13 +112,14 @@ bool is_close_packed_sfpu_output(const std::vector<uint32_t>& vec_a, const std::
 namespace unit_tests::compute::sfpu {
 
 struct SfpuConfig {
-    size_t num_tiles = 0;
-    size_t tile_byte_size = 0;
+    size_t r_tile_dim = 0;
+    size_t c_tile_dim = 0;
     tt::DataFormat l1_input_data_format = tt::DataFormat::Invalid;
     tt::DataFormat l1_output_data_format = tt::DataFormat::Invalid;
     CoreRangeSet cores = CoreRangeSet();
     std::string sfpu_op = "";
     bool approx_mode = true;
+    bool fp32_dest_acc_en = true;
 };
 
 /// @brief Does Dram --> Reader --> CB --> Sfpu Compute --> CB --> Writer --> Dram. So far, enqueue APIs only added to
@@ -116,62 +127,81 @@ struct SfpuConfig {
 /// @param device
 /// @param test_config - Configuration of the test -- see struct
 /// @return
-bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_config) {
-    const size_t byte_size = test_config.num_tiles * test_config.tile_byte_size;
+bool run_sfpu_test(tt_metal::Device* device, const SfpuConfig& test_config) {
+    size_t num_tiles = test_config.r_tile_dim * test_config.c_tile_dim;
+    const size_t input_byte_size = num_tiles * tile_size(test_config.l1_input_data_format);
+    const size_t output_byte_size = num_tiles * tile_size(test_config.l1_output_data_format);
+
     tt_metal::Program program = tt_metal::CreateProgram();
-    tt::tt_metal::InterleavedBufferConfig dram_config{
-                    .device=device,
-                    .size = byte_size,
-                    .page_size = byte_size,
+
+    // Create input/output buffers
+    tt::tt_metal::InterleavedBufferConfig input_dram_config{
+                    .device = device,
+                    .size = input_byte_size,
+                    .page_size = input_byte_size,
                     .buffer_type = tt::tt_metal::BufferType::DRAM
-        };
+    };
 
-    auto input_dram_buffer = CreateBuffer(dram_config);
+    tt::tt_metal::InterleavedBufferConfig output_dram_config{
+                    .device = device,
+                    .size = output_byte_size,
+                    .page_size = output_byte_size,
+                    .buffer_type = tt::tt_metal::BufferType::DRAM
+    };
+
+    auto input_dram_buffer = CreateBuffer(input_dram_config);
     uint32_t input_dram_byte_address = input_dram_buffer->address();
     auto input_dram_noc_xy = input_dram_buffer->noc_coordinates();
-    auto output_dram_buffer = CreateBuffer(dram_config);
+    auto output_dram_buffer = CreateBuffer(output_dram_config);
     uint32_t output_dram_byte_address = output_dram_buffer->address();
     auto output_dram_noc_xy = output_dram_buffer->noc_coordinates();
 
     vector<uint32_t> compute_kernel_args = {
-        uint32_t(test_config.num_tiles),  // per_core_block_cnt
-        1                            // per_core_block_cnt
+        uint32_t(test_config.c_tile_dim),  // per_core_block_cnt
+        uint32_t(test_config.r_tile_dim)   // per_core_block_dim
     };
 
-    // Input
-    std::vector<uint32_t> packed_input = sfpu_util::generate_packed_sfpu_input(
-        byte_size / bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count());
-
-    // Golden output
-    auto input = unpack_vector<bfloat16, uint32_t>(packed_input);
-    std::vector<bfloat16> golden(input.size());
-    std::transform(input.begin(), input.end(), golden.begin(), [&](const bfloat16& val) {
+    // Create packed input
+    vector<uint32_t> packed_input = sfpu_util::generate_random_sfpu_vector(
+        GOLDEN_BOT_LIMIT,
+        GOLDEN_TOP_LIMIT,
+        input_byte_size,
+        test_config.sfpu_op,
+        test_config.l1_input_data_format,
+        std::chrono::system_clock::now().time_since_epoch().count()
+    );
+
+    // Unpack input to prepare for golden
+    vector<float> unpacked_input = unit_tests::compute::unpack_generalized(test_config.l1_input_data_format, packed_input);
+
+    // Golden output, a float vector
+    vector<float> golden(unpacked_input.size());
+    std::transform(unpacked_input.begin(), unpacked_input.end(), golden.begin(), [&](const float& val) {
         return sfpu_util::sfpu_function(test_config.sfpu_op, val);
     });
-    std::vector<uint32_t> packed_golden = pack_vector<uint32_t, bfloat16>(golden);
 
     // Same runtime args for every core
     vector<uint32_t> reader_rt_args = {
         (uint32_t)input_dram_byte_address,
         (uint32_t)input_dram_noc_xy.x,
         (uint32_t)input_dram_noc_xy.y,
-        (uint32_t)test_config.num_tiles,
+        (uint32_t)num_tiles,
     };
 
     vector<uint32_t> writer_rt_args = {
         (uint32_t)output_dram_byte_address,
         (uint32_t)output_dram_noc_xy.x,
         (uint32_t)output_dram_noc_xy.y,
-        (uint32_t)test_config.num_tiles,
+        (uint32_t)num_tiles,
     };
 
     for (const CoreRange& core_range : test_config.cores.ranges()) {
-        tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(byte_size, {{0, test_config.l1_input_data_format}})
-            .set_page_size(0, test_config.tile_byte_size);
+        tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(input_byte_size, {{0, test_config.l1_input_data_format}})
+            .set_page_size(0, tile_size(test_config.l1_input_data_format));
         auto l1_input_cb = tt_metal::CreateCircularBuffer(program, core_range, l1_input_cb_config);
 
-        tt_metal::CircularBufferConfig l1_output_cb_config = tt_metal::CircularBufferConfig(byte_size, {{16, test_config.l1_output_data_format}})
-            .set_page_size(16, test_config.tile_byte_size);
+        tt_metal::CircularBufferConfig l1_output_cb_config = tt_metal::CircularBufferConfig(output_byte_size, {{16, test_config.l1_output_data_format}})
+            .set_page_size(16, tile_size(test_config.l1_output_data_format));
         auto l1_output_cb = tt_metal::CreateCircularBuffer(program, core_range, l1_output_cb_config);
 
         auto reader_kernel = tt_metal::CreateKernel(
@@ -188,7 +218,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
             tt_metal::DataMovementConfig{
                 .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
-        std::map<string, string> sfpu_defines = sfpu_util::sfpu_op_to_op_name.at(test_config.sfpu_op);
+        map<string, string> sfpu_defines = sfpu_util::sfpu_op_to_op_name.at(test_config.sfpu_op);
 
         sfpu_defines["SFPU_OP_EXP_INCLUDE"] = "1";
         sfpu_defines["SFPU_OP_GELU_INCLUDE"] = "1";
@@ -200,11 +230,16 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
         sfpu_defines["SFPU_OP_RELU_FAMILY_INCLUDE"] = "1";
         sfpu_defines["SFPU_OP_COMPUTE_KERNEL_API_INCLUDE"]="1";
 
+        if (test_config.fp32_dest_acc_en) {
+            sfpu_defines["DEST_ACCUM_EN"] = "1";
+        }
+
         auto sfpu_kernel = tt_metal::CreateKernel(
             program,
             "tt_metal/kernels/compute/eltwise_sfpu.cpp",
             test_config.cores,
             tt_metal::ComputeConfig{
+                .fp32_dest_acc_en = test_config.fp32_dest_acc_en,
                 .math_approx_mode = test_config.approx_mode,
                 .compile_args = compute_kernel_args,
                 .defines = sfpu_defines});
@@ -216,258 +251,93 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
         }
     }
 
-    std::vector<uint32_t> dest_buffer_data;
+    vector<uint32_t> packed_output;
     tt_metal::detail::WriteToBuffer(input_dram_buffer, packed_input);
     tt_metal::detail::LaunchProgram(device, program);
-    tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data);
+    tt_metal::detail::ReadFromBuffer(output_dram_buffer, packed_output);
+
+    // Unpack SFPU output to float vector
+    vector<float> unpacked_output = unit_tests::compute::unpack_generalized(test_config.l1_output_data_format, packed_output);
 
-    return sfpu_util::is_close_packed_sfpu_output(dest_buffer_data, packed_golden, test_config.sfpu_op);
+    return sfpu_util::is_close_packed_sfpu_output(golden, unpacked_output, test_config.sfpu_op);
 }
 
 }  // namespace unit_tests::compute::sfpu
+
 class SingleCoreSingleDeviceSfpuParameterizedFixture : public DeviceFixture,
-                                                       public testing::WithParamInterface<std::tuple<size_t, string>> {
+                                                       public testing::WithParamInterface<std::tuple<std::tuple<tt::DataFormat, tt::DataFormat>, std::string>> {
 };
+
 TEST_P(SingleCoreSingleDeviceSfpuParameterizedFixture, SfpuCompute) {
-    size_t num_tiles = std::get<0>(GetParam());
+    // Generate random width and height of the input block
+    // Can be easily expanded to all dimensions of tensor
+    vector<uint32_t> random_shape = generate_uniform_random_vector<uint32_t>(
+        MIN_BLOCK_DIM,
+        MAX_BLOCK_DIM,
+        NUM_DIMS,
+        std::chrono::system_clock::now().time_since_epoch().count() // Seed
+    );
+    size_t r_tile_dim = random_shape[0];
+    size_t c_tile_dim = random_shape[1];
+
+    // Extract the tuple of input/output formats and the sfpu_op
+    auto formats = std::get<0>(GetParam());
     string sfpu_op = std::get<1>(GetParam());
 
+    // Extract input and output formats from the tuple
+    tt::DataFormat input_format = std::get<0>(formats);
+    tt::DataFormat output_format = std::get<1>(formats);
+
     CoreRange core_range({0, 0}, {0, 0});
     CoreRangeSet core_range_set({core_range});
-    unit_tests::compute::sfpu::SfpuConfig test_config = {
-        .num_tiles = num_tiles,
-        .tile_byte_size = 2 * 32 * 32,
-        .l1_input_data_format = tt::DataFormat::Float16_b,
-        .l1_output_data_format = tt::DataFormat::Float16_b,
-        .cores = core_range_set,
-        .sfpu_op = sfpu_op,
-        .approx_mode = false};
-    log_info("Testing SFPU_OP={} num_tiles={}", sfpu_op, num_tiles);
-    for (unsigned int id = 0; id < num_devices_; id++) {
-        EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(id), test_config));
+    for (bool approx_mode: {true, false}) {
+        for (bool fp32_dest_acc_en : {true, false}) {
+            // FP32 dest acc not possible for GS
+            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
+            unit_tests::compute::sfpu::SfpuConfig test_config = {
+                .r_tile_dim = r_tile_dim,
+                .c_tile_dim = c_tile_dim,
+                .l1_input_data_format = input_format,
+                .l1_output_data_format = output_format,
+                .cores = core_range_set,
+                .sfpu_op = sfpu_op,
+                .approx_mode = approx_mode,
+                .fp32_dest_acc_en = fp32_dest_acc_en
+            };
+            log_info("SFPU_OP={}, r_tile_dim={}, c_tile_dim={}, approx_mode={}, fp32_dest_acc_en={} input_format={} output_format={}",
+                sfpu_op,
+                r_tile_dim,
+                c_tile_dim,
+                approx_mode,
+                fp32_dest_acc_en,
+                input_format,
+                output_format);
+            for (unsigned int id = 0; id < num_devices_; id++) {
+                EXPECT_TRUE(run_sfpu_test(devices_.at(id), test_config));
+            }
+        }
     }
 }
 
 INSTANTIATE_TEST_SUITE_P(
     SingleCoreSfpuCompute,
     SingleCoreSingleDeviceSfpuParameterizedFixture,
-    ::testing::Values(
-        std::make_tuple(1, "relu"),
-        std::make_tuple(1, "exponential"),
-        std::make_tuple(1, "reciprocal"),
-        std::make_tuple(1, "gelu"),
-        std::make_tuple(1, "sqrt"),
-        std::make_tuple(1, "sigmoid"),
-        std::make_tuple(1, "log"),
-        std::make_tuple(1, "tanh"),
-        std::make_tuple(4, "relu"),
-        std::make_tuple(4, "exponential"),
-        std::make_tuple(4, "reciprocal"),
-        std::make_tuple(4, "gelu"),
-        std::make_tuple(4, "sqrt"),
-        std::make_tuple(4, "sigmoid"),
-        std::make_tuple(4, "log"),
-        std::make_tuple(4, "tanh")));
-class SingleCoreSingleDeviceSfpuParameterizedApproxFixture
-    : public DeviceFixture,
-      public testing::WithParamInterface<std::tuple<size_t, string>> {};
-
-TEST_P(SingleCoreSingleDeviceSfpuParameterizedApproxFixture, SfpuCompute) {
-    size_t num_tiles = std::get<0>(GetParam());
-    string sfpu_op = std::get<1>(GetParam());
-
-    if (((arch_ == tt::ARCH::WORMHOLE_B0) and (sfpu_op == "relu")) or
-        ((arch_ == tt::ARCH::WORMHOLE_B0) and (sfpu_op == "exponential")) or
-        ((arch_ == tt::ARCH::WORMHOLE_B0) and (sfpu_op == "log"))) {
-        GTEST_SKIP();
-    } else {
-        CoreRange core_range({0, 0}, {0, 0});
-        CoreRangeSet core_range_set({core_range});
-        unit_tests::compute::sfpu::SfpuConfig test_config = {
-            .num_tiles = num_tiles,
-            .tile_byte_size = 2 * 32 * 32,
-            .l1_input_data_format = tt::DataFormat::Float16_b,
-            .l1_output_data_format = tt::DataFormat::Float16_b,
-            .cores = core_range_set,
-            .sfpu_op = sfpu_op,
-            .approx_mode = true};
-        log_info("Testing SFPU_OP={} num_tiles={}", sfpu_op, num_tiles);
-        for (unsigned int id = 0; id < num_devices_; id++) {
-            EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(id), test_config));
-        }
-    }
-}
-INSTANTIATE_TEST_SUITE_P(
-    SingleCoreSfpuCompute,
-    SingleCoreSingleDeviceSfpuParameterizedApproxFixture,
-    ::testing::Values(
-        std::make_tuple(1, "relu"),
-        std::make_tuple(1, "exponential"),
-        std::make_tuple(1, "reciprocal"),
-        std::make_tuple(1, "gelu"),
-        std::make_tuple(1, "sqrt"),
-        std::make_tuple(1, "sigmoid"),
-        std::make_tuple(1, "log"),
-        std::make_tuple(1, "tanh"),
-        std::make_tuple(4, "relu"),
-        std::make_tuple(4, "exponential"),
-        std::make_tuple(4, "reciprocal"),
-        std::make_tuple(4, "gelu"),
-        std::make_tuple(4, "sqrt"),
-        std::make_tuple(4, "sigmoid"),
-        std::make_tuple(4, "log"),
-        std::make_tuple(4, "tanh")));
-
-TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreSingleTileSfpuApproxCompute) {
-    CoreRange core_range({0, 0}, {1, 0});
-    CoreRangeSet core_range_set({core_range});
-    unit_tests::compute::sfpu::SfpuConfig test_config = {
-        .tile_byte_size = 2 * 32 * 32,
-        .l1_input_data_format = tt::DataFormat::Float16_b,
-        .l1_output_data_format = tt::DataFormat::Float16_b,
-        .cores = core_range_set,
-        .approx_mode = true};
-
-    auto arch = this->arch_;
-
-    if (arch != tt::ARCH::GRAYSKULL) {
-        GTEST_SKIP();
-    }
-
-    CoreRangeSet core_set({core_range});
-    test_config.cores = core_set;
-
-    test_config.num_tiles = 1;
-    test_config.sfpu_op = "relu";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "exponential";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "reciprocal";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "gelu";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "sqrt";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "sigmoid";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "log";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "tanh";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-}
-
-TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreMultiTileSfpuApproxCompute) {
-    CoreRange core_range({0, 0}, {1, 0});
-    CoreRangeSet core_range_set({core_range});
-    unit_tests::compute::sfpu::SfpuConfig test_config = {
-        .tile_byte_size = 2 * 32 * 32,
-        .l1_input_data_format = tt::DataFormat::Float16_b,
-        .l1_output_data_format = tt::DataFormat::Float16_b,
-        .cores = core_range_set,
-        .approx_mode = true};
-
-    auto arch = this->arch_;
-
-    if (arch != tt::ARCH::GRAYSKULL) {
-        GTEST_SKIP();
-    }
-
-    CoreRangeSet core_set({core_range});
-    test_config.cores = core_set;
-
-    test_config.num_tiles = 4;
-
-    test_config.sfpu_op = "relu";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "exponential";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "reciprocal";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "gelu";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "sqrt";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "sigmoid";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "log";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "tanh";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-}
-TEST_F(DeviceFixture, DISABLED_AllCoreSingleTileSfpuApproxCompute) {
-    unit_tests::compute::sfpu::SfpuConfig test_config = {
-        .tile_byte_size = 2 * 32 * 32,
-        .l1_input_data_format = tt::DataFormat::Float16_b,
-        .l1_output_data_format = tt::DataFormat::Float16_b,
-        .cores = CoreRangeSet(),
-        .approx_mode = true};
-
-    auto arch = this->arch_;
-
-    if (arch != tt::ARCH::GRAYSKULL) {
-        GTEST_SKIP();
-    }
-
-    int chip_id = 0;
-    CoreCoord worker_grid_size = this->devices_.at(0)->logical_grid_size();
-    CoreRange core_range({0, 0}, {worker_grid_size.x - 2, worker_grid_size.y - 2});
-
-    CoreRangeSet core_set({core_range});
-    test_config.cores = core_set;
-
-    test_config.num_tiles = 1;
-    test_config.sfpu_op = "relu";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "exponential";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "reciprocal";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "gelu";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "sqrt";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "sigmoid";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "log";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "tanh";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-}
-TEST_F(DeviceFixture, DISABLED_AllCoreMultiTileSfpuApproxCompute) {
-    unit_tests::compute::sfpu::SfpuConfig test_config = {
-        .tile_byte_size = 2 * 32 * 32,
-        .l1_input_data_format = tt::DataFormat::Float16_b,
-        .l1_output_data_format = tt::DataFormat::Float16_b,
-        .cores = CoreRangeSet(),
-        .approx_mode = true};
-
-    auto arch = this->arch_;
-
-    if (arch != tt::ARCH::GRAYSKULL) {
-        GTEST_SKIP();
-    }
-
-    int chip_id = 0;
-    CoreCoord worker_grid_size = this->devices_.at(0)->logical_grid_size();
-    CoreRange core_range({0, 0}, {worker_grid_size.x - 2, worker_grid_size.y - 2});
-
-    CoreRangeSet core_set({core_range});
-    test_config.cores = core_set;
-    test_config.num_tiles = 4;
-    test_config.sfpu_op = "relu";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "exponential";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "reciprocal";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "gelu";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "sqrt";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "sigmoid";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "log";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-    test_config.sfpu_op = "tanh";
-    EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
-}
+    ::testing::Combine(
+        ::testing::Values(
+            std::make_tuple(tt::DataFormat::Float16_b, tt::DataFormat::Float16_b),
+            std::make_tuple(tt::DataFormat::Float16_b, tt::DataFormat::Float32),
+            std::make_tuple(tt::DataFormat::Float32, tt::DataFormat::Float16_b),
+            std::make_tuple(tt::DataFormat::Bfp4_b, tt::DataFormat::Float16_b),
+            std::make_tuple(tt::DataFormat::Bfp8_b, tt::DataFormat::Float32)
+        ),
+        ::testing::Values(
+            "gelu",
+            "relu",
+            "sqrt",
+            "exponential",
+            "log",
+            "reciprocal",
+            "tanh",
+            "sigmoid"
+        )
+    ));