From 948fafb081ace2fd966295548128616b409903cf Mon Sep 17 00:00:00 2001
From: Stanislav Minakov <sminakov@tenstorrent.com>
Date: Wed, 30 Oct 2024 16:09:41 -0700
Subject: [PATCH] #0: Faster builds by enabling Unity build for TTNN and tests
 (#14461)

* #0: Unity builds

* #0: Unity build for tests

* #0: More unity builds

* #0: Cleanup

* #0: Rename ANON_NAMESPACE to CMAKE_UNIQUE_NAMESPACE

* #0: Allow to disable unity builds

* #0: Disable unity builds if export commands is on

* #0: Raise min cmake version to 3.20

* #0: CMake fixes

* #0: Review fixes - cmake cleanup

* #0: Disable unity builds on older cmake

* #0: Build script fixup

* #0: Build fix
---
 CMakeLists.txt                                |  2 +
 build_metal.sh                                | 17 +++++-
 cmake/helper_functions.cmake                  |  1 +
 cmake/project_options.cmake                   | 12 ++++
 cmake/unity.cmake                             | 12 ++++
 tests/tt_metal/test_utils/df/bfloat16.hpp     | 47 ---------------
 tests/tt_metal/test_utils/df/df.hpp           |  1 -
 .../tt_metal/unit_tests/CMakeLists.txt        |  1 +
 .../unit_tests/buffer/test_banked.cpp         |  4 +-
 .../unit_tests/common/core_coord_fixture.hpp  |  2 +
 .../unit_tests/compute/test_broadcast.cpp     | 34 +++++------
 .../unit_tests/compute/test_cumsum.cpp        | 20 +++----
 .../unit_tests/compute/test_sfpu_compute.cpp  | 58 +++++++++----------
 .../test_single_core_binary_compute.cpp       | 38 ++++++------
 .../test_single_core_matmul_compute.cpp       | 48 +++++++--------
 .../tt_metal/unit_tests/dram/direct.cpp       |  8 +--
 .../multichip/basic_eth_kernels.cpp           | 12 ++++
 .../multichip/buffer_movement_kernels.cpp     |  4 +-
 .../multichip/erisc_app_direct_send.cpp       | 17 ++++--
 .../multichip/ring_gather_kernels.cpp         |  4 +-
 .../tt_metal/unit_tests_common/CMakeLists.txt |  3 +-
 .../common/common_fixture.hpp                 |  2 +
 .../common/dprint_fixture.hpp                 |  2 +
 .../common/watcher_fixture.hpp                |  2 +
 .../compute/matmul/matmul_utils.hpp           |  3 +
 .../dprint/test_eth_cores.cpp                 |  8 ++-
 .../dprint/test_mute_device.cpp               |  6 +-
 .../dprint/test_mute_print_server.cpp         |  6 +-
 .../dprint/test_print_all_harts.cpp           |  6 +-
 .../dprint/test_print_hanging.cpp             |  6 +-
 .../dprint/test_print_tensix_dest.cpp         |  2 +-
 .../dprint/test_raise_wait.cpp                |  6 +-
 .../unit_tests_common/watcher/test_pause.cpp  |  6 +-
 .../watcher/test_waypoint.cpp                 |  6 +-
 .../unit_tests_fast_dispatch/CMakeLists.txt   |  1 +
 .../command_queue_test_utils.hpp              |  2 +
 .../command_queue/test_EnqueueTrace.cpp       |  6 --
 .../common/command_queue_fixture.hpp          |  2 +
 .../compute/sfpu/sfpu_compute.cpp             | 54 ++++++++---------
 .../multichip/test_eth_EnqueueProgram.cpp     | 10 +++-
 .../test_eth_ring_gather_EnqueueProgram.cpp   | 14 ++++-
 .../CMakeLists.txt                            |  1 +
 .../command_queue/test_EnqueueTrace.cpp       |  6 --
 .../common/command_queue_fixture.hpp          |  2 +
 .../common/command_queue_test_utils.hpp       |  2 +-
 tests/ttnn/unit_tests/gtests/CMakeLists.txt   |  1 +
 tests/ttnn/unit_tests/gtests/test_add.cpp     |  4 +-
 .../ttnn/unit_tests/gtests/test_graph_add.cpp |  4 +-
 .../unit_tests/gtests/test_multi_device.cpp   |  4 +-
 .../ttnn_multi_command_queue_fixture.hpp      |  2 +
 .../unit_tests/gtests/ttnn_test_fixtures.hpp  |  5 +-
 ttnn/CMakeLists.txt                           |  1 +
 ttnn/cpp/pybind11/operations/creation.hpp     |  2 +-
 .../device/bernoulli_device_operation.hpp     |  2 +
 ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp   |  4 +-
 .../device/reduce_scatter_op.cpp              |  8 ++-
 .../conv2d_op_sharded_program_factory.cpp     |  8 +++
 ...onv2d_op_width_sharded_program_factory.cpp | 27 ++++-----
 ttnn/cpp/ttnn/operations/creation.hpp         | 16 ++---
 .../clone/device/clone_device_operation.hpp   |  2 +
 .../concat/device/concat_program_factory.hpp  |  2 +
 .../untilize/device/untilize_op.cpp           |  8 +--
 .../device/untilize_program_factory.cpp       |  2 +-
 .../device/untilize_with_halo_v2_op.cpp       |  2 +-
 ...t_and_width_multi_core_program_factory.cpp |  5 ++
 ...cast_height_multi_core_program_factory.cpp |  5 ++
 ...core_sharded_optimized_program_factory.cpp |  5 ++
 ...dcast_width_multi_core_program_factory.cpp |  5 ++
 .../device/unary_sharded_program_factory.cpp  |  4 +-
 .../device/embedding_program_factory.hpp      | 10 ++--
 .../embedding_backward_program_factory.cpp    |  2 +-
 .../ccl/all_reduce/device/all_reduce_op.cpp   |  7 ++-
 .../rotary_embedding_program_factory.cpp      |  4 +-
 ...otary_embedding_llama_device_operation.cpp |  3 +-
 ...rotary_embedding_llama_program_factory.cpp |  3 +-
 .../full/device/full_device_operation.hpp     |  2 +
 .../device/full_like_device_operation.hpp     |  2 +
 .../device/index_fill_device_operation.hpp    |  3 +
 .../operations/matmul/device/matmul_op.hpp    |  2 +-
 ttnn/cpp/ttnn/operations/matmul/matmul.cpp    |  4 +-
 ttnn/cpp/ttnn/operations/matmul/matmul.hpp    |  4 +-
 .../device/moreh_adam_device_operation.hpp    |  2 +
 .../device/moreh_arange_device_operation.hpp  |  2 +
 .../device/moreh_cumsum_device_operation.hpp  |  2 +
 .../device/moreh_dot_device_operation.hpp     |  2 +
 .../moreh_dot_backward_device_operation.hpp   |  2 +
 .../device/moreh_getitem_device_operation.hpp |  2 +
 .../device/moreh_getitem_rm_factory.cpp       |  6 ++
 .../device/moreh_getitem_tilized_factory.cpp  |  6 ++
 .../moreh_group_norm_device_operation.hpp     |  2 +
 ...kward_gamma_beta_grad_device_operation.hpp |  2 +
 ...m_backward_input_grad_device_operation.hpp |  2 +
 .../moreh_layer_norm_device_operation.hpp     |  2 +
 ...kward_gamma_beta_grad_device_operation.hpp |  2 +
 ...m_backward_input_grad_device_operation.hpp |  2 +
 .../device/moreh_matmul_device_operation.hpp  |  2 +
 .../device/moreh_mean_device_operation.hpp    |  2 +
 .../moreh_mean_backward_device_operation.hpp  |  2 +
 .../device/moreh_norm_device_operation.hpp    |  2 +
 .../moreh_norm_backward_device_operation.hpp  |  2 +
 .../device/moreh_sgd_device_operation.hpp     |  2 +
 .../device/moreh_softmax_device_operation.hpp |  2 +
 ...oreh_softmax_backward_device_operation.hpp |  2 +
 .../device/moreh_sum_device_operation.hpp     |  2 +
 .../moreh_sum_backward_device_operation.hpp   |  2 +
 .../multi_core/groupnorm_op_multi_core.cpp    |  9 ++-
 .../multi_core/layernorm_op_multi_core.cpp    | 14 +++--
 ...ayernorm_post_all_gather_op_multi_core.cpp |  5 ++
 ...layernorm_pre_all_gather_op_multi_core.cpp |  5 ++
 .../multi_core/softmax_op_multi_core.cpp      |  6 +-
 .../halo/device/halo_device_operation.cpp     |  2 +-
 .../device/uniform_device_operation.hpp       |  2 +
 ttnn/cpp/ttnn/tensor/types.cpp                |  2 +-
 ttnn/cpp/ttnn/types.hpp                       |  9 ---
 114 files changed, 495 insertions(+), 295 deletions(-)
 create mode 100644 cmake/unity.cmake
 delete mode 100644 tests/tt_metal/test_utils/df/bfloat16.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0e9d25c10a..77be383575f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ endif()
 list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 include(project_options)
+include(unity)
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -121,6 +122,7 @@ message(STATUS "Build Python bindings: ${WITH_PYTHON_BINDINGS}")
 message(STATUS "Build Programming Examples: ${BUILD_PROGRAMMING_EXAMPLES}")
 message(STATUS "Build TT METAL Tests: ${TT_METAL_BUILD_TESTS}")
 message(STATUS "Build TTNN Tests: ${TTNN_BUILD_TESTS}")
+message(STATUS "Build with Unity builds: ${TT_UNITY_BUILDS}")
 ############################################################################################################################
 
 if(ENABLE_BUILD_TIME_TRACE)
diff --git a/build_metal.sh b/build_metal.sh
index 2b09410eb20..a3653d8c1ed 100755
--- a/build_metal.sh
+++ b/build_metal.sh
@@ -26,6 +26,7 @@ show_help() {
     echo "  --debug                          Set the build type as Debug."
     echo "  --clean                          Remove build workspaces."
     echo "  --build-static-libs              Build tt_metal (not ttnn) as a static lib (BUILD_SHARED_LIBS=OFF)"
+    echo "  --disable-unity-builds           Disable Unity builds"
 }
 
 clean() {
@@ -49,11 +50,12 @@ build_metal_tests="OFF"
 build_umd_tests="OFF"
 build_programming_examples="OFF"
 build_static_libs="OFF"
+unity_builds="ON"
 
 declare -a cmake_args
 
 OPTIONS=h,e,c,t,a,m,s,u,b:,p
-LONGOPTIONS=help,export-compile-commands,enable-ccache,enable-time-trace,enable-asan,enable-msan,enable-tsan,enable-ubsan,build-type:,enable-profiler,install-prefix:,build-tests,build-ttnn-tests,build-metal-tests,build-umd-tests,build-programming-examples,build-static-libs,release,development,debug,clean
+LONGOPTIONS=help,export-compile-commands,enable-ccache,enable-time-trace,enable-asan,enable-msan,enable-tsan,enable-ubsan,build-type:,enable-profiler,install-prefix:,build-tests,build-ttnn-tests,build-metal-tests,build-umd-tests,build-programming-examples,build-static-libs,disable-unity-builds,release,development,debug,clean
 
 # Parse the options
 PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTIONS --name "$0" -- "$@")
@@ -70,7 +72,7 @@ while true; do
         -h|--help)
             show_help;exit 0;;
         -e|--export-compile-commands)
-            export_compile_commands="ON";;
+            export_compile_commands="ON";unity_builds="OFF";;
         -c|--enable-ccache)
             enable_ccache="ON";;
         -t|--enable-time-trace)
@@ -101,6 +103,8 @@ while true; do
             build_programming_examples="ON";;
         --build-static-libs)
             build_static_libs="ON";;
+        --disable-unity-builds)
+	    unity_builds="OFF";;
         --release)
             build_type="Release";;
         --development)
@@ -156,6 +160,7 @@ echo "INFO: Enable UndefinedBehaviorSanitizer: $enable_ubsan"
 echo "INFO: Build directory: $build_dir"
 echo "INFO: Install Prefix: $cmake_install_prefix"
 echo "INFO: Build tests: $build_tests"
+echo "INFO: Enable Unity builds: $unity_builds"
 
 # Prepare cmake arguments
 cmake_args+=("-B" "$build_dir")
@@ -194,6 +199,8 @@ fi
 
 if [ "$export_compile_commands" = "ON" ]; then
     cmake_args+=("-DCMAKE_EXPORT_COMPILE_COMMANDS=ON")
+else
+    cmake_args+=("-DCMAKE_EXPORT_COMPILE_COMMANDS=OFF")
 fi
 
 if [ "$build_tests" = "ON" ]; then
@@ -222,6 +229,12 @@ if [ "$build_static_libs" = "ON" ]; then
     cmake_args+=("-DBUILD_SHARED_LIBS=OFF")
 fi
 
+if [ "$unity_builds" = "ON" ]; then
+    cmake_args+=("-DTT_UNITY_BUILDS=ON")
+else
+    cmake_args+=("-DTT_UNITY_BUILDS=OFF")
+fi
+
 # Create and link the build directory
 mkdir -p $build_dir
 ln -nsf $build_dir build
diff --git a/cmake/helper_functions.cmake b/cmake/helper_functions.cmake
index 7214addd901..ac76caea8a7 100644
--- a/cmake/helper_functions.cmake
+++ b/cmake/helper_functions.cmake
@@ -12,6 +12,7 @@ function(CREATE_EAGER_TEST_EXE TESTLIST)
             set(TEST_TARGET ${TEST_NAME})
         endif()
         add_executable(${TEST_TARGET} ${TEST_SRC_PATH})
+        TT_ENABLE_UNITY_BUILD(${TEST_TARGET})
 
         target_link_libraries(
             ${TEST_TARGET}
diff --git a/cmake/project_options.cmake b/cmake/project_options.cmake
index 90851878df4..63a5922f815 100644
--- a/cmake/project_options.cmake
+++ b/cmake/project_options.cmake
@@ -16,4 +16,16 @@ option(BUILD_PROGRAMMING_EXAMPLES "Enables build of tt_metal programming example
 option(TT_METAL_BUILD_TESTS "Enables build of tt_metal tests" OFF)
 option(TTNN_BUILD_TESTS "Enables build of ttnn tests" OFF)
 option(ENABLE_CCACHE "Build with compiler cache" FALSE)
+option(TT_UNITY_BUILDS "Build with Unity builds" ON)
 ###########################################################################################
+
+if(TT_UNITY_BUILDS)
+    if(CMAKE_EXPORT_COMPILE_COMMANDS)
+        message(STATUS "Disabling Unity builds because CMAKE_EXPORT_COMPILE_COMMANDS is ON")
+        set(TT_UNITY_BUILDS OFF)
+    endif()
+    if(CMAKE_VERSION VERSION_LESS "3.20.0")
+        message(STATUS "CMake 3.20 or newer is required for Unity builds, disabling")
+        set(TT_UNITY_BUILDS OFF)
+    endif()
+endif()
diff --git a/cmake/unity.cmake b/cmake/unity.cmake
new file mode 100644
index 00000000000..644113e26d7
--- /dev/null
+++ b/cmake/unity.cmake
@@ -0,0 +1,12 @@
+function(TT_ENABLE_UNITY_BUILD TARGET)
+    if(TT_UNITY_BUILDS)
+        set_target_properties(
+            ${TARGET}
+            PROPERTIES
+                UNITY_BUILD
+                    ON
+                UNITY_BUILD_UNIQUE_ID
+                    "CMAKE_UNIQUE_NAMESPACE"
+        )
+    endif()
+endfunction()
diff --git a/tests/tt_metal/test_utils/df/bfloat16.hpp b/tests/tt_metal/test_utils/df/bfloat16.hpp
deleted file mode 100644
index 339e43ce89b..00000000000
--- a/tests/tt_metal/test_utils/df/bfloat16.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <bit>
-#include <iostream>
-
-namespace tt::test_utils::df {
-
-//! Custom type is supported as long as the custom type supports the following custom functions
-//! static SIZEOF - indicates byte size of custom type
-//! to_float() - get float value from custom type
-//! to_packed() - get packed (into an integral type that is of the bitwidth specified by SIZEOF)
-//! Constructor(float in) - constructor with a float as the initializer
-
-class bfloat16 {
-   private:
-    uint16_t uint16_data;
-
-   public:
-    static constexpr size_t SIZEOF = 2;
-
-    constexpr bfloat16() noexcept = default;
-
-    // create from float: no rounding, just truncate
-    constexpr bfloat16(float float_num) noexcept : bfloat16((std::bit_cast<uint32_t>(float_num)) >> 16) {
-        static_assert(sizeof(float) == sizeof(uint32_t), "Can only support 32bit fp");
-    }
-
-    // store lower 16 as 16-bit uint
-    constexpr bfloat16(uint32_t uint32_data) noexcept : uint16_data(static_cast<uint16_t>(uint32_data)) {}
-
-    constexpr float to_float() const noexcept { return std::bit_cast<float>(static_cast<uint32_t>(uint16_data) << 16); }
-
-    constexpr uint16_t to_packed() const noexcept { return uint16_data; }
-
-    constexpr bool operator==(const bfloat16& rhs) const noexcept = default;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const bfloat16& val) {
-    os << val.to_packed();
-    return os;
-}
-
-}  // namespace tt::test_utils::df
diff --git a/tests/tt_metal/test_utils/df/df.hpp b/tests/tt_metal/test_utils/df/df.hpp
index 89afc6bc818..883264f5935 100644
--- a/tests/tt_metal/test_utils/df/df.hpp
+++ b/tests/tt_metal/test_utils/df/df.hpp
@@ -3,5 +3,4 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "tt_metal/test_utils/df/bfloat16.hpp"
 #include "tt_metal/test_utils/df/float32.hpp"
diff --git a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt
index bee628ad488..0e49d557eae 100644
--- a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt
@@ -48,6 +48,7 @@ add_executable(
     ${UNIT_TESTS_SRC}
     $<TARGET_OBJECTS:unit_tests_common_o>
 )
+TT_ENABLE_UNITY_BUILD(unit_tests)
 add_executable(unit_tests_galaxy ${CMAKE_CURRENT_SOURCE_DIR}/multichip/galaxy_cluster_api.cpp)
 
 target_link_libraries(
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
index 62418427c21..9f1d68e7440 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
@@ -227,8 +227,8 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) {
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> input_packed = tt::test_utils::generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-        -1.0f, 1.0f, cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
+    std::vector<uint32_t> input_packed = tt::test_utils::generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+        -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
 
     ////////////////////////////////////////////////////////////////////////////
     //                      Compile and Execute Appli   cation
diff --git a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp
index 90963f22e23..e483088a7f7 100644
--- a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
index d12d89bd88f..43963dc422e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
@@ -69,7 +69,7 @@ struct BroadcastConfig {
     MathFidelity math_fidelity = MathFidelity::HiFi4;
 };
 
-void mask_src_b_for_broadcast(std::vector<tt::test_utils::df::bfloat16>& tile, const std::vector<uint32_t> &shape, BroadcastDim dim) {
+void mask_src_b_for_broadcast(std::vector<bfloat16>& tile, const std::vector<uint32_t> &shape, BroadcastDim dim) {
     int num_rows = shape.at(0);
     int num_cols = shape.at(1);
 
@@ -83,14 +83,14 @@ void mask_src_b_for_broadcast(std::vector<tt::test_utils::df::bfloat16>& tile, c
     }
 }
 
-std::vector<tt::test_utils::df::bfloat16> gold_broadcast(std::vector<tt::test_utils::df::bfloat16>& src_a, std::vector<tt::test_utils::df::bfloat16>& src_b, const std::vector<uint32_t> &shape, EltwiseOp op, BroadcastDim dim, MathFidelity math_fidelity = MathFidelity::HiFi4) {
+std::vector<bfloat16> gold_broadcast(std::vector<bfloat16>& src_a, std::vector<bfloat16>& src_b, const std::vector<uint32_t> &shape, EltwiseOp op, BroadcastDim dim, MathFidelity math_fidelity = MathFidelity::HiFi4) {
     int num_rows = shape.at(0);
     int num_cols = shape.at(1);
 
     uint16_t srca_fid_mask = 0xFFFF;
     uint16_t srcb_fid_mask = 0xFFFF;
 
-    std::vector<tt::test_utils::df::bfloat16> golden(num_cols * num_rows);
+    std::vector<bfloat16> golden(num_cols * num_rows);
     auto arch = get_arch_from_string(get_umd_arch_name());
 
     switch (math_fidelity) {
@@ -103,7 +103,7 @@ std::vector<tt::test_utils::df::bfloat16> gold_broadcast(std::vector<tt::test_ut
 
     for (int i = 0; i < num_rows; i++) {
         for (int j = 0; j < num_cols; j++) {
-            tt::test_utils::df::bfloat16 broadcast_value;
+            bfloat16 broadcast_value;
             switch (dim)
             {
             case BroadcastDim::ROW: { broadcast_value = src_b[j]; break; }
@@ -118,8 +118,8 @@ std::vector<tt::test_utils::df::bfloat16> gold_broadcast(std::vector<tt::test_ut
             case EltwiseOp::SUB: { golden[i * num_cols + j] = src_a[i * num_cols + j].to_float() - broadcast_value.to_float(); break; }
             case EltwiseOp::MUL: {
                 golden[i * num_cols + j] =
-                    tt::test_utils::df::bfloat16(std::bit_cast<uint32_t>(src_a[i * num_cols + j].to_packed() & srca_fid_mask)).to_float() *
-                    tt::test_utils::df::bfloat16(std::bit_cast<uint32_t>(broadcast_value.to_packed() & srcb_fid_mask)).to_float();
+                    bfloat16(std::bit_cast<uint32_t>(src_a[i * num_cols + j].to_packed() & srca_fid_mask)).to_float() *
+                    bfloat16(std::bit_cast<uint32_t>(broadcast_value.to_packed() & srcb_fid_mask)).to_float();
                 break;
             }
             default: { TT_THROW("Unsupported EltwiseOp={}", op); break; }
@@ -142,7 +142,7 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig&
     constexpr uint32_t tile_width = 32;
     constexpr uint32_t tile_height = 32;
 
-    constexpr uint32_t single_tile_size = tile_width * tile_height * tt::test_utils::df::bfloat16::SIZEOF;
+    constexpr uint32_t single_tile_size = tile_width * tile_height * bfloat16::SIZEOF;
 
     tt_metal::InterleavedBufferConfig dram_config{
         .device=device,
@@ -244,25 +244,25 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig&
             (uint32_t)1,
         });
 
-    std::vector<tt::test_utils::df::bfloat16> input0 = generate_uniform_random_vector<tt::test_utils::df::bfloat16>(
+    std::vector<bfloat16> input0 = generate_uniform_random_vector<bfloat16>(
         -1.0f,
         1.0f,
-        single_tile_size / tt::test_utils::df::bfloat16::SIZEOF,
+        single_tile_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
 
-    std::vector<tt::test_utils::df::bfloat16> input1 = generate_uniform_random_vector<tt::test_utils::df::bfloat16>(
+    std::vector<bfloat16> input1 = generate_uniform_random_vector<bfloat16>(
         -1.0f,
         1.0f,
-        single_tile_size / tt::test_utils::df::bfloat16::SIZEOF,
+        single_tile_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
 
     mask_src_b_for_broadcast(input1, {tile_width, tile_height}, test_config.broadcast_dim);
 
-    std::vector<tt::test_utils::df::bfloat16> golden = gold_broadcast(input0, input1, {tile_width, tile_height}, test_config.eltwise_op, test_config.broadcast_dim, test_config.math_fidelity);
+    std::vector<bfloat16> golden = gold_broadcast(input0, input1, {tile_width, tile_height}, test_config.eltwise_op, test_config.broadcast_dim, test_config.math_fidelity);
 
-    auto packed_input0 = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(input0);
-    auto packed_input1 = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(input1);
-    auto packed_golden = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(golden);
+    auto packed_input0 = pack_vector<uint32_t, bfloat16>(input0);
+    auto packed_input1 = pack_vector<uint32_t, bfloat16>(input1);
+    auto packed_golden = pack_vector<uint32_t, bfloat16>(golden);
     unit_tests::compute::GoldenConfig config = {
         .num_tiles_r_dim = tile_width/32,
         .num_tiles_c_dim = tile_height/32
@@ -279,10 +279,10 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig&
     tt_metal::detail::ReadFromBuffer(dst_dram_buffer, dest_buffer_data);
     auto dest_buffer_data_untilized = unit_tests::compute::gold_standard_untilize(dest_buffer_data, config);
 
-    bool result = is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
+    bool result = is_close_packed_vectors<bfloat16, uint32_t>(
         dest_buffer_data_untilized,
         packed_golden,
-        [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) {
+        [&](const bfloat16& a, const bfloat16& b) {
             return is_close(a, b, 0.0155);
         });
     ASSERT_TRUE(result);
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp
index 7fdfa5d75a6..66119879e6c 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp
@@ -24,12 +24,12 @@ struct CumsumConfig {
     bool rowwise;
 };
 
-std::vector<tt::test_utils::df::bfloat16> gold_cumsum(std::vector<tt::test_utils::df::bfloat16>& src, const std::vector<uint32_t> &shape, bool rowwise) {
+std::vector<bfloat16> gold_cumsum(std::vector<bfloat16>& src, const std::vector<uint32_t> &shape, bool rowwise) {
     int N = shape.at(0);
     int W = shape.at(1);
     int H = shape.at(2);
 
-    std::vector<tt::test_utils::df::bfloat16> golden(N * W * H);
+    std::vector<bfloat16> golden(N * W * H);
 
     int dim_a = rowwise ? H : W;
     int dim_b = rowwise ? W : H;
@@ -57,7 +57,7 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c
     constexpr uint32_t tile_width = 32;
     constexpr uint32_t tile_height = 32;
 
-    constexpr uint32_t single_tile_size = tile_width * tile_height * tt::test_utils::df::bfloat16::SIZEOF;
+    constexpr uint32_t single_tile_size = tile_width * tile_height * bfloat16::SIZEOF;
 
     uint32_t W = test_config.Wt * tile_width;
     uint32_t H = test_config.Ht * tile_height;
@@ -147,16 +147,16 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c
             (uint32_t)test_config.Ht * test_config.Wt                  // Used for transposing kernel
         });
 
-    std::vector<tt::test_utils::df::bfloat16> input = generate_uniform_random_vector<tt::test_utils::df::bfloat16>(
+    std::vector<bfloat16> input = generate_uniform_random_vector<bfloat16>(
         -1.0f,
         1.0f,
-        dram_buffer_size / tt::test_utils::df::bfloat16::SIZEOF,
+        dram_buffer_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
 
-    std::vector<tt::test_utils::df::bfloat16> golden = gold_cumsum(input, {test_config.N, W, H}, test_config.rowwise);
-    auto golden_packed = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(golden);
+    std::vector<bfloat16> golden = gold_cumsum(input, {test_config.N, W, H}, test_config.rowwise);
+    auto golden_packed = pack_vector<uint32_t, bfloat16>(golden);
 
-    auto input_packed = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(input);
+    auto input_packed = pack_vector<uint32_t, bfloat16>(input);
     auto input_packed_tilized = unit_tests::compute::gold_standard_tilize(input_packed, {test_config.N * test_config.Ht, test_config.Wt});
 
     tt_metal::detail::WriteToBuffer(src_dram_buffer, input_packed_tilized);
@@ -169,10 +169,10 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c
 
     log_info(tt::LogTest, "Running test for N = {}, Wt = {}, Ht = {}", test_config.N, test_config.Wt, test_config.Ht);
 
-    bool result = is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
+    bool result = is_close_packed_vectors<bfloat16, uint32_t>(
         output_packed,
         golden_packed,
-        [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) {
+        [&](const bfloat16& a, const bfloat16& b) {
             return is_close(a, b, 0.01f);
         });
     ASSERT_TRUE(result);
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
index 0a7822c6fbf..35ffb316d01 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
@@ -39,61 +39,61 @@ const map<string, std::map<string, string>> sfpu_op_to_op_name = {
     {"tanh", {{"SFPU_OP_CHAIN_0", "tanh_tile_init(); tanh_tile(0);"}}},
 };
 
-tt::test_utils::df::bfloat16 sfpu_function(const string& op_name, const tt::test_utils::df::bfloat16& input) {
+bfloat16 sfpu_function(const string& op_name, const bfloat16& input) {
     if (op_name == "relu") {
-        return tt::test_utils::df::bfloat16(fmaxf(input.to_float(), 0.0f));
+        return bfloat16(fmaxf(input.to_float(), 0.0f));
     } else if (op_name == "exponential") {
-        return tt::test_utils::df::bfloat16(std::exp(input.to_float()));
+        return bfloat16(std::exp(input.to_float()));
     } else if (op_name == "reciprocal") {
-        return tt::test_utils::df::bfloat16(1 / input.to_float());
+        return bfloat16(1 / input.to_float());
     } else if (op_name == "gelu") {
         static constexpr float alpha = M_2_SQRTPI * M_SQRT1_2;
         auto x = input.to_float();
         auto x3 = x * x * x;
         float result = x * 0.5 * (1.0 + tanhf(alpha * (x + 0.044715 * x3)));
-        return tt::test_utils::df::bfloat16(result);
+        return bfloat16(result);
     } else if (op_name == "sqrt") {
-        return tt::test_utils::df::bfloat16(sqrtf(input.to_float()));
+        return bfloat16(sqrtf(input.to_float()));
     } else if (op_name == "sigmoid") {
         auto x = input.to_float();
         float result = 1 / (1 + std::exp(-x));
-        return tt::test_utils::df::bfloat16(result);
+        return bfloat16(result);
     } else if (op_name == "log") {
-        return tt::test_utils::df::bfloat16(logf(input.to_float()));
+        return bfloat16(logf(input.to_float()));
     } else if (op_name == "tanh") {
-        return tt::test_utils::df::bfloat16(std::tanh(input.to_float()));
+        return bfloat16(std::tanh(input.to_float()));
     } else {
         TT_THROW("Unsupported op_name in test");
-        return tt::test_utils::df::bfloat16(0.0f);
+        return bfloat16(0.0f);
     }
 }
 vector<uint32_t> generate_packed_sfpu_input(const unsigned int numel, const string& op_name, const int seed) {
     if ((op_name == "sqrt") or (op_name == "log")) {
-        return generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(0.0001f, 4.0f, numel, seed);
+        return generate_packed_uniform_random_vector<uint32_t, bfloat16>(0.0001f, 4.0f, numel, seed);
     } else if ((op_name == "exponential") or (op_name == "gelu") or (op_name == "reciprocal")) {
-        auto possible_values = vector<tt::test_utils::df::bfloat16>({-1.0f, -0.5f, 0.5f, 1.0f});
-        return generate_packed_random_vector_from_vector<uint32_t, tt::test_utils::df::bfloat16>(possible_values, numel, seed);
+        auto possible_values = vector<bfloat16>({-1.0f, -0.5f, 0.5f, 1.0f});
+        return generate_packed_random_vector_from_vector<uint32_t, bfloat16>(possible_values, numel, seed);
     } else {
-        return generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(-1.0f, 1.0f, numel, seed);
+        return generate_packed_uniform_random_vector<uint32_t, bfloat16>(-1.0f, 1.0f, numel, seed);
     }
 }
 
 bool is_close_packed_sfpu_output(const std::vector<uint32_t>& vec_a, const std::vector<uint32_t>& vec_b, const string& op_name) {
     if (op_name == "tanh") {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); });
     } else if ((op_name == "gelu") or (op_name == "relu")) {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.15f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.15f); });
     } else if ((op_name == "exponential")) {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); });
     } else if ((op_name == "log")) {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.03f, 0.02f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.03f, 0.02f); });
     } else {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); });
     }
 }
 
@@ -140,15 +140,15 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
 
     // Input
     std::vector<uint32_t> packed_input = sfpu_util::generate_packed_sfpu_input(
-        byte_size / tt::test_utils::df::bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count());
+        byte_size / bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count());
 
     // Golden output
-    auto input = unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(packed_input);
-    std::vector<tt::test_utils::df::bfloat16> golden(input.size());
-    std::transform(input.begin(), input.end(), golden.begin(), [&](const tt::test_utils::df::bfloat16& val) {
+    auto input = unpack_vector<bfloat16, uint32_t>(packed_input);
+    std::vector<bfloat16> golden(input.size());
+    std::transform(input.begin(), input.end(), golden.begin(), [&](const bfloat16& val) {
         return sfpu_util::sfpu_function(test_config.sfpu_op, val);
     });
-    std::vector<uint32_t> packed_golden = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(golden);
+    std::vector<uint32_t> packed_golden = pack_vector<uint32_t, bfloat16>(golden);
 
     // Same runtime args for every core
     vector<uint32_t> reader_rt_args = {
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp
index 19baa412647..3be28d9843e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp
@@ -157,27 +157,27 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig&
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> packed_input0 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+    std::vector<uint32_t> packed_input0 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
         -1.0f,
         1.0f,
-        byte_size / tt::test_utils::df::bfloat16::SIZEOF,
+        byte_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
-    std::vector<uint32_t> packed_input1 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+    std::vector<uint32_t> packed_input1 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
         -1.0f,
         1.0f,
-        byte_size / tt::test_utils::df::bfloat16::SIZEOF,
+        byte_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
-    std::vector<uint32_t> packed_input2 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+    std::vector<uint32_t> packed_input2 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
         -1.0f,
         1.0f,
-        byte_size / tt::test_utils::df::bfloat16::SIZEOF,
+        byte_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
     ////////////////////////////////////////////////////////////////////////////
     //                      Golden Generation
     ////////////////////////////////////////////////////////////////////////////
-    auto input0 = unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(packed_input0);
-    auto input1 = unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(packed_input1);
-    auto input2 = unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(packed_input2);
+    auto input0 = unpack_vector<bfloat16, uint32_t>(packed_input0);
+    auto input1 = unpack_vector<bfloat16, uint32_t>(packed_input1);
+    auto input2 = unpack_vector<bfloat16, uint32_t>(packed_input2);
 
     std::vector<float> temp_golden(input0.size());
     uint16_t srca_fid_mask = 0xFFFF;
@@ -188,14 +188,14 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig&
         input0.end(),
         input1.begin(),
         temp_golden.begin(),
-        [&](const tt::test_utils::df::bfloat16& lhs, const tt::test_utils::df::bfloat16& rhs) {
+        [&](const bfloat16& lhs, const bfloat16& rhs) {
             if (test_config.binary_op == "add") {
                 return (lhs.to_float() + rhs.to_float());
             } else if (test_config.binary_op == "sub") {
                 return (lhs.to_float() - rhs.to_float());
             } else if (test_config.binary_op == "mul") {
-                return ( tt::test_utils::df::bfloat16(std::bit_cast<uint32_t>(lhs.to_packed() & srca_fid_mask)).to_float() *
-                            tt::test_utils::df::bfloat16(std::bit_cast<uint32_t>(rhs.to_packed() & srcb_fid_mask)).to_float());
+                return ( bfloat16(std::bit_cast<uint32_t>(lhs.to_packed() & srca_fid_mask)).to_float() *
+                            bfloat16(std::bit_cast<uint32_t>(rhs.to_packed() & srcb_fid_mask)).to_float());
             } else if (test_config.binary_op.find("with_dest_reuse") != std::string::npos) {
                 return lhs.to_float();
             } else {
@@ -204,26 +204,26 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig&
             }
         });
 
-    std::vector<tt::test_utils::df::bfloat16> golden(input0.size());
+    std::vector<bfloat16> golden(input0.size());
     std::transform(
     input2.begin(),
     input2.end(),
     temp_golden.begin(),
     golden.begin(),
-    [&](const tt::test_utils::df::bfloat16& lhs, const float& rhs) {
+    [&](const bfloat16& lhs, const float& rhs) {
         //acc_to_dest accumulates dest value with binary output, for all binary operations
         if (test_config.acc_to_dest || test_config.binary_op == "add_with_dest_reuse") {
             return (lhs.to_float() + rhs);
         } else if (test_config.binary_op == "sub_with_dest_reuse") {
             return (lhs.to_float() - rhs);
         } else if (test_config.binary_op == "mul_with_dest_reuse") {
-            return (tt::test_utils::df::bfloat16(std::bit_cast<uint32_t>(lhs.to_packed() & srca_fid_mask)).to_float() *
-                    tt::test_utils::df::bfloat16(std::bit_cast<uint32_t>(tt::test_utils::df::bfloat16(rhs).to_packed() & srcb_fid_mask)).to_float());
+            return (bfloat16(std::bit_cast<uint32_t>(lhs.to_packed() & srca_fid_mask)).to_float() *
+                    bfloat16(std::bit_cast<uint32_t>(bfloat16(rhs).to_packed() & srcb_fid_mask)).to_float());
         } else {
             return rhs;
         }
     });
-    auto packed_golden = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(golden);
+    auto packed_golden = pack_vector<uint32_t, bfloat16>(golden);
 
     ////////////////////////////////////////////////////////////////////////////
     //                      Compile and Execute Application
@@ -268,10 +268,10 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig&
     std::vector<uint32_t> dest_buffer_data;
     tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data);
 
-    pass &= is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
+    pass &= is_close_packed_vectors<bfloat16, uint32_t>(
         dest_buffer_data,
         packed_golden,
-        [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) {
+        [&](const bfloat16& a, const bfloat16& b) {
             return is_close(a, b, 0.0155f);
         });
     return pass;
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp
index f26e3c55812..140874255df 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp
@@ -204,12 +204,12 @@ bool single_tile_matmul(tt_metal::Device* device) {
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> packed_input0 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-        1.0f, 1.0f, byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
-    std::vector<uint32_t> packed_input1 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+    std::vector<uint32_t> packed_input0 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+        1.0f, 1.0f, byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
+    std::vector<uint32_t> packed_input1 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
         1.0f / 32.0f,
         1.0f / 32.0f,
-        byte_size / tt::test_utils::df::bfloat16::SIZEOF,
+        byte_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
     // Setup the weights such that final result is the original input.
 
@@ -258,8 +258,8 @@ bool single_tile_matmul(tt_metal::Device* device) {
     ////////////////////////////////////////////////////////////////////////////
     std::vector<uint32_t> dest_buffer_data;
     tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data);
-    pass &= is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-        dest_buffer_data, packed_golden, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.015f); });
+    pass &= is_close_packed_vectors<bfloat16, uint32_t>(
+        dest_buffer_data, packed_golden, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.015f); });
     return pass;
 }
 // blocked matmul has blocking, but still fits within dst, so no spill/reloads or intermediates
@@ -353,20 +353,20 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> packed_input0 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-        1.0f, 1.0f, in0_byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
-    std::vector<uint32_t> packed_input1 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+    std::vector<uint32_t> packed_input0 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+        1.0f, 1.0f, in0_byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
+    std::vector<uint32_t> packed_input1 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
         0.03125f,
         0.03125f,
-        in1_byte_size / tt::test_utils::df::bfloat16::SIZEOF,
+        in1_byte_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
     ////////////////////////////////////////////////////////////////////////////
     //                      Golden Generation
     ////////////////////////////////////////////////////////////////////////////
-    auto packed_golden = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+    auto packed_golden = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
         1.0f * K,
         1.0f * K,
-        (out_byte_size) / tt::test_utils::df::bfloat16::SIZEOF,
+        (out_byte_size) / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
 
     ////////////////////////////////////////////////////////////////////////////
@@ -413,14 +413,14 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3
     std::vector<uint32_t> dest_buffer_data;
     tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data);
     int failed_index;
-    pass &= is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
+    pass &= is_close_packed_vectors<bfloat16, uint32_t>(
         dest_buffer_data,
         packed_golden,
-        [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.015f); },
+        [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.015f); },
         &failed_index);
     if (not pass) {
         log_info("Failed Index={}", failed_index);
-        print_vector_fixed_numel_per_row(unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(dest_buffer_data), 32);
+        print_vector_fixed_numel_per_row(unpack_vector<bfloat16, uint32_t>(dest_buffer_data), 32);
     }
     return pass;
 }
@@ -531,20 +531,20 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> packed_input0 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-        1.0f, 1.0f, in0_byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
-    std::vector<uint32_t> packed_input1 = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+    std::vector<uint32_t> packed_input0 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+        1.0f, 1.0f, in0_byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
+    std::vector<uint32_t> packed_input1 = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
         0.03125f,
         0.03125f,
-        in1_byte_size / tt::test_utils::df::bfloat16::SIZEOF,
+        in1_byte_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
     ////////////////////////////////////////////////////////////////////////////
     //                      Golden Generation
     ////////////////////////////////////////////////////////////////////////////
-    auto packed_golden = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+    auto packed_golden = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
         1.0f * K,
         1.0f * K,
-        (out_byte_size) / tt::test_utils::df::bfloat16::SIZEOF,
+        (out_byte_size) / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
 
     ////////////////////////////////////////////////////////////////////////////
@@ -591,14 +591,14 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N
     std::vector<uint32_t> dest_buffer_data;
     tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data);
     int failed_index;
-    pass &= is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
+    pass &= is_close_packed_vectors<bfloat16, uint32_t>(
         dest_buffer_data,
         packed_golden,
-        [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.015f); },
+        [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.015f); },
         &failed_index);
     if (not pass) {
         log_info("Failed Index={}", failed_index);
-        print_vector_fixed_numel_per_row(unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(dest_buffer_data), 32);
+        print_vector_fixed_numel_per_row(unpack_vector<bfloat16, uint32_t>(dest_buffer_data), 32);
     }
     return pass;
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
index 791f033d127..5c86e8feadf 100644
--- a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
@@ -222,8 +222,8 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> inputs = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-        -1.0f, 1.0f, byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
+    std::vector<uint32_t> inputs = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+        -1.0f, 1.0f, byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
     ////////////////////////////////////////////////////////////////////////////
     //                      Compile and Execute Application
     ////////////////////////////////////////////////////////////////////////////
@@ -333,8 +333,8 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> inputs = generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-        -1.0f, 1.0f, byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
+    std::vector<uint32_t> inputs = generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+        -1.0f, 1.0f, byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
     ////////////////////////////////////////////////////////////////////////////
     //                      Compile and Execute Application
     ////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/basic_eth_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/basic_eth_kernels.cpp
index 4dfe7eeba98..a24f3f64f66 100644
--- a/tests/tt_metal/tt_metal/unit_tests/multichip/basic_eth_kernels.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/multichip/basic_eth_kernels.cpp
@@ -22,9 +22,13 @@ using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
 constexpr std::int32_t MAX_NUM_WORDS =
     (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE;
+}
+}
 
 namespace unit_tests::erisc::kernels {
 
@@ -186,6 +190,7 @@ bool writer_kernel_no_receive(
 }
 
 TEST_F(N300DeviceFixture, EthKernelsNocReadNoSend) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     const auto& device_0 = devices_.at(0);
     const auto& device_1 = devices_.at(1);
@@ -212,6 +217,7 @@ TEST_F(N300DeviceFixture, EthKernelsNocReadNoSend) {
 }
 
 TEST_F(N300DeviceFixture, EthKernelsNocWriteNoReceive) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     const auto& device_0 = devices_.at(0);
     const auto& device_1 = devices_.at(1);
@@ -354,6 +360,7 @@ bool eth_direct_sender_receiver_kernels(
 }  // namespace unit_tests::erisc::kernels
 
 TEST_F(N300DeviceFixture, EthKernelsDirectSendChip0ToChip1) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     const auto& device_0 = devices_.at(0);
     const auto& device_1 = devices_.at(1);
@@ -402,6 +409,7 @@ TEST_F(N300DeviceFixture, EthKernelsDirectSendChip0ToChip1) {
 }
 
 TEST_F(N300DeviceFixture, EthKernelsDirectSendChip1ToChip0) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     const auto& device_0 = devices_.at(0);
     const auto& device_1 = devices_.at(1);
@@ -450,6 +458,7 @@ TEST_F(N300DeviceFixture, EthKernelsDirectSendChip1ToChip0) {
 }
 
 TEST_F(DeviceFixture, EthKernelsDirectSendAllConnectedChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
     const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
     for (const auto& sender_device : devices_) {
@@ -500,6 +509,7 @@ TEST_F(DeviceFixture, EthKernelsDirectSendAllConnectedChips) {
 }
 
 TEST_F(N300DeviceFixture, EthKernelsBidirectionalDirectSend) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const auto& device_0 = devices_.at(0);
     const auto& device_1 = devices_.at(1);
 
@@ -585,6 +595,7 @@ TEST_F(N300DeviceFixture, EthKernelsBidirectionalDirectSend) {
 }
 
 TEST_F(N300DeviceFixture, EthKernelsRepeatedDirectSends) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const auto& device_0 = devices_.at(0);
     const auto& device_1 = devices_.at(1);
 
@@ -617,6 +628,7 @@ TEST_F(N300DeviceFixture, EthKernelsRepeatedDirectSends) {
 }
 
 TEST_F(N300DeviceFixture, EthKernelsRandomDirectSendTests) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     srand(0);
     const auto& device_0 = devices_.at(0);
     const auto& device_1 = devices_.at(1);
diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/buffer_movement_kernels.cpp
index 10da65cd31b..dd9c95aab8f 100644
--- a/tests/tt_metal/tt_metal/unit_tests/multichip/buffer_movement_kernels.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/multichip/buffer_movement_kernels.cpp
@@ -187,10 +187,10 @@ bool chip_to_chip_interleaved_buffer_transfer(
 
     auto input_packed = generate_uniform_random_vector<uint32_t>(0, 100, cfg.size_bytes / sizeof(uint32_t));
     /*std::vector<uint32_t> input_packed =
-        tt::test_utils::generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
+        tt::test_utils::generate_packed_uniform_random_vector<uint32_t, bfloat16>(
             -1.0f,
             1.0f,
-            cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF,
+            cfg.size_bytes / bfloat16::SIZEOF,
             std::chrono::system_clock::now().time_since_epoch().count());*/
 
     tt::tt_metal::InterleavedBufferConfig sender_config{
diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
index de55e3f17f5..6de3291f65c 100644
--- a/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp
@@ -17,13 +17,11 @@
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
 constexpr std::int32_t MAX_NUM_WORDS = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_SIZE / WORD_SIZE;
 
-using namespace tt;
-using namespace tt::test_utils;
-using namespace tt::test_utils::df;
-
 struct erisc_info_t {
   volatile uint32_t num_bytes;
   volatile uint32_t mode;
@@ -34,6 +32,13 @@ struct erisc_info_t {
   volatile uint32_t reserverd_3_;
   volatile uint32_t reserverd_4_;
 };
+}
+}
+
+using namespace tt;
+using namespace tt::test_utils;
+using namespace tt::test_utils::df;
+
 namespace unit_tests::erisc::direct_send {
 // Tests ethernet direct send/receive from ERISC_L1_UNRESERVED_BASE
 bool send_over_eth(
@@ -136,6 +141,7 @@ bool send_over_eth(
 }  // namespace unit_tests::erisc::direct_send
 
 TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip0ToChip1) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     ASSERT_TRUE(this->num_devices_ == 2);
     const auto& device_0 = devices_.at(0);
@@ -165,6 +171,7 @@ TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip0ToChip1) {
 }
 
 TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip1ToChip0) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     ASSERT_TRUE(this->num_devices_ == 2);
     const auto& device_0 = devices_.at(0);
@@ -194,6 +201,7 @@ TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip1ToChip0) {
 }
 
 TEST_F(N300DeviceFixture, BidirectionalEthCoreDirectSend) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     ASSERT_TRUE(this->num_devices_ == 2);
     const auto& device_0 = devices_.at(0);
@@ -239,6 +247,7 @@ TEST_F(N300DeviceFixture, BidirectionalEthCoreDirectSend) {
 }
 
 TEST_F(N300DeviceFixture, RandomDirectSendTests) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     srand(0);
     ASSERT_TRUE(this->num_devices_ == 2);
diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp
index 6cad6f5d625..56ed607e79d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp
@@ -337,8 +337,8 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels(
 
     for (uint32_t i = 0; i < sender_receivers.size(); ++i) {
         inputs.emplace_back(
-            tt::test_utils::generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-                -1.0f, 1.0f, cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, i));
+            tt::test_utils::generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+                -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, i));
         full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end());
 
         const auto& device = std::get<0>(sender_receivers[i]);
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
index ed66195b343..47f4f154fc7 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
@@ -29,7 +29,8 @@ set(UNIT_TESTS_COMMON_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_waypoint.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp
 )
-add_library(unit_tests_common_o OBJECT ${UNIT_TESTS_COMMON_SRC})
+add_library(unit_tests_common_o STATIC ${UNIT_TESTS_COMMON_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_common_o)
 target_link_libraries(
     unit_tests_common_o
     PUBLIC
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
index a2adab128e5..1b1b4d6104f 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp
index f65199b756f..829a9feb140 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "common_fixture.hpp"
 #include "impl/debug/dprint_server.hpp"
 #include "tt_metal/common/core_descriptor.hpp"
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp
index ba7bfa88920..9d74f94942d 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <chrono>
 #include <thread>
 #include "common_fixture.hpp"
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp
index 62165c4b1cb..4eecec45c61 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp
@@ -1,6 +1,9 @@
 // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
 #include <algorithm>
 #include <functional>
 #include <random>
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp
index 850cb14a0ae..38ece0f5ca0 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp
@@ -14,6 +14,8 @@
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 const std::string golden_output =
 R"(Test Debug Print: ERISC
 Basic Types:
@@ -76,6 +78,8 @@ static void RunTest(DPrintFixture* fixture, Device* device, bool active) {
         tt::DPrintServerClearLogFile();
     }
 }
+}
+}
 
 TEST_F(DPrintFixture, TestPrintEthCores) {
     for (Device* device : this->devices_) {
@@ -86,7 +90,7 @@ TEST_F(DPrintFixture, TestPrintEthCores) {
         }
         this->RunTestOnDevice(
             [](DPrintFixture *fixture, Device *device){
-                RunTest(fixture, device, true);
+                CMAKE_UNIQUE_NAMESPACE::RunTest(fixture, device, true);
             },
             device
         );
@@ -105,7 +109,7 @@ TEST_F(DPrintFixture, TestPrintIEthCores) {
         }
         this->RunTestOnDevice(
             [](DPrintFixture *fixture, Device *device){
-                RunTest(fixture, device, false);
+                CMAKE_UNIQUE_NAMESPACE::RunTest(fixture, device, false);
             },
             device
         );
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp
index 391cfa9f1e2..8440e242ac6 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp
@@ -14,6 +14,8 @@
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 const std::string golden_output =
 R"(Test Debug Print: Data0
 Basic Types:
@@ -72,9 +74,11 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
     EXPECT_TRUE(OpenFile(file_name, log_file, std::fstream::in));
     EXPECT_TRUE(log_file.peek() == std::ifstream::traits_type::eof());
 }
+}
+}
 
 TEST_F(DPrintFixtureDisableDevices, TestPrintMuteDevice) {
     for (Device* device : this->devices_) {
-        this->RunTestOnDevice(RunTest, device);
+        this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp
index 4b76841b498..3798288e27c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp
@@ -14,6 +14,8 @@
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 const std::string golden_output =
 R"(Printing int from arg: 0
 Printing int from arg: 2)";
@@ -61,9 +63,11 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
         )
     );
 }
+}
+}
 
 TEST_F(DPrintFixture, TestPrintMuting) {
     for (Device* device : this->devices_) {
-        this->RunTestOnDevice(RunTest, device);
+        this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp
index 26a9489ee83..42d7382b5bb 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp
@@ -15,6 +15,8 @@
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 const std::string golden_output =
 R"(Test Debug Print: Data0
 Basic Types:
@@ -187,9 +189,11 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
         )
     );
 }
+}
+}
 
 TEST_F(DPrintFixture, TestPrintFromAllHarts) {
     for (Device* device : this->devices_) {
-        this->RunTestOnDevice(RunTest, device);
+        this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp
index 7014aaf40f0..a707ffff86c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp
@@ -15,6 +15,8 @@
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 // Some machines will run this test on different physical cores, so wildcard the exact coordinates.
 const std::string golden_output =
 R"(DPRINT server timed out on Device *, worker core (x=*,y=*), riscv 4, waiting on a RAISE signal: 1
@@ -51,6 +53,8 @@ try {
         )
     );
 }
+}
+}
 
 TEST_F(DPrintFixture, TestPrintHanging) {
     // Skip this test for slow dipatch for now. Due to how llrt currently sits below device, it's
@@ -60,5 +64,5 @@ TEST_F(DPrintFixture, TestPrintHanging) {
         GTEST_SKIP();
 
     // Since the dprint server gets killed from a timeout, only run on one device.
-    this->RunTestOnDevice(RunTest, this->devices_[0]);
+    this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, this->devices_[0]);
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp
index 1767d904469..1f73a7bc736 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp
@@ -145,7 +145,7 @@ static KernelHandle prepare_compute(tt_metal::Program& program, const DestPrintT
 // Generates input data based on the test configuration
 static std::vector<uint32_t> generate_inputs(const DestPrintTestConfig& config) {
     if (config.data_format == tt::DataFormat::Float16_b)
-        return tt::test_utils::generate_packed_increment_vector<uint32_t, tt::test_utils::df::bfloat16>(
+        return tt::test_utils::generate_packed_increment_vector<uint32_t, bfloat16>(
             0.0f, config.get_num_elements(), 0.03125f, -1.1875f);
 
     if (config.data_format == tt::DataFormat::Float32)
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp
index 4830e39a678..0786c960813 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp
@@ -14,6 +14,8 @@
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 const std::string golden_output =
 R"(TestConstCharStrNC{0,0}
    2
@@ -274,9 +276,11 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
         )
     );
 }
+}
+}
 
 TEST_F(DPrintFixture, TestPrintRaiseWait) {
     for (Device* device : this->devices_) {
-        this->RunTestOnDevice(RunTest, device);
+        this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp
index 1a21a43a187..f358a30ebad 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp
@@ -12,6 +12,8 @@ using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 static void RunTest(WatcherFixture* fixture, Device* device) {
     // Set up program
     Program program = Program();
@@ -129,9 +131,11 @@ static void RunTest(WatcherFixture* fixture, Device* device) {
     // See #10527
     // EXPECT_TRUE(FileContainsAllStrings(fixture->log_file_name, expected_strings));
 }
+}
+}
 
 TEST_F(WatcherFixture, TestWatcherPause) {
     for (Device* device : this->devices_) {
-        this->RunTestOnDevice(RunTest, device);
+        this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp
index ffc9fb62e57..8da13273c27 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp
@@ -14,6 +14,8 @@ using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 static void RunTest(WatcherFixture* fixture, Device* device) {
     // Set up program
     Program program = Program();
@@ -194,9 +196,11 @@ static void RunTest(WatcherFixture* fixture, Device* device) {
         }
     }
 }
+}
+}
 
 TEST_F(WatcherFixture, TestWatcherWaypoints) {
     for (Device* device : this->devices_) {
-        this->RunTestOnDevice(RunTest, device);
+        this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt
index 4e32bc33a45..5f9247bba85 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt
@@ -18,6 +18,7 @@ add_executable(
     ${UNIT_TESTS_FD_SRC}
     $<TARGET_OBJECTS:unit_tests_common_o>
 )
+TT_ENABLE_UNITY_BUILD(unit_tests_fast_dispatch)
 
 target_link_libraries(unit_tests_fast_dispatch PUBLIC test_metal_common_libs)
 target_include_directories(
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
index 2d3c5a4b6c7..c66a0c33d47 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/common/bfloat16.hpp"
 #include "tt_metal/impl/buffers/buffer.hpp"
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
index 755f5892db0..b6dbd81212c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
@@ -22,12 +22,6 @@ using std::vector;
 using namespace tt;
 using namespace tt::tt_metal;
 
-struct TestBufferConfig {
-    uint32_t num_pages;
-    uint32_t page_size;
-    BufferType buftype;
-};
-
 Program create_simple_unary_program(Buffer& input, Buffer& output) {
     Program program = CreateProgram();
     Device* device = input.device();
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp
index 5c4aabbb0cd..2bad2d7ba2b 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <algorithm>
 #include <cstdint>
 #include <variant>
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
index 83746fe8a54..06cd4a16177 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
@@ -41,58 +41,58 @@ const map<string, std::map<string, string>> sfpu_op_to_op_name = {
     {"tanh", {{"SFPU_OP_CHAIN_0", "tanh_tile_init(); tanh_tile(0);"}}},
 };
 
-tt::test_utils::df::bfloat16 sfpu_function(const string& op_name, const tt::test_utils::df::bfloat16& input) {
+bfloat16 sfpu_function(const string& op_name, const bfloat16& input) {
     if (op_name == "relu") {
-        return tt::test_utils::df::bfloat16(fmaxf(input.to_float(), 0.0f));
+        return bfloat16(fmaxf(input.to_float(), 0.0f));
     } else if (op_name == "exponential") {
-        return tt::test_utils::df::bfloat16(std::exp(input.to_float()));
+        return bfloat16(std::exp(input.to_float()));
     } else if (op_name == "reciprocal") {
-        return tt::test_utils::df::bfloat16(1 / input.to_float());
+        return bfloat16(1 / input.to_float());
     } else if (op_name == "gelu") {
         static constexpr float alpha = M_2_SQRTPI * M_SQRT1_2;
         auto x = input.to_float();
         auto x3 = x * x * x;
         float result = x * 0.5 * (1.0 + tanhf(alpha * (x + 0.044715 * x3)));
-        return tt::test_utils::df::bfloat16(result);
+        return bfloat16(result);
     } else if (op_name == "sqrt") {
-        return tt::test_utils::df::bfloat16(sqrtf(input.to_float()));
+        return bfloat16(sqrtf(input.to_float()));
     } else if (op_name == "sigmoid") {
         auto x = input.to_float();
         float result = 1 / (1 + std::exp(-x));
-        return tt::test_utils::df::bfloat16(result);
+        return bfloat16(result);
     } else if (op_name == "log") {
-        return tt::test_utils::df::bfloat16(logf(input.to_float()));
+        return bfloat16(logf(input.to_float()));
     } else if (op_name == "tanh") {
-        return tt::test_utils::df::bfloat16(std::tanh(input.to_float()));
+        return bfloat16(std::tanh(input.to_float()));
     } else {
         TT_THROW("Unsupported op_name in test");
-        return tt::test_utils::df::bfloat16(0.0f);
+        return bfloat16(0.0f);
     }
 }
 vector<uint32_t> generate_packed_sfpu_input(const unsigned int numel, const string& op_name, const int seed) {
     if ((op_name == "sqrt") or (op_name == "log")) {
-        return generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(0.0001f, 4.0f, numel, seed);
+        return generate_packed_uniform_random_vector<uint32_t, bfloat16>(0.0001f, 4.0f, numel, seed);
     } else if ((op_name == "exponential") or (op_name == "gelu") or (op_name == "reciprocal")) {
-        auto possible_values = vector<tt::test_utils::df::bfloat16>({-1.0f, -0.5f, 0.5f, 1.0f});
-        return generate_packed_random_vector_from_vector<uint32_t, tt::test_utils::df::bfloat16>(possible_values, numel, seed);
+        auto possible_values = vector<bfloat16>({-1.0f, -0.5f, 0.5f, 1.0f});
+        return generate_packed_random_vector_from_vector<uint32_t, bfloat16>(possible_values, numel, seed);
     } else {
-        return generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(-1.0f, 1.0f, numel, seed);
+        return generate_packed_uniform_random_vector<uint32_t, bfloat16>(-1.0f, 1.0f, numel, seed);
     }
 }
 
 bool is_close_packed_sfpu_output(const std::vector<uint32_t>& vec_a, const std::vector<uint32_t>& vec_b, const string& op_name) {
     if (op_name == "tanh") {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); });
     } else if ((op_name == "gelu") or (op_name == "relu")) {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.15f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.15f); });
     } else if ((op_name == "exponential")) {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); });
     } else {
-        return is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
-            vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); });
+        return is_close_packed_vectors<bfloat16, uint32_t>(
+            vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); });
     }
 }
 
@@ -138,15 +138,15 @@ bool run_sfpu_all_same_buffer(CommandQueue & cq, const SfpuConfig& test_config)
 
     // Input
     std::vector<uint32_t> packed_input = sfpu_util::generate_packed_sfpu_input(
-        byte_size / tt::test_utils::df::bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count());
+        byte_size / bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count());
 
     // Golden output
-    auto input = unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(packed_input);
-    std::vector<tt::test_utils::df::bfloat16> golden(input.size());
-    std::transform(input.begin(), input.end(), golden.begin(), [&](const tt::test_utils::df::bfloat16& val) {
+    auto input = unpack_vector<bfloat16, uint32_t>(packed_input);
+    std::vector<bfloat16> golden(input.size());
+    std::transform(input.begin(), input.end(), golden.begin(), [&](const bfloat16& val) {
         return sfpu_util::sfpu_function(test_config.sfpu_op, val);
     });
-    std::vector<uint32_t> packed_golden = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(golden);
+    std::vector<uint32_t> packed_golden = pack_vector<uint32_t, bfloat16>(golden);
 
     // Same runtime args for every core
     vector<uint32_t> reader_rt_args = {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp
index ae36623be52..ef05c731489 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp
@@ -26,6 +26,8 @@ using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
 constexpr std::int32_t MAX_NUM_WORDS =
     (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE;
@@ -40,6 +42,8 @@ struct BankedConfig {
     tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1;
     tt::DataFormat l1_data_format = tt::DataFormat::Float16_b;
 };
+}
+}
 
 namespace fd_unit_tests::erisc::kernels {
 
@@ -445,7 +449,7 @@ bool chip_to_chip_interleaved_buffer_transfer(
     tt_metal::Device* receiver_device,
     const CoreCoord& eth_sender_core,
     const CoreCoord& eth_receiver_core,
-    const BankedConfig& cfg,
+    const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg,
     const uint32_t& max_transfer_size) {
     bool pass = true;
 
@@ -560,6 +564,7 @@ TEST_F(CommandQueueSingleCardFixture, EnqueueDummyProgramOnEthCore) {
 }
 
 TEST_F(CommandQueueSingleCardFixture, EthKernelsNocReadNoSend) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
 
     for (const auto& device : devices_) {
@@ -575,6 +580,7 @@ TEST_F(CommandQueueSingleCardFixture, EthKernelsNocReadNoSend) {
 }
 
 TEST_F(CommandQueueSingleCardFixture, EthKernelsNocWriteNoReceive) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
 
     for (const auto& device : devices_) {
@@ -590,6 +596,7 @@ TEST_F(CommandQueueSingleCardFixture, EthKernelsNocWriteNoReceive) {
 }
 
 TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectSendAllConnectedChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
     const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
     for (const auto& sender_device : devices_) {
@@ -672,6 +679,7 @@ TEST_F(CommandQueueMultiDeviceFixture, EthKernelsSendDramBufferAllConnectedChips
 }
 
 TEST_F(CommandQueueMultiDeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     for (const auto& sender_device : devices_) {
         for (const auto& receiver_device : devices_) {
             if (sender_device->id() >= receiver_device->id()) {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
index 53d9f0d5707..df20eec8c8d 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
@@ -24,6 +24,8 @@ using namespace tt;
 using namespace tt::test_utils;
 using namespace tt::test_utils::df;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
 constexpr std::int32_t MAX_NUM_WORDS =
     (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE;
@@ -160,6 +162,8 @@ std::vector<std::tuple<tt_metal::Device*, tt_metal::Device*, CoreCoord, CoreCoor
     }
     return sender_receivers;
 }
+}
+}
 
 namespace fd_unit_tests::erisc::kernels {
 
@@ -170,6 +174,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
     const size_t& dst_eth_l1_byte_address,
     const size_t& sem_l1_byte_address,
     uint32_t num_bytes_per_send = 16) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     bool pass = true;
     const auto& sender_receivers = get_sender_receiver_cores(device_ring);
 
@@ -313,11 +318,12 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
 
 bool eth_interleaved_ring_gather_sender_receiver_kernels(
     std::vector<tt::tt_metal::Device*> device_ring,
-    const BankedConfig& cfg,
+    const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg,
     const size_t& src_eth_l1_byte_address,
     const size_t& dst_eth_l1_byte_address,
     const size_t& sem_l1_byte_address,
     uint32_t num_bytes_per_send = 16) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     bool pass = true;
     const auto& sender_receivers = get_sender_receiver_cores(device_ring);
 
@@ -335,8 +341,8 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels(
 
     for (uint32_t i = 0; i < sender_receivers.size(); ++i) {
         inputs.emplace_back(
-            tt::test_utils::generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-                -1.0f, 1.0f, cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, i));
+            tt::test_utils::generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+                -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, i));
         full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end());
 
         const auto& device = std::get<0>(sender_receivers[i]);
@@ -455,6 +461,7 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels(
 }  // namespace fd_unit_tests::erisc::kernels
 
 TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectRingGatherAllChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (num_devices_ < 4) {
         GTEST_SKIP();
     }
@@ -470,6 +477,7 @@ TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectRingGatherAllChips) {
 }
 
 TEST_F(CommandQueueMultiDeviceFixture, EthKernelsInterleavedRingGatherAllChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (num_devices_ < 4) {
         GTEST_SKIP();
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt
index 702eb9535f0..9cef200d2fa 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt
@@ -6,6 +6,7 @@ set(UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS
 )
 
 add_executable(unit_tests_fast_dispatch_single_chip_multi_queue ${UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS})
+TT_ENABLE_UNITY_BUILD(unit_tests_fast_dispatch_single_chip_multi_queue)
 
 target_link_libraries(unit_tests_fast_dispatch_single_chip_multi_queue PUBLIC test_metal_common_libs)
 target_include_directories(
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
index 1a933c8a2f9..0f9c35adb96 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
@@ -11,12 +11,6 @@
 using std::vector;
 using namespace tt::tt_metal;
 
-struct TestBufferConfig {
-    uint32_t num_pages;
-    uint32_t page_size;
-    BufferType buftype;
-};
-
 Program create_simple_unary_program(const Buffer& input, const Buffer& output) {
     Program program = CreateProgram();
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp
index ea9365e4f6a..b3efb0e4f16 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp
index a42dd078797..e1e02ae6e16 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-// #pragma once
+#pragma once
 
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/common/bfloat16.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
index 783c89d06cd..9d46d75f129 100644
--- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt
+++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
@@ -11,6 +11,7 @@ set(TTNN_CCL_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/ccl/test_erisc_data_move
 set(TTNN_TENSOR_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_create_tensor.cpp)
 
 add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_ttnn)
 add_executable(unit_tests_ttnn_ccl ${TTNN_CCL_UNIT_TESTS_SRC})
 add_executable(unit_tests_ttnn_tensor ${TTNN_TENSOR_UNIT_TESTS_SRC})
 add_executable(test_multi_device ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_device.cpp)
diff --git a/tests/ttnn/unit_tests/gtests/test_add.cpp b/tests/ttnn/unit_tests/gtests/test_add.cpp
index 0f4c2ef5d3a..c1be54118a6 100644
--- a/tests/ttnn/unit_tests/gtests/test_add.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_add.cpp
@@ -31,10 +31,10 @@ TEST_P(Add1DTensorAndScalarFixture, AddsScalarCorrectly) {
     ttnn::Shape shape(dimensions);
 
     {
-        const auto input_tensor = ttnn::zeros(shape, ttnn::bfloat16, ttnn::TILE_LAYOUT, device);
+        const auto input_tensor = ttnn::zeros(shape, DataType::BFLOAT16, ttnn::TILE_LAYOUT, device);
         const auto output_tensor = input_tensor + param.scalar;
         const auto expected_tensor =
-            ttnn::operations::creation::full(shape, param.scalar, ttnn::bfloat16, ttnn::TILE_LAYOUT, device);
+            ttnn::operations::creation::full(shape, param.scalar, DataType::BFLOAT16, ttnn::TILE_LAYOUT, device);
         TT_FATAL(ttnn::numpy::allclose<::bfloat16>(ttnn::from_device(expected_tensor), ttnn::from_device(output_tensor)), "Error");
     }
     ttnn::close_device(device);
diff --git a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
index 8e019a5e95d..5903126db3a 100644
--- a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
@@ -47,8 +47,8 @@ TEST_P(AddOpGraphTestFixture, AddGraphTrace) {
     auto run_mode = std::get<1>(param_combination);
 
     {
-        const auto input_tensor_a = ttnn::zeros(params.a_Shape, ttnn::bfloat16, ttnn::TILE_LAYOUT, this->getDevice(), params.memory_config);
-        const auto input_tensor_b = ttnn::zeros(params.b_Shape, ttnn::bfloat16, ttnn::TILE_LAYOUT, this->getDevice(), params.memory_config);
+        const auto input_tensor_a = ttnn::zeros(params.a_Shape, DataType::BFLOAT16, ttnn::TILE_LAYOUT, this->getDevice(), params.memory_config);
+        const auto input_tensor_b = ttnn::zeros(params.b_Shape, DataType::BFLOAT16, ttnn::TILE_LAYOUT, this->getDevice(), params.memory_config);
 
         auto call = [&] {
             const auto output_tensor = ttnn::add(input_tensor_a, input_tensor_b);
diff --git a/tests/ttnn/unit_tests/gtests/test_multi_device.cpp b/tests/ttnn/unit_tests/gtests/test_multi_device.cpp
index 6d6863305f6..7404a7528bb 100644
--- a/tests/ttnn/unit_tests/gtests/test_multi_device.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_multi_device.cpp
@@ -29,7 +29,7 @@ Tensor create_host_multi_device_tensor(const Tensor& tensor, const ReplicateTens
 
 TEST_F(T3kMultiDeviceFixture, TestGetTensorsFromMultiDeviceStorage) {
     MeshDevice* mesh_device = this->mesh_device_.get();
-    const auto input_tensor = ttnn::ones(ttnn::Shape(std::array<uint32_t, 2>{32, 32}), ttnn::bfloat16);
+    const auto input_tensor = ttnn::ones(ttnn::Shape(std::array<uint32_t, 2>{32, 32}), DataType::BFLOAT16);
     const auto replicated_tensor = create_host_multi_device_tensor(input_tensor, ReplicateTensor(8));
     const auto device_tensors = get_tensors_from_multi_device_storage(replicated_tensor);
 
@@ -38,7 +38,7 @@ TEST_F(T3kMultiDeviceFixture, TestGetTensorsFromMultiDeviceStorage) {
 
 TEST_F(T3kMultiDeviceFixture, TestGetDistributedTensorConfigFromMultiDeviceStorage) {
     MeshDevice* mesh_device = this->mesh_device_.get();
-    const auto input_tensor = ttnn::ones(ttnn::Shape(std::array<uint32_t, 2>{32, 32}), ttnn::bfloat16);
+    const auto input_tensor = ttnn::ones(ttnn::Shape(std::array<uint32_t, 2>{32, 32}), DataType::BFLOAT16);
     const auto replicated_tensor = create_host_multi_device_tensor(input_tensor, ReplicateTensor(8));
     const auto distributed_tensor_config = get_distributed_tensor_config_from_tensor(replicated_tensor);
 
diff --git a/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp b/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp
index 463cd8d1972..debbce04437 100644
--- a/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp
+++ b/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp b/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp
index 6e517ccdaab..4b4bd0eecdd 100644
--- a/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp
+++ b/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp
@@ -1,9 +1,10 @@
-#include <math.h>
-
 // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
+#include <math.h>
 #include <algorithm>
 #include <functional>
 #include <random>
diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
index c4ab3a8485e..388331a3316 100644
--- a/ttnn/CMakeLists.txt
+++ b/ttnn/CMakeLists.txt
@@ -645,6 +645,7 @@ target_include_directories(ttnn PUBLIC ${TTNN_PUBLIC_INCLUDE_DIRS})
 target_link_libraries(ttnn PUBLIC ${TTNN_PUBLIC_LINK_LIBRARIES})
 target_link_directories(ttnn PUBLIC ${TTNN_PUBLIC_LINK_DIRS})
 target_precompile_headers(ttnn PRIVATE ${TTNN_PRECOMPILED_HEADERS})
+TT_ENABLE_UNITY_BUILD(ttnn)
 
 #We move the library binaries to a different path rather than PROJECT_BINARY_DIR
 #in the Python wheel
diff --git a/ttnn/cpp/pybind11/operations/creation.hpp b/ttnn/cpp/pybind11/operations/creation.hpp
index ce6f2a44d55..2f10cf149ec 100644
--- a/ttnn/cpp/pybind11/operations/creation.hpp
+++ b/ttnn/cpp/pybind11/operations/creation.hpp
@@ -323,7 +323,7 @@ void bind_arange_operation(py::module& module, const creation_operation_t& opera
             py::arg("start") = 0,
             py::arg("end"),
             py::arg("step") = 1,
-            py::arg("dtype") = ttnn::bfloat16,
+            py::arg("dtype") = DataType::BFLOAT16,
             py::arg("device") = std::nullopt,
             py::arg("memory_config") = ttnn::DRAM_MEMORY_CONFIG});
 }
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp
index d15841d1442..a085a650432 100644
--- a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
index 5faf89bd0a6..fed321d9712 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
@@ -147,8 +147,8 @@ void generate_edm_kernels_for_ring_or_linear_topology(
     std::vector<ccl::EriscDatamoverBuilder> const& counter_clockwise_edm_builders,
     std::optional<uint32_t> receiver_device_id,
     std::optional<uint32_t> sender_device_id) {
-    auto sender_noc = detail::GetPreferredNOCForDRAMRead(tt::Cluster::instance().arch());
-    auto receiver_noc = detail::GetPreferredNOCForDRAMWrite(tt::Cluster::instance().arch());
+    auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(tt::Cluster::instance().arch());
+    auto receiver_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(tt::Cluster::instance().arch());
     uint32_t sender_socket_idx = 0;
     uint32_t receiver_socket_idx = 0;
     if (receiver_device_id == sender_device_id) {
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
index 6727c3a2349..0924001d006 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
@@ -88,7 +88,9 @@ operation::ProgramWithCallbacks ReduceScatter::create_program(
         this->user_defined_num_buffers_per_channel);
 }
 
-static ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_type(ttnn::operations::reduction::ReduceType reduce_op) {
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
+ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_type(ttnn::operations::reduction::ReduceType reduce_op) {
     // Leaving switch statement for future support of additional types.
     switch (reduce_op) {
         case ttnn::operations::reduction::ReduceType::Sum:
@@ -98,6 +100,8 @@ static ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_typ
             return ttnn::operations::binary::BinaryOpType::ADD;
     }
 }
+}
+}
 
 namespace operations{
 namespace ccl{
@@ -110,6 +114,7 @@ Tensor reduce_scatter(
     ttnn::ccl::Topology topology,
     const std::optional<size_t> user_defined_num_workers,
     const std::optional<size_t> user_defined_num_buffers_per_channel) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     ttnn::operations::binary::BinaryOpType binary_op_type = convert_reduce_type_to_eltwise_type(math_op);
     TT_FATAL(std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr, "reduce_scatter op is only supported for Fast Dispatch");
 
@@ -162,6 +167,7 @@ Tensor reduce_scatter(
     ttnn::ccl::Topology topology,
     const std::optional<size_t> user_defined_num_workers,
     const std::optional<size_t> user_defined_num_buffers_per_channel) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     ttnn::operations::binary::BinaryOpType binary_op_type = convert_reduce_type_to_eltwise_type(reduce_op);
 
     TT_FATAL(topology == ttnn::ccl::Topology::Linear, "This all_gather API with cluster_axis is currently supported only for the Linear topology");
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
index dc7295a0d75..9573ff0595c 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
@@ -17,6 +17,8 @@ namespace conv2d {
 
 using namespace tt;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 const uint32_t act_cb = CB::c_in0;
 const uint32_t weight_cb = CB::c_in1;
 const uint32_t bias_cb = CB::c_in2;
@@ -30,6 +32,8 @@ const uint32_t tilize_mode_tilized_act_cb = CB::c_intermed1;
 const uint32_t untilize_mode_reblock_cb = CB::c_intermed2;
 const uint32_t out0_cb = CB::c_out0;
 const uint32_t temp_sum_cb = CB::c_intermed3;
+}
+}
 
 
 operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl(
@@ -82,6 +86,8 @@ std::tuple<CBHandle, CBHandle> create_CBs_for_sharded_input_v2(
     bool fp32_dest_acc_en,
     bool packer_l1_acc_en,
     bool use_non_tile_height) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+
     tt::DataFormat interm0_df =
         packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df;
 
@@ -260,6 +266,7 @@ std::tuple<CBHandle, CBHandle> create_CBs_for_depthwise_sharded_input(
     bool split_reader,
     bool fp32_dest_acc_en,
     bool packer_l1_acc_en) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     tt::DataFormat interm0_df =
         packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df;
 
@@ -355,6 +362,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
     bool enable_split_reader,
     bool enable_subblock_padding,
     bool use_non_tile_height) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     bool pass = true;
     tt_metal::Device* device = a.device();
     TT_FATAL(a.get_layout() == Layout::ROW_MAJOR, "Conv activation should be in row major layout");
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
index 9cc49a47f22..ebde538c36a 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
@@ -19,19 +19,6 @@ namespace conv2d {
 
 using namespace tt;
 
-const uint32_t act_cb = CB::c_in0;
-const uint32_t weight_cb = CB::c_in1;
-const uint32_t bias_cb = CB::c_in2;
-const uint32_t sharded_act_cb = CB::c_in3;
-const uint32_t cb_for_reader_indices = CB::c_in4;
-const uint32_t cb_for_l1_array = CB::c_in5;
-const uint32_t act_cb_row_major_bfloat16 = CB::c_in6;
-const uint32_t act_cb_second_reader = CB::c_in7;
-const uint32_t matmul_partials_cb = CB::c_intermed0;
-const uint32_t tilize_mode_tilized_act_cb = CB::c_intermed1;
-const uint32_t untilize_mode_reblock_cb = CB::c_intermed2;
-const uint32_t out0_cb = CB::c_out0;
-
 operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl(
     tt_metal::Program& program,
     const Tensor& a,
@@ -54,6 +41,20 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl(
     bool enable_act_double_buffer,
     bool enable_split_reader,
     bool enable_subblock_padding) {
+
+    const uint32_t act_cb = CB::c_in0;
+    const uint32_t weight_cb = CB::c_in1;
+    const uint32_t bias_cb = CB::c_in2;
+    const uint32_t sharded_act_cb = CB::c_in3;
+    const uint32_t cb_for_reader_indices = CB::c_in4;
+    const uint32_t cb_for_l1_array = CB::c_in5;
+    const uint32_t act_cb_row_major_bfloat16 = CB::c_in6;
+    const uint32_t act_cb_second_reader = CB::c_in7;
+    const uint32_t matmul_partials_cb = CB::c_intermed0;
+    const uint32_t tilize_mode_tilized_act_cb = CB::c_intermed1;
+    const uint32_t untilize_mode_reblock_cb = CB::c_intermed2;
+    const uint32_t out0_cb = CB::c_out0;
+
     bool pass = true;
     enable_split_reader = false;
     tt_metal::Device* device = a.device();
diff --git a/ttnn/cpp/ttnn/operations/creation.hpp b/ttnn/cpp/ttnn/operations/creation.hpp
index 55f2a81d96e..2c82325a172 100644
--- a/ttnn/cpp/ttnn/operations/creation.hpp
+++ b/ttnn/cpp/ttnn/operations/creation.hpp
@@ -67,7 +67,7 @@ inline ttnn::Tensor full_impl(
     std::optional<ttnn::Tensor> optional_output_tensor = std::nullopt) {
     Device* device = optional_output_tensor.has_value() ? optional_output_tensor.value().device() : device_arg.has_value() ? &(device_arg.value().get()) : nullptr;
     Layout layout_value = optional_output_tensor.has_value() ? optional_output_tensor.value().get_layout() : layout.value_or(ttnn::ROW_MAJOR_LAYOUT);
-    DataType dtype_value = optional_output_tensor.has_value() ? optional_output_tensor.value().get_dtype() : dtype.value_or(ttnn::bfloat16);
+    DataType dtype_value = optional_output_tensor.has_value() ? optional_output_tensor.value().get_dtype() : dtype.value_or(DataType::BFLOAT16);
     tt::tt_metal::LegacyShape shape_value = optional_output_tensor.has_value() ? optional_output_tensor.value().get_legacy_shape() : shape.value;
     MemoryConfig mem_cfg = optional_output_tensor.has_value() ? optional_output_tensor.value().memory_config() : memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG);
     return numpy::full_impl(
@@ -330,7 +330,7 @@ struct FullLike {
 struct Arange {
     static ttnn::Tensor invoke(
         const int64_t stop,
-        const DataType dtype = ttnn::bfloat16,
+        const DataType dtype = DataType::BFLOAT16,
         const std::optional<std::reference_wrapper<Device>>& device = std::nullopt,
         const MemoryConfig& memory_config = ttnn::DRAM_MEMORY_CONFIG) {
         return Arange::invoke(0, stop, 1, dtype, device, memory_config);
@@ -340,20 +340,20 @@ struct Arange {
         const int64_t start,
         const int64_t stop,
         const int64_t step = 1,
-        const DataType dtype = ttnn::bfloat16,
+        const DataType dtype = ttnn::DataType::BFLOAT16,
         const std::optional<std::reference_wrapper<Device>>& device_arg = std::nullopt,
         const MemoryConfig& memory_config = ttnn::DRAM_MEMORY_CONFIG) {
         Device* device = device_arg.has_value() ? &(device_arg.value().get()) : nullptr;
         switch (dtype) {
-            case ttnn::bfloat16:
+            case DataType::BFLOAT16:
                 return numpy::arange<::bfloat16>(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config);
-            case ttnn::float32:
+            case DataType::FLOAT32:
                 return numpy::arange<float>(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config);
-            case ttnn::uint16:
+            case DataType::UINT16:
                 return numpy::arange<uint16_t>(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config);
-            case ttnn::uint32:
+            case DataType::UINT32:
                 return numpy::arange<uint32_t>(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config);
-            case ttnn::int32:
+            case DataType::INT32:
                 return numpy::arange<int32_t>(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config);
             default: TT_THROW("Unsupported dtype");
         }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp
index 434e8db04cb..9675b95c63b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp
index 57b2a50748c..4b53cccebb0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp"
 #include "tt_metal/common/work_split.hpp"
 #include "tt_metal/detail/util.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
index f320b2a3d52..bb53c8bee61 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
@@ -18,8 +18,8 @@ uint32_t get_num_cores(CoreCoord grid_size, uint32_t nblocks) {
     if (nblocks <= ncores) {
         ncores = nblocks;
     } else {
-        uint32_t nblocks_per_core = ceil((float)nblocks / ncores);
-        ncores = ceil((float)nblocks / nblocks_per_core);
+        uint32_t nblocks_per_core = std::ceil((float)nblocks / ncores);
+        ncores = std::ceil((float)nblocks / nblocks_per_core);
     }
     return ncores;
 }
@@ -47,7 +47,7 @@ void Untilize::validate(const std::vector<Tensor>& input_tensors) const {
         TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Error");
         uint32_t ntiles = input_tensor_a.volume() / TILE_HW;
         uint32_t ntiles_per_block = input_tensor_a.get_legacy_shape()[-1] / TILE_WIDTH;
-        uint32_t nblocks = ceil((float)ntiles / ntiles_per_block);
+        uint32_t nblocks = std::ceil((float)ntiles / ntiles_per_block);
         auto num_cores =
             untilize_helpers::get_num_cores(input_tensor_a.device()->compute_with_storage_grid_size(), nblocks);
         uint32_t fused_height = input_tensor_a.volume() / input_tensor_a.get_legacy_shape()[-1];
@@ -82,7 +82,7 @@ std::vector<Tensor> Untilize::create_output_tensors(
         } else {
             uint32_t ntiles = input_tensor.volume() / TILE_HW;
             uint32_t ntiles_per_block = input_tensor.get_legacy_shape()[-1] / TILE_WIDTH;
-            uint32_t nblocks = ceil((float)ntiles / ntiles_per_block);
+            uint32_t nblocks = std::ceil((float)ntiles / ntiles_per_block);
             auto num_cores =
                 untilize_helpers::get_num_cores(input_tensor.device()->compute_with_storage_grid_size(), nblocks);
             auto shard_grid =
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
index 22826b7cd4d..f746b2fe105 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
@@ -287,7 +287,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
     uint32_t ntiles = a.volume() / TILE_HW;
     uint32_t stick_s = a.get_legacy_shape()[-1];
     uint32_t ntiles_per_block = a.get_legacy_shape()[-1] / TILE_WIDTH;
-    uint32_t nblocks = ceil((float)ntiles / ntiles_per_block);
+    uint32_t nblocks = std::ceil((float)ntiles / ntiles_per_block);
     uint32_t block_size_nbytes = a.get_legacy_shape()[-1] * output.element_size();
 
     uint32_t max_l1_size = a.device()->l1_size_per_core() / 2 - a.device()->get_base_allocator_addr(HalMemType::L1);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
index 92d31327d3a..8c3f07035de 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
@@ -41,7 +41,7 @@ std::vector<tt::tt_metal::LegacyShape> UntilizeWithHaloV2::compute_output_shapes
     // output_shape[1] remains same
     // output_shape[2] changes
     // output_shape[3] remains same
-    output_shape[2] = (uint32_t)ceil((float)total_nsticks / nbatch);
+    output_shape[2] = (uint32_t)std::ceil((float)total_nsticks / nbatch);
 
     log_debug(
         tt::LogOp, "output_shape: [{} {} {} {}]", output_shape[0], output_shape[1], output_shape[2], output_shape[3]);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
index de27f41453a..ef69cacc9d0 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
@@ -17,6 +17,8 @@
 
 namespace ttnn::operations::binary {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType binary_op_type) {
     switch (binary_op_type) {
         case BinaryOpType::ADD: return BcastOpMath::ADD;
@@ -25,6 +27,8 @@ static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType bina
         default: TT_THROW("BinaryOpType cannot be mapped to BcastOpMath");
     }
 }
+}
+}
 
 BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::cached_program_t
 BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::create(
@@ -34,6 +38,7 @@ BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::create(
     using namespace tt;
     using namespace tt::tt_metal;
     using namespace tt::constants;
+    using namespace CMAKE_UNIQUE_NAMESPACE;
 
     const auto& a = tensor_args.input_tensor_a;
     const auto& b = tensor_args.input_tensor_b;
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
index ac00c26f9a3..91def43dade 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
@@ -15,6 +15,8 @@
 
 namespace ttnn::operations::binary {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType binary_op_type) {
     switch (binary_op_type) {
         case BinaryOpType::ADD: return BcastOpMath::ADD;
@@ -23,6 +25,8 @@ static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType bina
         default: TT_THROW("BinaryOpType cannot be mapped to BcastOpMath");
     }
 }
+}
+}
 
 BinaryDeviceOperation::BroadcastHeightMultiCore::cached_program_t
 BinaryDeviceOperation ::BroadcastHeightMultiCore::create(
@@ -32,6 +36,7 @@ BinaryDeviceOperation ::BroadcastHeightMultiCore::create(
     using namespace tt;
     using namespace tt::tt_metal;
     using namespace tt::constants;
+    using namespace CMAKE_UNIQUE_NAMESPACE;
 
     const auto& a = tensor_args.input_tensor_a;
     const auto& b = tensor_args.input_tensor_b;
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
index f28c4e01db1..b52c1dd8bbf 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
@@ -15,6 +15,8 @@
 
 namespace ttnn::operations::binary {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType binary_op_type) {
     switch (binary_op_type) {
         case BinaryOpType::ADD: return BcastOpMath::ADD;
@@ -23,6 +25,8 @@ static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType bina
         default: TT_THROW("BinaryOpType cannot be mapped to BcastOpMath");
     }
 }
+}
+}
 
 BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::cached_program_t
 BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create(
@@ -32,6 +36,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create(
     using namespace tt;
     using namespace tt::tt_metal;
     using namespace tt::constants;
+    using namespace CMAKE_UNIQUE_NAMESPACE;
 
     const auto& a = tensor_args.input_tensor_a;
     const auto& b = tensor_args.input_tensor_b;
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
index 82c93081909..62e88253d59 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
@@ -15,6 +15,8 @@
 
 namespace ttnn::operations::binary {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType binary_op_type) {
     switch (binary_op_type) {
         case BinaryOpType::ADD: return BcastOpMath::ADD;
@@ -23,6 +25,8 @@ static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType bina
         default: TT_THROW("BinaryOpType cannot be mapped to BcastOpMath");
     }
 }
+}
+}
 
 BinaryDeviceOperation::BroadcastWidthMultiCore::cached_program_t BinaryDeviceOperation::BroadcastWidthMultiCore::create(
     const operation_attributes_t& operation_attributes,
@@ -31,6 +35,7 @@ BinaryDeviceOperation::BroadcastWidthMultiCore::cached_program_t BinaryDeviceOpe
     using namespace tt;
     using namespace tt::tt_metal;
     using namespace tt::constants;
+    using namespace CMAKE_UNIQUE_NAMESPACE;
 
     const auto& a = tensor_args.input_tensor_a;
     const auto& b = tensor_args.input_tensor_b;
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
index e9d4c2d84e9..e2f771f37f6 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
@@ -50,8 +50,8 @@ UnaryShardedProgramFactory::cached_program_t UnaryShardedProgramFactory::create(
     uint32_t num_tile_per_core = 0;
 
     if (input.get_dtype() == DataType::BFLOAT8_B) {
-        uint32_t ntiles_along_width = ceil(shard_spec.shape[1] / (float) tt::constants::TILE_WIDTH);
-        uint32_t ntiles_along_height = ceil(shard_spec.shape[0] / (float) tt::constants::TILE_HEIGHT);
+        uint32_t ntiles_along_width = std::ceil(shard_spec.shape[1] / (float) tt::constants::TILE_WIDTH);
+        uint32_t ntiles_along_height = std::ceil(shard_spec.shape[0] / (float) tt::constants::TILE_HEIGHT);
         num_tile_per_core = ntiles_along_width * ntiles_along_height;
     } else {
         TT_FATAL(
diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
index f35d2793889..b925b9bf8e5 100644
--- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
@@ -133,9 +133,9 @@ operation::ProgramWithCallbacks embeddings_tilized(
     auto cb_output = tt_metal::CreateCircularBuffer(program, all_cores, cb_output_config);
 
     bool input_stick_size_is_power_of_two = is_power_of_two_at_least_32(input_page_size);
-    uint32_t input_log2_stick_size = input_stick_size_is_power_of_two ? (std::uint32_t)log2(input_page_size) : 0;
+    uint32_t input_log2_stick_size = input_stick_size_is_power_of_two ? (std::uint32_t)std::log2(input_page_size) : 0;
     bool weight_stick_size_is_power_of_two = is_power_of_two_at_least_32(weight_page_size);
-    uint32_t weight_log2_stick_size = weight_stick_size_is_power_of_two ? (std::uint32_t)log2(weight_page_size) : 0;
+    uint32_t weight_log2_stick_size = weight_stick_size_is_power_of_two ? (std::uint32_t)std::log2(weight_page_size) : 0;
 
     // Create Kernels
     // reader
@@ -370,9 +370,9 @@ operation::ProgramWithCallbacks embeddings_rm(
     uint32_t output_cb_index = src0_cb_index;
 
     bool input_stick_size_is_power_of_two = is_power_of_two_at_least_32(input_page_size);
-    uint32_t input_log2_stick_size = input_stick_size_is_power_of_two ? (std::uint32_t)log2(input_page_size) : 0;
+    uint32_t input_log2_stick_size = input_stick_size_is_power_of_two ? (std::uint32_t)std::log2(input_page_size) : 0;
     bool weight_stick_size_is_power_of_two = is_power_of_two_at_least_32(weight_page_size);
-    uint32_t weight_log2_stick_size = weight_stick_size_is_power_of_two ? (std::uint32_t)log2(weight_page_size) : 0;
+    uint32_t weight_log2_stick_size = weight_stick_size_is_power_of_two ? (std::uint32_t)std::log2(weight_page_size) : 0;
 
     // Create Kernels
     // reader
@@ -407,7 +407,7 @@ operation::ProgramWithCallbacks embeddings_rm(
             embedding_defines));
 
     bool output_stick_size_is_power_of_two = is_power_of_two_at_least_32(output_page_size);
-    uint32_t output_log2_stick_size = output_stick_size_is_power_of_two ? (std::uint32_t)log2(output_page_size) : 0;
+    uint32_t output_log2_stick_size = output_stick_size_is_power_of_two ? (std::uint32_t)std::log2(output_page_size) : 0;
     std::vector<uint32_t> writer_compile_time_args = {
         (std::uint32_t)output_cb_index,
         (std::uint32_t)out_is_dram,
diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
index 171a6ab7266..6d08f349266 100644
--- a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
@@ -105,7 +105,7 @@ operation::ProgramWithCallbacks embedding_backward_multi_core(
     // reader
 
     bool index_stick_size_is_power_of_two = is_power_of_two_at_least_32(index_page_size);
-    uint32_t index_log2_stick_size = index_stick_size_is_power_of_two ? log2(index_page_size) : 0;
+    uint32_t index_log2_stick_size = index_stick_size_is_power_of_two ? std::log2(index_page_size) : 0;
 
     std::vector<uint32_t> reader_compile_time_args = {
         grad_is_dram,
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp
index f630d8de757..f64a761ff8e 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp
@@ -46,7 +46,9 @@ operation::ProgramWithCallbacks AllReduce::create_program(
         this->user_defined_num_buffers_per_channel);
 }
 
-static ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_type(ttnn::operations::reduction::ReduceType reduce_op) {
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
+ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_type(ttnn::operations::reduction::ReduceType reduce_op) {
     // Leaving switch statement for future support of additional types.
     switch (reduce_op) {
         case ttnn::operations::reduction::ReduceType::Sum:
@@ -56,6 +58,8 @@ static ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_typ
             return ttnn::operations::binary::BinaryOpType::ADD;
     }
 }
+}
+}
 
 namespace operations{
 namespace experimental{
@@ -174,6 +178,7 @@ Tensor all_reduce(
     ttnn::ccl::Topology topology,
     const std::optional<size_t> user_defined_num_workers,
     const std::optional<size_t> user_defined_num_buffers_per_channel) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     ttnn::operations::binary::BinaryOpType binary_op_type = convert_reduce_type_to_eltwise_type(math_op);
     TT_FATAL(std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr, "All Reduce op is only supported for Fast Dispatch");
     TT_FATAL(topology == ttnn::ccl::Topology::Ring, "All Reduce op is currently supported only on Ring topology");
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
index e1b2660ef51..50a9012ceb7 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
@@ -13,8 +13,6 @@
 #include "tt_metal/detail/util.hpp"
 #include "tt_metal/host_api.hpp"
 
-using namespace tt::constants;
-
 namespace tt {
 
 namespace tt_metal {
@@ -27,6 +25,8 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
     std::optional<uint32_t> token_idx,
     ttnn::DeviceComputeKernelConfig compute_kernel_config
 ) {
+    using namespace tt::constants;
+
     Program program{};
 
     tt::DataFormat input_cb_data_format = tt_metal::datatype_to_dataformat_converter(input.get_dtype());
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp
index 401227158f0..cfc5d3970d1 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp
@@ -8,13 +8,12 @@
 #include "tt_metal/common/constants.hpp"
 #include "tt_metal/host_api.hpp"
 
-using namespace tt::constants;
-
 namespace tt {
 
 namespace tt_metal {
 
 void RotaryEmbeddingLlama::validate(const std::vector<Tensor>& input_tensors) const {
+    using namespace tt::constants;
     const auto& input_tensor = input_tensors.at(0);
     const auto& cos = input_tensors.at(1);
     const auto& sin = input_tensors.at(2);
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
index 0f1cd8a6131..6c075f6b7c8 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
@@ -10,8 +10,6 @@
 #include "tt_metal/detail/util.hpp"
 #include "tt_metal/host_api.hpp"
 
-using namespace tt::constants;
-
 namespace tt {
 
 namespace tt_metal {
@@ -24,6 +22,7 @@ operation::ProgramWithCallbacks rotary_embedding_llama_multi_core(
     Tensor &output,
     ttnn::DeviceComputeKernelConfig compute_kernel_config
 ) {
+    using namespace tt::constants;
     Program program{};
 
     const tt::DataFormat input_cb_data_format = tt_metal::datatype_to_dataformat_converter(input.get_dtype());
diff --git a/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp b/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp
index 1a12849b814..a7efe354c34 100644
--- a/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <variant>
 
 #include "ttnn/decorators.hpp"
diff --git a/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp b/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp
index c4c59a724d1..7b8341ef7db 100644
--- a/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <optional>
 #include <variant>
 
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp
index 7ed3cb413c3..757c88962b2 100644
--- a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp
@@ -1,6 +1,9 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
 #include <optional>
 #include <variant>
 
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp
index 32eb87cd13a..93e36fee7f2 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp
@@ -164,7 +164,7 @@ struct Matmul {
     const bool user_run_batched = false;
     const bool transpose_a = false;
     const bool transpose_b = false;
-    const std::optional<const Tile> output_tile;
+    const std::optional<const tt::tt_metal::Tile> output_tile;
 
     void validate(
         const std::vector<Tensor> &input_tensors,
diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp
index 227ab4af119..c2d14cddedc 100644
--- a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp
@@ -104,7 +104,7 @@ Tensor MatmulOperation::invoke(
     const std::optional<const std::string>& activation,
     const std::optional<const DeviceComputeKernelConfig> compute_kernel_config,
     const std::optional<const CoreGrid> core_grid,
-    const std::optional<const Tile>& output_tile) {
+    const std::optional<const tt::tt_metal::Tile>& output_tile) {
     std::optional<CoreCoord> user_core_coord;
     if (core_grid.has_value()) {
         user_core_coord = CoreCoord(core_grid->x, core_grid->y);
@@ -142,7 +142,7 @@ Tensor LinearOperation::invoke(
     const std::optional<const std::string>& activation,
     const std::optional<const DeviceComputeKernelConfig> compute_kernel_config,
     const std::optional<const CoreGrid> core_grid,
-    const std::optional<const Tile>& output_tile) {
+    const std::optional<const tt::tt_metal::Tile>& output_tile) {
     std::optional<CoreCoord> user_core_coord;
     if (core_grid.has_value()) {
         user_core_coord = CoreCoord(core_grid->x, core_grid->y);
diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
index 8559de2d18d..eb450bd2896 100644
--- a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
+++ b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
@@ -47,7 +47,7 @@ struct MatmulOperation {
         const std::optional<const std::string>& activation = std::nullopt,
         const std::optional<const DeviceComputeKernelConfig> compute_kernel_config = std::nullopt,
         const std::optional<const CoreGrid> core_grid = std::nullopt,
-        const std::optional<const Tile>& output_tile = std::nullopt);
+        const std::optional<const tt::tt_metal::Tile>& output_tile = std::nullopt);
 };
 
 struct LinearOperation {
@@ -63,7 +63,7 @@ struct LinearOperation {
         const std::optional<const std::string>& activation = std::nullopt,
         const std::optional<const DeviceComputeKernelConfig> compute_kernel_config = std::nullopt,
         const std::optional<const CoreGrid> core_grid = std::nullopt,
-        const std::optional<const Tile>& output_tile = std::nullopt);
+        const std::optional<const tt::tt_metal::Tile>& output_tile = std::nullopt);
 };
 
 }  // namespace matmul
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp
index b39450d24ad..8795ae3e254 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <variant>
 #include <vector>
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp
index 23540113725..a4eae4dd393 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 
 namespace ttnn::operations::moreh::moreh_arange {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp
index 04d3f356647..0b4006a0cc7 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp
index 92bef5f3ac0..5fe907df84f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp
index 34ae223deb8..f51df643baf 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <optional>
 #include <variant>
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp
index 1cbae242656..0ca74ffcf42 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <cstdint>
 #include <optional>
 #include <variant>
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
index 8f74b1cbe4f..c0f7e7ae8f4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
@@ -5,12 +5,16 @@
 #include "moreh_getitem_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 struct IndexInfo {
     bool is_defined;
     bool is_dram;
     uint32_t address;
     uint32_t unit_size;
 };
+}
+}
 
 namespace ttnn::operations::moreh::moreh_getitem {
 MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOperation::MorehGetItemRmFactory::create(
@@ -20,6 +24,7 @@ MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOpera
     using namespace tt;
     using namespace tt::tt_metal;
     using namespace tt::operations::primary;
+    using namespace CMAKE_UNIQUE_NAMESPACE;
 
     auto input = tensor_args.input;
     auto index_tensors = tensor_args.index_tensors;
@@ -234,6 +239,7 @@ void MorehGetItemOperation::MorehGetItemRmFactory::override_runtime_arguments(
     const operation_attributes_t &operation_attributes,
     const tensor_args_t &tensor_args,
     tensor_return_value_t &tensor_return_value) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     auto &program = cached_program.program;
     auto &reader_kernel_id = cached_program.shared_variables.unary_reader_kernel_id;
     auto &writer_kernel_id = cached_program.shared_variables.unary_writer_kernel_id;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
index 74749bc4636..e3c81d815c7 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
@@ -8,12 +8,16 @@
 #include "moreh_getitem_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 struct IndexInfo {
     bool is_defined;
     bool is_dram;
     uint32_t address;
     uint32_t unit_size;
 };
+}
+}
 
 namespace ttnn::operations::moreh::moreh_getitem {
 MorehGetItemOperation::MorehGetItemTilizedFactory::cached_program_t
@@ -24,6 +28,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
     using namespace tt;
     using namespace tt::tt_metal;
     using namespace tt::operations::primary;
+    using namespace CMAKE_UNIQUE_NAMESPACE;
 
     auto input = tensor_args.input;
     auto index_tensors = tensor_args.index_tensors;
@@ -547,6 +552,7 @@ void MorehGetItemOperation::MorehGetItemTilizedFactory::override_runtime_argumen
     const operation_attributes_t &operation_attributes,
     const tensor_args_t &tensor_args,
     tensor_return_value_t &tensor_return_value) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     auto &program = cached_program.program;
     auto &reader_kernel_id = cached_program.shared_variables.unary_reader_kernel_id;
     auto &writer_kernel_id = cached_program.shared_variables.unary_writer_kernel_id;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp
index 480aac7cf01..f69fdb3acad 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp
index 4eda36fccbb..5b5fb240f15 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <cstdint>
 #include <optional>
 #include <variant>
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp
index 160e3425aae..2d6bfa7450d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp
index e7bef713e90..2c0334abc35 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp
index cbceeaefb10..aec3c8a6bf4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp
index c3167583676..fdfb4621253 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp
index 1ec9b4ddd74..55a88691a6a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <optional>
 
 #include "ttnn/decorators.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp
index 92d083eef03..743c2e1c162 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <cstdint>
 #include <optional>
 #include <variant>
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp
index 376bd1759e6..54849e02229 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <cstdint>
 #include <optional>
 #include <variant>
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp
index 1b8b4da8ab2..ffa38825da0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp
index a955818f626..ff8d5df97f6 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp
index b9bfd28c7b7..ababc16fd38 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp
index 42ee2f45b6e..e8829852fdf 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp
index a6906e46ef3..68da88b09eb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp
index a62db287a25..f3ce1416459 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <variant>
 
 #include "ttnn/decorators.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp
index e848e61fa2b..9d5e86c3873 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
index 94f5706c5c7..e493824e0d1 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
@@ -18,6 +18,8 @@ using namespace tt::constants;
 
 namespace ttnn::operations::normalization {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; }
 inline bool is_dram(const std::optional<const Tensor> input_tensor) {
      return input_tensor.has_value() ? is_dram(input_tensor.value()) : true;
@@ -111,6 +113,8 @@ std::pair<uint32_t, uint32_t> find_max_tile_span(uint32_t W, uint32_t group_size
 
     return {max_tile_span, num_groups_before_start_again_at_tile_beginning};
 }
+}
+}
 
 operation::ProgramWithCallbacks groupnorm_multi_core_sharded(
     const Tensor &a,
@@ -126,6 +130,7 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded(
     CoreCoord grid_size,
     bool inplace
 ) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (gamma.has_value()) {
         TT_ASSERT(gamma.value().get_layout() == Layout::ROW_MAJOR);
     }
@@ -477,8 +482,8 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded(
         (std::uint32_t) per_core_Mt,
         (std::uint32_t) TILE_HEIGHT
     };
-    tt::tt_metal::NOC reader_noc = detail::GetPreferredNOCForDRAMWrite(device->arch());
-    tt::tt_metal::NOC writer_noc = detail::GetPreferredNOCForDRAMRead(device->arch());
+    tt::tt_metal::NOC reader_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch());
+    tt::tt_metal::NOC writer_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch());
     // reader kernel
     auto reader_mcast_sender_kernels_id = CreateKernel(
         program,
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
index d9e387165e4..affb95045eb 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
@@ -18,6 +18,8 @@ using namespace tt::constants;
 
 namespace ttnn::operations::normalization {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; }
 inline bool is_dram(const std::optional<const Tensor> input_tensor) {
      return input_tensor.has_value() ? is_dram(input_tensor.value()) : true;
@@ -40,6 +42,8 @@ inline uint32_t pack_two_bfloat16_into_uint32(std::pair<uint16_t, uint16_t> two_
     // second -> upper 16
     return (uint32_t)two_bfloats.first | ((uint32_t)two_bfloats.second << 16);
 }
+}
+}
 
 // computes layernorm(a+*b)*gamma + beta
 // if b is nullptr it's treated as zero (no addition)
@@ -53,6 +57,7 @@ operation::ProgramWithCallbacks layernorm_multi_core(
     float eps,
     DeviceComputeKernelConfig compute_kernel_config
 ) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     bool rms_norm = norm_type == LayerNormType::RMSNORM;
     const auto shape = a.get_legacy_shape();
     uint32_t W = shape[-1], H = shape[-2];
@@ -192,7 +197,7 @@ operation::ProgramWithCallbacks layernorm_multi_core(
         bool gamma_stick_size_is_power_of_two = is_power_of_two_at_least_32(gamma_stick_size);
         reader_compile_time_args.push_back((std::uint32_t) gamma_stick_size_is_power_of_two);
         if (gamma_stick_size_is_power_of_two) {
-            uint32_t gamma_log2_stick_size = gamma_stick_size_is_power_of_two ? (std::uint32_t)log2(gamma_stick_size) : 0;
+            uint32_t gamma_log2_stick_size = gamma_stick_size_is_power_of_two ? (std::uint32_t)std::log2(gamma_stick_size) : 0;
             reader_compile_time_args.push_back((std::uint32_t) gamma_log2_stick_size);
         } else {
             reader_compile_time_args.push_back(gamma_stick_size);
@@ -202,7 +207,7 @@ operation::ProgramWithCallbacks layernorm_multi_core(
         bool beta_stick_size_is_power_of_two = is_power_of_two_at_least_32(beta_stick_size);
         reader_compile_time_args.push_back((std::uint32_t) beta_stick_size_is_power_of_two);
         if (beta_stick_size_is_power_of_two) {
-            uint32_t beta_log2_stick_size = beta_stick_size_is_power_of_two ? (std::uint32_t)log2(beta_stick_size) : 0;
+            uint32_t beta_log2_stick_size = beta_stick_size_is_power_of_two ? (std::uint32_t)std::log2(beta_stick_size) : 0;
             reader_compile_time_args.push_back((std::uint32_t) beta_log2_stick_size);
         } else {
             reader_compile_time_args.push_back(beta_stick_size);
@@ -407,6 +412,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
     uint32_t block_wt,
     DeviceComputeKernelConfig compute_kernel_config
 ) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     bool rms_norm = norm_type == LayerNormType::RMSNORM;
     bool is_pre_all_gather = distributed_norm_stage == DistributedLayerNormStage::PRE_ALL_GATHER;
     bool is_post_all_gather = distributed_norm_stage == DistributedLayerNormStage::POST_ALL_GATHER;
@@ -904,7 +910,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
         writer_mcast_sender_compile_time_args.push_back((std::uint32_t) gamma_stick_size_is_power_of_two);
         writer_mcast_receiver_compile_time_args.push_back((std::uint32_t) gamma_stick_size_is_power_of_two);
         if (gamma_stick_size_is_power_of_two) {
-            uint32_t gamma_log2_stick_size = gamma_stick_size_is_power_of_two ? (std::uint32_t)log2(gamma_stick_size) : 0;
+            uint32_t gamma_log2_stick_size = gamma_stick_size_is_power_of_two ? (std::uint32_t)std::log2(gamma_stick_size) : 0;
             writer_mcast_sender_compile_time_args.push_back((std::uint32_t) gamma_log2_stick_size);
             writer_mcast_receiver_compile_time_args.push_back((std::uint32_t) gamma_log2_stick_size);
         } else {
@@ -917,7 +923,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
         writer_mcast_sender_compile_time_args.push_back((std::uint32_t) beta_stick_size_is_power_of_two);
         writer_mcast_receiver_compile_time_args.push_back((std::uint32_t) beta_stick_size_is_power_of_two);
         if (beta_stick_size_is_power_of_two) {
-            uint32_t beta_log2_stick_size = beta_stick_size_is_power_of_two ? (std::uint32_t)log2(beta_stick_size) : 0;
+            uint32_t beta_log2_stick_size = beta_stick_size_is_power_of_two ? (std::uint32_t)std::log2(beta_stick_size) : 0;
             writer_mcast_sender_compile_time_args.push_back((std::uint32_t) beta_log2_stick_size);
             writer_mcast_receiver_compile_time_args.push_back((std::uint32_t) beta_log2_stick_size);
         } else {
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
index 3daf51392fc..6fb5f6e2ba6 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
@@ -19,6 +19,8 @@ using namespace tt::tt_metal;
 
 namespace ttnn::operations::normalization {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; }
 inline bool is_dram(const std::optional<const Tensor> input_tensor) {
      return input_tensor.has_value() ? is_dram(input_tensor.value()) : true;
@@ -41,6 +43,8 @@ inline uint32_t pack_two_bfloat16_into_uint32(std::pair<uint16_t, uint16_t> two_
     // second -> upper 16
     return (uint32_t)two_bfloats.first | ((uint32_t)two_bfloats.second << 16);
 }
+}
+}
 
 // computes layernorm(a)*gamma + beta
 operation::ProgramWithCallbacks layernorm_post_allgather_multi_core(
@@ -53,6 +57,7 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core(
     float eps,
     ttnn::DeviceComputeKernelConfig compute_kernel_config
 ) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const bool is_rmsnorm = norm_type == LayerNormDistributedType::RMSNORM;
     const auto shape = a.get_legacy_shape();
     const uint32_t W = shape[-1], H = shape[-2];
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
index aedca4b1c2a..92f7d7601ef 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
@@ -18,6 +18,8 @@ using namespace tt::constants;
 
 namespace ttnn::operations::normalization {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; }
 inline bool is_dram(const std::optional<const Tensor> input_tensor) {
      return input_tensor.has_value() ? is_dram(input_tensor.value()) : true;
@@ -40,6 +42,8 @@ inline uint32_t pack_two_bfloat16_into_uint32(std::pair<uint16_t, uint16_t> two_
     // second -> upper 16
     return (uint32_t)two_bfloats.first | ((uint32_t)two_bfloats.second << 16);
 }
+}
+}
 
 operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core(
     const Tensor &a,
@@ -47,6 +51,7 @@ operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core(
     LayerNormDistributedType norm_type,
     DeviceComputeKernelConfig compute_kernel_config
 ) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const bool is_rmsnorm = norm_type == LayerNormDistributedType::RMSNORM;
     const auto shape = a.get_legacy_shape();
     const uint32_t W = shape[-1], H = shape[-2];
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
index a0b49812a56..69cce8381b9 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
@@ -20,11 +20,15 @@
 using namespace tt::constants;
 namespace ttnn::operations::normalization {
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; }
 inline bool is_dram(const std::optional<const Tensor> input_tensor) {
      return input_tensor.has_value() ? is_dram(input_tensor.value()) : true;
 }
 inline bool is_dram(const Buffer* b) { return b->buffer_type() == BufferType::DRAM; }
+}
+}
 
 // implementation of softmax with optional scale/mask (see the header for input_tensor more detailed description)
 operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
@@ -36,7 +40,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
     DeviceComputeKernelConfig compute_kernel_config,
     bool numeric_stable
 ) {
-
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     const auto shape = input_tensor.get_legacy_shape();
     uint32_t W = shape[-1], H = (input_tensor.volume() / (shape[0] * shape[-1])), NC = shape[0];
     uint32_t HW = H*W;
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp
index e1b38036ca1..dfcc7372f8e 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp
@@ -37,7 +37,7 @@ std::vector<tt::tt_metal::LegacyShape> HaloDeviceOperation::compute_output_shape
     // output_shape[1] remains same
     // output_shape[2] changes
     // output_shape[3] remains same
-    output_shape[2] = (uint32_t) ceil((float) total_nsticks / nbatch);
+    output_shape[2] = (uint32_t) std::ceil((float) total_nsticks / nbatch);
 
     log_debug(tt::LogOp, "output_shape: [{} {} {} {}]", output_shape[0], output_shape[1], output_shape[2], output_shape[3]);
     log_debug(tt::LogOp, "max_out_nsticks_per_core: {}", max_out_nsticks_per_core_);
diff --git a/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp b/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp
index 372c9dcc6ae..72bf4f0b7b3 100644
--- a/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/tensor/types.cpp b/ttnn/cpp/ttnn/tensor/types.cpp
index a476bf6a608..69856466bb6 100644
--- a/ttnn/cpp/ttnn/tensor/types.cpp
+++ b/ttnn/cpp/ttnn/tensor/types.cpp
@@ -8,7 +8,7 @@
 
 namespace ttnn {
 
-SimpleShape get_physical_shape(const SimpleShape& logical_shape, DataType data_type, Layout layout, const std::optional<Tile>& tile) {
+SimpleShape get_physical_shape(const SimpleShape& logical_shape, DataType data_type, Layout layout, const std::optional<tt::tt_metal::Tile>& tile) {
     SimpleShape physical_shape = logical_shape;
     auto rank = physical_shape.rank();
     if (layout == Layout::TILE) {
diff --git a/ttnn/cpp/ttnn/types.hpp b/ttnn/cpp/ttnn/types.hpp
index 72e212d1c21..ec1e760a06a 100644
--- a/ttnn/cpp/ttnn/types.hpp
+++ b/ttnn/cpp/ttnn/types.hpp
@@ -19,15 +19,6 @@ using Device = tt::tt_metal::Device;
 constexpr auto TILE_SIZE = 32;
 
 using tt::tt_metal::DataType;
-static constexpr auto uint8 = DataType::UINT8;
-static constexpr auto uint16 = DataType::UINT16;
-static constexpr auto int32 = DataType::INT32;
-static constexpr auto uint32 = DataType::UINT32;
-static constexpr auto float32 = DataType::FLOAT32;
-static constexpr auto bfloat16 = DataType::BFLOAT16;
-static constexpr auto bfloat8_b = DataType::BFLOAT8_B;
-static constexpr auto bfloat4_b = DataType::BFLOAT4_B;
-
 using tt::tt_metal::BufferType;
 using tt::tt_metal::MemoryConfig;
 using tt::tt_metal::TensorMemoryLayout;