From 948fafb081ace2fd966295548128616b409903cf Mon Sep 17 00:00:00 2001 From: Stanislav Minakov Date: Wed, 30 Oct 2024 16:09:41 -0700 Subject: [PATCH] #0: Faster builds by enabling Unity build for TTNN and tests (#14461) * #0: Unity builds * #0: Unity build for tests * #0: More unity builds * #0: Cleanup * #0: Rename ANON_NAMESPACE to CMAKE_UNIQUE_NAMESPACE * #0: Allow to disable unity builds * #0: Disable unity builds if export commands is on * #0: Raise min cmake version to 3.20 * #0: CMake fixes * #0: Review fixes - cmake cleanup * #0: Disable unity builds on older cmake * #0: Build script fixup * #0: Build fix --- CMakeLists.txt | 2 + build_metal.sh | 17 +++++- cmake/helper_functions.cmake | 1 + cmake/project_options.cmake | 12 ++++ cmake/unity.cmake | 12 ++++ tests/tt_metal/test_utils/df/bfloat16.hpp | 47 --------------- tests/tt_metal/test_utils/df/df.hpp | 1 - .../tt_metal/unit_tests/CMakeLists.txt | 1 + .../unit_tests/buffer/test_banked.cpp | 4 +- .../unit_tests/common/core_coord_fixture.hpp | 2 + .../unit_tests/compute/test_broadcast.cpp | 34 +++++------ .../unit_tests/compute/test_cumsum.cpp | 20 +++---- .../unit_tests/compute/test_sfpu_compute.cpp | 58 +++++++++---------- .../test_single_core_binary_compute.cpp | 38 ++++++------ .../test_single_core_matmul_compute.cpp | 48 +++++++-------- .../tt_metal/unit_tests/dram/direct.cpp | 8 +-- .../multichip/basic_eth_kernels.cpp | 12 ++++ .../multichip/buffer_movement_kernels.cpp | 4 +- .../multichip/erisc_app_direct_send.cpp | 17 ++++-- .../multichip/ring_gather_kernels.cpp | 4 +- .../tt_metal/unit_tests_common/CMakeLists.txt | 3 +- .../common/common_fixture.hpp | 2 + .../common/dprint_fixture.hpp | 2 + .../common/watcher_fixture.hpp | 2 + .../compute/matmul/matmul_utils.hpp | 3 + .../dprint/test_eth_cores.cpp | 8 ++- .../dprint/test_mute_device.cpp | 6 +- .../dprint/test_mute_print_server.cpp | 6 +- .../dprint/test_print_all_harts.cpp | 6 +- .../dprint/test_print_hanging.cpp | 6 +- .../dprint/test_print_tensix_dest.cpp | 2 +- .../dprint/test_raise_wait.cpp | 6 +- .../unit_tests_common/watcher/test_pause.cpp | 6 +- .../watcher/test_waypoint.cpp | 6 +- .../unit_tests_fast_dispatch/CMakeLists.txt | 1 + .../command_queue_test_utils.hpp | 2 + .../command_queue/test_EnqueueTrace.cpp | 6 -- .../common/command_queue_fixture.hpp | 2 + .../compute/sfpu/sfpu_compute.cpp | 54 ++++++++--------- .../multichip/test_eth_EnqueueProgram.cpp | 10 +++- .../test_eth_ring_gather_EnqueueProgram.cpp | 14 ++++- .../CMakeLists.txt | 1 + .../command_queue/test_EnqueueTrace.cpp | 6 -- .../common/command_queue_fixture.hpp | 2 + .../common/command_queue_test_utils.hpp | 2 +- tests/ttnn/unit_tests/gtests/CMakeLists.txt | 1 + tests/ttnn/unit_tests/gtests/test_add.cpp | 4 +- .../ttnn/unit_tests/gtests/test_graph_add.cpp | 4 +- .../unit_tests/gtests/test_multi_device.cpp | 4 +- .../ttnn_multi_command_queue_fixture.hpp | 2 + .../unit_tests/gtests/ttnn_test_fixtures.hpp | 5 +- ttnn/CMakeLists.txt | 1 + ttnn/cpp/pybind11/operations/creation.hpp | 2 +- .../device/bernoulli_device_operation.hpp | 2 + ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp | 4 +- .../device/reduce_scatter_op.cpp | 8 ++- .../conv2d_op_sharded_program_factory.cpp | 8 +++ ...onv2d_op_width_sharded_program_factory.cpp | 27 ++++----- ttnn/cpp/ttnn/operations/creation.hpp | 16 ++--- .../clone/device/clone_device_operation.hpp | 2 + .../concat/device/concat_program_factory.hpp | 2 + .../untilize/device/untilize_op.cpp | 8 +-- .../device/untilize_program_factory.cpp | 2 +- .../device/untilize_with_halo_v2_op.cpp | 2 +- ...t_and_width_multi_core_program_factory.cpp | 5 ++ ...cast_height_multi_core_program_factory.cpp | 5 ++ ...core_sharded_optimized_program_factory.cpp | 5 ++ ...dcast_width_multi_core_program_factory.cpp | 5 ++ .../device/unary_sharded_program_factory.cpp | 4 +- .../device/embedding_program_factory.hpp | 10 ++-- .../embedding_backward_program_factory.cpp | 2 +- .../ccl/all_reduce/device/all_reduce_op.cpp | 7 ++- .../rotary_embedding_program_factory.cpp | 4 +- ...otary_embedding_llama_device_operation.cpp | 3 +- ...rotary_embedding_llama_program_factory.cpp | 3 +- .../full/device/full_device_operation.hpp | 2 + .../device/full_like_device_operation.hpp | 2 + .../device/index_fill_device_operation.hpp | 3 + .../operations/matmul/device/matmul_op.hpp | 2 +- ttnn/cpp/ttnn/operations/matmul/matmul.cpp | 4 +- ttnn/cpp/ttnn/operations/matmul/matmul.hpp | 4 +- .../device/moreh_adam_device_operation.hpp | 2 + .../device/moreh_arange_device_operation.hpp | 2 + .../device/moreh_cumsum_device_operation.hpp | 2 + .../device/moreh_dot_device_operation.hpp | 2 + .../moreh_dot_backward_device_operation.hpp | 2 + .../device/moreh_getitem_device_operation.hpp | 2 + .../device/moreh_getitem_rm_factory.cpp | 6 ++ .../device/moreh_getitem_tilized_factory.cpp | 6 ++ .../moreh_group_norm_device_operation.hpp | 2 + ...kward_gamma_beta_grad_device_operation.hpp | 2 + ...m_backward_input_grad_device_operation.hpp | 2 + .../moreh_layer_norm_device_operation.hpp | 2 + ...kward_gamma_beta_grad_device_operation.hpp | 2 + ...m_backward_input_grad_device_operation.hpp | 2 + .../device/moreh_matmul_device_operation.hpp | 2 + .../device/moreh_mean_device_operation.hpp | 2 + .../moreh_mean_backward_device_operation.hpp | 2 + .../device/moreh_norm_device_operation.hpp | 2 + .../moreh_norm_backward_device_operation.hpp | 2 + .../device/moreh_sgd_device_operation.hpp | 2 + .../device/moreh_softmax_device_operation.hpp | 2 + ...oreh_softmax_backward_device_operation.hpp | 2 + .../device/moreh_sum_device_operation.hpp | 2 + .../moreh_sum_backward_device_operation.hpp | 2 + .../multi_core/groupnorm_op_multi_core.cpp | 9 ++- .../multi_core/layernorm_op_multi_core.cpp | 14 +++-- ...ayernorm_post_all_gather_op_multi_core.cpp | 5 ++ ...layernorm_pre_all_gather_op_multi_core.cpp | 5 ++ .../multi_core/softmax_op_multi_core.cpp | 6 +- .../halo/device/halo_device_operation.cpp | 2 +- .../device/uniform_device_operation.hpp | 2 + ttnn/cpp/ttnn/tensor/types.cpp | 2 +- ttnn/cpp/ttnn/types.hpp | 9 --- 114 files changed, 495 insertions(+), 295 deletions(-) create mode 100644 cmake/unity.cmake delete mode 100644 tests/tt_metal/test_utils/df/bfloat16.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index a0e9d25c10a..77be383575f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ endif() list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) include(project_options) +include(unity) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -121,6 +122,7 @@ message(STATUS "Build Python bindings: ${WITH_PYTHON_BINDINGS}") message(STATUS "Build Programming Examples: ${BUILD_PROGRAMMING_EXAMPLES}") message(STATUS "Build TT METAL Tests: ${TT_METAL_BUILD_TESTS}") message(STATUS "Build TTNN Tests: ${TTNN_BUILD_TESTS}") +message(STATUS "Build with Unity builds: ${TT_UNITY_BUILDS}") ############################################################################################################################ if(ENABLE_BUILD_TIME_TRACE) diff --git a/build_metal.sh b/build_metal.sh index 2b09410eb20..a3653d8c1ed 100755 --- a/build_metal.sh +++ b/build_metal.sh @@ -26,6 +26,7 @@ show_help() { echo " --debug Set the build type as Debug." echo " --clean Remove build workspaces." echo " --build-static-libs Build tt_metal (not ttnn) as a static lib (BUILD_SHARED_LIBS=OFF)" + echo " --disable-unity-builds Disable Unity builds" } clean() { @@ -49,11 +50,12 @@ build_metal_tests="OFF" build_umd_tests="OFF" build_programming_examples="OFF" build_static_libs="OFF" +unity_builds="ON" declare -a cmake_args OPTIONS=h,e,c,t,a,m,s,u,b:,p -LONGOPTIONS=help,export-compile-commands,enable-ccache,enable-time-trace,enable-asan,enable-msan,enable-tsan,enable-ubsan,build-type:,enable-profiler,install-prefix:,build-tests,build-ttnn-tests,build-metal-tests,build-umd-tests,build-programming-examples,build-static-libs,release,development,debug,clean +LONGOPTIONS=help,export-compile-commands,enable-ccache,enable-time-trace,enable-asan,enable-msan,enable-tsan,enable-ubsan,build-type:,enable-profiler,install-prefix:,build-tests,build-ttnn-tests,build-metal-tests,build-umd-tests,build-programming-examples,build-static-libs,disable-unity-builds,release,development,debug,clean # Parse the options PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTIONS --name "$0" -- "$@") @@ -70,7 +72,7 @@ while true; do -h|--help) show_help;exit 0;; -e|--export-compile-commands) - export_compile_commands="ON";; + export_compile_commands="ON";unity_builds="OFF";; -c|--enable-ccache) enable_ccache="ON";; -t|--enable-time-trace) @@ -101,6 +103,8 @@ while true; do build_programming_examples="ON";; --build-static-libs) build_static_libs="ON";; + --disable-unity-builds) + unity_builds="OFF";; --release) build_type="Release";; --development) @@ -156,6 +160,7 @@ echo "INFO: Enable UndefinedBehaviorSanitizer: $enable_ubsan" echo "INFO: Build directory: $build_dir" echo "INFO: Install Prefix: $cmake_install_prefix" echo "INFO: Build tests: $build_tests" +echo "INFO: Enable Unity builds: $unity_builds" # Prepare cmake arguments cmake_args+=("-B" "$build_dir") @@ -194,6 +199,8 @@ fi if [ "$export_compile_commands" = "ON" ]; then cmake_args+=("-DCMAKE_EXPORT_COMPILE_COMMANDS=ON") +else + cmake_args+=("-DCMAKE_EXPORT_COMPILE_COMMANDS=OFF") fi if [ "$build_tests" = "ON" ]; then @@ -222,6 +229,12 @@ if [ "$build_static_libs" = "ON" ]; then cmake_args+=("-DBUILD_SHARED_LIBS=OFF") fi +if [ "$unity_builds" = "ON" ]; then + cmake_args+=("-DTT_UNITY_BUILDS=ON") +else + cmake_args+=("-DTT_UNITY_BUILDS=OFF") +fi + # Create and link the build directory mkdir -p $build_dir ln -nsf $build_dir build diff --git a/cmake/helper_functions.cmake b/cmake/helper_functions.cmake index 7214addd901..ac76caea8a7 100644 --- a/cmake/helper_functions.cmake +++ b/cmake/helper_functions.cmake @@ -12,6 +12,7 @@ function(CREATE_EAGER_TEST_EXE TESTLIST) set(TEST_TARGET ${TEST_NAME}) endif() add_executable(${TEST_TARGET} ${TEST_SRC_PATH}) + TT_ENABLE_UNITY_BUILD(${TEST_TARGET}) target_link_libraries( ${TEST_TARGET} diff --git a/cmake/project_options.cmake b/cmake/project_options.cmake index 90851878df4..63a5922f815 100644 --- a/cmake/project_options.cmake +++ b/cmake/project_options.cmake @@ -16,4 +16,16 @@ option(BUILD_PROGRAMMING_EXAMPLES "Enables build of tt_metal programming example option(TT_METAL_BUILD_TESTS "Enables build of tt_metal tests" OFF) option(TTNN_BUILD_TESTS "Enables build of ttnn tests" OFF) option(ENABLE_CCACHE "Build with compiler cache" FALSE) +option(TT_UNITY_BUILDS "Build with Unity builds" ON) ########################################################################################### + +if(TT_UNITY_BUILDS) + if(CMAKE_EXPORT_COMPILE_COMMANDS) + message(STATUS "Disabling Unity builds because CMAKE_EXPORT_COMPILE_COMMANDS is ON") + set(TT_UNITY_BUILDS OFF) + endif() + if(CMAKE_VERSION VERSION_LESS "3.20.0") + message(STATUS "CMake 3.20 or newer is required for Unity builds, disabling") + set(TT_UNITY_BUILDS OFF) + endif() +endif() diff --git a/cmake/unity.cmake b/cmake/unity.cmake new file mode 100644 index 00000000000..644113e26d7 --- /dev/null +++ b/cmake/unity.cmake @@ -0,0 +1,12 @@ +function(TT_ENABLE_UNITY_BUILD TARGET) + if(TT_UNITY_BUILDS) + set_target_properties( + ${TARGET} + PROPERTIES + UNITY_BUILD + ON + UNITY_BUILD_UNIQUE_ID + "CMAKE_UNIQUE_NAMESPACE" + ) + endif() +endfunction() diff --git a/tests/tt_metal/test_utils/df/bfloat16.hpp b/tests/tt_metal/test_utils/df/bfloat16.hpp deleted file mode 100644 index 339e43ce89b..00000000000 --- a/tests/tt_metal/test_utils/df/bfloat16.hpp +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include - -namespace tt::test_utils::df { - -//! Custom type is supported as long as the custom type supports the following custom functions -//! static SIZEOF - indicates byte size of custom type -//! to_float() - get float value from custom type -//! to_packed() - get packed (into an integral type that is of the bitwidth specified by SIZEOF) -//! Constructor(float in) - constructor with a float as the initializer - -class bfloat16 { - private: - uint16_t uint16_data; - - public: - static constexpr size_t SIZEOF = 2; - - constexpr bfloat16() noexcept = default; - - // create from float: no rounding, just truncate - constexpr bfloat16(float float_num) noexcept : bfloat16((std::bit_cast(float_num)) >> 16) { - static_assert(sizeof(float) == sizeof(uint32_t), "Can only support 32bit fp"); - } - - // store lower 16 as 16-bit uint - constexpr bfloat16(uint32_t uint32_data) noexcept : uint16_data(static_cast(uint32_data)) {} - - constexpr float to_float() const noexcept { return std::bit_cast(static_cast(uint16_data) << 16); } - - constexpr uint16_t to_packed() const noexcept { return uint16_data; } - - constexpr bool operator==(const bfloat16& rhs) const noexcept = default; -}; - -inline std::ostream& operator<<(std::ostream& os, const bfloat16& val) { - os << val.to_packed(); - return os; -} - -} // namespace tt::test_utils::df diff --git a/tests/tt_metal/test_utils/df/df.hpp b/tests/tt_metal/test_utils/df/df.hpp index 89afc6bc818..883264f5935 100644 --- a/tests/tt_metal/test_utils/df/df.hpp +++ b/tests/tt_metal/test_utils/df/df.hpp @@ -3,5 +3,4 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "tt_metal/test_utils/df/bfloat16.hpp" #include "tt_metal/test_utils/df/float32.hpp" diff --git a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt index bee628ad488..0e49d557eae 100644 --- a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt @@ -48,6 +48,7 @@ add_executable( ${UNIT_TESTS_SRC} $ ) +TT_ENABLE_UNITY_BUILD(unit_tests) add_executable(unit_tests_galaxy ${CMAKE_CURRENT_SOURCE_DIR}/multichip/galaxy_cluster_api.cpp) target_link_libraries( diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp index 62418427c21..9f1d68e7440 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp @@ -227,8 +227,8 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) { //////////////////////////////////////////////////////////////////////////// // Stimulus Generation //////////////////////////////////////////////////////////////////////////// - std::vector input_packed = tt::test_utils::generate_packed_uniform_random_vector( - -1.0f, 1.0f, cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); + std::vector input_packed = tt::test_utils::generate_packed_uniform_random_vector( + -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); //////////////////////////////////////////////////////////////////////////// // Compile and Execute Appli cation diff --git a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp index 90963f22e23..e483088a7f7 100644 --- a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" #include "tt_metal/test_utils/env_vars.hpp" diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp index d12d89bd88f..43963dc422e 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp @@ -69,7 +69,7 @@ struct BroadcastConfig { MathFidelity math_fidelity = MathFidelity::HiFi4; }; -void mask_src_b_for_broadcast(std::vector& tile, const std::vector &shape, BroadcastDim dim) { +void mask_src_b_for_broadcast(std::vector& tile, const std::vector &shape, BroadcastDim dim) { int num_rows = shape.at(0); int num_cols = shape.at(1); @@ -83,14 +83,14 @@ void mask_src_b_for_broadcast(std::vector& tile, c } } -std::vector gold_broadcast(std::vector& src_a, std::vector& src_b, const std::vector &shape, EltwiseOp op, BroadcastDim dim, MathFidelity math_fidelity = MathFidelity::HiFi4) { +std::vector gold_broadcast(std::vector& src_a, std::vector& src_b, const std::vector &shape, EltwiseOp op, BroadcastDim dim, MathFidelity math_fidelity = MathFidelity::HiFi4) { int num_rows = shape.at(0); int num_cols = shape.at(1); uint16_t srca_fid_mask = 0xFFFF; uint16_t srcb_fid_mask = 0xFFFF; - std::vector golden(num_cols * num_rows); + std::vector golden(num_cols * num_rows); auto arch = get_arch_from_string(get_umd_arch_name()); switch (math_fidelity) { @@ -103,7 +103,7 @@ std::vector gold_broadcast(std::vector gold_broadcast(std::vector(src_a[i * num_cols + j].to_packed() & srca_fid_mask)).to_float() * - tt::test_utils::df::bfloat16(std::bit_cast(broadcast_value.to_packed() & srcb_fid_mask)).to_float(); + bfloat16(std::bit_cast(src_a[i * num_cols + j].to_packed() & srca_fid_mask)).to_float() * + bfloat16(std::bit_cast(broadcast_value.to_packed() & srcb_fid_mask)).to_float(); break; } default: { TT_THROW("Unsupported EltwiseOp={}", op); break; } @@ -142,7 +142,7 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& constexpr uint32_t tile_width = 32; constexpr uint32_t tile_height = 32; - constexpr uint32_t single_tile_size = tile_width * tile_height * tt::test_utils::df::bfloat16::SIZEOF; + constexpr uint32_t single_tile_size = tile_width * tile_height * bfloat16::SIZEOF; tt_metal::InterleavedBufferConfig dram_config{ .device=device, @@ -244,25 +244,25 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& (uint32_t)1, }); - std::vector input0 = generate_uniform_random_vector( + std::vector input0 = generate_uniform_random_vector( -1.0f, 1.0f, - single_tile_size / tt::test_utils::df::bfloat16::SIZEOF, + single_tile_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); - std::vector input1 = generate_uniform_random_vector( + std::vector input1 = generate_uniform_random_vector( -1.0f, 1.0f, - single_tile_size / tt::test_utils::df::bfloat16::SIZEOF, + single_tile_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); mask_src_b_for_broadcast(input1, {tile_width, tile_height}, test_config.broadcast_dim); - std::vector golden = gold_broadcast(input0, input1, {tile_width, tile_height}, test_config.eltwise_op, test_config.broadcast_dim, test_config.math_fidelity); + std::vector golden = gold_broadcast(input0, input1, {tile_width, tile_height}, test_config.eltwise_op, test_config.broadcast_dim, test_config.math_fidelity); - auto packed_input0 = pack_vector(input0); - auto packed_input1 = pack_vector(input1); - auto packed_golden = pack_vector(golden); + auto packed_input0 = pack_vector(input0); + auto packed_input1 = pack_vector(input1); + auto packed_golden = pack_vector(golden); unit_tests::compute::GoldenConfig config = { .num_tiles_r_dim = tile_width/32, .num_tiles_c_dim = tile_height/32 @@ -279,10 +279,10 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& tt_metal::detail::ReadFromBuffer(dst_dram_buffer, dest_buffer_data); auto dest_buffer_data_untilized = unit_tests::compute::gold_standard_untilize(dest_buffer_data, config); - bool result = is_close_packed_vectors( + bool result = is_close_packed_vectors( dest_buffer_data_untilized, packed_golden, - [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { + [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.0155); }); ASSERT_TRUE(result); diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp index 7fdfa5d75a6..66119879e6c 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp @@ -24,12 +24,12 @@ struct CumsumConfig { bool rowwise; }; -std::vector gold_cumsum(std::vector& src, const std::vector &shape, bool rowwise) { +std::vector gold_cumsum(std::vector& src, const std::vector &shape, bool rowwise) { int N = shape.at(0); int W = shape.at(1); int H = shape.at(2); - std::vector golden(N * W * H); + std::vector golden(N * W * H); int dim_a = rowwise ? H : W; int dim_b = rowwise ? W : H; @@ -57,7 +57,7 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c constexpr uint32_t tile_width = 32; constexpr uint32_t tile_height = 32; - constexpr uint32_t single_tile_size = tile_width * tile_height * tt::test_utils::df::bfloat16::SIZEOF; + constexpr uint32_t single_tile_size = tile_width * tile_height * bfloat16::SIZEOF; uint32_t W = test_config.Wt * tile_width; uint32_t H = test_config.Ht * tile_height; @@ -147,16 +147,16 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c (uint32_t)test_config.Ht * test_config.Wt // Used for transposing kernel }); - std::vector input = generate_uniform_random_vector( + std::vector input = generate_uniform_random_vector( -1.0f, 1.0f, - dram_buffer_size / tt::test_utils::df::bfloat16::SIZEOF, + dram_buffer_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); - std::vector golden = gold_cumsum(input, {test_config.N, W, H}, test_config.rowwise); - auto golden_packed = pack_vector(golden); + std::vector golden = gold_cumsum(input, {test_config.N, W, H}, test_config.rowwise); + auto golden_packed = pack_vector(golden); - auto input_packed = pack_vector(input); + auto input_packed = pack_vector(input); auto input_packed_tilized = unit_tests::compute::gold_standard_tilize(input_packed, {test_config.N * test_config.Ht, test_config.Wt}); tt_metal::detail::WriteToBuffer(src_dram_buffer, input_packed_tilized); @@ -169,10 +169,10 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c log_info(tt::LogTest, "Running test for N = {}, Wt = {}, Ht = {}", test_config.N, test_config.Wt, test_config.Ht); - bool result = is_close_packed_vectors( + bool result = is_close_packed_vectors( output_packed, golden_packed, - [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { + [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.01f); }); ASSERT_TRUE(result); diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp index 0a7822c6fbf..35ffb316d01 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp @@ -39,61 +39,61 @@ const map> sfpu_op_to_op_name = { {"tanh", {{"SFPU_OP_CHAIN_0", "tanh_tile_init(); tanh_tile(0);"}}}, }; -tt::test_utils::df::bfloat16 sfpu_function(const string& op_name, const tt::test_utils::df::bfloat16& input) { +bfloat16 sfpu_function(const string& op_name, const bfloat16& input) { if (op_name == "relu") { - return tt::test_utils::df::bfloat16(fmaxf(input.to_float(), 0.0f)); + return bfloat16(fmaxf(input.to_float(), 0.0f)); } else if (op_name == "exponential") { - return tt::test_utils::df::bfloat16(std::exp(input.to_float())); + return bfloat16(std::exp(input.to_float())); } else if (op_name == "reciprocal") { - return tt::test_utils::df::bfloat16(1 / input.to_float()); + return bfloat16(1 / input.to_float()); } else if (op_name == "gelu") { static constexpr float alpha = M_2_SQRTPI * M_SQRT1_2; auto x = input.to_float(); auto x3 = x * x * x; float result = x * 0.5 * (1.0 + tanhf(alpha * (x + 0.044715 * x3))); - return tt::test_utils::df::bfloat16(result); + return bfloat16(result); } else if (op_name == "sqrt") { - return tt::test_utils::df::bfloat16(sqrtf(input.to_float())); + return bfloat16(sqrtf(input.to_float())); } else if (op_name == "sigmoid") { auto x = input.to_float(); float result = 1 / (1 + std::exp(-x)); - return tt::test_utils::df::bfloat16(result); + return bfloat16(result); } else if (op_name == "log") { - return tt::test_utils::df::bfloat16(logf(input.to_float())); + return bfloat16(logf(input.to_float())); } else if (op_name == "tanh") { - return tt::test_utils::df::bfloat16(std::tanh(input.to_float())); + return bfloat16(std::tanh(input.to_float())); } else { TT_THROW("Unsupported op_name in test"); - return tt::test_utils::df::bfloat16(0.0f); + return bfloat16(0.0f); } } vector generate_packed_sfpu_input(const unsigned int numel, const string& op_name, const int seed) { if ((op_name == "sqrt") or (op_name == "log")) { - return generate_packed_uniform_random_vector(0.0001f, 4.0f, numel, seed); + return generate_packed_uniform_random_vector(0.0001f, 4.0f, numel, seed); } else if ((op_name == "exponential") or (op_name == "gelu") or (op_name == "reciprocal")) { - auto possible_values = vector({-1.0f, -0.5f, 0.5f, 1.0f}); - return generate_packed_random_vector_from_vector(possible_values, numel, seed); + auto possible_values = vector({-1.0f, -0.5f, 0.5f, 1.0f}); + return generate_packed_random_vector_from_vector(possible_values, numel, seed); } else { - return generate_packed_uniform_random_vector(-1.0f, 1.0f, numel, seed); + return generate_packed_uniform_random_vector(-1.0f, 1.0f, numel, seed); } } bool is_close_packed_sfpu_output(const std::vector& vec_a, const std::vector& vec_b, const string& op_name) { if (op_name == "tanh") { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); }); } else if ((op_name == "gelu") or (op_name == "relu")) { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.15f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.15f); }); } else if ((op_name == "exponential")) { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); }); } else if ((op_name == "log")) { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.03f, 0.02f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.03f, 0.02f); }); } else { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); }); } } @@ -140,15 +140,15 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c // Input std::vector packed_input = sfpu_util::generate_packed_sfpu_input( - byte_size / tt::test_utils::df::bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count()); + byte_size / bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count()); // Golden output - auto input = unpack_vector(packed_input); - std::vector golden(input.size()); - std::transform(input.begin(), input.end(), golden.begin(), [&](const tt::test_utils::df::bfloat16& val) { + auto input = unpack_vector(packed_input); + std::vector golden(input.size()); + std::transform(input.begin(), input.end(), golden.begin(), [&](const bfloat16& val) { return sfpu_util::sfpu_function(test_config.sfpu_op, val); }); - std::vector packed_golden = pack_vector(golden); + std::vector packed_golden = pack_vector(golden); // Same runtime args for every core vector reader_rt_args = { diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp index 19baa412647..3be28d9843e 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp @@ -157,27 +157,27 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& //////////////////////////////////////////////////////////////////////////// // Stimulus Generation //////////////////////////////////////////////////////////////////////////// - std::vector packed_input0 = generate_packed_uniform_random_vector( + std::vector packed_input0 = generate_packed_uniform_random_vector( -1.0f, 1.0f, - byte_size / tt::test_utils::df::bfloat16::SIZEOF, + byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); - std::vector packed_input1 = generate_packed_uniform_random_vector( + std::vector packed_input1 = generate_packed_uniform_random_vector( -1.0f, 1.0f, - byte_size / tt::test_utils::df::bfloat16::SIZEOF, + byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); - std::vector packed_input2 = generate_packed_uniform_random_vector( + std::vector packed_input2 = generate_packed_uniform_random_vector( -1.0f, 1.0f, - byte_size / tt::test_utils::df::bfloat16::SIZEOF, + byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); //////////////////////////////////////////////////////////////////////////// // Golden Generation //////////////////////////////////////////////////////////////////////////// - auto input0 = unpack_vector(packed_input0); - auto input1 = unpack_vector(packed_input1); - auto input2 = unpack_vector(packed_input2); + auto input0 = unpack_vector(packed_input0); + auto input1 = unpack_vector(packed_input1); + auto input2 = unpack_vector(packed_input2); std::vector temp_golden(input0.size()); uint16_t srca_fid_mask = 0xFFFF; @@ -188,14 +188,14 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& input0.end(), input1.begin(), temp_golden.begin(), - [&](const tt::test_utils::df::bfloat16& lhs, const tt::test_utils::df::bfloat16& rhs) { + [&](const bfloat16& lhs, const bfloat16& rhs) { if (test_config.binary_op == "add") { return (lhs.to_float() + rhs.to_float()); } else if (test_config.binary_op == "sub") { return (lhs.to_float() - rhs.to_float()); } else if (test_config.binary_op == "mul") { - return ( tt::test_utils::df::bfloat16(std::bit_cast(lhs.to_packed() & srca_fid_mask)).to_float() * - tt::test_utils::df::bfloat16(std::bit_cast(rhs.to_packed() & srcb_fid_mask)).to_float()); + return ( bfloat16(std::bit_cast(lhs.to_packed() & srca_fid_mask)).to_float() * + bfloat16(std::bit_cast(rhs.to_packed() & srcb_fid_mask)).to_float()); } else if (test_config.binary_op.find("with_dest_reuse") != std::string::npos) { return lhs.to_float(); } else { @@ -204,26 +204,26 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& } }); - std::vector golden(input0.size()); + std::vector golden(input0.size()); std::transform( input2.begin(), input2.end(), temp_golden.begin(), golden.begin(), - [&](const tt::test_utils::df::bfloat16& lhs, const float& rhs) { + [&](const bfloat16& lhs, const float& rhs) { //acc_to_dest accumulates dest value with binary output, for all binary operations if (test_config.acc_to_dest || test_config.binary_op == "add_with_dest_reuse") { return (lhs.to_float() + rhs); } else if (test_config.binary_op == "sub_with_dest_reuse") { return (lhs.to_float() - rhs); } else if (test_config.binary_op == "mul_with_dest_reuse") { - return (tt::test_utils::df::bfloat16(std::bit_cast(lhs.to_packed() & srca_fid_mask)).to_float() * - tt::test_utils::df::bfloat16(std::bit_cast(tt::test_utils::df::bfloat16(rhs).to_packed() & srcb_fid_mask)).to_float()); + return (bfloat16(std::bit_cast(lhs.to_packed() & srca_fid_mask)).to_float() * + bfloat16(std::bit_cast(bfloat16(rhs).to_packed() & srcb_fid_mask)).to_float()); } else { return rhs; } }); - auto packed_golden = pack_vector(golden); + auto packed_golden = pack_vector(golden); //////////////////////////////////////////////////////////////////////////// // Compile and Execute Application @@ -268,10 +268,10 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& std::vector dest_buffer_data; tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data); - pass &= is_close_packed_vectors( + pass &= is_close_packed_vectors( dest_buffer_data, packed_golden, - [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { + [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.0155f); }); return pass; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp index f26e3c55812..140874255df 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp @@ -204,12 +204,12 @@ bool single_tile_matmul(tt_metal::Device* device) { //////////////////////////////////////////////////////////////////////////// // Stimulus Generation //////////////////////////////////////////////////////////////////////////// - std::vector packed_input0 = generate_packed_uniform_random_vector( - 1.0f, 1.0f, byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); - std::vector packed_input1 = generate_packed_uniform_random_vector( + std::vector packed_input0 = generate_packed_uniform_random_vector( + 1.0f, 1.0f, byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); + std::vector packed_input1 = generate_packed_uniform_random_vector( 1.0f / 32.0f, 1.0f / 32.0f, - byte_size / tt::test_utils::df::bfloat16::SIZEOF, + byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); // Setup the weights such that final result is the original input. @@ -258,8 +258,8 @@ bool single_tile_matmul(tt_metal::Device* device) { //////////////////////////////////////////////////////////////////////////// std::vector dest_buffer_data; tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data); - pass &= is_close_packed_vectors( - dest_buffer_data, packed_golden, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.015f); }); + pass &= is_close_packed_vectors( + dest_buffer_data, packed_golden, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.015f); }); return pass; } // blocked matmul has blocking, but still fits within dst, so no spill/reloads or intermediates @@ -353,20 +353,20 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 //////////////////////////////////////////////////////////////////////////// // Stimulus Generation //////////////////////////////////////////////////////////////////////////// - std::vector packed_input0 = generate_packed_uniform_random_vector( - 1.0f, 1.0f, in0_byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); - std::vector packed_input1 = generate_packed_uniform_random_vector( + std::vector packed_input0 = generate_packed_uniform_random_vector( + 1.0f, 1.0f, in0_byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); + std::vector packed_input1 = generate_packed_uniform_random_vector( 0.03125f, 0.03125f, - in1_byte_size / tt::test_utils::df::bfloat16::SIZEOF, + in1_byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); //////////////////////////////////////////////////////////////////////////// // Golden Generation //////////////////////////////////////////////////////////////////////////// - auto packed_golden = generate_packed_uniform_random_vector( + auto packed_golden = generate_packed_uniform_random_vector( 1.0f * K, 1.0f * K, - (out_byte_size) / tt::test_utils::df::bfloat16::SIZEOF, + (out_byte_size) / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); //////////////////////////////////////////////////////////////////////////// @@ -413,14 +413,14 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 std::vector dest_buffer_data; tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data); int failed_index; - pass &= is_close_packed_vectors( + pass &= is_close_packed_vectors( dest_buffer_data, packed_golden, - [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.015f); }, + [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.015f); }, &failed_index); if (not pass) { log_info("Failed Index={}", failed_index); - print_vector_fixed_numel_per_row(unpack_vector(dest_buffer_data), 32); + print_vector_fixed_numel_per_row(unpack_vector(dest_buffer_data), 32); } return pass; } @@ -531,20 +531,20 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N //////////////////////////////////////////////////////////////////////////// // Stimulus Generation //////////////////////////////////////////////////////////////////////////// - std::vector packed_input0 = generate_packed_uniform_random_vector( - 1.0f, 1.0f, in0_byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); - std::vector packed_input1 = generate_packed_uniform_random_vector( + std::vector packed_input0 = generate_packed_uniform_random_vector( + 1.0f, 1.0f, in0_byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); + std::vector packed_input1 = generate_packed_uniform_random_vector( 0.03125f, 0.03125f, - in1_byte_size / tt::test_utils::df::bfloat16::SIZEOF, + in1_byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); //////////////////////////////////////////////////////////////////////////// // Golden Generation //////////////////////////////////////////////////////////////////////////// - auto packed_golden = generate_packed_uniform_random_vector( + auto packed_golden = generate_packed_uniform_random_vector( 1.0f * K, 1.0f * K, - (out_byte_size) / tt::test_utils::df::bfloat16::SIZEOF, + (out_byte_size) / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); //////////////////////////////////////////////////////////////////////////// @@ -591,14 +591,14 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N std::vector dest_buffer_data; tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data); int failed_index; - pass &= is_close_packed_vectors( + pass &= is_close_packed_vectors( dest_buffer_data, packed_golden, - [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.015f); }, + [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.015f); }, &failed_index); if (not pass) { log_info("Failed Index={}", failed_index); - print_vector_fixed_numel_per_row(unpack_vector(dest_buffer_data), 32); + print_vector_fixed_numel_per_row(unpack_vector(dest_buffer_data), 32); } return pass; } diff --git a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp index 791f033d127..5c86e8feadf 100644 --- a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp @@ -222,8 +222,8 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf //////////////////////////////////////////////////////////////////////////// // Stimulus Generation //////////////////////////////////////////////////////////////////////////// - std::vector inputs = generate_packed_uniform_random_vector( - -1.0f, 1.0f, byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); + std::vector inputs = generate_packed_uniform_random_vector( + -1.0f, 1.0f, byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); //////////////////////////////////////////////////////////////////////////// // Compile and Execute Application //////////////////////////////////////////////////////////////////////////// @@ -333,8 +333,8 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter //////////////////////////////////////////////////////////////////////////// // Stimulus Generation //////////////////////////////////////////////////////////////////////////// - std::vector inputs = generate_packed_uniform_random_vector( - -1.0f, 1.0f, byte_size / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); + std::vector inputs = generate_packed_uniform_random_vector( + -1.0f, 1.0f, byte_size / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count()); //////////////////////////////////////////////////////////////////////////// // Compile and Execute Application //////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/basic_eth_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/basic_eth_kernels.cpp index 4dfe7eeba98..a24f3f64f66 100644 --- a/tests/tt_metal/tt_metal/unit_tests/multichip/basic_eth_kernels.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/multichip/basic_eth_kernels.cpp @@ -22,9 +22,13 @@ using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet constexpr std::int32_t MAX_NUM_WORDS = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; +} +} namespace unit_tests::erisc::kernels { @@ -186,6 +190,7 @@ bool writer_kernel_no_receive( } TEST_F(N300DeviceFixture, EthKernelsNocReadNoSend) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); @@ -212,6 +217,7 @@ TEST_F(N300DeviceFixture, EthKernelsNocReadNoSend) { } TEST_F(N300DeviceFixture, EthKernelsNocWriteNoReceive) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); @@ -354,6 +360,7 @@ bool eth_direct_sender_receiver_kernels( } // namespace unit_tests::erisc::kernels TEST_F(N300DeviceFixture, EthKernelsDirectSendChip0ToChip1) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); @@ -402,6 +409,7 @@ TEST_F(N300DeviceFixture, EthKernelsDirectSendChip0ToChip1) { } TEST_F(N300DeviceFixture, EthKernelsDirectSendChip1ToChip0) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); @@ -450,6 +458,7 @@ TEST_F(N300DeviceFixture, EthKernelsDirectSendChip1ToChip0) { } TEST_F(DeviceFixture, EthKernelsDirectSendAllConnectedChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; for (const auto& sender_device : devices_) { @@ -500,6 +509,7 @@ TEST_F(DeviceFixture, EthKernelsDirectSendAllConnectedChips) { } TEST_F(N300DeviceFixture, EthKernelsBidirectionalDirectSend) { + using namespace CMAKE_UNIQUE_NAMESPACE; const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); @@ -585,6 +595,7 @@ TEST_F(N300DeviceFixture, EthKernelsBidirectionalDirectSend) { } TEST_F(N300DeviceFixture, EthKernelsRepeatedDirectSends) { + using namespace CMAKE_UNIQUE_NAMESPACE; const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); @@ -617,6 +628,7 @@ TEST_F(N300DeviceFixture, EthKernelsRepeatedDirectSends) { } TEST_F(N300DeviceFixture, EthKernelsRandomDirectSendTests) { + using namespace CMAKE_UNIQUE_NAMESPACE; srand(0); const auto& device_0 = devices_.at(0); const auto& device_1 = devices_.at(1); diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/buffer_movement_kernels.cpp index 10da65cd31b..dd9c95aab8f 100644 --- a/tests/tt_metal/tt_metal/unit_tests/multichip/buffer_movement_kernels.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/multichip/buffer_movement_kernels.cpp @@ -187,10 +187,10 @@ bool chip_to_chip_interleaved_buffer_transfer( auto input_packed = generate_uniform_random_vector(0, 100, cfg.size_bytes / sizeof(uint32_t)); /*std::vector input_packed = - tt::test_utils::generate_packed_uniform_random_vector( + tt::test_utils::generate_packed_uniform_random_vector( -1.0f, 1.0f, - cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, + cfg.size_bytes / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());*/ tt::tt_metal::InterleavedBufferConfig sender_config{ diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp index de55e3f17f5..6de3291f65c 100644 --- a/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/multichip/erisc_app_direct_send.cpp @@ -17,13 +17,11 @@ #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet constexpr std::int32_t MAX_NUM_WORDS = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_SIZE / WORD_SIZE; -using namespace tt; -using namespace tt::test_utils; -using namespace tt::test_utils::df; - struct erisc_info_t { volatile uint32_t num_bytes; volatile uint32_t mode; @@ -34,6 +32,13 @@ struct erisc_info_t { volatile uint32_t reserverd_3_; volatile uint32_t reserverd_4_; }; +} +} + +using namespace tt; +using namespace tt::test_utils; +using namespace tt::test_utils::df; + namespace unit_tests::erisc::direct_send { // Tests ethernet direct send/receive from ERISC_L1_UNRESERVED_BASE bool send_over_eth( @@ -136,6 +141,7 @@ bool send_over_eth( } // namespace unit_tests::erisc::direct_send TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip0ToChip1) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); ASSERT_TRUE(this->num_devices_ == 2); const auto& device_0 = devices_.at(0); @@ -165,6 +171,7 @@ TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip0ToChip1) { } TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip1ToChip0) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); ASSERT_TRUE(this->num_devices_ == 2); const auto& device_0 = devices_.at(0); @@ -194,6 +201,7 @@ TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip1ToChip0) { } TEST_F(N300DeviceFixture, BidirectionalEthCoreDirectSend) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); ASSERT_TRUE(this->num_devices_ == 2); const auto& device_0 = devices_.at(0); @@ -239,6 +247,7 @@ TEST_F(N300DeviceFixture, BidirectionalEthCoreDirectSend) { } TEST_F(N300DeviceFixture, RandomDirectSendTests) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); srand(0); ASSERT_TRUE(this->num_devices_ == 2); diff --git a/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp index 6cad6f5d625..56ed607e79d 100644 --- a/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/multichip/ring_gather_kernels.cpp @@ -337,8 +337,8 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( for (uint32_t i = 0; i < sender_receivers.size(); ++i) { inputs.emplace_back( - tt::test_utils::generate_packed_uniform_random_vector( - -1.0f, 1.0f, cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, i)); + tt::test_utils::generate_packed_uniform_random_vector( + -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, i)); full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end()); const auto& device = std::get<0>(sender_receivers[i]); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt index ed66195b343..47f4f154fc7 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt @@ -29,7 +29,8 @@ set(UNIT_TESTS_COMMON_SRC ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_waypoint.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp ) -add_library(unit_tests_common_o OBJECT ${UNIT_TESTS_COMMON_SRC}) +add_library(unit_tests_common_o STATIC ${UNIT_TESTS_COMMON_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_common_o) target_link_libraries( unit_tests_common_o PUBLIC diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp index a2adab128e5..1b1b4d6104f 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp index f65199b756f..829a9feb140 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "common_fixture.hpp" #include "impl/debug/dprint_server.hpp" #include "tt_metal/common/core_descriptor.hpp" diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp index ba7bfa88920..9d74f94942d 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include #include "common_fixture.hpp" diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp index 62165c4b1cb..4eecec45c61 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp @@ -1,6 +1,9 @@ // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 + +#pragma once + #include #include #include diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp index 850cb14a0ae..38ece0f5ca0 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp @@ -14,6 +14,8 @@ using namespace tt; using namespace tt::tt_metal; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { const std::string golden_output = R"(Test Debug Print: ERISC Basic Types: @@ -76,6 +78,8 @@ static void RunTest(DPrintFixture* fixture, Device* device, bool active) { tt::DPrintServerClearLogFile(); } } +} +} TEST_F(DPrintFixture, TestPrintEthCores) { for (Device* device : this->devices_) { @@ -86,7 +90,7 @@ TEST_F(DPrintFixture, TestPrintEthCores) { } this->RunTestOnDevice( [](DPrintFixture *fixture, Device *device){ - RunTest(fixture, device, true); + CMAKE_UNIQUE_NAMESPACE::RunTest(fixture, device, true); }, device ); @@ -105,7 +109,7 @@ TEST_F(DPrintFixture, TestPrintIEthCores) { } this->RunTestOnDevice( [](DPrintFixture *fixture, Device *device){ - RunTest(fixture, device, false); + CMAKE_UNIQUE_NAMESPACE::RunTest(fixture, device, false); }, device ); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp index 391cfa9f1e2..8440e242ac6 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp @@ -14,6 +14,8 @@ using namespace tt; using namespace tt::tt_metal; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { const std::string golden_output = R"(Test Debug Print: Data0 Basic Types: @@ -72,9 +74,11 @@ static void RunTest(DPrintFixture* fixture, Device* device) { EXPECT_TRUE(OpenFile(file_name, log_file, std::fstream::in)); EXPECT_TRUE(log_file.peek() == std::ifstream::traits_type::eof()); } +} +} TEST_F(DPrintFixtureDisableDevices, TestPrintMuteDevice) { for (Device* device : this->devices_) { - this->RunTestOnDevice(RunTest, device); + this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp index 4b76841b498..3798288e27c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp @@ -14,6 +14,8 @@ using namespace tt; using namespace tt::tt_metal; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { const std::string golden_output = R"(Printing int from arg: 0 Printing int from arg: 2)"; @@ -61,9 +63,11 @@ static void RunTest(DPrintFixture* fixture, Device* device) { ) ); } +} +} TEST_F(DPrintFixture, TestPrintMuting) { for (Device* device : this->devices_) { - this->RunTestOnDevice(RunTest, device); + this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp index 26a9489ee83..42d7382b5bb 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp @@ -15,6 +15,8 @@ using namespace tt; using namespace tt::tt_metal; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { const std::string golden_output = R"(Test Debug Print: Data0 Basic Types: @@ -187,9 +189,11 @@ static void RunTest(DPrintFixture* fixture, Device* device) { ) ); } +} +} TEST_F(DPrintFixture, TestPrintFromAllHarts) { for (Device* device : this->devices_) { - this->RunTestOnDevice(RunTest, device); + this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp index 7014aaf40f0..a707ffff86c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp @@ -15,6 +15,8 @@ using namespace tt; using namespace tt::tt_metal; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { // Some machines will run this test on different physical cores, so wildcard the exact coordinates. const std::string golden_output = R"(DPRINT server timed out on Device *, worker core (x=*,y=*), riscv 4, waiting on a RAISE signal: 1 @@ -51,6 +53,8 @@ try { ) ); } +} +} TEST_F(DPrintFixture, TestPrintHanging) { // Skip this test for slow dipatch for now. Due to how llrt currently sits below device, it's @@ -60,5 +64,5 @@ TEST_F(DPrintFixture, TestPrintHanging) { GTEST_SKIP(); // Since the dprint server gets killed from a timeout, only run on one device. - this->RunTestOnDevice(RunTest, this->devices_[0]); + this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, this->devices_[0]); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp index 1767d904469..1f73a7bc736 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp @@ -145,7 +145,7 @@ static KernelHandle prepare_compute(tt_metal::Program& program, const DestPrintT // Generates input data based on the test configuration static std::vector generate_inputs(const DestPrintTestConfig& config) { if (config.data_format == tt::DataFormat::Float16_b) - return tt::test_utils::generate_packed_increment_vector( + return tt::test_utils::generate_packed_increment_vector( 0.0f, config.get_num_elements(), 0.03125f, -1.1875f); if (config.data_format == tt::DataFormat::Float32) diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp index 4830e39a678..0786c960813 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp @@ -14,6 +14,8 @@ using namespace tt; using namespace tt::tt_metal; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { const std::string golden_output = R"(TestConstCharStrNC{0,0} 2 @@ -274,9 +276,11 @@ static void RunTest(DPrintFixture* fixture, Device* device) { ) ); } +} +} TEST_F(DPrintFixture, TestPrintRaiseWait) { for (Device* device : this->devices_) { - this->RunTestOnDevice(RunTest, device); + this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp index 1a21a43a187..f358a30ebad 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp @@ -12,6 +12,8 @@ using std::vector; using namespace tt; using namespace tt::tt_metal; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { static void RunTest(WatcherFixture* fixture, Device* device) { // Set up program Program program = Program(); @@ -129,9 +131,11 @@ static void RunTest(WatcherFixture* fixture, Device* device) { // See #10527 // EXPECT_TRUE(FileContainsAllStrings(fixture->log_file_name, expected_strings)); } +} +} TEST_F(WatcherFixture, TestWatcherPause) { for (Device* device : this->devices_) { - this->RunTestOnDevice(RunTest, device); + this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp index ffc9fb62e57..8da13273c27 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp @@ -14,6 +14,8 @@ using std::vector; using namespace tt; using namespace tt::tt_metal; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { static void RunTest(WatcherFixture* fixture, Device* device) { // Set up program Program program = Program(); @@ -194,9 +196,11 @@ static void RunTest(WatcherFixture* fixture, Device* device) { } } } +} +} TEST_F(WatcherFixture, TestWatcherWaypoints) { for (Device* device : this->devices_) { - this->RunTestOnDevice(RunTest, device); + this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } } diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt index 4e32bc33a45..5f9247bba85 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt @@ -18,6 +18,7 @@ add_executable( ${UNIT_TESTS_FD_SRC} $ ) +TT_ENABLE_UNITY_BUILD(unit_tests_fast_dispatch) target_link_libraries(unit_tests_fast_dispatch PUBLIC test_metal_common_libs) target_include_directories( diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp index 2d3c5a4b6c7..c66a0c33d47 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "tt_metal/host_api.hpp" #include "tt_metal/common/bfloat16.hpp" #include "tt_metal/impl/buffers/buffer.hpp" diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp index 755f5892db0..b6dbd81212c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp @@ -22,12 +22,6 @@ using std::vector; using namespace tt; using namespace tt::tt_metal; -struct TestBufferConfig { - uint32_t num_pages; - uint32_t page_size; - BufferType buftype; -}; - Program create_simple_unary_program(Buffer& input, Buffer& output) { Program program = CreateProgram(); Device* device = input.device(); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp index 5c4aabbb0cd..2bad2d7ba2b 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include #include diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp index 83746fe8a54..06cd4a16177 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp @@ -41,58 +41,58 @@ const map> sfpu_op_to_op_name = { {"tanh", {{"SFPU_OP_CHAIN_0", "tanh_tile_init(); tanh_tile(0);"}}}, }; -tt::test_utils::df::bfloat16 sfpu_function(const string& op_name, const tt::test_utils::df::bfloat16& input) { +bfloat16 sfpu_function(const string& op_name, const bfloat16& input) { if (op_name == "relu") { - return tt::test_utils::df::bfloat16(fmaxf(input.to_float(), 0.0f)); + return bfloat16(fmaxf(input.to_float(), 0.0f)); } else if (op_name == "exponential") { - return tt::test_utils::df::bfloat16(std::exp(input.to_float())); + return bfloat16(std::exp(input.to_float())); } else if (op_name == "reciprocal") { - return tt::test_utils::df::bfloat16(1 / input.to_float()); + return bfloat16(1 / input.to_float()); } else if (op_name == "gelu") { static constexpr float alpha = M_2_SQRTPI * M_SQRT1_2; auto x = input.to_float(); auto x3 = x * x * x; float result = x * 0.5 * (1.0 + tanhf(alpha * (x + 0.044715 * x3))); - return tt::test_utils::df::bfloat16(result); + return bfloat16(result); } else if (op_name == "sqrt") { - return tt::test_utils::df::bfloat16(sqrtf(input.to_float())); + return bfloat16(sqrtf(input.to_float())); } else if (op_name == "sigmoid") { auto x = input.to_float(); float result = 1 / (1 + std::exp(-x)); - return tt::test_utils::df::bfloat16(result); + return bfloat16(result); } else if (op_name == "log") { - return tt::test_utils::df::bfloat16(logf(input.to_float())); + return bfloat16(logf(input.to_float())); } else if (op_name == "tanh") { - return tt::test_utils::df::bfloat16(std::tanh(input.to_float())); + return bfloat16(std::tanh(input.to_float())); } else { TT_THROW("Unsupported op_name in test"); - return tt::test_utils::df::bfloat16(0.0f); + return bfloat16(0.0f); } } vector generate_packed_sfpu_input(const unsigned int numel, const string& op_name, const int seed) { if ((op_name == "sqrt") or (op_name == "log")) { - return generate_packed_uniform_random_vector(0.0001f, 4.0f, numel, seed); + return generate_packed_uniform_random_vector(0.0001f, 4.0f, numel, seed); } else if ((op_name == "exponential") or (op_name == "gelu") or (op_name == "reciprocal")) { - auto possible_values = vector({-1.0f, -0.5f, 0.5f, 1.0f}); - return generate_packed_random_vector_from_vector(possible_values, numel, seed); + auto possible_values = vector({-1.0f, -0.5f, 0.5f, 1.0f}); + return generate_packed_random_vector_from_vector(possible_values, numel, seed); } else { - return generate_packed_uniform_random_vector(-1.0f, 1.0f, numel, seed); + return generate_packed_uniform_random_vector(-1.0f, 1.0f, numel, seed); } } bool is_close_packed_sfpu_output(const std::vector& vec_a, const std::vector& vec_b, const string& op_name) { if (op_name == "tanh") { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.175f, 0.1f); }); } else if ((op_name == "gelu") or (op_name == "relu")) { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.15f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.15f); }); } else if ((op_name == "exponential")) { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.1f, 0.1f); }); } else { - return is_close_packed_vectors( - vec_a, vec_b, [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); }); + return is_close_packed_vectors( + vec_a, vec_b, [&](const bfloat16& a, const bfloat16& b) { return is_close(a, b, 0.06f, 0.006f); }); } } @@ -138,15 +138,15 @@ bool run_sfpu_all_same_buffer(CommandQueue & cq, const SfpuConfig& test_config) // Input std::vector packed_input = sfpu_util::generate_packed_sfpu_input( - byte_size / tt::test_utils::df::bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count()); + byte_size / bfloat16::SIZEOF, test_config.sfpu_op, std::chrono::system_clock::now().time_since_epoch().count()); // Golden output - auto input = unpack_vector(packed_input); - std::vector golden(input.size()); - std::transform(input.begin(), input.end(), golden.begin(), [&](const tt::test_utils::df::bfloat16& val) { + auto input = unpack_vector(packed_input); + std::vector golden(input.size()); + std::transform(input.begin(), input.end(), golden.begin(), [&](const bfloat16& val) { return sfpu_util::sfpu_function(test_config.sfpu_op, val); }); - std::vector packed_golden = pack_vector(golden); + std::vector packed_golden = pack_vector(golden); // Same runtime args for every core vector reader_rt_args = { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp index ae36623be52..ef05c731489 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp @@ -26,6 +26,8 @@ using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet constexpr std::int32_t MAX_NUM_WORDS = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; @@ -40,6 +42,8 @@ struct BankedConfig { tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1; tt::DataFormat l1_data_format = tt::DataFormat::Float16_b; }; +} +} namespace fd_unit_tests::erisc::kernels { @@ -445,7 +449,7 @@ bool chip_to_chip_interleaved_buffer_transfer( tt_metal::Device* receiver_device, const CoreCoord& eth_sender_core, const CoreCoord& eth_receiver_core, - const BankedConfig& cfg, + const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg, const uint32_t& max_transfer_size) { bool pass = true; @@ -560,6 +564,7 @@ TEST_F(CommandQueueSingleCardFixture, EnqueueDummyProgramOnEthCore) { } TEST_F(CommandQueueSingleCardFixture, EthKernelsNocReadNoSend) { + using namespace CMAKE_UNIQUE_NAMESPACE; const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; for (const auto& device : devices_) { @@ -575,6 +580,7 @@ TEST_F(CommandQueueSingleCardFixture, EthKernelsNocReadNoSend) { } TEST_F(CommandQueueSingleCardFixture, EthKernelsNocWriteNoReceive) { + using namespace CMAKE_UNIQUE_NAMESPACE; const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; for (const auto& device : devices_) { @@ -590,6 +596,7 @@ TEST_F(CommandQueueSingleCardFixture, EthKernelsNocWriteNoReceive) { } TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectSendAllConnectedChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; for (const auto& sender_device : devices_) { @@ -672,6 +679,7 @@ TEST_F(CommandQueueMultiDeviceFixture, EthKernelsSendDramBufferAllConnectedChips } TEST_F(CommandQueueMultiDeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; for (const auto& sender_device : devices_) { for (const auto& receiver_device : devices_) { if (sender_device->id() >= receiver_device->id()) { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp index 53d9f0d5707..df20eec8c8d 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp @@ -24,6 +24,8 @@ using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet constexpr std::int32_t MAX_NUM_WORDS = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; @@ -160,6 +162,8 @@ std::vector device_ring, - const BankedConfig& cfg, + const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg, const size_t& src_eth_l1_byte_address, const size_t& dst_eth_l1_byte_address, const size_t& sem_l1_byte_address, uint32_t num_bytes_per_send = 16) { + using namespace CMAKE_UNIQUE_NAMESPACE; bool pass = true; const auto& sender_receivers = get_sender_receiver_cores(device_ring); @@ -335,8 +341,8 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( for (uint32_t i = 0; i < sender_receivers.size(); ++i) { inputs.emplace_back( - tt::test_utils::generate_packed_uniform_random_vector( - -1.0f, 1.0f, cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, i)); + tt::test_utils::generate_packed_uniform_random_vector( + -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, i)); full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end()); const auto& device = std::get<0>(sender_receivers[i]); @@ -455,6 +461,7 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( } // namespace fd_unit_tests::erisc::kernels TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectRingGatherAllChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (num_devices_ < 4) { GTEST_SKIP(); } @@ -470,6 +477,7 @@ TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectRingGatherAllChips) { } TEST_F(CommandQueueMultiDeviceFixture, EthKernelsInterleavedRingGatherAllChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (num_devices_ < 4) { GTEST_SKIP(); } diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt index 702eb9535f0..9cef200d2fa 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt @@ -6,6 +6,7 @@ set(UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS ) add_executable(unit_tests_fast_dispatch_single_chip_multi_queue ${UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS}) +TT_ENABLE_UNITY_BUILD(unit_tests_fast_dispatch_single_chip_multi_queue) target_link_libraries(unit_tests_fast_dispatch_single_chip_multi_queue PUBLIC test_metal_common_libs) target_include_directories( diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp index 1a933c8a2f9..0f9c35adb96 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp @@ -11,12 +11,6 @@ using std::vector; using namespace tt::tt_metal; -struct TestBufferConfig { - uint32_t num_pages; - uint32_t page_size; - BufferType buftype; -}; - Program create_simple_unary_program(const Buffer& input, const Buffer& output) { Program program = CreateProgram(); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp index ea9365e4f6a..b3efb0e4f16 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp index a42dd078797..e1e02ae6e16 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -// #pragma once +#pragma once #include "tt_metal/host_api.hpp" #include "tt_metal/common/bfloat16.hpp" diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt index 783c89d06cd..9d46d75f129 100644 --- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt +++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt @@ -11,6 +11,7 @@ set(TTNN_CCL_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/ccl/test_erisc_data_move set(TTNN_TENSOR_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_create_tensor.cpp) add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_ttnn) add_executable(unit_tests_ttnn_ccl ${TTNN_CCL_UNIT_TESTS_SRC}) add_executable(unit_tests_ttnn_tensor ${TTNN_TENSOR_UNIT_TESTS_SRC}) add_executable(test_multi_device ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_device.cpp) diff --git a/tests/ttnn/unit_tests/gtests/test_add.cpp b/tests/ttnn/unit_tests/gtests/test_add.cpp index 0f4c2ef5d3a..c1be54118a6 100644 --- a/tests/ttnn/unit_tests/gtests/test_add.cpp +++ b/tests/ttnn/unit_tests/gtests/test_add.cpp @@ -31,10 +31,10 @@ TEST_P(Add1DTensorAndScalarFixture, AddsScalarCorrectly) { ttnn::Shape shape(dimensions); { - const auto input_tensor = ttnn::zeros(shape, ttnn::bfloat16, ttnn::TILE_LAYOUT, device); + const auto input_tensor = ttnn::zeros(shape, DataType::BFLOAT16, ttnn::TILE_LAYOUT, device); const auto output_tensor = input_tensor + param.scalar; const auto expected_tensor = - ttnn::operations::creation::full(shape, param.scalar, ttnn::bfloat16, ttnn::TILE_LAYOUT, device); + ttnn::operations::creation::full(shape, param.scalar, DataType::BFLOAT16, ttnn::TILE_LAYOUT, device); TT_FATAL(ttnn::numpy::allclose<::bfloat16>(ttnn::from_device(expected_tensor), ttnn::from_device(output_tensor)), "Error"); } ttnn::close_device(device); diff --git a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp index 8e019a5e95d..5903126db3a 100644 --- a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp +++ b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp @@ -47,8 +47,8 @@ TEST_P(AddOpGraphTestFixture, AddGraphTrace) { auto run_mode = std::get<1>(param_combination); { - const auto input_tensor_a = ttnn::zeros(params.a_Shape, ttnn::bfloat16, ttnn::TILE_LAYOUT, this->getDevice(), params.memory_config); - const auto input_tensor_b = ttnn::zeros(params.b_Shape, ttnn::bfloat16, ttnn::TILE_LAYOUT, this->getDevice(), params.memory_config); + const auto input_tensor_a = ttnn::zeros(params.a_Shape, DataType::BFLOAT16, ttnn::TILE_LAYOUT, this->getDevice(), params.memory_config); + const auto input_tensor_b = ttnn::zeros(params.b_Shape, DataType::BFLOAT16, ttnn::TILE_LAYOUT, this->getDevice(), params.memory_config); auto call = [&] { const auto output_tensor = ttnn::add(input_tensor_a, input_tensor_b); diff --git a/tests/ttnn/unit_tests/gtests/test_multi_device.cpp b/tests/ttnn/unit_tests/gtests/test_multi_device.cpp index 6d6863305f6..7404a7528bb 100644 --- a/tests/ttnn/unit_tests/gtests/test_multi_device.cpp +++ b/tests/ttnn/unit_tests/gtests/test_multi_device.cpp @@ -29,7 +29,7 @@ Tensor create_host_multi_device_tensor(const Tensor& tensor, const ReplicateTens TEST_F(T3kMultiDeviceFixture, TestGetTensorsFromMultiDeviceStorage) { MeshDevice* mesh_device = this->mesh_device_.get(); - const auto input_tensor = ttnn::ones(ttnn::Shape(std::array{32, 32}), ttnn::bfloat16); + const auto input_tensor = ttnn::ones(ttnn::Shape(std::array{32, 32}), DataType::BFLOAT16); const auto replicated_tensor = create_host_multi_device_tensor(input_tensor, ReplicateTensor(8)); const auto device_tensors = get_tensors_from_multi_device_storage(replicated_tensor); @@ -38,7 +38,7 @@ TEST_F(T3kMultiDeviceFixture, TestGetTensorsFromMultiDeviceStorage) { TEST_F(T3kMultiDeviceFixture, TestGetDistributedTensorConfigFromMultiDeviceStorage) { MeshDevice* mesh_device = this->mesh_device_.get(); - const auto input_tensor = ttnn::ones(ttnn::Shape(std::array{32, 32}), ttnn::bfloat16); + const auto input_tensor = ttnn::ones(ttnn::Shape(std::array{32, 32}), DataType::BFLOAT16); const auto replicated_tensor = create_host_multi_device_tensor(input_tensor, ReplicateTensor(8)); const auto distributed_tensor_config = get_distributed_tensor_config_from_tensor(replicated_tensor); diff --git a/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp b/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp index 463cd8d1972..debbce04437 100644 --- a/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp +++ b/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" #include "tt_metal/test_utils/env_vars.hpp" diff --git a/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp b/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp index 6e517ccdaab..4b4bd0eecdd 100644 --- a/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp +++ b/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp @@ -1,9 +1,10 @@ -#include - // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include #include #include #include diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index c4ab3a8485e..388331a3316 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -645,6 +645,7 @@ target_include_directories(ttnn PUBLIC ${TTNN_PUBLIC_INCLUDE_DIRS}) target_link_libraries(ttnn PUBLIC ${TTNN_PUBLIC_LINK_LIBRARIES}) target_link_directories(ttnn PUBLIC ${TTNN_PUBLIC_LINK_DIRS}) target_precompile_headers(ttnn PRIVATE ${TTNN_PRECOMPILED_HEADERS}) +TT_ENABLE_UNITY_BUILD(ttnn) #We move the library binaries to a different path rather than PROJECT_BINARY_DIR #in the Python wheel diff --git a/ttnn/cpp/pybind11/operations/creation.hpp b/ttnn/cpp/pybind11/operations/creation.hpp index ce6f2a44d55..2f10cf149ec 100644 --- a/ttnn/cpp/pybind11/operations/creation.hpp +++ b/ttnn/cpp/pybind11/operations/creation.hpp @@ -323,7 +323,7 @@ void bind_arange_operation(py::module& module, const creation_operation_t& opera py::arg("start") = 0, py::arg("end"), py::arg("step") = 1, - py::arg("dtype") = ttnn::bfloat16, + py::arg("dtype") = DataType::BFLOAT16, py::arg("device") = std::nullopt, py::arg("memory_config") = ttnn::DRAM_MEMORY_CONFIG}); } diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp index d15841d1442..a085a650432 100644 --- a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp index 5faf89bd0a6..fed321d9712 100644 --- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp @@ -147,8 +147,8 @@ void generate_edm_kernels_for_ring_or_linear_topology( std::vector const& counter_clockwise_edm_builders, std::optional receiver_device_id, std::optional sender_device_id) { - auto sender_noc = detail::GetPreferredNOCForDRAMRead(tt::Cluster::instance().arch()); - auto receiver_noc = detail::GetPreferredNOCForDRAMWrite(tt::Cluster::instance().arch()); + auto sender_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(tt::Cluster::instance().arch()); + auto receiver_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(tt::Cluster::instance().arch()); uint32_t sender_socket_idx = 0; uint32_t receiver_socket_idx = 0; if (receiver_device_id == sender_device_id) { diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp index 6727c3a2349..0924001d006 100644 --- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp @@ -88,7 +88,9 @@ operation::ProgramWithCallbacks ReduceScatter::create_program( this->user_defined_num_buffers_per_channel); } -static ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_type(ttnn::operations::reduction::ReduceType reduce_op) { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { +ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_type(ttnn::operations::reduction::ReduceType reduce_op) { // Leaving switch statement for future support of additional types. switch (reduce_op) { case ttnn::operations::reduction::ReduceType::Sum: @@ -98,6 +100,8 @@ static ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_typ return ttnn::operations::binary::BinaryOpType::ADD; } } +} +} namespace operations{ namespace ccl{ @@ -110,6 +114,7 @@ Tensor reduce_scatter( ttnn::ccl::Topology topology, const std::optional user_defined_num_workers, const std::optional user_defined_num_buffers_per_channel) { + using namespace CMAKE_UNIQUE_NAMESPACE; ttnn::operations::binary::BinaryOpType binary_op_type = convert_reduce_type_to_eltwise_type(math_op); TT_FATAL(std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr, "reduce_scatter op is only supported for Fast Dispatch"); @@ -162,6 +167,7 @@ Tensor reduce_scatter( ttnn::ccl::Topology topology, const std::optional user_defined_num_workers, const std::optional user_defined_num_buffers_per_channel) { + using namespace CMAKE_UNIQUE_NAMESPACE; ttnn::operations::binary::BinaryOpType binary_op_type = convert_reduce_type_to_eltwise_type(reduce_op); TT_FATAL(topology == ttnn::ccl::Topology::Linear, "This all_gather API with cluster_axis is currently supported only for the Linear topology"); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index dc7295a0d75..9573ff0595c 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -17,6 +17,8 @@ namespace conv2d { using namespace tt; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { const uint32_t act_cb = CB::c_in0; const uint32_t weight_cb = CB::c_in1; const uint32_t bias_cb = CB::c_in2; @@ -30,6 +32,8 @@ const uint32_t tilize_mode_tilized_act_cb = CB::c_intermed1; const uint32_t untilize_mode_reblock_cb = CB::c_intermed2; const uint32_t out0_cb = CB::c_out0; const uint32_t temp_sum_cb = CB::c_intermed3; +} +} operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl( @@ -82,6 +86,8 @@ std::tuple create_CBs_for_sharded_input_v2( bool fp32_dest_acc_en, bool packer_l1_acc_en, bool use_non_tile_height) { + using namespace CMAKE_UNIQUE_NAMESPACE; + tt::DataFormat interm0_df = packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df; @@ -260,6 +266,7 @@ std::tuple create_CBs_for_depthwise_sharded_input( bool split_reader, bool fp32_dest_acc_en, bool packer_l1_acc_en) { + using namespace CMAKE_UNIQUE_NAMESPACE; tt::DataFormat interm0_df = packer_l1_acc_en ? (fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b) : out_df; @@ -355,6 +362,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( bool enable_split_reader, bool enable_subblock_padding, bool use_non_tile_height) { + using namespace CMAKE_UNIQUE_NAMESPACE; bool pass = true; tt_metal::Device* device = a.device(); TT_FATAL(a.get_layout() == Layout::ROW_MAJOR, "Conv activation should be in row major layout"); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp index 9cc49a47f22..ebde538c36a 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp @@ -19,19 +19,6 @@ namespace conv2d { using namespace tt; -const uint32_t act_cb = CB::c_in0; -const uint32_t weight_cb = CB::c_in1; -const uint32_t bias_cb = CB::c_in2; -const uint32_t sharded_act_cb = CB::c_in3; -const uint32_t cb_for_reader_indices = CB::c_in4; -const uint32_t cb_for_l1_array = CB::c_in5; -const uint32_t act_cb_row_major_bfloat16 = CB::c_in6; -const uint32_t act_cb_second_reader = CB::c_in7; -const uint32_t matmul_partials_cb = CB::c_intermed0; -const uint32_t tilize_mode_tilized_act_cb = CB::c_intermed1; -const uint32_t untilize_mode_reblock_cb = CB::c_intermed2; -const uint32_t out0_cb = CB::c_out0; - operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl( tt_metal::Program& program, const Tensor& a, @@ -54,6 +41,20 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl( bool enable_act_double_buffer, bool enable_split_reader, bool enable_subblock_padding) { + + const uint32_t act_cb = CB::c_in0; + const uint32_t weight_cb = CB::c_in1; + const uint32_t bias_cb = CB::c_in2; + const uint32_t sharded_act_cb = CB::c_in3; + const uint32_t cb_for_reader_indices = CB::c_in4; + const uint32_t cb_for_l1_array = CB::c_in5; + const uint32_t act_cb_row_major_bfloat16 = CB::c_in6; + const uint32_t act_cb_second_reader = CB::c_in7; + const uint32_t matmul_partials_cb = CB::c_intermed0; + const uint32_t tilize_mode_tilized_act_cb = CB::c_intermed1; + const uint32_t untilize_mode_reblock_cb = CB::c_intermed2; + const uint32_t out0_cb = CB::c_out0; + bool pass = true; enable_split_reader = false; tt_metal::Device* device = a.device(); diff --git a/ttnn/cpp/ttnn/operations/creation.hpp b/ttnn/cpp/ttnn/operations/creation.hpp index 55f2a81d96e..2c82325a172 100644 --- a/ttnn/cpp/ttnn/operations/creation.hpp +++ b/ttnn/cpp/ttnn/operations/creation.hpp @@ -67,7 +67,7 @@ inline ttnn::Tensor full_impl( std::optional optional_output_tensor = std::nullopt) { Device* device = optional_output_tensor.has_value() ? optional_output_tensor.value().device() : device_arg.has_value() ? &(device_arg.value().get()) : nullptr; Layout layout_value = optional_output_tensor.has_value() ? optional_output_tensor.value().get_layout() : layout.value_or(ttnn::ROW_MAJOR_LAYOUT); - DataType dtype_value = optional_output_tensor.has_value() ? optional_output_tensor.value().get_dtype() : dtype.value_or(ttnn::bfloat16); + DataType dtype_value = optional_output_tensor.has_value() ? optional_output_tensor.value().get_dtype() : dtype.value_or(DataType::BFLOAT16); tt::tt_metal::LegacyShape shape_value = optional_output_tensor.has_value() ? optional_output_tensor.value().get_legacy_shape() : shape.value; MemoryConfig mem_cfg = optional_output_tensor.has_value() ? optional_output_tensor.value().memory_config() : memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG); return numpy::full_impl( @@ -330,7 +330,7 @@ struct FullLike { struct Arange { static ttnn::Tensor invoke( const int64_t stop, - const DataType dtype = ttnn::bfloat16, + const DataType dtype = DataType::BFLOAT16, const std::optional>& device = std::nullopt, const MemoryConfig& memory_config = ttnn::DRAM_MEMORY_CONFIG) { return Arange::invoke(0, stop, 1, dtype, device, memory_config); @@ -340,20 +340,20 @@ struct Arange { const int64_t start, const int64_t stop, const int64_t step = 1, - const DataType dtype = ttnn::bfloat16, + const DataType dtype = ttnn::DataType::BFLOAT16, const std::optional>& device_arg = std::nullopt, const MemoryConfig& memory_config = ttnn::DRAM_MEMORY_CONFIG) { Device* device = device_arg.has_value() ? &(device_arg.value().get()) : nullptr; switch (dtype) { - case ttnn::bfloat16: + case DataType::BFLOAT16: return numpy::arange<::bfloat16>(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); - case ttnn::float32: + case DataType::FLOAT32: return numpy::arange(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); - case ttnn::uint16: + case DataType::UINT16: return numpy::arange(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); - case ttnn::uint32: + case DataType::UINT32: return numpy::arange(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); - case ttnn::int32: + case DataType::INT32: return numpy::arange(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); default: TT_THROW("Unsupported dtype"); } diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp index 434e8db04cb..9675b95c63b 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp index 57b2a50748c..4b53cccebb0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp" #include "tt_metal/common/work_split.hpp" #include "tt_metal/detail/util.hpp" diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp index f320b2a3d52..bb53c8bee61 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp @@ -18,8 +18,8 @@ uint32_t get_num_cores(CoreCoord grid_size, uint32_t nblocks) { if (nblocks <= ncores) { ncores = nblocks; } else { - uint32_t nblocks_per_core = ceil((float)nblocks / ncores); - ncores = ceil((float)nblocks / nblocks_per_core); + uint32_t nblocks_per_core = std::ceil((float)nblocks / ncores); + ncores = std::ceil((float)nblocks / nblocks_per_core); } return ncores; } @@ -47,7 +47,7 @@ void Untilize::validate(const std::vector& input_tensors) const { TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Error"); uint32_t ntiles = input_tensor_a.volume() / TILE_HW; uint32_t ntiles_per_block = input_tensor_a.get_legacy_shape()[-1] / TILE_WIDTH; - uint32_t nblocks = ceil((float)ntiles / ntiles_per_block); + uint32_t nblocks = std::ceil((float)ntiles / ntiles_per_block); auto num_cores = untilize_helpers::get_num_cores(input_tensor_a.device()->compute_with_storage_grid_size(), nblocks); uint32_t fused_height = input_tensor_a.volume() / input_tensor_a.get_legacy_shape()[-1]; @@ -82,7 +82,7 @@ std::vector Untilize::create_output_tensors( } else { uint32_t ntiles = input_tensor.volume() / TILE_HW; uint32_t ntiles_per_block = input_tensor.get_legacy_shape()[-1] / TILE_WIDTH; - uint32_t nblocks = ceil((float)ntiles / ntiles_per_block); + uint32_t nblocks = std::ceil((float)ntiles / ntiles_per_block); auto num_cores = untilize_helpers::get_num_cores(input_tensor.device()->compute_with_storage_grid_size(), nblocks); auto shard_grid = diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp index 22826b7cd4d..f746b2fe105 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp @@ -287,7 +287,7 @@ operation::ProgramWithCallbacks untilize_multi_core( uint32_t ntiles = a.volume() / TILE_HW; uint32_t stick_s = a.get_legacy_shape()[-1]; uint32_t ntiles_per_block = a.get_legacy_shape()[-1] / TILE_WIDTH; - uint32_t nblocks = ceil((float)ntiles / ntiles_per_block); + uint32_t nblocks = std::ceil((float)ntiles / ntiles_per_block); uint32_t block_size_nbytes = a.get_legacy_shape()[-1] * output.element_size(); uint32_t max_l1_size = a.device()->l1_size_per_core() / 2 - a.device()->get_base_allocator_addr(HalMemType::L1); diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp index 92d31327d3a..8c3f07035de 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp @@ -41,7 +41,7 @@ std::vector UntilizeWithHaloV2::compute_output_shapes // output_shape[1] remains same // output_shape[2] changes // output_shape[3] remains same - output_shape[2] = (uint32_t)ceil((float)total_nsticks / nbatch); + output_shape[2] = (uint32_t)std::ceil((float)total_nsticks / nbatch); log_debug( tt::LogOp, "output_shape: [{} {} {} {}]", output_shape[0], output_shape[1], output_shape[2], output_shape[3]); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp index de27f41453a..ef69cacc9d0 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp @@ -17,6 +17,8 @@ namespace ttnn::operations::binary { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType binary_op_type) { switch (binary_op_type) { case BinaryOpType::ADD: return BcastOpMath::ADD; @@ -25,6 +27,8 @@ static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType bina default: TT_THROW("BinaryOpType cannot be mapped to BcastOpMath"); } } +} +} BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::cached_program_t BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::create( @@ -34,6 +38,7 @@ BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::create( using namespace tt; using namespace tt::tt_metal; using namespace tt::constants; + using namespace CMAKE_UNIQUE_NAMESPACE; const auto& a = tensor_args.input_tensor_a; const auto& b = tensor_args.input_tensor_b; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp index ac00c26f9a3..91def43dade 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp @@ -15,6 +15,8 @@ namespace ttnn::operations::binary { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType binary_op_type) { switch (binary_op_type) { case BinaryOpType::ADD: return BcastOpMath::ADD; @@ -23,6 +25,8 @@ static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType bina default: TT_THROW("BinaryOpType cannot be mapped to BcastOpMath"); } } +} +} BinaryDeviceOperation::BroadcastHeightMultiCore::cached_program_t BinaryDeviceOperation ::BroadcastHeightMultiCore::create( @@ -32,6 +36,7 @@ BinaryDeviceOperation ::BroadcastHeightMultiCore::create( using namespace tt; using namespace tt::tt_metal; using namespace tt::constants; + using namespace CMAKE_UNIQUE_NAMESPACE; const auto& a = tensor_args.input_tensor_a; const auto& b = tensor_args.input_tensor_b; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp index f28c4e01db1..b52c1dd8bbf 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp @@ -15,6 +15,8 @@ namespace ttnn::operations::binary { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType binary_op_type) { switch (binary_op_type) { case BinaryOpType::ADD: return BcastOpMath::ADD; @@ -23,6 +25,8 @@ static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType bina default: TT_THROW("BinaryOpType cannot be mapped to BcastOpMath"); } } +} +} BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::cached_program_t BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create( @@ -32,6 +36,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create( using namespace tt; using namespace tt::tt_metal; using namespace tt::constants; + using namespace CMAKE_UNIQUE_NAMESPACE; const auto& a = tensor_args.input_tensor_a; const auto& b = tensor_args.input_tensor_b; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp index 82c93081909..62e88253d59 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp @@ -15,6 +15,8 @@ namespace ttnn::operations::binary { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType binary_op_type) { switch (binary_op_type) { case BinaryOpType::ADD: return BcastOpMath::ADD; @@ -23,6 +25,8 @@ static const BcastOpMath binary_op_type_to_bcast_op_math(const BinaryOpType bina default: TT_THROW("BinaryOpType cannot be mapped to BcastOpMath"); } } +} +} BinaryDeviceOperation::BroadcastWidthMultiCore::cached_program_t BinaryDeviceOperation::BroadcastWidthMultiCore::create( const operation_attributes_t& operation_attributes, @@ -31,6 +35,7 @@ BinaryDeviceOperation::BroadcastWidthMultiCore::cached_program_t BinaryDeviceOpe using namespace tt; using namespace tt::tt_metal; using namespace tt::constants; + using namespace CMAKE_UNIQUE_NAMESPACE; const auto& a = tensor_args.input_tensor_a; const auto& b = tensor_args.input_tensor_b; diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp index e9d4c2d84e9..e2f771f37f6 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp @@ -50,8 +50,8 @@ UnaryShardedProgramFactory::cached_program_t UnaryShardedProgramFactory::create( uint32_t num_tile_per_core = 0; if (input.get_dtype() == DataType::BFLOAT8_B) { - uint32_t ntiles_along_width = ceil(shard_spec.shape[1] / (float) tt::constants::TILE_WIDTH); - uint32_t ntiles_along_height = ceil(shard_spec.shape[0] / (float) tt::constants::TILE_HEIGHT); + uint32_t ntiles_along_width = std::ceil(shard_spec.shape[1] / (float) tt::constants::TILE_WIDTH); + uint32_t ntiles_along_height = std::ceil(shard_spec.shape[0] / (float) tt::constants::TILE_HEIGHT); num_tile_per_core = ntiles_along_width * ntiles_along_height; } else { TT_FATAL( diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp index f35d2793889..b925b9bf8e5 100644 --- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp @@ -133,9 +133,9 @@ operation::ProgramWithCallbacks embeddings_tilized( auto cb_output = tt_metal::CreateCircularBuffer(program, all_cores, cb_output_config); bool input_stick_size_is_power_of_two = is_power_of_two_at_least_32(input_page_size); - uint32_t input_log2_stick_size = input_stick_size_is_power_of_two ? (std::uint32_t)log2(input_page_size) : 0; + uint32_t input_log2_stick_size = input_stick_size_is_power_of_two ? (std::uint32_t)std::log2(input_page_size) : 0; bool weight_stick_size_is_power_of_two = is_power_of_two_at_least_32(weight_page_size); - uint32_t weight_log2_stick_size = weight_stick_size_is_power_of_two ? (std::uint32_t)log2(weight_page_size) : 0; + uint32_t weight_log2_stick_size = weight_stick_size_is_power_of_two ? (std::uint32_t)std::log2(weight_page_size) : 0; // Create Kernels // reader @@ -370,9 +370,9 @@ operation::ProgramWithCallbacks embeddings_rm( uint32_t output_cb_index = src0_cb_index; bool input_stick_size_is_power_of_two = is_power_of_two_at_least_32(input_page_size); - uint32_t input_log2_stick_size = input_stick_size_is_power_of_two ? (std::uint32_t)log2(input_page_size) : 0; + uint32_t input_log2_stick_size = input_stick_size_is_power_of_two ? (std::uint32_t)std::log2(input_page_size) : 0; bool weight_stick_size_is_power_of_two = is_power_of_two_at_least_32(weight_page_size); - uint32_t weight_log2_stick_size = weight_stick_size_is_power_of_two ? (std::uint32_t)log2(weight_page_size) : 0; + uint32_t weight_log2_stick_size = weight_stick_size_is_power_of_two ? (std::uint32_t)std::log2(weight_page_size) : 0; // Create Kernels // reader @@ -407,7 +407,7 @@ operation::ProgramWithCallbacks embeddings_rm( embedding_defines)); bool output_stick_size_is_power_of_two = is_power_of_two_at_least_32(output_page_size); - uint32_t output_log2_stick_size = output_stick_size_is_power_of_two ? (std::uint32_t)log2(output_page_size) : 0; + uint32_t output_log2_stick_size = output_stick_size_is_power_of_two ? (std::uint32_t)std::log2(output_page_size) : 0; std::vector writer_compile_time_args = { (std::uint32_t)output_cb_index, (std::uint32_t)out_is_dram, diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp index 171a6ab7266..6d08f349266 100644 --- a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp @@ -105,7 +105,7 @@ operation::ProgramWithCallbacks embedding_backward_multi_core( // reader bool index_stick_size_is_power_of_two = is_power_of_two_at_least_32(index_page_size); - uint32_t index_log2_stick_size = index_stick_size_is_power_of_two ? log2(index_page_size) : 0; + uint32_t index_log2_stick_size = index_stick_size_is_power_of_two ? std::log2(index_page_size) : 0; std::vector reader_compile_time_args = { grad_is_dram, diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp index f630d8de757..f64a761ff8e 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp @@ -46,7 +46,9 @@ operation::ProgramWithCallbacks AllReduce::create_program( this->user_defined_num_buffers_per_channel); } -static ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_type(ttnn::operations::reduction::ReduceType reduce_op) { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { +ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_type(ttnn::operations::reduction::ReduceType reduce_op) { // Leaving switch statement for future support of additional types. switch (reduce_op) { case ttnn::operations::reduction::ReduceType::Sum: @@ -56,6 +58,8 @@ static ttnn::operations::binary::BinaryOpType convert_reduce_type_to_eltwise_typ return ttnn::operations::binary::BinaryOpType::ADD; } } +} +} namespace operations{ namespace experimental{ @@ -174,6 +178,7 @@ Tensor all_reduce( ttnn::ccl::Topology topology, const std::optional user_defined_num_workers, const std::optional user_defined_num_buffers_per_channel) { + using namespace CMAKE_UNIQUE_NAMESPACE; ttnn::operations::binary::BinaryOpType binary_op_type = convert_reduce_type_to_eltwise_type(math_op); TT_FATAL(std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr, "All Reduce op is only supported for Fast Dispatch"); TT_FATAL(topology == ttnn::ccl::Topology::Ring, "All Reduce op is currently supported only on Ring topology"); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp index e1b2660ef51..50a9012ceb7 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp @@ -13,8 +13,6 @@ #include "tt_metal/detail/util.hpp" #include "tt_metal/host_api.hpp" -using namespace tt::constants; - namespace tt { namespace tt_metal { @@ -27,6 +25,8 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core( std::optional token_idx, ttnn::DeviceComputeKernelConfig compute_kernel_config ) { + using namespace tt::constants; + Program program{}; tt::DataFormat input_cb_data_format = tt_metal::datatype_to_dataformat_converter(input.get_dtype()); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp index 401227158f0..cfc5d3970d1 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp @@ -8,13 +8,12 @@ #include "tt_metal/common/constants.hpp" #include "tt_metal/host_api.hpp" -using namespace tt::constants; - namespace tt { namespace tt_metal { void RotaryEmbeddingLlama::validate(const std::vector& input_tensors) const { + using namespace tt::constants; const auto& input_tensor = input_tensors.at(0); const auto& cos = input_tensors.at(1); const auto& sin = input_tensors.at(2); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp index 0f1cd8a6131..6c075f6b7c8 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp @@ -10,8 +10,6 @@ #include "tt_metal/detail/util.hpp" #include "tt_metal/host_api.hpp" -using namespace tt::constants; - namespace tt { namespace tt_metal { @@ -24,6 +22,7 @@ operation::ProgramWithCallbacks rotary_embedding_llama_multi_core( Tensor &output, ttnn::DeviceComputeKernelConfig compute_kernel_config ) { + using namespace tt::constants; Program program{}; const tt::DataFormat input_cb_data_format = tt_metal::datatype_to_dataformat_converter(input.get_dtype()); diff --git a/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp b/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp index 1a12849b814..a7efe354c34 100644 --- a/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/full/device/full_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include "ttnn/decorators.hpp" diff --git a/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp b/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp index c4c59a724d1..7b8341ef7db 100644 --- a/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/full_like/device/full_like_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp index 7ed3cb413c3..757c88962b2 100644 --- a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_device_operation.hpp @@ -1,6 +1,9 @@ // SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 + +#pragma once + #include #include diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp index 32eb87cd13a..93e36fee7f2 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp @@ -164,7 +164,7 @@ struct Matmul { const bool user_run_batched = false; const bool transpose_a = false; const bool transpose_b = false; - const std::optional output_tile; + const std::optional output_tile; void validate( const std::vector &input_tensors, diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp index 227ab4af119..c2d14cddedc 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp @@ -104,7 +104,7 @@ Tensor MatmulOperation::invoke( const std::optional& activation, const std::optional compute_kernel_config, const std::optional core_grid, - const std::optional& output_tile) { + const std::optional& output_tile) { std::optional user_core_coord; if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); @@ -142,7 +142,7 @@ Tensor LinearOperation::invoke( const std::optional& activation, const std::optional compute_kernel_config, const std::optional core_grid, - const std::optional& output_tile) { + const std::optional& output_tile) { std::optional user_core_coord; if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp index 8559de2d18d..eb450bd2896 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp @@ -47,7 +47,7 @@ struct MatmulOperation { const std::optional& activation = std::nullopt, const std::optional compute_kernel_config = std::nullopt, const std::optional core_grid = std::nullopt, - const std::optional& output_tile = std::nullopt); + const std::optional& output_tile = std::nullopt); }; struct LinearOperation { @@ -63,7 +63,7 @@ struct LinearOperation { const std::optional& activation = std::nullopt, const std::optional compute_kernel_config = std::nullopt, const std::optional core_grid = std::nullopt, - const std::optional& output_tile = std::nullopt); + const std::optional& output_tile = std::nullopt); }; } // namespace matmul diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp index b39450d24ad..8795ae3e254 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp index 23540113725..a4eae4dd393 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" namespace ttnn::operations::moreh::moreh_arange { diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp index 04d3f356647..0b4006a0cc7 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp index 92bef5f3ac0..5fe907df84f 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp index 34ae223deb8..f51df643baf 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp index 1cbae242656..0ca74ffcf42 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include #include diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp index 8f74b1cbe4f..c0f7e7ae8f4 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp @@ -5,12 +5,16 @@ #include "moreh_getitem_device_operation.hpp" #include "ttnn/operations/moreh/moreh_helper_functions.hpp" +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { struct IndexInfo { bool is_defined; bool is_dram; uint32_t address; uint32_t unit_size; }; +} +} namespace ttnn::operations::moreh::moreh_getitem { MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOperation::MorehGetItemRmFactory::create( @@ -20,6 +24,7 @@ MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOpera using namespace tt; using namespace tt::tt_metal; using namespace tt::operations::primary; + using namespace CMAKE_UNIQUE_NAMESPACE; auto input = tensor_args.input; auto index_tensors = tensor_args.index_tensors; @@ -234,6 +239,7 @@ void MorehGetItemOperation::MorehGetItemRmFactory::override_runtime_arguments( const operation_attributes_t &operation_attributes, const tensor_args_t &tensor_args, tensor_return_value_t &tensor_return_value) { + using namespace CMAKE_UNIQUE_NAMESPACE; auto &program = cached_program.program; auto &reader_kernel_id = cached_program.shared_variables.unary_reader_kernel_id; auto &writer_kernel_id = cached_program.shared_variables.unary_writer_kernel_id; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp index 74749bc4636..e3c81d815c7 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp @@ -8,12 +8,16 @@ #include "moreh_getitem_device_operation.hpp" #include "ttnn/operations/moreh/moreh_helper_functions.hpp" +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { struct IndexInfo { bool is_defined; bool is_dram; uint32_t address; uint32_t unit_size; }; +} +} namespace ttnn::operations::moreh::moreh_getitem { MorehGetItemOperation::MorehGetItemTilizedFactory::cached_program_t @@ -24,6 +28,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create( using namespace tt; using namespace tt::tt_metal; using namespace tt::operations::primary; + using namespace CMAKE_UNIQUE_NAMESPACE; auto input = tensor_args.input; auto index_tensors = tensor_args.index_tensors; @@ -547,6 +552,7 @@ void MorehGetItemOperation::MorehGetItemTilizedFactory::override_runtime_argumen const operation_attributes_t &operation_attributes, const tensor_args_t &tensor_args, tensor_return_value_t &tensor_return_value) { + using namespace CMAKE_UNIQUE_NAMESPACE; auto &program = cached_program.program; auto &reader_kernel_id = cached_program.shared_variables.unary_reader_kernel_id; auto &writer_kernel_id = cached_program.shared_variables.unary_writer_kernel_id; diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp index 480aac7cf01..f69fdb3acad 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp index 4eda36fccbb..5b5fb240f15 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include #include diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp index 160e3425aae..2d6bfa7450d 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp index e7bef713e90..2c0334abc35 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp index cbceeaefb10..aec3c8a6bf4 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp index c3167583676..fdfb4621253 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp index 1ec9b4ddd74..55a88691a6a 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include "ttnn/decorators.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp index 92d083eef03..743c2e1c162 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include #include diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp index 376bd1759e6..54849e02229 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include #include diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp index 1b8b4da8ab2..ffa38825da0 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp index a955818f626..ff8d5df97f6 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp index b9bfd28c7b7..ababc16fd38 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp index 42ee2f45b6e..e8829852fdf 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp index a6906e46ef3..68da88b09eb 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp index a62db287a25..f3ce1416459 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include #include "ttnn/decorators.hpp" diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp index e848e61fa2b..9d5e86c3873 100644 --- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/device_operation.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp index 94f5706c5c7..e493824e0d1 100644 --- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp @@ -18,6 +18,8 @@ using namespace tt::constants; namespace ttnn::operations::normalization { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; } inline bool is_dram(const std::optional input_tensor) { return input_tensor.has_value() ? is_dram(input_tensor.value()) : true; @@ -111,6 +113,8 @@ std::pair find_max_tile_span(uint32_t W, uint32_t group_size return {max_tile_span, num_groups_before_start_again_at_tile_beginning}; } +} +} operation::ProgramWithCallbacks groupnorm_multi_core_sharded( const Tensor &a, @@ -126,6 +130,7 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded( CoreCoord grid_size, bool inplace ) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (gamma.has_value()) { TT_ASSERT(gamma.value().get_layout() == Layout::ROW_MAJOR); } @@ -477,8 +482,8 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded( (std::uint32_t) per_core_Mt, (std::uint32_t) TILE_HEIGHT }; - tt::tt_metal::NOC reader_noc = detail::GetPreferredNOCForDRAMWrite(device->arch()); - tt::tt_metal::NOC writer_noc = detail::GetPreferredNOCForDRAMRead(device->arch()); + tt::tt_metal::NOC reader_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch()); + tt::tt_metal::NOC writer_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch()); // reader kernel auto reader_mcast_sender_kernels_id = CreateKernel( program, diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp index d9e387165e4..affb95045eb 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp @@ -18,6 +18,8 @@ using namespace tt::constants; namespace ttnn::operations::normalization { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; } inline bool is_dram(const std::optional input_tensor) { return input_tensor.has_value() ? is_dram(input_tensor.value()) : true; @@ -40,6 +42,8 @@ inline uint32_t pack_two_bfloat16_into_uint32(std::pair two_ // second -> upper 16 return (uint32_t)two_bfloats.first | ((uint32_t)two_bfloats.second << 16); } +} +} // computes layernorm(a+*b)*gamma + beta // if b is nullptr it's treated as zero (no addition) @@ -53,6 +57,7 @@ operation::ProgramWithCallbacks layernorm_multi_core( float eps, DeviceComputeKernelConfig compute_kernel_config ) { + using namespace CMAKE_UNIQUE_NAMESPACE; bool rms_norm = norm_type == LayerNormType::RMSNORM; const auto shape = a.get_legacy_shape(); uint32_t W = shape[-1], H = shape[-2]; @@ -192,7 +197,7 @@ operation::ProgramWithCallbacks layernorm_multi_core( bool gamma_stick_size_is_power_of_two = is_power_of_two_at_least_32(gamma_stick_size); reader_compile_time_args.push_back((std::uint32_t) gamma_stick_size_is_power_of_two); if (gamma_stick_size_is_power_of_two) { - uint32_t gamma_log2_stick_size = gamma_stick_size_is_power_of_two ? (std::uint32_t)log2(gamma_stick_size) : 0; + uint32_t gamma_log2_stick_size = gamma_stick_size_is_power_of_two ? (std::uint32_t)std::log2(gamma_stick_size) : 0; reader_compile_time_args.push_back((std::uint32_t) gamma_log2_stick_size); } else { reader_compile_time_args.push_back(gamma_stick_size); @@ -202,7 +207,7 @@ operation::ProgramWithCallbacks layernorm_multi_core( bool beta_stick_size_is_power_of_two = is_power_of_two_at_least_32(beta_stick_size); reader_compile_time_args.push_back((std::uint32_t) beta_stick_size_is_power_of_two); if (beta_stick_size_is_power_of_two) { - uint32_t beta_log2_stick_size = beta_stick_size_is_power_of_two ? (std::uint32_t)log2(beta_stick_size) : 0; + uint32_t beta_log2_stick_size = beta_stick_size_is_power_of_two ? (std::uint32_t)std::log2(beta_stick_size) : 0; reader_compile_time_args.push_back((std::uint32_t) beta_log2_stick_size); } else { reader_compile_time_args.push_back(beta_stick_size); @@ -407,6 +412,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded( uint32_t block_wt, DeviceComputeKernelConfig compute_kernel_config ) { + using namespace CMAKE_UNIQUE_NAMESPACE; bool rms_norm = norm_type == LayerNormType::RMSNORM; bool is_pre_all_gather = distributed_norm_stage == DistributedLayerNormStage::PRE_ALL_GATHER; bool is_post_all_gather = distributed_norm_stage == DistributedLayerNormStage::POST_ALL_GATHER; @@ -904,7 +910,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded( writer_mcast_sender_compile_time_args.push_back((std::uint32_t) gamma_stick_size_is_power_of_two); writer_mcast_receiver_compile_time_args.push_back((std::uint32_t) gamma_stick_size_is_power_of_two); if (gamma_stick_size_is_power_of_two) { - uint32_t gamma_log2_stick_size = gamma_stick_size_is_power_of_two ? (std::uint32_t)log2(gamma_stick_size) : 0; + uint32_t gamma_log2_stick_size = gamma_stick_size_is_power_of_two ? (std::uint32_t)std::log2(gamma_stick_size) : 0; writer_mcast_sender_compile_time_args.push_back((std::uint32_t) gamma_log2_stick_size); writer_mcast_receiver_compile_time_args.push_back((std::uint32_t) gamma_log2_stick_size); } else { @@ -917,7 +923,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded( writer_mcast_sender_compile_time_args.push_back((std::uint32_t) beta_stick_size_is_power_of_two); writer_mcast_receiver_compile_time_args.push_back((std::uint32_t) beta_stick_size_is_power_of_two); if (beta_stick_size_is_power_of_two) { - uint32_t beta_log2_stick_size = beta_stick_size_is_power_of_two ? (std::uint32_t)log2(beta_stick_size) : 0; + uint32_t beta_log2_stick_size = beta_stick_size_is_power_of_two ? (std::uint32_t)std::log2(beta_stick_size) : 0; writer_mcast_sender_compile_time_args.push_back((std::uint32_t) beta_log2_stick_size); writer_mcast_receiver_compile_time_args.push_back((std::uint32_t) beta_log2_stick_size); } else { diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp index 3daf51392fc..6fb5f6e2ba6 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp @@ -19,6 +19,8 @@ using namespace tt::tt_metal; namespace ttnn::operations::normalization { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; } inline bool is_dram(const std::optional input_tensor) { return input_tensor.has_value() ? is_dram(input_tensor.value()) : true; @@ -41,6 +43,8 @@ inline uint32_t pack_two_bfloat16_into_uint32(std::pair two_ // second -> upper 16 return (uint32_t)two_bfloats.first | ((uint32_t)two_bfloats.second << 16); } +} +} // computes layernorm(a)*gamma + beta operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( @@ -53,6 +57,7 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core( float eps, ttnn::DeviceComputeKernelConfig compute_kernel_config ) { + using namespace CMAKE_UNIQUE_NAMESPACE; const bool is_rmsnorm = norm_type == LayerNormDistributedType::RMSNORM; const auto shape = a.get_legacy_shape(); const uint32_t W = shape[-1], H = shape[-2]; diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp index aedca4b1c2a..92f7d7601ef 100644 --- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp @@ -18,6 +18,8 @@ using namespace tt::constants; namespace ttnn::operations::normalization { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; } inline bool is_dram(const std::optional input_tensor) { return input_tensor.has_value() ? is_dram(input_tensor.value()) : true; @@ -40,6 +42,8 @@ inline uint32_t pack_two_bfloat16_into_uint32(std::pair two_ // second -> upper 16 return (uint32_t)two_bfloats.first | ((uint32_t)two_bfloats.second << 16); } +} +} operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core( const Tensor &a, @@ -47,6 +51,7 @@ operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core( LayerNormDistributedType norm_type, DeviceComputeKernelConfig compute_kernel_config ) { + using namespace CMAKE_UNIQUE_NAMESPACE; const bool is_rmsnorm = norm_type == LayerNormDistributedType::RMSNORM; const auto shape = a.get_legacy_shape(); const uint32_t W = shape[-1], H = shape[-2]; diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp index a0b49812a56..69cce8381b9 100644 --- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp +++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp @@ -20,11 +20,15 @@ using namespace tt::constants; namespace ttnn::operations::normalization { +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { inline bool is_dram(const Tensor& input_tensor) { return input_tensor.memory_config().buffer_type == BufferType::DRAM; } inline bool is_dram(const std::optional input_tensor) { return input_tensor.has_value() ? is_dram(input_tensor.value()) : true; } inline bool is_dram(const Buffer* b) { return b->buffer_type() == BufferType::DRAM; } +} +} // implementation of softmax with optional scale/mask (see the header for input_tensor more detailed description) operation::ProgramWithCallbacks scale_mask_softmax_multi_core( @@ -36,7 +40,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core( DeviceComputeKernelConfig compute_kernel_config, bool numeric_stable ) { - + using namespace CMAKE_UNIQUE_NAMESPACE; const auto shape = input_tensor.get_legacy_shape(); uint32_t W = shape[-1], H = (input_tensor.volume() / (shape[0] * shape[-1])), NC = shape[0]; uint32_t HW = H*W; diff --git a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp index e1b38036ca1..dfcc7372f8e 100644 --- a/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/sliding_window/halo/device/halo_device_operation.cpp @@ -37,7 +37,7 @@ std::vector HaloDeviceOperation::compute_output_shape // output_shape[1] remains same // output_shape[2] changes // output_shape[3] remains same - output_shape[2] = (uint32_t) ceil((float) total_nsticks / nbatch); + output_shape[2] = (uint32_t) std::ceil((float) total_nsticks / nbatch); log_debug(tt::LogOp, "output_shape: [{} {} {} {}]", output_shape[0], output_shape[1], output_shape[2], output_shape[3]); log_debug(tt::LogOp, "max_out_nsticks_per_core: {}", max_out_nsticks_per_core_); diff --git a/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp b/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp index 372c9dcc6ae..72bf4f0b7b3 100644 --- a/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/uniform/device/uniform_device_operation.hpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "ttnn/decorators.hpp" #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" diff --git a/ttnn/cpp/ttnn/tensor/types.cpp b/ttnn/cpp/ttnn/tensor/types.cpp index a476bf6a608..69856466bb6 100644 --- a/ttnn/cpp/ttnn/tensor/types.cpp +++ b/ttnn/cpp/ttnn/tensor/types.cpp @@ -8,7 +8,7 @@ namespace ttnn { -SimpleShape get_physical_shape(const SimpleShape& logical_shape, DataType data_type, Layout layout, const std::optional& tile) { +SimpleShape get_physical_shape(const SimpleShape& logical_shape, DataType data_type, Layout layout, const std::optional& tile) { SimpleShape physical_shape = logical_shape; auto rank = physical_shape.rank(); if (layout == Layout::TILE) { diff --git a/ttnn/cpp/ttnn/types.hpp b/ttnn/cpp/ttnn/types.hpp index 72e212d1c21..ec1e760a06a 100644 --- a/ttnn/cpp/ttnn/types.hpp +++ b/ttnn/cpp/ttnn/types.hpp @@ -19,15 +19,6 @@ using Device = tt::tt_metal::Device; constexpr auto TILE_SIZE = 32; using tt::tt_metal::DataType; -static constexpr auto uint8 = DataType::UINT8; -static constexpr auto uint16 = DataType::UINT16; -static constexpr auto int32 = DataType::INT32; -static constexpr auto uint32 = DataType::UINT32; -static constexpr auto float32 = DataType::FLOAT32; -static constexpr auto bfloat16 = DataType::BFLOAT16; -static constexpr auto bfloat8_b = DataType::BFLOAT8_B; -static constexpr auto bfloat4_b = DataType::BFLOAT4_B; - using tt::tt_metal::BufferType; using tt::tt_metal::MemoryConfig; using tt::tt_metal::TensorMemoryLayout;