#0: Faster builds by enabling Unity build for TTNN and tests (#14461)

* #0: Unity builds * #0: Unity build for tests * #0: More unity builds * #0: Cleanup * #0: Rename ANON_NAMESPACE to CMAKE_UNIQUE_NAMESPACE * #0: Allow to disable unity builds * #0: Disable unity builds if export commands is on * #0: Raise min cmake version to 3.20 * #0: CMake fixes * #0: Review fixes - cmake cleanup * #0: Disable unity builds on older cmake * #0: Build script fixup * #0: Build fix
tenstorrent · Oct 30, 2024 · 948fafb · 948fafb
1 parent 5e590a3
commit 948fafb
Show file tree

Hide file tree

Showing 114 changed files with 495 additions and 295 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -43,6 +43,7 @@ endif()
 list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 include(project_options)
+include(unity)
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -121,6 +122,7 @@ message(STATUS "Build Python bindings: ${WITH_PYTHON_BINDINGS}")
 message(STATUS "Build Programming Examples: ${BUILD_PROGRAMMING_EXAMPLES}")
 message(STATUS "Build TT METAL Tests: ${TT_METAL_BUILD_TESTS}")
 message(STATUS "Build TTNN Tests: ${TTNN_BUILD_TESTS}")
+message(STATUS "Build with Unity builds: ${TT_UNITY_BUILDS}")
 ############################################################################################################################
 
 if(ENABLE_BUILD_TIME_TRACE)

diff --git a/build_metal.sh b/build_metal.sh
@@ -26,6 +26,7 @@ show_help() {
     echo "  --debug                          Set the build type as Debug."
     echo "  --clean                          Remove build workspaces."
     echo "  --build-static-libs              Build tt_metal (not ttnn) as a static lib (BUILD_SHARED_LIBS=OFF)"
+    echo "  --disable-unity-builds           Disable Unity builds"
 }
 
 clean() {
@@ -49,11 +50,12 @@ build_metal_tests="OFF"
 build_umd_tests="OFF"
 build_programming_examples="OFF"
 build_static_libs="OFF"
+unity_builds="ON"
 
 declare -a cmake_args
 
 OPTIONS=h,e,c,t,a,m,s,u,b:,p
-LONGOPTIONS=help,export-compile-commands,enable-ccache,enable-time-trace,enable-asan,enable-msan,enable-tsan,enable-ubsan,build-type:,enable-profiler,install-prefix:,build-tests,build-ttnn-tests,build-metal-tests,build-umd-tests,build-programming-examples,build-static-libs,release,development,debug,clean
+LONGOPTIONS=help,export-compile-commands,enable-ccache,enable-time-trace,enable-asan,enable-msan,enable-tsan,enable-ubsan,build-type:,enable-profiler,install-prefix:,build-tests,build-ttnn-tests,build-metal-tests,build-umd-tests,build-programming-examples,build-static-libs,disable-unity-builds,release,development,debug,clean
 
 # Parse the options
 PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTIONS --name "$0" -- "$@")
@@ -70,7 +72,7 @@ while true; do
         -h|--help)
             show_help;exit 0;;
         -e|--export-compile-commands)
-            export_compile_commands="ON";;
+            export_compile_commands="ON";unity_builds="OFF";;
         -c|--enable-ccache)
             enable_ccache="ON";;
         -t|--enable-time-trace)
@@ -101,6 +103,8 @@ while true; do
             build_programming_examples="ON";;
         --build-static-libs)
             build_static_libs="ON";;
+        --disable-unity-builds)
+	    unity_builds="OFF";;
         --release)
             build_type="Release";;
         --development)
@@ -156,6 +160,7 @@ echo "INFO: Enable UndefinedBehaviorSanitizer: $enable_ubsan"
 echo "INFO: Build directory: $build_dir"
 echo "INFO: Install Prefix: $cmake_install_prefix"
 echo "INFO: Build tests: $build_tests"
+echo "INFO: Enable Unity builds: $unity_builds"
 
 # Prepare cmake arguments
 cmake_args+=("-B" "$build_dir")
@@ -194,6 +199,8 @@ fi
 
 if [ "$export_compile_commands" = "ON" ]; then
     cmake_args+=("-DCMAKE_EXPORT_COMPILE_COMMANDS=ON")
+else
+    cmake_args+=("-DCMAKE_EXPORT_COMPILE_COMMANDS=OFF")
 fi
 
 if [ "$build_tests" = "ON" ]; then
@@ -222,6 +229,12 @@ if [ "$build_static_libs" = "ON" ]; then
     cmake_args+=("-DBUILD_SHARED_LIBS=OFF")
 fi
 
+if [ "$unity_builds" = "ON" ]; then
+    cmake_args+=("-DTT_UNITY_BUILDS=ON")
+else
+    cmake_args+=("-DTT_UNITY_BUILDS=OFF")
+fi
+
 # Create and link the build directory
 mkdir -p $build_dir
 ln -nsf $build_dir build

diff --git a/cmake/helper_functions.cmake b/cmake/helper_functions.cmake
@@ -12,6 +12,7 @@ function(CREATE_EAGER_TEST_EXE TESTLIST)
             set(TEST_TARGET ${TEST_NAME})
         endif()
         add_executable(${TEST_TARGET} ${TEST_SRC_PATH})
+        TT_ENABLE_UNITY_BUILD(${TEST_TARGET})
 
         target_link_libraries(
             ${TEST_TARGET}

diff --git a/cmake/project_options.cmake b/cmake/project_options.cmake
@@ -16,4 +16,16 @@ option(BUILD_PROGRAMMING_EXAMPLES "Enables build of tt_metal programming example
 option(TT_METAL_BUILD_TESTS "Enables build of tt_metal tests" OFF)
 option(TTNN_BUILD_TESTS "Enables build of ttnn tests" OFF)
 option(ENABLE_CCACHE "Build with compiler cache" FALSE)
+option(TT_UNITY_BUILDS "Build with Unity builds" ON)
 ###########################################################################################
+
+if(TT_UNITY_BUILDS)
+    if(CMAKE_EXPORT_COMPILE_COMMANDS)
+        message(STATUS "Disabling Unity builds because CMAKE_EXPORT_COMPILE_COMMANDS is ON")
+        set(TT_UNITY_BUILDS OFF)
+    endif()
+    if(CMAKE_VERSION VERSION_LESS "3.20.0")
+        message(STATUS "CMake 3.20 or newer is required for Unity builds, disabling")
+        set(TT_UNITY_BUILDS OFF)
+    endif()
+endif()
diff --git a/cmake/unity.cmake b/cmake/unity.cmake
@@ -0,0 +1,12 @@
+function(TT_ENABLE_UNITY_BUILD TARGET)
+    if(TT_UNITY_BUILDS)
+        set_target_properties(
+            ${TARGET}
+            PROPERTIES
+                UNITY_BUILD
+                    ON
+                UNITY_BUILD_UNIQUE_ID
+                    "CMAKE_UNIQUE_NAMESPACE"
+        )
+    endif()
+endfunction()
diff --git a/tests/tt_metal/test_utils/df/bfloat16.hpp b/tests/tt_metal/test_utils/df/bfloat16.hpp
diff --git a/tests/tt_metal/test_utils/df/df.hpp b/tests/tt_metal/test_utils/df/df.hpp
@@ -3,5 +3,4 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "tt_metal/test_utils/df/bfloat16.hpp"
 #include "tt_metal/test_utils/df/float32.hpp"
diff --git a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt
@@ -48,6 +48,7 @@ add_executable(
     ${UNIT_TESTS_SRC}
     $<TARGET_OBJECTS:unit_tests_common_o>
 )
+TT_ENABLE_UNITY_BUILD(unit_tests)
 add_executable(unit_tests_galaxy ${CMAKE_CURRENT_SOURCE_DIR}/multichip/galaxy_cluster_api.cpp)
 
 target_link_libraries(

diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
@@ -227,8 +227,8 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) {
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> input_packed = tt::test_utils::generate_packed_uniform_random_vector<uint32_t, tt::test_utils::df::bfloat16>(
-        -1.0f, 1.0f, cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
+    std::vector<uint32_t> input_packed = tt::test_utils::generate_packed_uniform_random_vector<uint32_t, bfloat16>(
+        -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, std::chrono::system_clock::now().time_since_epoch().count());
 
     ////////////////////////////////////////////////////////////////////////////
     //                      Compile and Execute Appli   cation

diff --git a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"

diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
@@ -69,7 +69,7 @@ struct BroadcastConfig {
     MathFidelity math_fidelity = MathFidelity::HiFi4;
 };
 
-void mask_src_b_for_broadcast(std::vector<tt::test_utils::df::bfloat16>& tile, const std::vector<uint32_t> &shape, BroadcastDim dim) {
+void mask_src_b_for_broadcast(std::vector<bfloat16>& tile, const std::vector<uint32_t> &shape, BroadcastDim dim) {
     int num_rows = shape.at(0);
     int num_cols = shape.at(1);
 
@@ -83,14 +83,14 @@ void mask_src_b_for_broadcast(std::vector<tt::test_utils::df::bfloat16>& tile, c
     }
 }
 
-std::vector<tt::test_utils::df::bfloat16> gold_broadcast(std::vector<tt::test_utils::df::bfloat16>& src_a, std::vector<tt::test_utils::df::bfloat16>& src_b, const std::vector<uint32_t> &shape, EltwiseOp op, BroadcastDim dim, MathFidelity math_fidelity = MathFidelity::HiFi4) {
+std::vector<bfloat16> gold_broadcast(std::vector<bfloat16>& src_a, std::vector<bfloat16>& src_b, const std::vector<uint32_t> &shape, EltwiseOp op, BroadcastDim dim, MathFidelity math_fidelity = MathFidelity::HiFi4) {
     int num_rows = shape.at(0);
     int num_cols = shape.at(1);
 
     uint16_t srca_fid_mask = 0xFFFF;
     uint16_t srcb_fid_mask = 0xFFFF;
 
-    std::vector<tt::test_utils::df::bfloat16> golden(num_cols * num_rows);
+    std::vector<bfloat16> golden(num_cols * num_rows);
     auto arch = get_arch_from_string(get_umd_arch_name());
 
     switch (math_fidelity) {
@@ -103,7 +103,7 @@ std::vector<tt::test_utils::df::bfloat16> gold_broadcast(std::vector<tt::test_ut
 
     for (int i = 0; i < num_rows; i++) {
         for (int j = 0; j < num_cols; j++) {
-            tt::test_utils::df::bfloat16 broadcast_value;
+            bfloat16 broadcast_value;
             switch (dim)
             {
             case BroadcastDim::ROW: { broadcast_value = src_b[j]; break; }
@@ -118,8 +118,8 @@ std::vector<tt::test_utils::df::bfloat16> gold_broadcast(std::vector<tt::test_ut
             case EltwiseOp::SUB: { golden[i * num_cols + j] = src_a[i * num_cols + j].to_float() - broadcast_value.to_float(); break; }
             case EltwiseOp::MUL: {
                 golden[i * num_cols + j] =
-                    tt::test_utils::df::bfloat16(std::bit_cast<uint32_t>(src_a[i * num_cols + j].to_packed() & srca_fid_mask)).to_float() *
-                    tt::test_utils::df::bfloat16(std::bit_cast<uint32_t>(broadcast_value.to_packed() & srcb_fid_mask)).to_float();
+                    bfloat16(std::bit_cast<uint32_t>(src_a[i * num_cols + j].to_packed() & srca_fid_mask)).to_float() *
+                    bfloat16(std::bit_cast<uint32_t>(broadcast_value.to_packed() & srcb_fid_mask)).to_float();
                 break;
             }
             default: { TT_THROW("Unsupported EltwiseOp={}", op); break; }
@@ -142,7 +142,7 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig&
     constexpr uint32_t tile_width = 32;
     constexpr uint32_t tile_height = 32;
 
-    constexpr uint32_t single_tile_size = tile_width * tile_height * tt::test_utils::df::bfloat16::SIZEOF;
+    constexpr uint32_t single_tile_size = tile_width * tile_height * bfloat16::SIZEOF;
 
     tt_metal::InterleavedBufferConfig dram_config{
         .device=device,
@@ -244,25 +244,25 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig&
             (uint32_t)1,
         });
 
-    std::vector<tt::test_utils::df::bfloat16> input0 = generate_uniform_random_vector<tt::test_utils::df::bfloat16>(
+    std::vector<bfloat16> input0 = generate_uniform_random_vector<bfloat16>(
         -1.0f,
         1.0f,
-        single_tile_size / tt::test_utils::df::bfloat16::SIZEOF,
+        single_tile_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
 
-    std::vector<tt::test_utils::df::bfloat16> input1 = generate_uniform_random_vector<tt::test_utils::df::bfloat16>(
+    std::vector<bfloat16> input1 = generate_uniform_random_vector<bfloat16>(
         -1.0f,
         1.0f,
-        single_tile_size / tt::test_utils::df::bfloat16::SIZEOF,
+        single_tile_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
 
     mask_src_b_for_broadcast(input1, {tile_width, tile_height}, test_config.broadcast_dim);
 
-    std::vector<tt::test_utils::df::bfloat16> golden = gold_broadcast(input0, input1, {tile_width, tile_height}, test_config.eltwise_op, test_config.broadcast_dim, test_config.math_fidelity);
+    std::vector<bfloat16> golden = gold_broadcast(input0, input1, {tile_width, tile_height}, test_config.eltwise_op, test_config.broadcast_dim, test_config.math_fidelity);
 
-    auto packed_input0 = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(input0);
-    auto packed_input1 = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(input1);
-    auto packed_golden = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(golden);
+    auto packed_input0 = pack_vector<uint32_t, bfloat16>(input0);
+    auto packed_input1 = pack_vector<uint32_t, bfloat16>(input1);
+    auto packed_golden = pack_vector<uint32_t, bfloat16>(golden);
     unit_tests::compute::GoldenConfig config = {
         .num_tiles_r_dim = tile_width/32,
         .num_tiles_c_dim = tile_height/32
@@ -279,10 +279,10 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig&
     tt_metal::detail::ReadFromBuffer(dst_dram_buffer, dest_buffer_data);
     auto dest_buffer_data_untilized = unit_tests::compute::gold_standard_untilize(dest_buffer_data, config);
 
-    bool result = is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
+    bool result = is_close_packed_vectors<bfloat16, uint32_t>(
         dest_buffer_data_untilized,
         packed_golden,
-        [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) {
+        [&](const bfloat16& a, const bfloat16& b) {
             return is_close(a, b, 0.0155);
         });
     ASSERT_TRUE(result);

diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp
@@ -24,12 +24,12 @@ struct CumsumConfig {
     bool rowwise;
 };
 
-std::vector<tt::test_utils::df::bfloat16> gold_cumsum(std::vector<tt::test_utils::df::bfloat16>& src, const std::vector<uint32_t> &shape, bool rowwise) {
+std::vector<bfloat16> gold_cumsum(std::vector<bfloat16>& src, const std::vector<uint32_t> &shape, bool rowwise) {
     int N = shape.at(0);
     int W = shape.at(1);
     int H = shape.at(2);
 
-    std::vector<tt::test_utils::df::bfloat16> golden(N * W * H);
+    std::vector<bfloat16> golden(N * W * H);
 
     int dim_a = rowwise ? H : W;
     int dim_b = rowwise ? W : H;
@@ -57,7 +57,7 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c
     constexpr uint32_t tile_width = 32;
     constexpr uint32_t tile_height = 32;
 
-    constexpr uint32_t single_tile_size = tile_width * tile_height * tt::test_utils::df::bfloat16::SIZEOF;
+    constexpr uint32_t single_tile_size = tile_width * tile_height * bfloat16::SIZEOF;
 
     uint32_t W = test_config.Wt * tile_width;
     uint32_t H = test_config.Ht * tile_height;
@@ -147,16 +147,16 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c
             (uint32_t)test_config.Ht * test_config.Wt                  // Used for transposing kernel
         });
 
-    std::vector<tt::test_utils::df::bfloat16> input = generate_uniform_random_vector<tt::test_utils::df::bfloat16>(
+    std::vector<bfloat16> input = generate_uniform_random_vector<bfloat16>(
         -1.0f,
         1.0f,
-        dram_buffer_size / tt::test_utils::df::bfloat16::SIZEOF,
+        dram_buffer_size / bfloat16::SIZEOF,
         std::chrono::system_clock::now().time_since_epoch().count());
 
-    std::vector<tt::test_utils::df::bfloat16> golden = gold_cumsum(input, {test_config.N, W, H}, test_config.rowwise);
-    auto golden_packed = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(golden);
+    std::vector<bfloat16> golden = gold_cumsum(input, {test_config.N, W, H}, test_config.rowwise);
+    auto golden_packed = pack_vector<uint32_t, bfloat16>(golden);
 
-    auto input_packed = pack_vector<uint32_t, tt::test_utils::df::bfloat16>(input);
+    auto input_packed = pack_vector<uint32_t, bfloat16>(input);
     auto input_packed_tilized = unit_tests::compute::gold_standard_tilize(input_packed, {test_config.N * test_config.Ht, test_config.Wt});
 
     tt_metal::detail::WriteToBuffer(src_dram_buffer, input_packed_tilized);
@@ -169,10 +169,10 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c
 
     log_info(tt::LogTest, "Running test for N = {}, Wt = {}, Ht = {}", test_config.N, test_config.Wt, test_config.Ht);
 
-    bool result = is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
+    bool result = is_close_packed_vectors<bfloat16, uint32_t>(
         output_packed,
         golden_packed,
-        [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) {
+        [&](const bfloat16& a, const bfloat16& b) {
             return is_close(a, b, 0.01f);
         });
     ASSERT_TRUE(result);