From d500ed7530f0bc09562372206e4727033a0031c8 Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Wed, 6 Nov 2024 15:48:32 +0000
Subject: [PATCH 1/6] #14406: Initial commit to test ccl perf in pipeline

---
 .../t3000-model-perf-tests-impl.yaml          | 10 ++++++-
 .github/workflows/t3000-model-perf-tests.yaml |  8 ++++-
 .../t3000/run_t3000_model_perf_tests.sh       | 29 ++++++++++++++++++-
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index 91e208c214b..489994bf9f0 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -22,6 +22,7 @@ jobs:
           { name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
           { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
           { name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
+          { name: "t3k CCL perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
           #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
         ]
     name: ${{ matrix.test-group.name }}
@@ -45,7 +46,14 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
-      - uses: actions/download-artifact@v4
+      - name: Download profiler build artifact
+        if: ${{ matrix.test-group.tracy }}
+        uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+      - name: Download regular build artifact
+        if: ${{ !matrix.test-group.tracy }}
+        uses: actions/download-artifact@v4
         with:
           name: TTMetal_build_${{ matrix.test-group.arch }}
       - name: Extract files
diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml
index 0a8759af27c..15d96746889 100644
--- a/.github/workflows/t3000-model-perf-tests.yaml
+++ b/.github/workflows/t3000-model-perf-tests.yaml
@@ -11,7 +11,13 @@ jobs:
     with:
       arch: '["wormhole_b0"]'
     secrets: inherit
+  build-artifact-profiler:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+      tracy: true
+    secrets: inherit
   t3000-model-perf-tests:
-    needs: build-artifact
+    needs: [build-artifact, build-artifact-profiler]
     secrets: inherit
     uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml
diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
index 70baaa85ae3..7c0d0757a09 100755
--- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh
+++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
@@ -142,6 +142,25 @@ run_t3000_resnet50_tests() {
   fi
 }
 
+run_t3000_ccl_perf_tests() {
+  # Record the start time
+  fail=0
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_ccl_perf_tests"
+
+  tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -t t3000
+  fail+=$?
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_ccl_perf_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
+}
+
 run_t3000_llm_tests() {
   # Run falcon7b tests
   run_t3000_falcon7b_tests
@@ -173,6 +192,12 @@ run_t3000_cnn_tests() {
   env python models/perf/merge_perf_results.py
 }
 
+run_t3000_ccl_tests() {
+  # Run ccl performance tests
+  run_t3000_ccl_perf_tests
+
+}
+
 fail=0
 main() {
   # For CI pipeline - source func commands but don't execute tests if not invoked directly
@@ -219,8 +244,10 @@ main() {
     run_t3000_llm_tests
   elif [[ "$pipeline_type" == "cnn_model_perf_t3000_device" ]]; then
     run_t3000_cnn_tests
+  elif [[ "$pipeline_type" == "ccl_perf_t3000_device" ]]; then
+    run_t3000_ccl_tests
   else
-    echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1
+    echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device, ccl_perf_t3000_device])" 2>&1
     exit 1
   fi
 

From 41154875f2d9697dd7dec110ae8b9e9035566762 Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Thu, 7 Nov 2024 06:20:13 +0000
Subject: [PATCH 2/6] #14406: Upload perf report to GH

---
 .../t3000-model-perf-tests-impl.yaml          | 20 +++++++++++++++----
 .../operations/ccl/perf/perf_csv.py           |  9 ++++++---
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index 489994bf9f0..f1d0090bda3 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -51,7 +51,7 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
-      - name: Download regular build artifact
+      - name: Download build artifact
         if: ${{ !matrix.test-group.tracy }}
         uses: actions/download-artifact@v4
         with:
@@ -74,9 +74,21 @@ jobs:
         if: ${{ !cancelled() }}
         run: |
           ls -hal
-          export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv"
-          ls -hal $PERF_REPORT_FILENAME
-          echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
+          TODAY=$(date +%Y_%m_%d)
+          PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
+          PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv"
+          if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
+            echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
+            echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
+            ls -hal "$PERF_REPORT_FILENAME_MODELS"
+          elif [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
+            echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
+            echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
+            ls -hal "$PERF_REPORT_FILENAME_CCL"
+          else
+            echo "No perf report found."
+            exit 1
+          fi
       - name: Upload perf report
         if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
         uses: actions/upload-artifact@v4
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
index 31f4636aa66..3d5cc2aaeb5 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import os
 import re
+import time
 
 
 def perf_report(file_path):
@@ -214,10 +215,12 @@ def calculate_bandwidth(row):
 
     averages_df = pd.DataFrame(averages_data)
 
-    averages_file_path = file_path.replace(".csv", "_averages.csv")
+    today = time.strftime("%Y_%m_%d")
+    ccl_perf_file_path = f"CCL_Perf_{today}.csv"
+    os.rename(file_path, ccl_perf_file_path)
 
-    averages_df.to_csv(averages_file_path, index=False)
+    averages_df.to_csv(ccl_perf_file_path, index=False)
 
-    print(f"Averages CSV saved to: {averages_file_path}")
+    print(f"CCL Perf report CSV saved to: {ccl_perf_file_path}")
 
     return averages_df

From 18d4bf41d6184a0d8846ceccf617f6a64370344f Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Thu, 7 Nov 2024 06:39:11 +0000
Subject: [PATCH 3/6] #14406: Upload perf report to GH

---
 .../t3000-model-perf-tests-impl.yaml          | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index f1d0090bda3..3c0bd44cdcf 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -73,21 +73,25 @@ jobs:
         id: check-perf-report
         if: ${{ !cancelled() }}
         run: |
-          ls -hal
           TODAY=$(date +%Y_%m_%d)
           PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
           PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv"
-          if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
-            echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
-            echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
-            ls -hal "$PERF_REPORT_FILENAME_MODELS"
-          elif [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
-            echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
-            echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
-            ls -hal "$PERF_REPORT_FILENAME_CCL"
+          if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
+            if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
+              echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
+              echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
+            else
+              echo "No CCL perf report found for today."
+              exit 1
+            fi
           else
-            echo "No perf report found."
-            exit 1
+            if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
+              echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
+              echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
+            else
+              echo "No Models perf report found for today."
+              exit 1
+            fi
           fi
       - name: Upload perf report
         if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}

From 4550bbc32bb35f57093c1cfd4f4756f86d70757d Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Thu, 7 Nov 2024 07:37:17 +0000
Subject: [PATCH 4/6] #14406: Mark fail if csv not generated

---
 .../unit_tests/operations/ccl/perf/run_all_gather_profile.sh     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
index 8422bde56d0..69e34a86b22 100755
--- a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
@@ -90,6 +90,7 @@ print(tabulate(average_df, headers='keys', tablefmt='pretty'))
         echo "$average_values"
     else
         echo "CSV path not found in the command output."
+        exit 1
     fi
 }
 

From a25003a013842f3e6db8c0b6145f4820001581de Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Fri, 8 Nov 2024 08:08:56 +0000
Subject: [PATCH 5/6] #14406: Improved print messages

---
 .../t3000-model-perf-tests-impl.yaml          |  2 +-
 .../t3000/run_t3000_model_perf_tests.sh       | 10 +++----
 .../ccl/perf/run_all_gather_profile.sh        | 29 +++++++++++-------
 .../ccl/perf/run_reduce_scatter_profile.sh    | 30 ++++++++++++-------
 4 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index 3c0bd44cdcf..f092abf18d1 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -22,7 +22,7 @@ jobs:
           { name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
           { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
           { name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
-          { name: "t3k CCL perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
+          { name: "t3k CCL all_gather perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
           #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
         ]
     name: ${{ matrix.test-group.name }}
diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
index 7c0d0757a09..19a54d710b1 100755
--- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh
+++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
@@ -142,20 +142,20 @@ run_t3000_resnet50_tests() {
   fi
 }
 
-run_t3000_ccl_perf_tests() {
+run_t3000_ccl_all_gather_perf_tests() {
   # Record the start time
   fail=0
   start_time=$(date +%s)
 
-  echo "LOG_METAL: Running run_t3000_ccl_perf_tests"
+  echo "LOG_METAL: Running run_t3000_ccl_all_gather_perf_tests"
 
-  tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -t t3000
+  tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t t3000
   fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
-  echo "LOG_METAL: run_t3000_ccl_perf_tests $duration seconds to complete"
+  echo "LOG_METAL: run_t3000_ccl_all_gather_perf_tests $duration seconds to complete"
   if [[ $fail -ne 0 ]]; then
     exit 1
   fi
@@ -194,7 +194,7 @@ run_t3000_cnn_tests() {
 
 run_t3000_ccl_tests() {
   # Run ccl performance tests
-  run_t3000_ccl_perf_tests
+  run_t3000_ccl_all_gather_perf_tests
 
 }
 
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
index 69e34a86b22..0e714429b88 100755
--- a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
@@ -72,22 +72,31 @@ run_profile_and_extract_csv() {
 
     if [ -n "$csv_path" ]; then
         echo "CSV path found: $csv_path"
+        echo "Generating performance report..."
 
-        # Run the Python script to generate performance report
-        average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
+        tmp_file="/tmp/perf_report_output.log"
+        PYTHONPATH="$MODULE_DIR" python3 -c "
+import sys
 import pandas as pd
 from perf_csv import perf_report
 from tabulate import tabulate
 
-# Generate the report and convert it to a DataFrame
-average_df = perf_report('$csv_path')
-# Print the DataFrame in a pretty table format
-print(tabulate(average_df, headers='keys', tablefmt='pretty'))
-")
+try:
+    # Generate the report and convert it to a DataFrame
+    average_df = perf_report('$csv_path')
+    # Print the DataFrame in a pretty table format
+    print('Min - Avg - Max by Common Runs:')
+    print(tabulate(average_df, headers='keys', tablefmt='pretty'))
+except Exception as e:
+    print(f'Error in performance report generation: {e}', file=sys.stderr)
+    sys.exit(1)
+" 2>&1 | tee "$tmp_file"
+
+        if grep -q "Error in performance report generation" "$tmp_file"; then
+            echo "Error: Performance report generation failed."
+            exit 1
+        fi
 
-        # Print the output
-        echo "Min - Avg - Max by Common Runs:"
-        echo "$average_values"
     else
         echo "CSV path not found in the command output."
         exit 1
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh
index 23071225ac1..2f054ca348c 100755
--- a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh
@@ -72,24 +72,34 @@ run_profile_and_extract_csv() {
 
     if [ -n "$csv_path" ]; then
         echo "CSV path found: $csv_path"
+        echo "Generating performance report..."
 
-        # Run the Python script to generate performance report
-        average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
+        tmp_file="/tmp/perf_report_output.log"
+        PYTHONPATH="$MODULE_DIR" python3 -c "
+import sys
 import pandas as pd
 from perf_csv import perf_report
 from tabulate import tabulate
 
-# Generate the report and convert it to a DataFrame
-average_df = perf_report('$csv_path')
-# Print the DataFrame in a pretty table format
-print(tabulate(average_df, headers='keys', tablefmt='pretty'))
-")
+try:
+    # Generate the report and convert it to a DataFrame
+    average_df = perf_report('$csv_path')
+    # Print the DataFrame in a pretty table format
+    print('Min - Avg - Max by Common Runs:')
+    print(tabulate(average_df, headers='keys', tablefmt='pretty'))
+except Exception as e:
+    print(f'Error in performance report generation: {e}', file=sys.stderr)
+    sys.exit(1)
+" 2>&1 | tee "$tmp_file"
+
+        if grep -q "Error in performance report generation" "$tmp_file"; then
+            echo "Error: Performance report generation failed."
+            exit 1
+        fi
 
-        # Print the output
-        echo "Min - Avg - Max by Common Runs:"
-        echo "$average_values"
     else
         echo "CSV path not found in the command output."
+        exit 1
     fi
 }
 

From 200e5d804600ddaa9ddecb3c0960913c7f908f72 Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Fri, 8 Nov 2024 10:13:57 +0000
Subject: [PATCH 6/6] #0: ci check

---
 .github/workflows/t3000-model-perf-tests-impl.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index f092abf18d1..c104d01fbaa 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -47,19 +47,24 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
       - name: Download profiler build artifact
+        id: download-profiler-artifact
         if: ${{ matrix.test-group.tracy }}
         uses: actions/download-artifact@v4
         with:
           name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+        continue-on-error: true
       - name: Download build artifact
+        id: download-artifact
         if: ${{ !matrix.test-group.tracy }}
         uses: actions/download-artifact@v4
         with:
           name: TTMetal_build_${{ matrix.test-group.arch }}
       - name: Extract files
+        if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run model perf regression tests
+        if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
         shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
@@ -71,7 +76,7 @@ jobs:
           env python models/perf/merge_perf_results.py
       - name: Check perf report exists
         id: check-perf-report
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }}
         run: |
           TODAY=$(date +%Y_%m_%d)
           PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"