tenstorrent · Aswinmcw · Nov 6, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
@@ -22,6 +22,7 @@ jobs:
           { name: "t3k LLM llama3 model perf tests", model: "llama3", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
           { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
           { name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
+          { name: "t3k CCL all_gather perf tests", arch: wormhole_b0, cmd: run_t3000_ccl_all_gather_perf_tests, timeout: 75, tracy: true, owner_id: ULMEPM2MA}, # Sean Nijjar
           #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
         ]
     name: ${{ matrix.test-group.name }}
@@ -45,13 +46,25 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
-      - uses: actions/download-artifact@v4
+      - name: Download profiler build artifact
+        id: download-profiler-artifact
+        if: ${{ matrix.test-group.tracy }}
+        uses: actions/download-artifact@v4
+        with:
+          name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+        continue-on-error: true
+      - name: Download build artifact
+        id: download-artifact
+        if: ${{ !matrix.test-group.tracy }}
+        uses: actions/download-artifact@v4
         with:
           name: TTMetal_build_${{ matrix.test-group.arch }}
       - name: Extract files
+        if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run model perf regression tests
+        if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
         shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
@@ -63,12 +76,28 @@ jobs:
           env python models/perf/merge_perf_results.py
       - name: Check perf report exists
         id: check-perf-report
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && (matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy) }}
         run: |
-          ls -hal
-          export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv"
-          ls -hal $PERF_REPORT_FILENAME
-          echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
+          TODAY=$(date +%Y_%m_%d)
+          PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
+          PERF_REPORT_FILENAME_CCL="CCL_Perf_${TODAY}.csv"
+          if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
+            if [ -f "$PERF_REPORT_FILENAME_CCL" ]; then
+              echo "Found CCL Perf report: $PERF_REPORT_FILENAME_CCL"
+              echo "perf_report_filename=$PERF_REPORT_FILENAME_CCL" >> "$GITHUB_OUTPUT"
+            else
+              echo "No CCL perf report found for today."
+              exit 1
+            fi
+          else
+            if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
+              echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
+              echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
+            else
+              echo "No Models perf report found for today."
+              exit 1
+            fi
+          fi
       - name: Upload perf report
         if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
         uses: actions/upload-artifact@v4

@@ -11,7 +11,13 @@ jobs:
     with:
       arch: '["wormhole_b0"]'
     secrets: inherit
+  build-artifact-profiler:
+    uses: ./.github/workflows/build-artifact.yaml
+    with:
+      arch: '["wormhole_b0"]'
+      tracy: true
+    secrets: inherit
   t3000-model-perf-tests:
-    needs: build-artifact
+    needs: [build-artifact, build-artifact-profiler]
     secrets: inherit
     uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -142,6 +142,25 @@ run_t3000_resnet50_tests() {
   fi
 }
 
+run_t3000_ccl_all_gather_perf_tests() {
+  # Record the start time
+  fail=0
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_ccl_all_gather_perf_tests"
+
+  tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t t3000
+  fail+=$?
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_ccl_all_gather_perf_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
+}
+
 run_t3000_llm_tests() {
   # Run falcon7b tests
   run_t3000_falcon7b_tests
@@ -173,6 +192,12 @@ run_t3000_cnn_tests() {
   env python models/perf/merge_perf_results.py
 }
 
+run_t3000_ccl_tests() {
+  # Run ccl performance tests
+  run_t3000_ccl_all_gather_perf_tests
+
+}
+
 fail=0
 main() {
   # For CI pipeline - source func commands but don't execute tests if not invoked directly
@@ -219,8 +244,10 @@ main() {
     run_t3000_llm_tests
   elif [[ "$pipeline_type" == "cnn_model_perf_t3000_device" ]]; then
     run_t3000_cnn_tests
+  elif [[ "$pipeline_type" == "ccl_perf_t3000_device" ]]; then
+    run_t3000_ccl_tests
   else
-    echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1
+    echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device, ccl_perf_t3000_device])" 2>&1
     exit 1
   fi
 

@@ -5,6 +5,7 @@
 import pandas as pd
 import os
 import re
+import time
 
 
 def perf_report(file_path):
@@ -214,10 +215,12 @@ def calculate_bandwidth(row):
 
     averages_df = pd.DataFrame(averages_data)
 
-    averages_file_path = file_path.replace(".csv", "_averages.csv")
+    today = time.strftime("%Y_%m_%d")
+    ccl_perf_file_path = f"CCL_Perf_{today}.csv"
+    os.rename(file_path, ccl_perf_file_path)
 
-    averages_df.to_csv(averages_file_path, index=False)
+    averages_df.to_csv(ccl_perf_file_path, index=False)
 
-    print(f"Averages CSV saved to: {averages_file_path}")
+    print(f"CCL Perf report CSV saved to: {ccl_perf_file_path}")
 
     return averages_df
@@ -72,24 +72,34 @@ run_profile_and_extract_csv() {
 
     if [ -n "$csv_path" ]; then
         echo "CSV path found: $csv_path"
+        echo "Generating performance report..."
 
-        # Run the Python script to generate performance report
-        average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
+        tmp_file="/tmp/perf_report_output.log"
+        PYTHONPATH="$MODULE_DIR" python3 -c "
+import sys
 import pandas as pd
 from perf_csv import perf_report
 from tabulate import tabulate
 
-# Generate the report and convert it to a DataFrame
-average_df = perf_report('$csv_path')
-# Print the DataFrame in a pretty table format
-print(tabulate(average_df, headers='keys', tablefmt='pretty'))
-")
+try:
+    # Generate the report and convert it to a DataFrame
+    average_df = perf_report('$csv_path')
+    # Print the DataFrame in a pretty table format
+    print('Min - Avg - Max by Common Runs:')
+    print(tabulate(average_df, headers='keys', tablefmt='pretty'))
+except Exception as e:
+    print(f'Error in performance report generation: {e}', file=sys.stderr)
+    sys.exit(1)
+" 2>&1 | tee "$tmp_file"
+
+        if grep -q "Error in performance report generation" "$tmp_file"; then
+            echo "Error: Performance report generation failed."
+            exit 1
+        fi
 
-        # Print the output
-        echo "Min - Avg - Max by Common Runs:"
-        echo "$average_values"
     else
         echo "CSV path not found in the command output."
+        exit 1
     fi
 }
 

@@ -72,24 +72,34 @@ run_profile_and_extract_csv() {
 
     if [ -n "$csv_path" ]; then
         echo "CSV path found: $csv_path"
+        echo "Generating performance report..."
 
-        # Run the Python script to generate performance report
-        average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
+        tmp_file="/tmp/perf_report_output.log"
+        PYTHONPATH="$MODULE_DIR" python3 -c "
+import sys
 import pandas as pd
 from perf_csv import perf_report
 from tabulate import tabulate
 
-# Generate the report and convert it to a DataFrame
-average_df = perf_report('$csv_path')
-# Print the DataFrame in a pretty table format
-print(tabulate(average_df, headers='keys', tablefmt='pretty'))
-")
+try:
+    # Generate the report and convert it to a DataFrame
+    average_df = perf_report('$csv_path')
+    # Print the DataFrame in a pretty table format
+    print('Min - Avg - Max by Common Runs:')
+    print(tabulate(average_df, headers='keys', tablefmt='pretty'))
+except Exception as e:
+    print(f'Error in performance report generation: {e}', file=sys.stderr)
+    sys.exit(1)
+" 2>&1 | tee "$tmp_file"
+
+        if grep -q "Error in performance report generation" "$tmp_file"; then
+            echo "Error: Performance report generation failed."
+            exit 1
+        fi
 
-        # Print the output
-        echo "Min - Avg - Max by Common Runs:"
-        echo "$average_values"
     else
         echo "CSV path not found in the command output."
+        exit 1
     fi
 }