From 974ee3a2f65764526769fc302cfb3f906cdf8841 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Fri, 25 Oct 2024 22:07:34 +0800
Subject: [PATCH] update tests

---
 .../test/contrib_ops/matmul_4bits_test.cc     | 23 ++++++++++++++-----
 .../matmul_integer_to_float_test.cc           |  2 +-
 onnxruntime/test/lora/lora_test.cc            | 16 +++++++++++++
 onnxruntime/test/providers/cpu/model_tests.cc | 12 ++++++++++
 tools/ci_build/build.py                       |  2 +-
 5 files changed, 47 insertions(+), 8 deletions(-)
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 8138829b057f2..9fa1e155f0d7a 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -485,13 +485,17 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   if (use_float16) {
 #ifdef USE_CUDA
-    execution_providers.push_back(DefaultCudaExecutionProvider());
+    if (DefaultCudaExecutionProvider() != nullptr) {
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+    }
 #endif
 #ifdef USE_ROCM
     execution_providers.push_back(DefaultRocmExecutionProvider());
 #endif
 #ifdef USE_DML
-    execution_providers.push_back(DefaultDmlExecutionProvider());
+    if (DefaultDmlExecutionProvider() != nullptr) {
+      execution_providers.push_back(DefaultDmlExecutionProvider());
+    }
 #endif
 
     RunTest<MLFloat16>(opts, std::move(execution_providers));
@@ -506,8 +510,11 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
 }  // namespace
 
 TEST(MatMulNBits, Float16Cuda) {
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
   auto has_gidx_options = {true, false};
+  if (DefaultDmlExecutionProvider() != nullptr) {
+    has_gidx_options = {false};
+  }
 #else
   auto has_gidx_options = {false};
 #endif
@@ -518,7 +525,9 @@ TEST(MatMulNBits, Float16Cuda) {
         for (auto block_size : {16, 32, 64, 128}) {
           for (auto has_gidx : has_gidx_options) {
 #ifdef USE_DML
-            RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
+            if (DefaultDmlExecutionProvider() != nullptr) {
+              RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
+            }
 #else
             RunTest(M, N, K, block_size, 0, false, true, has_gidx);
             RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
@@ -531,12 +540,15 @@ TEST(MatMulNBits, Float16Cuda) {
 }
 
 TEST(MatMulNBits, Float16Large) {
-#ifdef USE_DML
+#if defined(USE_CUDA) || defined(USE_DML)
   // For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail
   // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an
   // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number
   // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances.
   float abs_error = 0.3f;
+  if (DefaultDmlExecutionProvider() != nullptr) {
+    abs_error = 0.05f;
+  }
 #else
   float abs_error = 0.05f;
 #endif
@@ -549,7 +561,6 @@ TEST(MatMulNBits, Float16Large) {
     }
   }
 }
-
 #endif  // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 8d7629b5fda1c..d88c3131a4ca5 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
 }
 
 // DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
-#if defined(USE_DML)
+#if defined(USE_DML) && !defined(USE_CUDA)
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
   RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index fde603858f9a9..4155cb7abc279 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -201,6 +201,14 @@ TEST(LoraAdapterTest, Load) {
 
 #ifdef USE_CUDA
 TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    GTEST_SKIP() << "Skip This Test Due to this EP is null";
+  }
+#ifdef USE_DML
+  if (DefaultDmlExecutionProvider() == nullptr) {
+    GTEST_SKIP() << "It should not run with DML EP";
+  }
+#endif
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
   auto cuda_allocator = DefaultCudaExecutionProvider()->CreatePreferredAllocators()[0];
@@ -234,6 +242,14 @@ TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
 
 #ifdef USE_DML
 TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
+  if (DefaultDmlExecutionProvider() == nullptr) {
+    GTEST_SKIP() << "Skip This Test Due to this EP is null";
+  }
+#ifdef USE_CUDA
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    GTEST_SKIP() << "It should not run with CUDA EP";
+  }
+#endif
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
 
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index e3c86a137484f..b46c253fb8ed9 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -491,6 +491,18 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   // the number of times these are run to reduce the CI time.
   provider_names.erase(provider_name_cpu);
 #endif
+
+#if defined(USE_CUDA) && defined(USE_DML)
+  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
+  if (no_cuda_ep_test == "1") {
+    provider_names.erase(provider_name_cuda);
+  }
+  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
+  if (no_dml_ep_test == "1") {
+    provider_names.erase(provider_name_dml);
+  }
+#endif
+
   std::vector<std::basic_string<ORTCHAR_T>> v;
   // Permanently exclude following tests because ORT support only opset starting from 7,
   // Please make no more changes to the list
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 2bc7be6b0115c..9624f9112c49f 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2072,7 +2072,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 executables.append("onnxruntime_global_thread_pools_test")
                 executables.append("onnxruntime_customopregistration_test")
             for exe in executables:
-                test_output = f"--gtest_filter=*FusedMatMulOpTest* --gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
+                test_output = f"--gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
                 run_subprocess([os.path.join(cwd, exe), test_output], cwd=cwd, dll_path=dll_path)
         else:
             ctest_cmd = [ctest_path, "--build-config", config, "--verbose", "--timeout", args.test_all_timeout]