consolidate gpu data transfer

microsoft · Oct 25, 2024 · 5aa867a · 5aa867a
1 parent b4afc62
commit 5aa867a
Show file tree

Hide file tree

Showing 8 changed files with 45 additions and 42 deletions.
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -50,7 +50,6 @@ class Memcpy final : public OpKernel {
       ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
       Tensor* Y = ctx->Output(0, X->Shape());
       ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
-      // do we support async copy?
       // The cudaMemCpyAsync will handle the pinned memory and non-pinned memory,
       // so we don't need the check here.
       auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);

diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc
@@ -7,10 +7,6 @@
 #include "cuda_common.h"
 
 namespace onnxruntime {
-GPUDataTransfer::GPUDataTransfer() {}
-
-GPUDataTransfer::~GPUDataTransfer() {}
-
 bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
   return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED ||
          dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
@@ -30,19 +26,17 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
       // Copy only if the two addresses are different.
       if (dst_data != src_data) {
         CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice));
-        CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
       }
     } else {
       // copy from other CPU memory to GPU, this is blocking
       CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
-      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
     // copying from GPU to CPU memory, this is blocking
     CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
-    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
   } else {
     // copying between cpu memory
+    ORT_ENFORCE(dst_data != src_data);
     memcpy(dst_data, src_data, bytes);
   }
 
@@ -59,7 +53,7 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
 
   if (dst_device.Type() == OrtDevice::GPU) {
     if (src_device.Type() == OrtDevice::CPU) {
-      // copy from pinned memory to GPU, this is non-blocking
+      // copy from pinned or non-pinned CPU memory to GPU
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, static_cast<cudaStream_t>(stream.GetHandle())));
     } else if (src_device.Type() == OrtDevice::GPU) {
       // copying between GPU, this is non-blocking
@@ -69,14 +63,16 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
     if (dst_device.Type() == OrtDevice::CPU) {
-      // copying from GPU to pinned memory, this is non-blocking
+      // copy from GPU to pinned or non-pinned CPU memory.
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, static_cast<cudaStream_t>(stream.GetHandle())));
     }
   } else {
     if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
       // sync the stream first to make sure the data arrived
       CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(stream.GetHandle())));
     }
+
+    ORT_ENFORCE(dst_data != src_data);
     memcpy(dst_data, src_data, bytes);
   }
 

diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h
@@ -10,8 +10,8 @@ namespace onnxruntime {
 
 class GPUDataTransfer : public IDataTransfer {
  public:
-  GPUDataTransfer();
-  ~GPUDataTransfer();
+  GPUDataTransfer() = default;
+  ~GPUDataTransfer() = default;
 
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
 

diff --git a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
@@ -2,12 +2,16 @@
 // Licensed under the MIT License.
 
 #include "core/providers/shared_library/provider_api.h"
-#include "gpu_data_transfer.h"
-#include "migraphx_call.h"
+#include "core/providers/migraphx/gpu_data_transfer.h"
+#include "core/providers/migraphx/migraphx_call.h"
+
+// If you make change below, please also update onnxruntime/core/providers/rocm/gpu_data_transfer.cc
 
 namespace onnxruntime {
+
 bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
-  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
+  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
+         dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
 }
 
 common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
@@ -23,17 +27,18 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
     if (src_device.Type() == OrtDevice::GPU) {
       // Copy only if the two addresses are different.
       if (dst_data != src_data) {
-        HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
+        HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
       }
     } else {
       // copy from other CPU memory to GPU, this is blocking
-      HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
+      HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
     // copying from GPU to CPU memory, this is blocking
-    HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
+    HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
   } else {
     // copying between cpu memory
+    ORT_ENFORCE(dst_data != src_data);
     memcpy(dst_data, src_data, bytes);
   }
 
@@ -49,23 +54,28 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
   auto& dst_device = dst.Location().device;
 
   if (dst_device.Type() == OrtDevice::GPU) {
-    if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
-      // copy from pinned memory to GPU, this is non-blocking
-      HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
+    if (src_device.Type() == OrtDevice::CPU) {
+      // If source are not pinned, the memory copy will be performed synchronously.
+      // For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
+      HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
     } else if (src_device.Type() == OrtDevice::GPU) {
       // copying between GPU, this is non-blocking
       HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, static_cast<hipStream_t>(stream.GetHandle())));
-    } else {
-      // copy from other CPU memory to GPU, this is blocking
-      HIP_CALL_THROW(hipMemcpyWithStream(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
-    HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
+    // If dest are not pinned, the memory copy will be performed synchronously.
+    // For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
+    HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
   } else {
-    // copying between cpu memory
+    if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
+      // sync the stream first to make sure the data arrived
+      HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
+    }
+    ORT_ENFORCE(dst_data != src_data);
     memcpy(dst_data, src_data, bytes);
   }
 
   return Status::OK();
 }
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -49,6 +49,8 @@ class Memcpy final : public OpKernel {
     const IDataTransfer* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
     if (!gpu_data_transfer)
       return Status(common::ONNXRUNTIME, common::EP_FAIL, "gpu data transfer is missing in Migraphx EP.");
+    // CopyTensorAsync could handle both pinned memory and non-pinned CPU memory.
+    // For non-pinned CPU memory, the copy is synchronous.
     return gpu_data_transfer->CopyTensorAsync(*X, *Y, *(ctx->GetComputeStream()));
   }
 };

diff --git a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc
@@ -6,10 +6,8 @@
 #include "core/providers/rocm/gpu_data_transfer.h"
 #include "core/providers/rocm/rocm_common.h"
 
+// If you make change below, please also update onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
 namespace onnxruntime {
-GPUDataTransfer::GPUDataTransfer() {}
-
-GPUDataTransfer::~GPUDataTransfer() {}
 
 bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
   return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
@@ -30,19 +28,17 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
       // Copy only if the two addresses are different.
       if (dst_data != src_data) {
         HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
-        HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
       }
     } else {
       // copy from other CPU memory to GPU, this is blocking
       HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
-      HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
     // copying from GPU to CPU memory, this is blocking
     HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
-    HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
   } else {
     // copying between cpu memory
+    ORT_ENFORCE(dst_data != src_data);
     memcpy(dst_data, src_data, bytes);
   }
 
@@ -59,7 +55,8 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
 
   if (dst_device.Type() == OrtDevice::GPU) {
     if (src_device.Type() == OrtDevice::CPU) {
-      // copy from pinned memory to GPU, this is non-blocking
+      // If source are not pinned, the memory copy will be performed synchronously.
+      // For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
       HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
     } else if (src_device.Type() == OrtDevice::GPU) {
       // copying between GPU, this is non-blocking
@@ -68,15 +65,15 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
       }
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
-    if (dst_device.Type() == OrtDevice::CPU) {
-      // copying from GPU to pinned memory, this is non-blocking
-      HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
-    }
+    // If dest are not pinned, the memory copy will be performed synchronously.
+    // For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
+    HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
   } else {
     if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
       // sync the stream first to make sure the data arrived
       HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
     }
+    ORT_ENFORCE(dst_data != src_data);
     memcpy(dst_data, src_data, bytes);
   }
 

diff --git a/onnxruntime/core/providers/rocm/gpu_data_transfer.h b/onnxruntime/core/providers/rocm/gpu_data_transfer.h
@@ -10,8 +10,8 @@ namespace onnxruntime {
 
 class GPUDataTransfer : public IDataTransfer {
  public:
-  GPUDataTransfer();
-  ~GPUDataTransfer();
+  GPUDataTransfer() = default;
+  ~GPUDataTransfer() = default;
 
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
 

diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -41,10 +41,9 @@ class Memcpy final : public OpKernel {
       ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
       Tensor* Y = ctx->Output(0, X->Shape());
       ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
-      // do we support async copy?
-      // The rocmMemCpyAsync will handle the pinned memory and non-pinned memory,
-      // so we don't need the check here.
       auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
+      // CopyTensorAsync could handle both pinned memory and non-pinned CPU memory.
+      // For non-pinned CPU memory, the copy is synchronous.
       ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream()));
       return Status::OK();
     } else {