Skip to content

Commit

Permalink
consolidate gpu data transfer
Browse files Browse the repository at this point in the history
  • Loading branch information
tianleiwu committed Oct 25, 2024
1 parent b4afc62 commit 5aa867a
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 42 deletions.
1 change: 0 additions & 1 deletion onnxruntime/core/providers/cuda/cuda_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class Memcpy final : public OpKernel {
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
Tensor* Y = ctx->Output(0, X->Shape());
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
// do we support async copy?
// The cudaMemCpyAsync will handle the pinned memory and non-pinned memory,
// so we don't need the check here.
auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
Expand Down
14 changes: 5 additions & 9 deletions onnxruntime/core/providers/cuda/gpu_data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
#include "cuda_common.h"

namespace onnxruntime {
GPUDataTransfer::GPUDataTransfer() {}

GPUDataTransfer::~GPUDataTransfer() {}

bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED ||
dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
Expand All @@ -30,19 +26,17 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
// Copy only if the two addresses are different.
if (dst_data != src_data) {
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
}
} else {
// copy from other CPU memory to GPU, this is blocking
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
}
} else if (src_device.Type() == OrtDevice::GPU) {
// copying from GPU to CPU memory, this is blocking
CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
} else {
// copying between cpu memory
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand All @@ -59,7 +53,7 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,

if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU) {
// copy from pinned memory to GPU, this is non-blocking
// copy from pinned or non-pinned CPU memory to GPU
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, static_cast<cudaStream_t>(stream.GetHandle())));
} else if (src_device.Type() == OrtDevice::GPU) {
// copying between GPU, this is non-blocking
Expand All @@ -69,14 +63,16 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
}
} else if (src_device.Type() == OrtDevice::GPU) {
if (dst_device.Type() == OrtDevice::CPU) {
// copying from GPU to pinned memory, this is non-blocking
// copy from GPU to pinned or non-pinned CPU memory.
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, static_cast<cudaStream_t>(stream.GetHandle())));
}
} else {
if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// sync the stream first to make sure the data arrived
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(stream.GetHandle())));
}

ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/cuda/gpu_data_transfer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ namespace onnxruntime {

class GPUDataTransfer : public IDataTransfer {
public:
GPUDataTransfer();
~GPUDataTransfer();
GPUDataTransfer() = default;
~GPUDataTransfer() = default;

bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;

Expand Down
38 changes: 24 additions & 14 deletions onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
// Licensed under the MIT License.

#include "core/providers/shared_library/provider_api.h"
#include "gpu_data_transfer.h"
#include "migraphx_call.h"
#include "core/providers/migraphx/gpu_data_transfer.h"
#include "core/providers/migraphx/migraphx_call.h"

// If you make change below, please also update onnxruntime/core/providers/rocm/gpu_data_transfer.cc

namespace onnxruntime {

bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::HIP_PINNED;
}

common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
Expand All @@ -23,17 +27,18 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
if (src_device.Type() == OrtDevice::GPU) {
// Copy only if the two addresses are different.
if (dst_data != src_data) {
HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
}
} else {
// copy from other CPU memory to GPU, this is blocking
HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
}
} else if (src_device.Type() == OrtDevice::GPU) {
// copying from GPU to CPU memory, this is blocking
HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
} else {
// copying between cpu memory
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand All @@ -49,23 +54,28 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
auto& dst_device = dst.Location().device;

if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
// copy from pinned memory to GPU, this is non-blocking
HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
if (src_device.Type() == OrtDevice::CPU) {
// If source are not pinned, the memory copy will be performed synchronously.
// For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
} else if (src_device.Type() == OrtDevice::GPU) {
// copying between GPU, this is non-blocking
HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, static_cast<hipStream_t>(stream.GetHandle())));
} else {
// copy from other CPU memory to GPU, this is blocking
HIP_CALL_THROW(hipMemcpyWithStream(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
}
} else if (src_device.Type() == OrtDevice::GPU) {
HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
// If dest are not pinned, the memory copy will be performed synchronously.
// For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
} else {
// copying between cpu memory
if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// sync the stream first to make sure the data arrived
HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
}
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

return Status::OK();
}

} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class Memcpy final : public OpKernel {
const IDataTransfer* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
if (!gpu_data_transfer)
return Status(common::ONNXRUNTIME, common::EP_FAIL, "gpu data transfer is missing in Migraphx EP.");
// CopyTensorAsync could handle both pinned memory and non-pinned CPU memory.
// For non-pinned CPU memory, the copy is synchronous.
return gpu_data_transfer->CopyTensorAsync(*X, *Y, *(ctx->GetComputeStream()));
}
};
Expand Down
19 changes: 8 additions & 11 deletions onnxruntime/core/providers/rocm/gpu_data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
#include "core/providers/rocm/gpu_data_transfer.h"
#include "core/providers/rocm/rocm_common.h"

// If you make change below, please also update onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
namespace onnxruntime {
GPUDataTransfer::GPUDataTransfer() {}

GPUDataTransfer::~GPUDataTransfer() {}

bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::HIP_PINNED ||
Expand All @@ -30,19 +28,17 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
// Copy only if the two addresses are different.
if (dst_data != src_data) {
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
}
} else {
// copy from other CPU memory to GPU, this is blocking
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
}
} else if (src_device.Type() == OrtDevice::GPU) {
// copying from GPU to CPU memory, this is blocking
HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
HIP_RETURN_IF_ERROR(hipStreamSynchronize(nullptr));
} else {
// copying between cpu memory
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand All @@ -59,7 +55,8 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,

if (dst_device.Type() == OrtDevice::GPU) {
if (src_device.Type() == OrtDevice::CPU) {
// copy from pinned memory to GPU, this is non-blocking
// If source are not pinned, the memory copy will be performed synchronously.
// For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, static_cast<hipStream_t>(stream.GetHandle())));
} else if (src_device.Type() == OrtDevice::GPU) {
// copying between GPU, this is non-blocking
Expand All @@ -68,15 +65,15 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
}
}
} else if (src_device.Type() == OrtDevice::GPU) {
if (dst_device.Type() == OrtDevice::CPU) {
// copying from GPU to pinned memory, this is non-blocking
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
}
// If dest are not pinned, the memory copy will be performed synchronously.
// For best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
} else {
if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
// sync the stream first to make sure the data arrived
HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(stream.GetHandle())));
}
ORT_ENFORCE(dst_data != src_data);
memcpy(dst_data, src_data, bytes);
}

Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/rocm/gpu_data_transfer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ namespace onnxruntime {

class GPUDataTransfer : public IDataTransfer {
public:
GPUDataTransfer();
~GPUDataTransfer();
GPUDataTransfer() = default;
~GPUDataTransfer() = default;

bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;

Expand Down
5 changes: 2 additions & 3 deletions onnxruntime/core/providers/rocm/rocm_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,9 @@ class Memcpy final : public OpKernel {
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
Tensor* Y = ctx->Output(0, X->Shape());
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
// do we support async copy?
// The rocmMemCpyAsync will handle the pinned memory and non-pinned memory,
// so we don't need the check here.
auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
// CopyTensorAsync could handle both pinned memory and non-pinned CPU memory.
// For non-pinned CPU memory, the copy is synchronous.
ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream()));
return Status::OK();
} else {
Expand Down

0 comments on commit 5aa867a

Please sign in to comment.