From 2a74f77cd8d6a03edc9b228bc1968cd1559facff Mon Sep 17 00:00:00 2001 From: wangna11BD Date: Thu, 16 May 2024 12:55:12 +0000 Subject: [PATCH 1/2] aclnn for argsort_grad --- backends/npu/kernels/argsort_grad_kernel.cc | 171 ++++++++++++++++++-- 1 file changed, 158 insertions(+), 13 deletions(-) diff --git a/backends/npu/kernels/argsort_grad_kernel.cc b/backends/npu/kernels/argsort_grad_kernel.cc index 5e204a901..92f930940 100644 --- a/backends/npu/kernels/argsort_grad_kernel.cc +++ b/backends/npu/kernels/argsort_grad_kernel.cc @@ -15,9 +15,28 @@ #include "kernels/funcs/npu_funcs.h" #include "kernels/funcs/npu_op_runner.h" #include "paddle/phi/backends/custom/custom_context.h" +#include "paddle/phi/kernels/funcs/tensor_formatter.h" namespace custom_kernel { +template +void CastKernel(const Context& dev_ctx, + const phi::DenseTensor& x, + phi::DataType dtype, + phi::DenseTensor* out); + +template +void TransposeKernel(const Context& dev_ctx, + const phi::DenseTensor& x, + const std::vector& axis, + phi::DenseTensor* out); + +template +void AddKernel(const Context& dev_ctx, + const phi::DenseTensor& x, + const phi::DenseTensor& y, + phi::DenseTensor* out); + template static void TranposeNPU(const Context& dev_ctx, const aclrtStream& stream, @@ -34,12 +53,12 @@ static void TranposeNPU(const Context& dev_ctx, } template -static void FullAssignNPU(const Context& dev_ctx, - const aclrtStream& stream, - const phi::DDim in_dims, - const phi::DenseTensor& input, - const phi::DenseTensor& indices, - phi::DenseTensor* t_out) { +static void AclopFullAssignNPU(const Context& dev_ctx, + const aclrtStream& stream, + const phi::DDim in_dims, + const phi::DenseTensor& input, + const phi::DenseTensor& indices, + phi::DenseTensor* t_out) { const int64_t input_height = phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); const int64_t input_width = in_dims[in_dims.size() - 1]; @@ -86,6 +105,129 @@ static void FullAssignNPU(const Context& dev_ctx, runner.Run(stream); } +template +static void FullAssignNPU(const Context& dev_ctx, + const phi::DDim in_dims, + const phi::DenseTensor& input, + const phi::DenseTensor& indices, + phi::DenseTensor* t_out) { + DO_COMPATIBILITY( + aclnnScatterNd, + (custom_kernel::AclopFullAssignNPU( + dev_ctx, dev_ctx.stream(), in_dims, input, indices, t_out))); + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + + phi::DenseTensor input_tmp(input); + input_tmp.Resize( + phi::make_ddim(std::vector{input_height * input_width, 1})); + + phi::DenseTensor indices_tmp(indices); + indices_tmp.Resize( + phi::make_ddim(std::vector{input_height, input_width})); + + std::vector indexs_value; + for (Type i = 0; i < input_height; i++) { + indexs_value.push_back(i * input_width); + } + phi::DenseTensor indexs_tmp; + phi::DenseTensorMeta indexs_tmp_meta = { + indices.dtype(), phi::make_ddim(std::vector{input_height, 1})}; + indexs_tmp.set_meta(indexs_tmp_meta); + dev_ctx.template Alloc(&indexs_tmp); + TensorFromVector(dev_ctx, indexs_value, dev_ctx, &indexs_tmp); + indexs_tmp.Resize(phi::make_ddim(std::vector{input_height, 1})); + + phi::DenseTensor indices_index; + phi::DenseTensorMeta indices_index_meta = {indices.dtype(), + indices_tmp.dims()}; + indices_index.set_meta(indices_index_meta); + dev_ctx.template Alloc(&indices_index); + custom_kernel::AddKernel( + dev_ctx, indices_tmp, indexs_tmp, &indices_index); + + indices_index.Resize( + phi::make_ddim(std::vector{input_height * input_width, 1})); + + phi::DenseTensor indices_index_int; + phi::DenseTensorMeta meta = {phi::DataType::INT64, indices_index.dims()}; + indices_index_int.set_meta(meta); + custom_kernel::CastKernel( + dev_ctx, indices_index, phi::DataType::INT64, &indices_index_int); + + dev_ctx.template Alloc(t_out); + phi::DenseTensor out_tmp(*t_out); + out_tmp.Resize(input_tmp.dims()); + EXEC_NPU_CMD(aclnnScatterNd, + dev_ctx, + input_tmp, + indices_index_int, + input_tmp, + out_tmp); + out_tmp.Resize(t_out->dims()); +} + +template +void AclopArgsortGradKernel(const Context& dev_ctx, + const phi::DenseTensor& indices, + const phi::DenseTensor& input, + const phi::DenseTensor& out_grad, + int axis, + bool descending, + phi::DenseTensor* in_grad) { + auto stream = dev_ctx.stream(); + auto in_dims = indices.dims(); + auto rank = input.dims().size(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + dev_ctx.template Alloc(in_grad); + if (out_grad.numel() == 0) return; + + if (rank == 0) { + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, in_grad); + return; + } + + // Do full assign + if (axis == -1 || axis + 1 == in_dims.size()) { + AclopFullAssignNPU( + dev_ctx, stream, in_dims, out_grad, indices, in_grad); + } else { + std::vector perm; + for (int64_t i = 0; i < in_dims.size(); i++) { + perm.emplace_back(i); + } + std::swap(perm[axis], perm[in_dims.size() - 1]); + + std::vector shape; + for (size_t i = 0; i < perm.size(); i++) { + shape.emplace_back(in_dims[perm[i]]); + } + auto trans_dims = phi::make_ddim(shape); + phi::DenseTensor trans_dout; + phi::DenseTensor trans_ids; + phi::DenseTensorMeta trans_dout_meta = {out_grad.dtype(), trans_dims}; + phi::DenseTensorMeta trans_ids_meta = {indices.dtype(), trans_dims}; + trans_dout.set_meta(trans_dout_meta); + trans_ids.set_meta(trans_ids_meta); + dev_ctx.template Alloc(&trans_dout); + dev_ctx.template Alloc(&trans_ids); + + TranposeNPU(dev_ctx, stream, &perm, out_grad, &trans_dout); + TranposeNPU(dev_ctx, stream, &perm, indices, &trans_ids); + + phi::DenseTensor trans_dx; + phi::DenseTensorMeta trans_dx_meta = {out_grad.dtype(), trans_dims}; + trans_dx.set_meta(trans_dx_meta); + dev_ctx.template Alloc(&trans_dx); + + AclopFullAssignNPU( + dev_ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx); + + TranposeNPU(dev_ctx, stream, &perm, trans_dx, in_grad); + } +} + template void ArgsortGradKernel(const Context& dev_ctx, const phi::DenseTensor& indices, @@ -109,10 +251,10 @@ void ArgsortGradKernel(const Context& dev_ctx, // Do full assign if (axis == -1 || axis + 1 == in_dims.size()) { FullAssignNPU( - dev_ctx, stream, in_dims, out_grad, indices, in_grad); + dev_ctx, in_dims, out_grad, indices, in_grad); } else { - std::vector perm; - for (int64_t i = 0; i < in_dims.size(); i++) { + std::vector perm; + for (int i = 0; i < in_dims.size(); i++) { perm.emplace_back(i); } std::swap(perm[axis], perm[in_dims.size() - 1]); @@ -131,8 +273,10 @@ void ArgsortGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(&trans_dout); dev_ctx.template Alloc(&trans_ids); - TranposeNPU(dev_ctx, stream, &perm, out_grad, &trans_dout); - TranposeNPU(dev_ctx, stream, &perm, indices, &trans_ids); + custom_kernel::TransposeKernel( + dev_ctx, out_grad, perm, &trans_dout); + custom_kernel::TransposeKernel( + dev_ctx, indices, perm, &trans_ids); phi::DenseTensor trans_dx; phi::DenseTensorMeta trans_dx_meta = {out_grad.dtype(), trans_dims}; @@ -140,9 +284,10 @@ void ArgsortGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(&trans_dx); FullAssignNPU( - dev_ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx); + dev_ctx, trans_dims, trans_dout, trans_ids, &trans_dx); - TranposeNPU(dev_ctx, stream, &perm, trans_dx, in_grad); + custom_kernel::TransposeKernel( + dev_ctx, trans_dx, perm, in_grad); } } From baa3e69b971c736695ea9bc91fd048c565449b4b Mon Sep 17 00:00:00 2001 From: wangna11BD Date: Mon, 20 May 2024 07:01:57 +0000 Subject: [PATCH 2/2] fix stable --- backends/npu/kernels/argsort_grad_kernel.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backends/npu/kernels/argsort_grad_kernel.cc b/backends/npu/kernels/argsort_grad_kernel.cc index 92f930940..9f503596b 100644 --- a/backends/npu/kernels/argsort_grad_kernel.cc +++ b/backends/npu/kernels/argsort_grad_kernel.cc @@ -175,6 +175,7 @@ void AclopArgsortGradKernel(const Context& dev_ctx, const phi::DenseTensor& out_grad, int axis, bool descending, + bool stable, phi::DenseTensor* in_grad) { auto stream = dev_ctx.stream(); auto in_dims = indices.dims(); @@ -235,6 +236,7 @@ void ArgsortGradKernel(const Context& dev_ctx, const phi::DenseTensor& out_grad, int axis, bool descending, + bool stable, phi::DenseTensor* in_grad) { auto stream = dev_ctx.stream(); auto in_dims = indices.dims();