From eba4da5ef87ab62937619f979e252791b85ff718 Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Fri, 7 Jul 2023 14:41:36 -0700
Subject: [PATCH] Query the memory type for each request (#202)

---
 src/onnxruntime.cc | 93 +++++++++++++++++++---------------------------
 1 file changed, 38 insertions(+), 55 deletions(-)

diff --git a/src/onnxruntime.cc b/src/onnxruntime.cc
index 6d2e2c5..6fb9899 100644
--- a/src/onnxruntime.cc
+++ b/src/onnxruntime.cc
@@ -1682,71 +1682,54 @@ ModelInstanceState::ProcessRequests(
       TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
       int64_t memory_type_id = 0;
 
-      if (output_device_info_.size() !=
-          StateForModel()->ModelOutputs().size()) {
-        // Get data type for this output. If this is a string then
-        // use CPU for binding output otherwise, query the preferred location
-        // for this output and bind accordingly. In case of any errors we
-        // fallback to binding the output to CPU.
-        auto iit = output_tensor_infos_.find(output_name.first);
-        if (iit == output_tensor_infos_.end()) {
+      // Get data type for this output. If this is a string then
+      // use CPU for binding output otherwise, query the preferred location
+      // for this output and bind accordingly. In case of any errors we
+      // fallback to binding the output to CPU.
+      auto iit = output_tensor_infos_.find(output_name.first);
+      if (iit == output_tensor_infos_.end()) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string(
+                 "Error while retrieving output data type. Using cpu "
+                 "as preferred location for output: " +
+                 output_name.first)
+                 .c_str()));
+      } else if (iit->second.type_ != ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
+        // Query the memory type of destination output buffer. Bind the
+        // output to this destination memory type. The destination memory type
+        // for an output for all requests should be same. So use any request
+        // for this query.
+        memory_type = preferred_memory_type;
+        memory_type_id = preferred_memory_type_id;
+        auto err = TRITONBACKEND_RequestOutputBufferProperties(
+            requests[0], output_name.first.c_str(), /*byte_size*/ nullptr,
+            &memory_type, &memory_type_id);
+
+        if (err != nullptr) {
           LOG_MESSAGE(
               TRITONSERVER_LOG_VERBOSE,
               (std::string(
-                   "Error while retrieving output data type. Using cpu "
-                   "as preferred location for output: " +
-                   output_name.first)
+                   "Output Properties Unavailable. Using cpu as "
+                   "preferred location for output: " +
+                   output_name.first +
+                   " Error: " + TRITONSERVER_ErrorMessage(err))
                    .c_str()));
-        } else if (iit->second.type_ != ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
-          // Query the memory type of destination output buffer. Bind the
-          // output to this destination memory type. The destination memory type
-          // for an output for all requests should be same. So use any request
-          // for this query.
-          memory_type = preferred_memory_type;
-          memory_type_id = preferred_memory_type_id;
-          auto err = TRITONBACKEND_RequestOutputBufferProperties(
-              requests[0], output_name.first.c_str(), /*byte_size*/ nullptr,
-              &memory_type, &memory_type_id);
-
-          if (err != nullptr) {
-            LOG_MESSAGE(
-                TRITONSERVER_LOG_VERBOSE,
-                (std::string(
-                     "Output Properties Unavailable. Using cpu as "
-                     "preferred location for output: " +
-                     output_name.first)
-                     .c_str()));
-            memory_type = TRITONSERVER_MEMORY_CPU;
-            memory_type_id = 0;
-          }
-        }
-
-        // If the cuda allocator is not set, bind the output to CPU.
-        if (cuda_allocator_info_ == nullptr) {
           memory_type = TRITONSERVER_MEMORY_CPU;
           memory_type_id = 0;
         }
+      }
 
-        // finally save the derived mem type and device id as we need it for
-        // reading the outputs.
-        output_device_info_.insert(
-            {output_name.first, {memory_type, memory_type_id}});
-      } else {
-        auto output_device_info_iter =
-            output_device_info_.find(output_name.first);
-        if (output_device_info_iter == output_device_info_.end()) {
-          RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-              responses, request_count, all_response_failed,
-              TRITONSERVER_ErrorNew(
-                  TRITONSERVER_ERROR_INTERNAL,
-                  (std::string("device info for output tensor '") +
-                   output_name.first + "' not found")
-                      .c_str()));
-        }
-        memory_type = output_device_info_iter->second.first;
-        memory_type_id = output_device_info_iter->second.second;
+      // If the cuda allocator is not set, bind the output to CPU.
+      if (cuda_allocator_info_ == nullptr) {
+        memory_type = TRITONSERVER_MEMORY_CPU;
+        memory_type_id = 0;
       }
 
+      // finally save the derived mem type and device id as we need it for
+      // reading the outputs.
+      output_device_info_[output_name.first] = {memory_type, memory_type_id};
+
       RESPOND_ALL_AND_SET_TRUE_IF_ORT_ERROR(
           responses, request_count, all_response_failed,
           ort_api->BindOutputToDevice(