triton-inference-server · yinggeh · Oct 23, 2024 · Oct 11, 2024 · Oct 15, 2024 · Oct 16, 2024
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
@@ -204,6 +204,42 @@ metrics are used for latencies:
 
 To disable these metrics specifically, you can set `--metrics-config counter_latencies=false`
 
+#### Histograms
+
+> **Note**
+>
+> The following Histogram feature is experimental for the time being and may be
+> subject to change based on user feedback.
+
+By default, the following
+[Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram)
+metrics are used for latencies:
+
+|Category      |Metric          |Metric Name |Description                |Granularity|Frequency    |Model Type
+|--------------|----------------|------------|---------------------------|-----------|-------------|-------------|
+|Latency       |Request to First Response Time    |`nv_inference_first_response_histogram_ms` |Histogram of end-to-end inference request to the first response time |Per model  |Per request  | Decoupled |
+
+To enable these metrics specifically, you can set `--metrics-config histogram_latencies=true`
+
+Each histogram above may composed of several sub-metrics. For each
+metric, there is a set of `le` metrics tracking the counter for each
+bucket. Additionally, there are `_count` and `_sum` metrics that aggregate
+the count and observed values for each. For example, see the following
+information exposed by the Request to First Response Histogram metrics:
+```
+# HELP nv_first_response_histogram_ms Duration from request to first response in milliseconds
+# TYPE nv_first_response_histogram_ms histogram
+nv_inference_first_response_histogram_ms_count{model="my_model",version="1"} 101
+nv_inference_first_response_histogram_ms_sum{model="my_model",version="1"} 3685
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="10"} 55
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="100"} 97
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="500"} 98
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="1000"} 101
+nv_inference_first_response_histogram_ms{model="my_model",version="1", le="+Inf"} 101
+```
+
+Triton initializes histograms with default buckets for each, as shown above. Customization of buckets per metric is currently unsupported.
+
 #### Summaries
 
 > **Note**

diff --git a/qa/L0_metrics/ensemble_decoupled/async_execute_decouple/1/model.py b/qa/L0_metrics/ensemble_decoupled/async_execute_decouple/1/model.py
@@ -0,0 +1,76 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import asyncio
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    async def execute(self, requests):
+        processed_requests = []
+        async_tasks = []
+        for request in requests:
+            wait_secs_tensor = pb_utils.get_input_tensor_by_name(
+                request, "WAIT_SECONDS"
+            ).as_numpy()
+            for wait_secs in wait_secs_tensor:
+                if wait_secs < 0:
+                    self.raise_value_error(requests)
+                async_tasks.append(asyncio.create_task(asyncio.sleep(wait_secs)))
+            processed_requests.append(
+                {
+                    "wait_secs": wait_secs,
+                    "response_sender": request.get_response_sender(),
+                }
+            )
+
+        # This decoupled execute should be scheduled to run in the background
+        # concurrently with other instances of decoupled execute, as long as the event
+        # loop is not blocked.
+        await asyncio.gather(*async_tasks)
+
+        for p_req in processed_requests:
+            wait_secs = p_req["wait_secs"]
+            response_sender = p_req["response_sender"]
+
+            output_tensors = pb_utils.Tensor(
+                "DUMMY_OUT", np.array([wait_secs], np.float32)
+            )
+            response = pb_utils.InferenceResponse(output_tensors=[output_tensors])
+            response_sender.send(
+                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+            )
+
+        return None
+
+    def raise_value_error(self, requests):
+        # TODO: Model may raise exception without sending complete final
+        for request in requests:
+            response_sender = request.get_response_sender()
+            response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+        raise ValueError("wait_secs cannot be negative")
diff --git a/qa/L0_metrics/ensemble_decoupled/async_execute_decouple/config.pbtxt b/qa/L0_metrics/ensemble_decoupled/async_execute_decouple/config.pbtxt
@@ -0,0 +1,45 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "python"
+input [
+  {
+    name: "WAIT_SECONDS"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "DUMMY_OUT"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
+model_transaction_policy { decoupled: True }
+
diff --git a/qa/L0_metrics/ensemble_decoupled/ensemble/config.pbtxt b/qa/L0_metrics/ensemble_decoupled/ensemble/config.pbtxt
@@ -0,0 +1,72 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "ensemble"
+platform: "ensemble"
+input [
+ {
+  name: "INPUT"
+  data_type: TYPE_FP32
+  dims: [ 1 ]
+ }
+]
+output [
+ {
+  name: "OUTPUT"
+  data_type: TYPE_FP32
+  dims: [ 1 ]
+ }
+]
+ensemble_scheduling {
+ step [
+  {
+   # decoupled model
+   model_name: "async_execute_decouple"
+   model_version: 1
+   input_map {
+    key: "WAIT_SECONDS"
+    value: "INPUT"
+   }
+   output_map {
+    key: "DUMMY_OUT"
+    value: "temp_output"
+   }
+  },
+  {
+   # non-decoupled model
+   model_name: "async_execute"
+   model_version: 1
+   input_map {
+    key: "WAIT_SECONDS"
+    value: "temp_output"
+   }
+   output_map {
+    key: "DUMMY_OUT"
+    value: "OUTPUT"
+   }
+  }
+ ]
+}