Skip to content

Commit

Permalink
Add histogram support and new TTFT metric
Browse files Browse the repository at this point in the history
  • Loading branch information
yinggeh committed Oct 11, 2024
1 parent 623d0a5 commit 0c2893d
Show file tree
Hide file tree
Showing 11 changed files with 198 additions and 27 deletions.
2 changes: 1 addition & 1 deletion src/backend_model_instance.cc
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ TritonModelInstance::TritonModelInstance(
model_->Server()->ResponseCacheEnabled();
MetricModelReporter::Create(
model_->ModelId(), model_->Version(), id, response_cache_enabled,
model_->Config().metric_tags(), &reporter_);
model_->Decoupled(), model_->Config().metric_tags(), &reporter_);
}
#endif // TRITON_ENABLE_METRICS
}
Expand Down
5 changes: 3 additions & 2 deletions src/ensemble_scheduler/ensemble_scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1469,12 +1469,13 @@ EnsembleScheduler::EnsembleScheduler(
}
#endif // TRITON_ENABLE_GPU

const bool is_decoupled = config.model_transaction_policy().decoupled();
#ifdef TRITON_ENABLE_METRICS
if (Metrics::Enabled()) {
// Ensemble scheduler doesn't currently support response cache at top level.
MetricModelReporter::Create(
model_id, 1 /* model_version */, METRIC_REPORTER_ID_CPU,
false /* response_cache_enabled */, config.metric_tags(),
false /* response_cache_enabled */, is_decoupled, config.metric_tags(),
&metric_reporter_);
}
#endif // TRITON_ENABLE_METRICS
Expand All @@ -1485,7 +1486,7 @@ EnsembleScheduler::EnsembleScheduler(
info_->ensemble_name_ = config.name();

// This config field is filled internally for ensemble models
info_->is_decoupled_ = config.model_transaction_policy().decoupled();
info_->is_decoupled_ = is_decoupled;

// field to check if response cache enabled in the ensemble model config.
info_->is_cache_enabled_ =
Expand Down
53 changes: 47 additions & 6 deletions src/infer_response.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -38,14 +38,20 @@ namespace triton { namespace core {
//
Status
InferenceResponseFactory::CreateResponse(
std::unique_ptr<InferenceResponse>* response) const
std::unique_ptr<InferenceResponse>* response)
{
response->reset(new InferenceResponse(
model_, id_, allocator_, alloc_userp_, response_fn_, response_userp_,
response_delegator_));
response_delegator_, response_cnt_
#ifdef TRITON_ENABLE_METRICS
,
infer_start_ns_
#endif // TRITON_ENABLE_METRICS
));
#ifdef TRITON_ENABLE_TRACING
(*response)->SetTrace(trace_);
#endif // TRITON_ENABLE_TRACING
response_cnt_++;
return Status::Success;
}

Expand All @@ -72,10 +78,20 @@ InferenceResponse::InferenceResponse(
TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
void* response_userp,
const std::function<
void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator)
void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
uint64_t seq_num
#ifdef TRITON_ENABLE_METRICS
,
uint64_t infer_start_ns
#endif // TRITON_ENABLE_METRICS
)
: model_(model), id_(id), allocator_(allocator), alloc_userp_(alloc_userp),
response_fn_(response_fn), response_userp_(response_userp),
response_delegator_(delegator), null_response_(false)
response_delegator_(delegator), seq_num_(seq_num),
#ifdef TRITON_ENABLE_METRICS
infer_start_ns_(infer_start_ns),
#endif // TRITON_ENABLE_METRICS
null_response_(false)
{
// If the allocator has a start_fn then invoke it.
TRITONSERVER_ResponseAllocatorStartFn_t start_fn = allocator_->StartFn();
Expand All @@ -92,7 +108,10 @@ InferenceResponse::InferenceResponse(
InferenceResponse::InferenceResponse(
TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
void* response_userp)
: response_fn_(response_fn), response_userp_(response_userp),
: response_fn_(response_fn), response_userp_(response_userp), seq_num_(0),
#ifdef TRITON_ENABLE_METRICS
infer_start_ns_(0),
#endif // TRITON_ENABLE_METRICS
null_response_(true)
{
}
Expand Down Expand Up @@ -214,6 +233,10 @@ InferenceResponse::Send(
TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT, "InferenceResponse Send");
#endif // TRITON_ENABLE_TRACING

#ifdef TRITON_ENABLE_METRICS
response->UpdateResponseMetrics();
#endif // TRITON_ENABLE_METRICS

if (response->response_delegator_ != nullptr) {
auto ldelegator = std::move(response->response_delegator_);
ldelegator(std::move(response), flags);
Expand Down Expand Up @@ -282,6 +305,24 @@ InferenceResponse::TraceOutputTensors(
}
#endif // TRITON_ENABLE_TRACING

#ifdef TRITON_ENABLE_METRICS
void
InferenceResponse::UpdateResponseMetrics() const
{
if (model_ != nullptr && seq_num_ == 0) {
auto first_response_ns =
std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
if (auto reporter = model_->MetricReporter()) {
reporter->ObserveHistogram(
"first_response_histogram",
(first_response_ns - infer_start_ns_) / 1000000);
}
}
}
#endif // TRITON_ENABLE_METRICS

//
// InferenceResponse::Output
//
Expand Down
35 changes: 32 additions & 3 deletions src/infer_response.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,17 @@ class InferenceResponseFactory {
: model_(model), id_(id), allocator_(allocator),
alloc_userp_(alloc_userp), response_fn_(response_fn),
response_userp_(response_userp), response_delegator_(delegator),
is_cancelled_(false)
is_cancelled_(false), response_cnt_(0)
#ifdef TRITON_ENABLE_STATS
,
response_stats_index_(0)
#endif // TRITON_ENABLE_STATS
{
#ifdef TRITON_ENABLE_METRICS
infer_start_ns_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
#endif // TRITON_ENABLE_METRICS
}

void Cancel() { is_cancelled_ = true; }
Expand All @@ -84,7 +89,7 @@ class InferenceResponseFactory {
}

// Create a new response.
Status CreateResponse(std::unique_ptr<InferenceResponse>* response) const;
Status CreateResponse(std::unique_ptr<InferenceResponse>* response);

// Send a "null" response with 'flags'.
Status SendFlags(const uint32_t flags) const;
Expand Down Expand Up @@ -134,6 +139,14 @@ class InferenceResponseFactory {

std::atomic<bool> is_cancelled_;

// The number of responses created by this factory.
std::atomic<uint64_t> response_cnt_;

#ifdef TRITON_ENABLE_METRICS
// The start time of associate request in ns.
uint64_t infer_start_ns_;
#endif // TRITON_ENABLE_METRICS

#ifdef TRITON_ENABLE_TRACING
// Inference trace associated with this response.
std::shared_ptr<InferenceTraceProxy> trace_;
Expand Down Expand Up @@ -247,7 +260,13 @@ class InferenceResponse {
TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
void* response_userp,
const std::function<void(
std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator);
std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator,
uint64_t seq_num
#ifdef TRITON_ENABLE_METRICS
,
uint64_t infer_start_ns
#endif // TRITON_ENABLE_METRICS
);

// "null" InferenceResponse is a special instance of InferenceResponse which
// contains minimal information for calling InferenceResponse::Send,
Expand Down Expand Up @@ -324,6 +343,11 @@ class InferenceResponse {
TRITONSERVER_InferenceTraceActivity activity, const std::string& msg);
#endif // TRITON_ENABLE_TRACING


#ifdef TRITON_ENABLE_METRICS
void UpdateResponseMetrics() const;
#endif // TRITON_ENABLE_METRICS

// The model associated with this factory. For normal
// requests/responses this will always be defined and acts to keep
// the model loaded as long as this factory is live. It may be
Expand Down Expand Up @@ -358,6 +382,11 @@ class InferenceResponse {
std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
response_delegator_;

const uint64_t seq_num_;
#ifdef TRITON_ENABLE_METRICS
const uint64_t infer_start_ns_;
#endif // TRITON_ENABLE_METRICS

bool null_response_;

#ifdef TRITON_ENABLE_TRACING
Expand Down
65 changes: 60 additions & 5 deletions src/metric_model_reporter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ namespace triton { namespace core {
// MetricReporterConfig
//
void
MetricReporterConfig::ParseConfig(bool response_cache_enabled)
MetricReporterConfig::ParseConfig(
bool response_cache_enabled, bool is_decoupled)
{
// Global config only for now in config map
auto metrics_config_map = Metrics::ConfigMap();
Expand All @@ -53,6 +54,10 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled)
latency_counters_enabled_ = false;
}

if (pair.first == "histogram_latencies" && pair.second == "false") {
latency_histograms_enabled_ = false;
}

if (pair.first == "summary_latencies" && pair.second == "true") {
latency_summaries_enabled_ = true;
}
Expand All @@ -68,6 +73,7 @@ MetricReporterConfig::ParseConfig(bool response_cache_enabled)

// Set flag to signal to stats aggregator if caching is enabled or not
cache_enabled_ = response_cache_enabled;
is_decoupled_ = is_decoupled;
}

prometheus::Summary::Quantiles
Expand Down Expand Up @@ -112,7 +118,7 @@ const std::map<FailureReason, std::string>
Status
MetricModelReporter::Create(
const ModelIdentifier& model_id, const int64_t model_version,
const int device, bool response_cache_enabled,
const int device, bool response_cache_enabled, bool is_decoupled,
const triton::common::MetricTagsMap& model_tags,
std::shared_ptr<MetricModelReporter>* metric_model_reporter)
{
Expand Down Expand Up @@ -141,25 +147,27 @@ MetricModelReporter::Create(
}

metric_model_reporter->reset(new MetricModelReporter(
model_id, model_version, device, response_cache_enabled, model_tags));
model_id, model_version, device, response_cache_enabled, is_decoupled,
model_tags));
reporter_map.insert({hash_labels, *metric_model_reporter});
return Status::Success;
}

MetricModelReporter::MetricModelReporter(
const ModelIdentifier& model_id, const int64_t model_version,
const int device, bool response_cache_enabled,
const int device, bool response_cache_enabled, bool is_decoupled,
const triton::common::MetricTagsMap& model_tags)
{
std::map<std::string, std::string> labels;
GetMetricLabels(&labels, model_id, model_version, device, model_tags);

// Parse metrics config to control metric setup and behavior
config_.ParseConfig(response_cache_enabled);
config_.ParseConfig(response_cache_enabled, is_decoupled);

// Initialize families and metrics
InitializeCounters(labels);
InitializeGauges(labels);
InitializeHistograms(labels);
InitializeSummaries(labels);
}

Expand All @@ -182,6 +190,14 @@ MetricModelReporter::~MetricModelReporter()
}
}

for (auto& iter : histogram_families_) {
const auto& name = iter.first;
auto family_ptr = iter.second;
if (family_ptr) {
family_ptr->Remove(histograms_[name]);
}
}

for (auto& iter : summary_families_) {
const auto& name = iter.first;
auto family_ptr = iter.second;
Expand Down Expand Up @@ -261,6 +277,28 @@ MetricModelReporter::InitializeGauges(
}
}

void
MetricModelReporter::InitializeHistograms(
const std::map<std::string, std::string>& labels)
{
// Only create response metrics if decoupled model to reduce metric output
if (config_.latency_histograms_enabled_) {
if (config_.is_decoupled_) {
histogram_families_["first_response_histogram"] =
&Metrics::FamilyFirstResponseDuration();
}
}

for (auto& iter : histogram_families_) {
const auto& name = iter.first;
auto family_ptr = iter.second;
if (family_ptr) {
histograms_[name] = CreateMetric<prometheus::Histogram>(
*family_ptr, labels, config_.buckets_);
}
}
}

void
MetricModelReporter::InitializeSummaries(
const std::map<std::string, std::string>& labels)
Expand Down Expand Up @@ -398,6 +436,23 @@ MetricModelReporter::DecrementGauge(const std::string& name, double value)
IncrementGauge(name, -1 * value);
}

void
MetricModelReporter::ObserveHistogram(const std::string& name, double value)
{
auto iter = histograms_.find(name);
if (iter == histograms_.end()) {
// No histogram metric exists with this name
return;
}

auto histogram = iter->second;
if (!histogram) {
// histogram is uninitialized/nullptr
return;
}
histogram->Observe(value);
}

void
MetricModelReporter::ObserveSummary(const std::string& name, double value)
{
Expand Down
Loading

0 comments on commit 0c2893d

Please sign in to comment.