Merge pull request #1 from triton-inference-server/main

Merge main from tif
triton-inference-server · May 17, 2024 · 2585eb9 · 2585eb9
2 parents 2f848ee + 0089bb7
commit 2585eb9
Show file tree

Hide file tree

Showing 27 changed files with 405 additions and 144 deletions.
diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h
@@ -91,7 +91,7 @@ struct TRITONSERVER_MetricFamily;
 ///   }
 ///
 #define TRITONSERVER_API_VERSION_MAJOR 1
-#define TRITONSERVER_API_VERSION_MINOR 30
+#define TRITONSERVER_API_VERSION_MINOR 31
 
 /// Get the TRITONBACKEND API version supported by the Triton shared
 /// library. This value can be compared against the
@@ -1828,6 +1828,16 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
 TRITONSERVER_ServerOptionsSetStrictModelConfig(
     struct TRITONSERVER_ServerOptions* options, bool strict);
 
+/// Set the custom model configuration name to load for all models.
+/// Fall back to default config file if empty.
+///
+/// \param options The server options object.
+/// \param config_name The name of the config file to load for all models.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelConfigName(
+    struct TRITONSERVER_ServerOptions* options, const char* model_config_name);
+
 /// Set the rate limit mode in a server options.
 ///
 ///   TRITONSERVER_RATE_LIMIT_EXEC_COUNT: The rate limiting prioritizes the

diff --git a/python/test/test_api.py b/python/test/test_api.py
@@ -346,7 +346,7 @@ def test_ready(self):
         self.assertTrue(server.ready())
 
     @pytest.mark.xfail(
-        tritonserver.__version__ <= "2.43.0",
+        tritonserver.__version__ <= "2.46.0",
         reason="Known issue on stop: Exit timeout expired. Exiting immediately",
         raises=tritonserver.InternalError,
     )

diff --git a/python/tritonserver/_api/_server.py b/python/tritonserver/_api/_server.py
@@ -137,7 +137,7 @@ class Options:
         List of models to load at startup. Only relevant with ModelControlMode.EXPLICIT.
         See :c:func:`TRITONSERVER_ServerOptionsSetStartupModel`
 
-    strict_model_config : bool, default True
+    strict_model_config : bool, default False
         Enable or disable strict model configuration.
         See :c:func:`TRITONSERVER_ServerOptionsSetStrictModelConfig`
 
@@ -275,7 +275,7 @@ class Options:
     server_id: str = "triton"
     model_control_mode: ModelControlMode = ModelControlMode.NONE
     startup_models: list[str] = field(default_factory=list[str])
-    strict_model_config: bool = True
+    strict_model_config: bool = False
 
     rate_limiter_mode: RateLimitMode = RateLimitMode.OFF
     rate_limiter_resources: list[RateLimiterResource] = field(
@@ -507,7 +507,7 @@ def __init__(
 
         Options(server_id='triton', model_repository='/workspace/models',
         model_control_mode=<TRITONSERVER_ModelControlMode.NONE: 0>,
-        startup_models=[], strict_model_config=True,
+        startup_models=[], strict_model_config=False,
         rate_limiter_mode=<TRITONSERVER_RateLimitMode.OFF: 0>,
         rate_limiter_resources=[], pinned_memory_pool_size=268435456,
         cuda_memory_pool_sizes={}, cache_config={},

diff --git a/src/backend_model.cc b/src/backend_model.cc
@@ -61,8 +61,9 @@ TritonModel::Create(
     InferenceServer* server, const std::string& model_path,
     const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
     const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
-    const int64_t version, inference::ModelConfig model_config,
-    const bool is_config_provided, std::unique_ptr<TritonModel>* model)
+    const ModelIdentifier& model_id, const int64_t version,
+    inference::ModelConfig model_config, const bool is_config_provided,
+    std::unique_ptr<TritonModel>* model)
 {
   model->reset();
 
@@ -143,8 +144,8 @@ TritonModel::Create(
 
   // Create and initialize the model.
   std::unique_ptr<TritonModel> local_model(new TritonModel(
-      server, localized_model_dir, backend, min_compute_capability, version,
-      model_config, auto_complete_config, backend_cmdline_config_map,
+      server, localized_model_dir, backend, min_compute_capability, model_id,
+      version, model_config, auto_complete_config, backend_cmdline_config_map,
       host_policy_map));
 
   TritonModel* raw_local_model = local_model.get();
@@ -929,12 +930,14 @@ TritonModel::TritonModel(
     InferenceServer* server,
     const std::shared_ptr<LocalizedPath>& localized_model_dir,
     const std::shared_ptr<TritonBackend>& backend,
-    const double min_compute_capability, const int64_t version,
-    const inference::ModelConfig& config, const bool auto_complete_config,
+    const double min_compute_capability, const ModelIdentifier& model_id,
+    const int64_t version, const inference::ModelConfig& config,
+    const bool auto_complete_config,
     const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
     const triton::common::HostPolicyCmdlineConfigMap& host_policy_map)
     : Model(
-          min_compute_capability, localized_model_dir->Path(), version, config),
+          min_compute_capability, localized_model_dir->Path(), model_id,
+          version, config),
       server_(server), min_compute_capability_(min_compute_capability),
       auto_complete_config_(auto_complete_config),
       backend_cmdline_config_map_(backend_cmdline_config_map),

diff --git a/src/backend_model.h b/src/backend_model.h
@@ -61,8 +61,9 @@ class TritonModel : public Model {
       InferenceServer* server, const std::string& model_path,
       const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
       const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
-      const int64_t version, inference::ModelConfig model_config,
-      const bool is_config_provided, std::unique_ptr<TritonModel>* model);
+      const ModelIdentifier& model_id, const int64_t version,
+      inference::ModelConfig model_config, const bool is_config_provided,
+      std::unique_ptr<TritonModel>* model);
   ~TritonModel();
 
   // Return path to the localized model directory.
@@ -118,8 +119,9 @@ class TritonModel : public Model {
       InferenceServer* server,
       const std::shared_ptr<LocalizedPath>& localized_model_dir,
       const std::shared_ptr<TritonBackend>& backend,
-      const double min_compute_capability, const int64_t version,
-      const inference::ModelConfig& config, const bool auto_complete_config,
+      const double min_compute_capability, const ModelIdentifier& model_id,
+      const int64_t version, const inference::ModelConfig& config,
+      const bool auto_complete_config,
       const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
       const triton::common::HostPolicyCmdlineConfigMap& host_policy_map);
 

diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
@@ -190,7 +190,7 @@ TritonModelInstance::TritonModelInstance(
         model_->ResponseCacheEnabled() &&
         model_->Server()->ResponseCacheEnabled();
     MetricModelReporter::Create(
-        model_->Name(), model_->Version(), id, response_cache_enabled,
+        model_->ModelId(), model_->Version(), id, response_cache_enabled,
         model_->Config().metric_tags(), &reporter_);
   }
 #endif  // TRITON_ENABLE_METRICS

diff --git a/src/constants.h b/src/constants.h
@@ -1,4 +1,4 @@
-// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -71,7 +71,10 @@ constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
     "auto_mixed_precision";
 
 constexpr char kModelConfigPbTxt[] = "config.pbtxt";
+constexpr char kPbTxtExtension[] = ".pbtxt";
+constexpr char kModelConfigFolder[] = "configs";
 
+constexpr char kMetricsLabelModelNamespace[] = "namespace";
 constexpr char kMetricsLabelModelName[] = "model";
 constexpr char kMetricsLabelModelVersion[] = "version";
 constexpr char kMetricsLabelGpuUuid[] = "gpu_uuid";

diff --git a/src/dynamic_batch_scheduler.cc b/src/dynamic_batch_scheduler.cc
@@ -39,14 +39,6 @@
 
 namespace triton { namespace core {
 
-uint64_t
-CaptureTimeNs()
-{
-  return std::chrono::duration_cast<std::chrono::nanoseconds>(
-             std::chrono::steady_clock::now().time_since_epoch())
-      .count();
-}
-
 bool
 IsStaleState(Payload::State payload_state)
 {
@@ -753,32 +745,9 @@ DynamicBatchScheduler::CacheLookUp(
     std::unique_ptr<InferenceRequest>& request,
     std::unique_ptr<InferenceResponse>& cached_response)
 {
-  Status status;
   auto cache = model_->Server()->CacheManager()->Cache();
-  std::unique_ptr<InferenceResponse> local_response;
-  request->ResponseFactory()->CreateResponse(&local_response);
-  // Hash request into cache key
-  std::string key = "";
-  if (!request->CacheKeyIsSet()) {
-    status = cache->Hash(*request, &key);
-    if (!status.IsOk()) {
-      LOG_ERROR << "Failed to hash request: " << status.Message();
-      return;
-    }
-    request->SetCacheKey(key);
-  } else {
-    key = request->CacheKey();
-  }
-
-  // Lookup and capture timestamps
-  {
-    request->CaptureCacheLookupStartNs();
-    status = cache->Lookup(local_response.get(), key);
-    request->CaptureCacheLookupEndNs();
-  }
-
-  if (status.IsOk() && (local_response != nullptr)) {
-    cached_response = std::move(local_response);
+  bool is_lookup_success = CacheLookUpUtil(request, cached_response, cache);
+  if (is_lookup_success) {
 #ifdef TRITON_ENABLE_STATS
     // Update model metrics/stats on cache hits
     // Backends will update metrics as normal on cache misses

diff --git a/src/ensemble_scheduler/ensemble_model.cc b/src/ensemble_scheduler/ensemble_model.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -37,19 +37,21 @@ namespace triton { namespace core {
 
 Status
 EnsembleModel::Create(
-    InferenceServer* server, const std::string& path, const int64_t version,
+    InferenceServer* server, const std::string& path,
+    const ModelIdentifier& model_id, const int64_t version,
     const inference::ModelConfig& model_config, const bool is_config_provided,
     const double min_compute_capability, std::unique_ptr<Model>* model)
 {
   // Create the ensemble model.
-  std::unique_ptr<EnsembleModel> local_model(
-      new EnsembleModel(min_compute_capability, path, version, model_config));
+  std::unique_ptr<EnsembleModel> local_model(new EnsembleModel(
+      min_compute_capability, path, model_id, version, model_config));
 
   RETURN_IF_ERROR(local_model->Init(is_config_provided));
 
   std::unique_ptr<Scheduler> scheduler;
   RETURN_IF_ERROR(EnsembleScheduler::Create(
-      local_model->MutableStatsAggregator(), server, model_config, &scheduler));
+      local_model->MutableStatsAggregator(), server, local_model->ModelId(),
+      model_config, &scheduler));
   RETURN_IF_ERROR(local_model->SetScheduler(std::move(scheduler)));
 
   LOG_VERBOSE(1) << "ensemble model for " << local_model->Name() << std::endl;

diff --git a/src/ensemble_scheduler/ensemble_model.h b/src/ensemble_scheduler/ensemble_model.h
@@ -1,4 +1,4 @@
-// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -39,7 +39,8 @@ class EnsembleModel : public Model {
   EnsembleModel(EnsembleModel&&) = default;
 
   static Status Create(
-      InferenceServer* server, const std::string& path, const int64_t version,
+      InferenceServer* server, const std::string& path,
+      const ModelIdentifier& model_id, const int64_t version,
       const inference::ModelConfig& model_config, const bool is_config_provided,
       const double min_compute_capability, std::unique_ptr<Model>* model);
 
@@ -48,8 +49,9 @@ class EnsembleModel : public Model {
 
   explicit EnsembleModel(
       const double min_compute_capability, const std::string& model_dir,
-      const int64_t version, const inference::ModelConfig& config)
-      : Model(min_compute_capability, model_dir, version, config)
+      const ModelIdentifier& model_id, const int64_t version,
+      const inference::ModelConfig& config)
+      : Model(min_compute_capability, model_dir, model_id, version, config)
   {
   }
   friend std::ostream& operator<<(std::ostream&, const EnsembleModel&);