diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 5fce435168a..67c57bd3154 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -26,7 +26,7 @@ jobs:
       - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d  # v5.1.0
         with:
           python-version: '3'
-      - uses: isort/isort-action@master
+      - uses: isort/isort-action@f14e57e1d457956c45a19c05a89cccdf087846e5  # v1.1.0
         with:
           requirementsFiles: "requirements.txt"
       - name: Run on SConstruct file.
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
new file mode 100644
index 00000000000..8f5524d4513
--- /dev/null
+++ b/.github/workflows/trivy.yml
@@ -0,0 +1,70 @@
+name: Trivy scan
+
+on:
+  workflow_dispatch:
+  push:
+    branches: ["master", "release/**"]
+  pull_request:
+    branches: ["master", "release/**"]
+
+# Declare default permissions as nothing.
+permissions: {}
+
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11  # v4.1.1
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
+
+      - name: Prepare the report to be uploaded to the GitHub artifact store
+        run: |
+          mkdir report
+          cp trivy-report-daos.txt report
+          cp utils/trivy/.trivyignore report/trivyignore.txt
+
+      - name: Upload the report to the GitHub artifact store
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808  # v4.3.3
+        with:
+          path: report/*
+          name: trivy-report-daos
+
+      - name: Adjust config file to use sarif format
+        run: |
+          sed -i 's/output: "trivy-report-daos.txt"/output: "trivy-results.sarif"/g' \
+            utils/trivy/trivy.yaml
+          sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
+
+      - name: Upload Trivy scan results to GitHub Security tab
+        uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a
+        # 3.25.15 (v3)
+        with:
+          sarif_file: 'trivy-results.sarif'
+
+      - name: Adjust config file to show and validate scan results
+        run: |
+          sed -i 's/output: "trivy-results.sarif"//g' utils/trivy/trivy.yaml
+          sed -i 's/format: sarif/format: table/g' utils/trivy/trivy.yaml
+          sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
diff --git a/TAG b/TAG
index 47d92ef13d9..045cac3ed40 100644
--- a/TAG
+++ b/TAG
@@ -1 +1 @@
-2.6.1-rc3
+2.6.2-tb1
diff --git a/VERSION b/VERSION
index 6a6a3d8e35c..097a15a2af3 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.6.1
+2.6.2
diff --git a/debian/changelog b/debian/changelog
index 9790dcbee3e..4a48f35bab2 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+daos (2.6.2-1) unstable; urgency=medium
+  [ Phillip Henderson ]
+  * First test build for 2.6.2
+
+ -- Phillip Henderson <phillip.henderson@intel.com>  Tue, 05 Nov 2024 23:25:00 -0500
+
 daos (2.6.1-4) unstable; urgency=medium
   [ Tomasz Gromadzki ]
   * Add support of the PMDK package 2.1.0 with NDCTL enabled.
diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md
index 060c3790d57..3de0a079203 100644
--- a/docs/admin/env_variables.md
+++ b/docs/admin/env_variables.md
@@ -44,6 +44,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_MD\_CAP         |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.|
 |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.|
 |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.|
+|CRT\_EVENT\_DELAY|Delay in seconds before handling a set of CaRT events. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
 |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.|
 |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"|
 |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.|
diff --git a/src/cart/README.env b/src/cart/README.env
index 00f270d7a41..b90939c8c72 100644
--- a/src/cart/README.env
+++ b/src/cart/README.env
@@ -1,13 +1,10 @@
 This file lists the environment variables used in CaRT.
 
  . D_PROVIDER (Deprecated: CRT_PHY_ADDR_STR)
-   It determines which mercury NA plugin to be used:
+   It determines which mercury NA plugin and transport to be used:
    - set it as "ofi+verbs;ofi_rxm" to use OFI verbs;ofi_rxm provider
-   - set it as "ofi+gni" to use OFI gni provider
    - set it as "sm" to use SM plugin which only works within single node
-   - set it as "ofi+tcp;ofi_rxm" to use OFI tcp;ofi_rxm provider.
-   - set it as "ofi+sockets" to use OFI sockets provider
-      NOTE: This provider is deprecated in favor of "ofi+tcp;ofi_rxm"
+   - set it as "ofi+tcp" to use OFI tcp provider.
    - by default (not set or set as any other value) it will use ofi tcp
       provider.
 
@@ -205,3 +202,8 @@ This file lists the environment variables used in CaRT.
    start copying data in an effort to release multi-recv buffers. Copy will occur when at
    most D_MRECV_BUF_COPY buffers remain.
 
+   SWIM_TRAFFIC_CLASS
+   (server only) Select a traffic class for the SWIM protocol to use and prevent potential
+   traffic congestion. Available options are: "unspec" (default), "best_effort",
+   "low_latency", "bulk_data".
+
diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c
index 789d75ceb31..26b54b52ec3 100644
--- a/src/cart/crt_hg.c
+++ b/src/cart/crt_hg.c
@@ -863,6 +863,9 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
 	init_info.request_post_incr         = crt_gdata.cg_post_incr;
 	init_info.multi_recv_op_max         = crt_gdata.cg_mrecv_buf;
 	init_info.multi_recv_copy_threshold = crt_gdata.cg_mrecv_buf_copy;
+	/* Separate SWIM traffic in an effort to prevent potential congestion. */
+	if (crt_is_service() && ctx_idx == crt_gdata.cg_swim_crt_idx)
+		init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc;
 
 	hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info);
 	if (hg_class == NULL) {
@@ -1479,6 +1482,16 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv)
 		rc = crt_hgret_2_der(hg_ret);
 	}
 
+	/* Release input buffer */
+	if (rpc_priv->crp_release_input_early && !rpc_priv->crp_forward) {
+		hg_ret = HG_Release_input_buf(rpc_priv->crp_hg_hdl);
+		if (hg_ret != HG_SUCCESS) {
+			RPC_ERROR(rpc_priv, "HG_Release_input_buf failed, hg_ret: " DF_HG_RC "\n",
+				  DP_HG_RC(hg_ret));
+			/* Fall through */
+		}
+	}
+
 	return rc;
 }
 
diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c
index 21fc184d446..48d2090a5b3 100644
--- a/src/cart/crt_init.c
+++ b/src/cart/crt_init.c
@@ -18,6 +18,10 @@ static volatile int	gdata_init_flag;
 struct crt_plugin_gdata crt_plugin_gdata;
 static bool		g_prov_settings_applied[CRT_PROV_COUNT];
 
+#define X(a, b) b,
+static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES};
+#undef X
+
 static void
 crt_lib_init(void) __attribute__((__constructor__));
 
@@ -237,18 +241,30 @@ crt_gdata_dump(void)
 	DUMP_GDATA_FIELD("%d", cg_rpc_quota);
 }
 
+static enum crt_traffic_class
+crt_str_to_tc(const char *str)
+{
+	enum crt_traffic_class i = 0;
+
+	while (str != NULL && strcmp(crt_tc_name[i], str) != 0 && i < CRT_TC_UNKNOWN)
+		i++;
+
+	return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i;
+}
+
 /* first step init - for initializing crt_gdata */
 static int data_init(int server, crt_init_options_t *opt)
 {
-	uint32_t        timeout = 0;
-	uint32_t	credits;
-	uint32_t	fi_univ_size = 0;
-	uint32_t	mem_pin_enable = 0;
-	uint32_t        is_secondary;
-	uint32_t        post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR;
-	unsigned int    mrecv_buf      = CRT_HG_MRECV_BUF;
-	unsigned int    mrecv_buf_copy = 0; /* buf copy disabled by default */
-	int             rc             = 0;
+	uint32_t     timeout = 0;
+	uint32_t     credits;
+	uint32_t     fi_univ_size   = 0;
+	uint32_t     mem_pin_enable = 0;
+	uint32_t     is_secondary;
+	uint32_t     post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR;
+	unsigned int mrecv_buf          = CRT_HG_MRECV_BUF;
+	unsigned int mrecv_buf_copy     = 0; /* buf copy disabled by default */
+	char        *swim_traffic_class = NULL;
+	int          rc                 = 0;
 
 	crt_env_dump();
 
@@ -261,6 +277,8 @@ static int data_init(int server, crt_init_options_t *opt)
 	crt_gdata.cg_mrecv_buf = mrecv_buf;
 	crt_env_get(D_MRECV_BUF_COPY, &mrecv_buf_copy);
 	crt_gdata.cg_mrecv_buf_copy = mrecv_buf_copy;
+	crt_env_get(SWIM_TRAFFIC_CLASS, &swim_traffic_class);
+	crt_gdata.cg_swim_tc = crt_str_to_tc(swim_traffic_class);
 
 	is_secondary = 0;
 	/* Apply CART-890 workaround for server side only */
diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h
index 513d6c3db96..0c480b06c2c 100644
--- a/src/cart/crt_internal_types.h
+++ b/src/cart/crt_internal_types.h
@@ -42,6 +42,17 @@ struct crt_na_config {
 	char            **noc_domain_str; /* Array of domains */
 };
 
+#define CRT_TRAFFIC_CLASSES                                                                        \
+	X(CRT_TC_UNSPEC, "unspec")           /* Leave it upon plugin to choose */                  \
+	X(CRT_TC_BEST_EFFORT, "best_effort") /* Best effort */                                     \
+	X(CRT_TC_LOW_LATENCY, "low_latency") /* Low latency */                                     \
+	X(CRT_TC_BULK_DATA, "bulk_data")     /* Bulk data */                                       \
+	X(CRT_TC_UNKNOWN, "unknown")         /* Unknown */
+
+#define X(a, b) a,
+enum crt_traffic_class { CRT_TRAFFIC_CLASSES };
+#undef X
+
 struct crt_prov_gdata {
 	/** NA plugin type */
 	int			cpg_provider;
@@ -105,6 +116,9 @@ struct crt_gdata {
 	/** global swim index for all servers */
 	int32_t			cg_swim_crt_idx;
 
+	/** traffic class used by SWIM */
+	enum crt_traffic_class   cg_swim_tc;
+
 	/** credits limitation for #in-flight RPCs per target EP CTX */
 	uint32_t		cg_credit_ep_ctx;
 
@@ -220,6 +234,7 @@ struct crt_event_cb_priv {
 	ENV(SWIM_PING_TIMEOUT)                                                                     \
 	ENV(SWIM_PROTOCOL_PERIOD_LEN)                                                              \
 	ENV(SWIM_SUSPECT_TIMEOUT)                                                                  \
+	ENV_STR(SWIM_TRAFFIC_CLASS)                                                                \
 	ENV_STR(UCX_IB_FORK_INIT)
 
 /* uint env */
diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c
index 603e565ac1f..bf8124a8a6a 100644
--- a/src/cart/crt_iv.c
+++ b/src/cart/crt_iv.c
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -2911,8 +2911,12 @@ bulk_update_transfer_done_aux(const struct crt_bulk_cb_info *info)
 	return rc;
 
 send_error:
-	rc = crt_bulk_free(cb_info->buc_bulk_hdl);
+	/* send back whatever error got us here */
 	output->rc = rc;
+	rc         = crt_bulk_free(cb_info->buc_bulk_hdl);
+	if (rc != 0)
+		DL_ERROR(rc, "crt_bulk_free() failed");
+
 	iv_ops->ivo_on_put(ivns_internal, &cb_info->buc_iv_value,
 			   cb_info->buc_user_priv);
 
diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c
index 72d8214aa09..ead03d1bf29 100644
--- a/src/cart/crt_rpc.c
+++ b/src/cart/crt_rpc.c
@@ -1550,6 +1550,26 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg)
 	return rc;
 }
 
+int
+crt_reply_send_input_free(crt_rpc_t *req)
+{
+	struct crt_rpc_priv *rpc_priv = NULL;
+	int                  rc       = 0;
+
+	if (req == NULL) {
+		D_ERROR("invalid parameter (NULL req).\n");
+		D_GOTO(out, rc = -DER_INVAL);
+	}
+
+	rpc_priv                          = container_of(req, struct crt_rpc_priv, crp_pub);
+	rpc_priv->crp_release_input_early = 1;
+
+	return crt_reply_send(req);
+
+out:
+	return rc;
+}
+
 int
 crt_reply_send(crt_rpc_t *req)
 {
diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h
index a5e6afee780..3a590933103 100644
--- a/src/cart/crt_rpc.h
+++ b/src/cart/crt_rpc.h
@@ -166,29 +166,30 @@ struct crt_rpc_priv {
 	 * match with crp_req_hdr.cch_flags.
 	 */
 	uint32_t		crp_flags;
-	uint32_t		crp_srv:1, /* flag of server received request */
-				crp_output_got:1,
-				crp_input_got:1,
-				/* flag of collective RPC request */
-				crp_coll:1,
-				/* flag of crp_tgt_uri need to be freed */
-				crp_uri_free:1,
-				/* flag of forwarded rpc for corpc */
-				crp_forward:1,
-				/* flag of in timeout binheap */
-				crp_in_binheap:1,
-				/* set if a call to crt_req_reply pending */
-				crp_reply_pending:1,
-				/* set to 1 if target ep is set */
-				crp_have_ep:1,
-				/* RPC is tracked by the context */
-				crp_ctx_tracked:1,
-				/* 1 if RPC fails HLC epsilon check */
-				crp_fail_hlc:1,
-				/* RPC completed flag */
-				crp_completed:1,
-				/* RPC originated from a primary provider */
-				crp_src_is_primary:1;
+	uint32_t                 crp_srv : 1, /* flag of server received request */
+	    crp_output_got : 1, crp_input_got : 1,
+	    /* flag of collective RPC request */
+	    crp_coll                : 1,
+	    /* flag of crp_tgt_uri need to be freed */
+	    crp_uri_free            : 1,
+	    /* flag of forwarded rpc for corpc */
+	    crp_forward             : 1,
+	    /* flag of in timeout binheap */
+	    crp_in_binheap          : 1,
+	    /* set if a call to crt_req_reply pending */
+	    crp_reply_pending       : 1,
+	    /* set to 1 if target ep is set */
+	    crp_have_ep             : 1,
+	    /* RPC is tracked by the context */
+	    crp_ctx_tracked         : 1,
+	    /* 1 if RPC fails HLC epsilon check */
+	    crp_fail_hlc            : 1,
+	    /* RPC completed flag */
+	    crp_completed           : 1,
+	    /* RPC originated from a primary provider */
+	    crp_src_is_primary      : 1,
+	    /* release input buffer early */
+	    crp_release_input_early : 1;
 
 	struct crt_opc_info	*crp_opc_info;
 	/* corpc info, only valid when (crp_coll == 1) */
diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c
index ace1c736791..39821ec6328 100644
--- a/src/chk/chk_common.c
+++ b/src/chk/chk_common.c
@@ -403,7 +403,7 @@ chk_pool_restart_svc(struct chk_pool_rec *cpr)
 		if (cpr->cpr_started)
 			chk_pool_shutdown(cpr, true);
 
-		rc = ds_pool_start_after_check(cpr->cpr_uuid);
+		rc = ds_pool_start_after_check(cpr->cpr_uuid, cpr->cpr_immutable);
 		if (rc != 0) {
 			D_WARN("Cannot start full PS for "DF_UUIDF" after CR check: "DF_RC"\n",
 			       DP_UUID(cpr->cpr_uuid), DP_RC(rc));
diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c
index 56e6da3ad9b..ad61af851ce 100644
--- a/src/chk/chk_engine.c
+++ b/src/chk/chk_engine.c
@@ -1797,10 +1797,8 @@ chk_engine_pool_ult(void *args)
 	}
 
 	rc = chk_engine_cont_cleanup(cpr, svc, &aggregator);
-	if (rc != 0)
-		goto out;
-
-	rc = ds_pool_svc_schedule_reconf(svc);
+	if (rc == 0 && !cpr->cpr_immutable)
+		rc = ds_pool_svc_schedule_reconf(svc);
 
 out:
 	chk_engine_cont_list_fini(&aggregator);
@@ -2113,6 +2111,11 @@ chk_engine_start_post(struct chk_instance *ins)
 		if (pool_cbk->cb_phase == CHK__CHECK_SCAN_PHASE__CSP_DONE)
 			continue;
 
+		if (ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_DRYRUN)
+			cpr->cpr_immutable = 1;
+		else
+			cpr->cpr_immutable = 0;
+
 		if (phase > pool_cbk->cb_phase)
 			phase = pool_cbk->cb_phase;
 
@@ -2950,7 +2953,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags)
 	cbk = &cpr->cpr_bk;
 	chk_pool_get(cpr);
 
-	rc = ds_pool_start(uuid, false);
+	rc = ds_pool_start(uuid, false, cpr->cpr_immutable);
 	if (rc != 0)
 		D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc));
 
diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h
index 86868e305ee..9ab16b060b3 100644
--- a/src/chk/chk_internal.h
+++ b/src/chk/chk_internal.h
@@ -596,6 +596,7 @@ struct chk_pool_rec {
 				 cpr_stop:1,
 				 cpr_done:1,
 				 cpr_skip:1,
+				 cpr_immutable:1,
 				 cpr_dangling:1,
 				 cpr_for_orphan:1,
 				 cpr_notified_exit:1,
diff --git a/src/chk/chk_vos.c b/src/chk/chk_vos.c
index fdefc2995f7..af2904affd1 100644
--- a/src/chk/chk_vos.c
+++ b/src/chk/chk_vos.c
@@ -19,11 +19,20 @@ chk_db_fetch(char *key, int key_size, void *val, int val_size)
 {
 	d_iov_t	key_iov;
 	d_iov_t	val_iov;
+	int	rc;
 
 	d_iov_set(&key_iov, key, key_size);
 	d_iov_set(&val_iov, val, val_size);
 
-	return chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
+	rc = chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
+
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
+	return rc;
 }
 
 static int
@@ -33,21 +42,17 @@ chk_db_update(char *key, int key_size, void *val, int val_size)
 	d_iov_t	val_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
-
 	d_iov_set(&key_iov, key, key_size);
 	d_iov_set(&val_iov, val, val_size);
 
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
 	rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
 
-out:
 	return rc;
 }
 
@@ -57,27 +62,33 @@ chk_db_delete(char *key, int key_size)
 	d_iov_t	key_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
-
 	d_iov_set(&key_iov, key, key_size);
 
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
 	rc = chk_db->sd_delete(chk_db, CHK_DB_TABLE, &key_iov);
 
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
 
-out:
 	return rc;
 }
 
 static int
 chk_db_traverse(sys_db_trav_cb_t cb, void *args)
 {
-	return chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args);
+	int	rc;
+
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
+	rc = chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args);
+
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
+	return rc;
 }
 
 int
@@ -243,11 +254,8 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 	d_iov_t	val_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
 
 	if (cpp->cp_rank_nr != 0 && rank_list != NULL) {
 		D_ASSERTF(cpp->cp_rank_nr == rank_list->rl_nr, "Invalid rank nr %u/%u\n",
@@ -259,7 +267,7 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 
 		rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 		if (rc != 0)
-			goto end;
+			goto out;
 	}
 
 	d_iov_set(&key_iov, CHK_PROPERTY, strlen(CHK_PROPERTY));
@@ -267,11 +275,10 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 
 	rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 
-end:
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
-
 out:
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
 	if (rc != 0)
 		D_ERROR("Failed to update check property on rank %u: "DF_RC"\n",
 			dss_self_rank(), DP_RC(rc));
diff --git a/src/client/dfs/cont.c b/src/client/dfs/cont.c
index 910e1819aa3..27822914056 100644
--- a/src/client/dfs/cont.c
+++ b/src/client/dfs/cont.c
@@ -970,7 +970,9 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
 out_snap:
 	D_FREE(oit_args);
 	epr.epr_hi = epr.epr_lo = snap_epoch;
-	rc2                     = daos_cont_destroy_snap(coh, epr, NULL);
+	rc2 = daos_cont_destroy_snap(coh, epr, NULL);
+	if (rc2 != 0)
+		D_ERROR("Failed to destroy OID table: " DF_RC "\n", DP_RC(rc2));
 	if (rc == 0)
 		rc = daos_der2errno(rc2);
 out_dfs:
diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c
index 4f654fa3209..6397b283e97 100644
--- a/src/client/dfuse/dfuse_core.c
+++ b/src/client/dfuse/dfuse_core.c
@@ -53,7 +53,7 @@ dfuse_progress_thread(void *arg)
 				return NULL;
 		}
 
-		rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_WAIT, 128, &dev[0]);
+		rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_NOWAIT, 128, &dev[0]);
 		if (rc >= 1) {
 			for (i = 0; i < rc; i++) {
 				struct dfuse_event *ev;
diff --git a/src/client/dfuse/pil4dfs/hook.c b/src/client/dfuse/pil4dfs/hook.c
index 0ec1a0b5374..c30061387c1 100644
--- a/src/client/dfuse/pil4dfs/hook.c
+++ b/src/client/dfuse/pil4dfs/hook.c
@@ -42,6 +42,7 @@
 #include <linux/limits.h>
 #include <capstone/capstone.h>
 #include <gurt/common.h>
+#include <gnu/libc-version.h>
 
 #include "hook.h"
 #include "hook_int.h"
@@ -89,10 +90,15 @@ static uint64_t lib_base_addr[MAX_NUM_LIB];
 /* List of names of loaded libraries */
 static char   **lib_name_list;
 
+/* libc version number in current process. e.g., 2.28 */
+static float    libc_version;
+static char    *libc_version_str;
+
 /* end   to compile list of memory blocks in /proc/pid/maps */
 
 static char    *path_ld;
 static char    *path_libc;
+static char    *path_libdl;
 static char    *path_libpthread;
 /* This holds the path of libpil4dfs.so. It is needed when we want to
  * force child processes append libpil4dfs.so to env LD_PRELOAD. */
@@ -212,7 +218,7 @@ determine_lib_path(void)
 {
 	int   path_offset   = 0, read_size, i, rc;
 	char *read_buff_map = NULL;
-	char *pos, *start, *end, lib_ver_str[32] = "", *lib_dir_str = NULL;
+	char *pos, *start, *end, *lib_dir_str = NULL;
 
 	read_size = read_map_file(&read_buff_map);
 
@@ -289,19 +295,17 @@ determine_lib_path(void)
 		goto err;
 	path_libc[end - start] = 0;
 
-	pos = strstr(path_libc, "libc-2.");
-	if (pos) {
-		/* containing version in name. example, 2.17 */
-		memcpy(lib_ver_str, pos + 5, 4);
-		lib_ver_str[4] = 0;
+	if (libc_version_str == NULL) {
+		libc_version_str = (char *)gnu_get_libc_version();
+		if (libc_version_str == NULL) {
+			DS_ERROR(errno, "Failed to determine libc version");
+			goto err;
+		}
+		libc_version = atof(libc_version_str);
 	}
 
-	if (lib_ver_str[0]) {
-		/* with version in name */
-		rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, lib_ver_str);
-	} else {
-		rc = asprintf(&path_libpthread, "%s/libpthread.so.0", lib_dir_str);
-	}
+	/* with version in name */
+	rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, libc_version_str);
 	if (rc < 0) {
 		DS_ERROR(ENOMEM, "Failed to allocate memory for path_libpthread");
 		goto err_1;
@@ -311,7 +315,18 @@ determine_lib_path(void)
 		path_libpthread = NULL;
 		DS_ERROR(ENAMETOOLONG, "path_libpthread is too long");
 		goto err_1;
-	}	
+	}
+	rc = asprintf(&path_libdl, "%s/libdl-%s.so", lib_dir_str, libc_version_str);
+	if (rc < 0) {
+		DS_ERROR(ENOMEM, "Failed to allocate memory for path_libdl");
+		goto err_1;
+	}
+	if (rc >= PATH_MAX) {
+		free(path_libdl);
+		path_libdl = NULL;
+		DS_ERROR(ENAMETOOLONG, "path_libdl is too long");
+		goto err_1;
+	}
 	D_FREE(lib_dir_str);
 
 	pos = strstr(read_buff_map, "libpil4dfs.so");
@@ -348,6 +363,11 @@ query_pil4dfs_path(void)
 	return path_libpil4dfs;
 }
 
+float
+query_libc_version(void)
+{
+	return libc_version;
+}
 
 /*
  * query_func_addr - Determine the addresses and code sizes of functions in func_name_list[].
@@ -754,6 +774,7 @@ free_memory_in_hook(void)
 	D_FREE(path_ld);
 	D_FREE(path_libc);
 	D_FREE(module_list);
+	free(path_libdl);
 	free(path_libpthread);
 
 	if (lib_name_list) {
@@ -1034,6 +1055,8 @@ register_a_hook(const char *module_name, const char *func_name, const void *new_
 		module_name_local = path_ld;
 	else if (strncmp(module_name, "libc", 5) == 0)
 		module_name_local = path_libc;
+	else if (strncmp(module_name, "libdl", 6) == 0)
+		module_name_local = path_libdl;
 	else if (strncmp(module_name, "libpthread", 11) == 0)
 		module_name_local = path_libpthread;
 	else
diff --git a/src/client/dfuse/pil4dfs/hook.h b/src/client/dfuse/pil4dfs/hook.h
index 7742faaff53..b686d99ce4e 100644
--- a/src/client/dfuse/pil4dfs/hook.h
+++ b/src/client/dfuse/pil4dfs/hook.h
@@ -60,4 +60,10 @@ free_memory_in_hook(void);
 char *
 query_pil4dfs_path(void);
 
+/**
+ * return glibc version in current process
+ */
+float
+query_libc_version(void);
+
 #endif
diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c
index fff49d9f0ae..84567ac5f2d 100644
--- a/src/client/dfuse/pil4dfs/int_dfs.c
+++ b/src/client/dfuse/pil4dfs/int_dfs.c
@@ -161,6 +161,7 @@ static long int         page_size;
 #define DAOS_INIT_RUNNING     1
 
 static _Atomic uint64_t mpi_init_count;
+static _Atomic int64_t  zeInit_count;
 
 static long int         daos_initing;
 _Atomic bool            d_daos_inited;
@@ -470,6 +471,10 @@ static int (*next_tcgetattr)(int fd, void *termios_p);
 /* end NOT supported by DAOS */
 
 static int (*next_mpi_init)(int *argc, char ***argv);
+static int (*next_pmpi_init)(int *argc, char ***argv);
+static int (*next_ze_init)(int flags);
+static void *(*next_dlsym)(void *handle, const char *symbol);
+static void *(*new_dlsym)(void *handle, const char *symbol);
 
 /* to do!! */
 /**
@@ -1041,6 +1046,159 @@ MPI_Init(int *argc, char ***argv)
 	return rc;
 }
 
+int
+PMPI_Init(int *argc, char ***argv)
+{
+	int rc;
+
+	if (next_pmpi_init == NULL) {
+		next_pmpi_init = dlsym(RTLD_NEXT, "PMPI_Init");
+		D_ASSERT(next_pmpi_init != NULL);
+	}
+
+	atomic_fetch_add_relaxed(&mpi_init_count, 1);
+	rc = next_pmpi_init(argc, argv);
+	atomic_fetch_add_relaxed(&mpi_init_count, -1);
+	return rc;
+}
+
+int
+zeInit(int flags)
+{
+	int rc;
+
+	if (next_ze_init == NULL) {
+		if (d_hook_enabled)
+			next_ze_init = next_dlsym(RTLD_NEXT, "zeInit");
+		else
+			next_ze_init = dlsym(RTLD_NEXT, "zeInit");
+	}
+	D_ASSERT(next_ze_init != NULL);
+	atomic_fetch_add_relaxed(&zeInit_count, 1);
+	rc = next_ze_init(flags);
+	atomic_fetch_add_relaxed(&zeInit_count, -1);
+	return rc;
+}
+
+#if defined(__x86_64__)
+/* This is used to work around compiling warning and limitations of using asm function. */
+static void *
+query_new_dlsym_addr(void *addr)
+{
+	int i;
+
+	/* assume little endian */
+	for (i = 0; i < 64; i++) {
+		/* 0x56579090 is corresponding to the first four instructions at new_dlsym_asm.
+		 * 0x90 - nop, 0x90 - nop, 0x57 - push %rdi, 0x56 - push %rsi
+		 */
+		if (*((int *)(addr + i)) == 0x56579090) {
+			/* two nop are added for easier positioning. offset +2 here to skip two
+			 * nop and start from the real entry.
+			 */
+			return ((void *)(addr + i + 2));
+		}
+	}
+	return NULL;
+}
+
+_Pragma("GCC diagnostic push")
+_Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+_Pragma("GCC diagnostic ignored \"-Wunused-variable\"")
+
+_Pragma("GCC push_options")
+_Pragma("GCC optimize(\"-O0\")")
+static char str_zeinit[] = "zeInit";
+
+static int
+is_hook_enabled(void)
+{
+	return (d_hook_enabled ? (1) : (0));
+}
+
+/* This wrapper function is introduced to avoid compiling issue with Intel-C on Leap 15.5 */
+static int
+my_strcmp(const char *s1, const char *s2)
+{
+	return strcmp(s1, s2);
+}
+
+static void *
+get_zeinit_addr(void)
+{
+	return (void *)zeInit;
+}
+
+__attribute__((aligned(16))) static void
+new_dlsym_marker(void)
+{
+}
+
+__asm__(
+	"new_dlsym_asm:\n"
+	"nop\n"
+	"nop\n"
+	"push %rdi\n"
+	"push %rsi\n"
+
+	"call is_hook_enabled\n"
+	"test %eax,%eax\n"
+	"je org_dlsym\n"
+
+	"mov %rsi, %rdi\n"
+	"lea str_zeinit(%rip), %rsi\n"
+	"call my_strcmp\n"
+	"test %eax,%eax\n"
+	"jne org_dlsym\n"
+
+	"pop %rsi\n"
+	"pop %rdi\n"
+	"call *next_dlsym(%rip)\n"
+	"mov %rax, next_ze_init(%rip)\n"
+
+	"test %eax,%eax\n"
+	"jne found\n"
+	"ret\n"
+
+	"found:\n"
+	"call get_zeinit_addr\n"
+	"ret\n"
+
+	"org_dlsym:\n"
+	"pop %rsi\n"
+	"pop %rdi\n"
+	"jmp *next_dlsym(%rip)\n"
+);
+_Pragma("GCC pop_options")
+_Pragma("GCC diagnostic pop")
+
+#else
+/* c code for other architecture. caller info could be wrong inside libc dlsym() when handle is set
+ * RTLD_NEXT. Assembly version implementation similar to above is needed to fix the issue by using
+ * jump instead of call instruction. 
+ */
+static void *
+new_dlsym_c(void *handle, const char *symbol)
+{
+	if (!d_hook_enabled)
+		goto org_dlsym;
+	printf("Inside my dlsym().\n");
+	if (strcmp(symbol, "zeInit") != 0)
+		goto org_dlsym;
+
+	next_ze_init = next_dlsym(handle, symbol);
+	if (next_ze_init)
+		/* dlsym() finished successfully, then intercept zeInit() */
+		return zeInit;
+	else
+		return next_ze_init;
+
+org_dlsym:
+	/* Ideally we need to adjust stack and jump to next_dlsym(). */
+	return next_dlsym(handle, symbol);
+}
+#endif
+
 /** determine whether a path (both relative and absolute) is on DAOS or not. If yes,
  *  returns parent object, item name, full path of parent dir, full absolute path, and
  *  the pointer to struct dfs_mt.
@@ -1147,6 +1305,15 @@ query_path(const char *szInput, int *is_target_path, struct dcache_rec **parent,
 				goto out_normal;
 			}
 
+			/* Check whether zeInit() is running. If yes, pass to the original
+			 * libc functions. Avoid possible zeInit reentrancy/nested call.
+			 */
+
+			if (atomic_load_relaxed(&zeInit_count) > 0) {
+				*is_target_path = 0;
+				goto out_normal;
+			}
+
 			/* daos_init() is expensive to call. We call it only when necessary. */
 
 			/* Check whether daos_init() is running. If yes, pass to the original
@@ -2017,6 +2184,7 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 
 	if (!is_target_path)
 		goto org_func;
+	atomic_fetch_add_relaxed(&num_open, 1);
 	if (oflags & O_CREAT && (oflags & O_DIRECTORY || oflags & O_PATH)) {
 		/* Create a dir is not supported. */
 		errno = ENOENT;
@@ -2044,7 +2212,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 		}
 
 		/* Need to create a fake fd and associate with fd_kernel */
-		atomic_fetch_add_relaxed(&num_open, 1);
 		dfs_get_mode(dfs_obj, &mode_query);
 
 		/* regular file */
@@ -2220,7 +2387,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 
 		return (idx_dirfd + FD_DIR_BASE);
 	}
-	atomic_fetch_add_relaxed(&num_open, 1);
 
 	rc = find_next_available_fd(NULL, &idx_fd);
 	if (rc)
@@ -5997,7 +6163,7 @@ ioctl(int fd, unsigned long request, ...)
 	va_list                         arg;
 	void                           *param;
 	struct dfuse_user_reply        *reply;
-	int                             fd_directed;
+	int                             fd_directed = fd;
 
 	va_start(arg, request);
 	param = va_arg(arg, void *);
@@ -6023,12 +6189,11 @@ ioctl(int fd, unsigned long request, ...)
 		return next_ioctl(fd, request, param);
 
 	fd_directed = d_get_fd_redirected(fd);
-	if (fd_directed < FD_FILE_BASE)
+	if ((fd_directed < FD_FILE_BASE) || (fd_directed >= (FD_DIR_BASE + MAX_OPENED_DIR)))
 		return next_ioctl(fd, request, param);
 
 	errno = ENOTSUP;
-
-	return -1;
+	return (-1);
 }
 
 int
@@ -6224,6 +6389,14 @@ new_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
 
 	atomic_fetch_add_relaxed(&num_mmap, 1);
 
+	if ((fd < FD_FILE_BASE) && (fd_directed >= FD_FILE_BASE) && d_compatible_mode) {
+		/* DAOS-14494: Force the kernel to update the size before mapping. */
+		rc = next_fxstat(1, fd, &stat_buf);
+		if (rc == -1)
+			return MAP_FAILED;
+		return next_mmap(addr, length, prot, flags, fd, offset);
+	}
+
 	addr_ret = next_mmap(addr, length, prot, flags | MAP_ANONYMOUS, -1, offset);
 	if (addr_ret == MAP_FAILED)
 		return MAP_FAILED;
@@ -6693,6 +6866,18 @@ check_exe_sh_bash(void)
 	return;
 }
 
+#define SMALL_DIFF (0.0001)
+static int
+libc_ver_cmp(float ver_a, float ver_b)
+{
+	if ((ver_a + SMALL_DIFF) < ver_b)
+		return (-1);
+	else if (ver_a > (ver_b + SMALL_DIFF))
+		return (1);
+	else
+		return (0);
+}
+
 static __attribute__((constructor)) void
 init_myhook(void)
 {
@@ -6700,6 +6885,7 @@ init_myhook(void)
 	char    *env_log;
 	int      rc;
 	uint64_t eq_count_loc = 0;
+	float    libc_version;
 
 	umask_old = umask(0);
 	umask(umask_old);
@@ -6842,14 +7028,24 @@ init_myhook(void)
 
 	register_a_hook("libc", "fcntl", (void *)new_fcntl, (long int *)(&libc_fcntl));
 
-	if (d_compatible_mode == false) {
-		register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap));
-		register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap));
-	}
+	register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap));
+	register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap));
 
 	register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit));
 	register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3));
 
+#if defined(__x86_64__)
+	new_dlsym = query_new_dlsym_addr(new_dlsym_marker);
+#else
+	new_dlsym = new_dlsym_c;
+#endif
+	D_ASSERT(new_dlsym != NULL);
+	libc_version = query_libc_version();
+	if (libc_ver_cmp(libc_version, 2.34) < 0)
+		register_a_hook("libdl", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym));
+	else
+		register_a_hook("libc", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym));
+
 	init_fd_dup2_list();
 
 	/* Need to check whether current process is bash or not under regular & compatible modes.*/
@@ -6861,6 +7057,10 @@ init_myhook(void)
 		dcache_rec_timeout = 0;
 
 	install_hook();
+
+	/* Check it here to minimize the work in function new_dlsym() written in assembly */
+	D_ASSERT(next_dlsym != NULL);
+
 	d_hook_enabled   = 1;
 	hook_enabled_bak = d_hook_enabled;
 }
diff --git a/src/client/dfuse/pil4dfs/pil4dfs_int.h b/src/client/dfuse/pil4dfs/pil4dfs_int.h
index a9c54b55555..0693123b51f 100644
--- a/src/client/dfuse/pil4dfs/pil4dfs_int.h
+++ b/src/client/dfuse/pil4dfs/pil4dfs_int.h
@@ -30,7 +30,7 @@
 /* FD_FILE_BASE - The base number of the file descriptor for a directory.
  * The fd allocate from this lib is always larger than FD_FILE_BASE.
  */
-#define FD_DIR_BASE     (0x40000000)
+#define FD_DIR_BASE     (FD_FILE_BASE + MAX_OPENED_FILE)
 
 /* structure allocated for a FD for a file */
 struct file_obj {
diff --git a/src/common/lru.c b/src/common/lru.c
index de86d367e0e..87dbdaddaa9 100644
--- a/src/common/lru.c
+++ b/src/common/lru.c
@@ -255,7 +255,9 @@ void
 daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
 {
 	D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1);
-	D_ASSERT(d_list_empty(&llink->ll_qlink));
+	D_ASSERTF(d_list_empty(&llink->ll_qlink),
+		  "May hit corrupted item in LRU cache %p: llink %p, refs %d, prev %p, next %p\n",
+		  lcache, llink, llink->ll_ref, llink->ll_qlink.prev, llink->ll_qlink.next);
 
 	lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link);
 
diff --git a/src/container/srv_container.c b/src/container/srv_container.c
index 372da43afe4..91d87d9b978 100644
--- a/src/container/srv_container.c
+++ b/src/container/srv_container.c
@@ -1555,9 +1555,9 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont,
 	 * - Users who can delete any container in the pool
 	 * - Users who have been given access to delete the specific container
 	 */
-	if (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) &&
-	    !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred,
-				    &owner, acl)) {
+	if (pool_hdl->sph_pool->sp_immutable ||
+	    (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) &&
+	     !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, &owner, acl))) {
 		D_ERROR(DF_CONT": permission denied to delete cont\n",
 			DP_CONT(pool_hdl->sph_pool->sp_uuid, cont->c_uuid));
 		D_GOTO(out_prop, rc = -DER_NO_PERM);
@@ -2253,6 +2253,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		goto out;
 	}
 
+	if (pool_hdl->sph_pool->sp_immutable && (flags & DAOS_COO_IO_BASE_MASK) != DAOS_COO_RO) {
+		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the immutable "
+			"container with flags " DF_X64 ", sec_capas " DF_X64 ": " DF_RC "\n",
+			DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+			DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, DP_RC(rc));
+		goto out;
+	}
+
 	/*
 	 * Need props to check for pool redundancy requirements and access
 	 * control.
@@ -2274,6 +2283,11 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		D_GOTO(out, rc);
 	}
 
+	D_DEBUG(DB_MD, DF_UUID "/" DF_UUID "/" DF_UUID ": opening container with flags "
+		DF_X64", sec_capas " DF_X64 "/" DF_X64 "\n",
+		DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+		DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas);
+
 	if ((flags & DAOS_COO_EVICT_ALL) && !ds_sec_cont_can_evict_all(sec_capas)) {
 		D_ERROR(DF_CONT": permission denied evicting all handles\n",
 			DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid));
@@ -2282,11 +2296,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		goto out;
 	}
 
-	if ((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) {
-		D_ERROR(DF_CONT": permission denied opening exclusively\n",
-			DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid));
-		daos_prop_free(prop);
+	if (((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) ||
+	    ((flags & DAOS_COO_RW) && !ds_sec_cont_can_modify(sec_capas))) {
 		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the container "
+			"with flags " DF_X64 ", capas " DF_X64 "/" DF_X64 ": " DF_RC "\n",
+			DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+			DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas,
+			DP_RC(rc));
+		daos_prop_free(prop);
 		goto out;
 	}
 
diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index b0f3b693580..caf2a4b2ee7 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -475,7 +475,7 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb,
 		if (rc == -DER_SHUTDOWN) {
 			break;	/* pool destroyed */
 		} else if (rc < 0) {
-			DL_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, rc,
+			DL_CDEBUG(rc == -DER_BUSY || rc == -DER_INPROGRESS, DB_EPC, DLOG_ERR, rc,
 				  DF_CONT ": %s aggregate failed",
 				  DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
 				  param->ap_vos_agg ? "VOS" : "EC");
@@ -669,6 +669,7 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid,
 	cont->sc_dtx_committable_coll_count = 0;
 	D_INIT_LIST_HEAD(&cont->sc_dtx_cos_list);
 	D_INIT_LIST_HEAD(&cont->sc_dtx_coll_list);
+	D_INIT_LIST_HEAD(&cont->sc_dtx_batched_list);
 
 	*link = &cont->sc_list;
 	return 0;
@@ -935,7 +936,7 @@ cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid,
 			cont_child->sc_stopping, cont_child->sc_destroying);
 		rc = -DER_SHUTDOWN;
 	} else if (!cont_child_started(cont_child)) {
-		if (!ds_pool_skip_for_check(pool_child->spc_pool)) {
+		if (!ds_pool_restricted(pool_child->spc_pool, false)) {
 			rc = cont_start_agg(cont_child);
 			if (rc != 0)
 				goto out;
@@ -1591,11 +1592,15 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 		 *     but for creating rebuild global container handle.
 		 */
 		D_ASSERT(hdl->sch_cont != NULL);
+		D_ASSERT(hdl->sch_cont->sc_pool != NULL);
 		hdl->sch_cont->sc_open++;
 
 		if (hdl->sch_cont->sc_open > 1)
 			goto opened;
 
+		if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false))
+			goto csum_init;
+
 		rc = dtx_cont_open(hdl->sch_cont);
 		if (rc != 0) {
 			D_ASSERTF(hdl->sch_cont->sc_open == 1, "Unexpected open count for cont "
@@ -1623,10 +1628,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 			D_GOTO(err_dtx, rc);
 		}
 
-		D_ASSERT(hdl->sch_cont != NULL);
-		D_ASSERT(hdl->sch_cont->sc_pool != NULL);
+csum_init:
 		rc = ds_cont_csummer_init(hdl->sch_cont);
-
 		if (rc != 0)
 			D_GOTO(err_dtx, rc);
 	}
diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go
index 62fede43ed4..cb2dc5099d4 100644
--- a/src/control/common/proto/ctl/storage_nvme.pb.go
+++ b/src/control/common/proto/ctl/storage_nvme.pb.go
@@ -95,11 +95,12 @@ type ScanNvmeReq struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields
 
-	Health   bool   `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"`     // Retrieve NVMe device health statistics
-	Meta     bool   `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"`         // Retrieve metadata relating to NVMe device
-	Basic    bool   `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"`       // Strip NVMe device details to only basic
-	MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob
-	RdbSize  uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"`   // Size of the RDB blob
+	Health    bool   `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"`       // Retrieve NVMe device health statistics
+	Meta      bool   `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"`           // Retrieve metadata relating to NVMe device
+	Basic     bool   `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"`         // Strip NVMe device details to only basic
+	MetaSize  uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"`   // Size of the metadata blob
+	RdbSize   uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"`     // Size of the RDB blob
+	LinkStats bool   `protobuf:"varint,6,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics
 }
 
 func (x *ScanNvmeReq) Reset() {
@@ -169,6 +170,13 @@ func (x *ScanNvmeReq) GetRdbSize() uint64 {
 	return 0
 }
 
+func (x *ScanNvmeReq) GetLinkStats() bool {
+	if x != nil {
+		return x.LinkStats
+	}
+	return false
+}
+
 type ScanNvmeResp struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -276,7 +284,7 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{
 	0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53,
 	0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72,
 	0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08,
-	0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0x85, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61,
+	0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61,
 	0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c,
 	0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68,
 	0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04,
@@ -285,18 +293,20 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{
 	0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65,
 	0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a,
 	0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65,
-	0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70,
-	0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b,
-	0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72,
-	0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a,
-	0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63,
-	0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65,
-	0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61,
-	0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68,
-	0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63,
-	0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-	0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f,
-	0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x12, 0x1c, 0x0a, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x06, 0x20,
+	0x01, 0x28, 0x08, 0x52, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65,
+	0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b,
+	0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13,
+	0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+	0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73,
+	0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c,
+	0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05,
+	0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e,
+	0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
+	0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f,
+	0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+	0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74,
+	0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
 }
 
 var (
diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go
index 65942a6c12d..9d5fe470de6 100644
--- a/src/control/lib/control/storage.go
+++ b/src/control/lib/control/storage.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2020-2023 Intel Corporation.
+// (C) Copyright 2020-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -258,6 +258,8 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe
 				// Health and meta details required to populate usage statistics.
 				Health: req.NvmeHealth || req.Usage,
 				Meta:   req.Usage,
+				// Only request link stats if health explicitly requested.
+				LinkStats: req.NvmeHealth,
 			},
 		})
 	})
diff --git a/src/control/lib/hardware/pciutils/bindings.go b/src/control/lib/hardware/pciutils/bindings.go
index 86686896c80..f4d43c50fa3 100644
--- a/src/control/lib/hardware/pciutils/bindings.go
+++ b/src/control/lib/hardware/pciutils/bindings.go
@@ -40,6 +40,7 @@ var (
 	ErrMultiDevices     = errors.New("want single device config got multiple")
 	ErrCfgNotTerminated = errors.New("device config content not new-line terminated")
 	ErrCfgMissing       = errors.New("incomplete device config")
+	ErrNoPCIeCaps       = errors.New("no pci-express capabilities found")
 )
 
 // api provides the PCIeLinkStatsProvider interface by exposing a concrete implementation of
@@ -150,7 +151,7 @@ func (ap *api) PCIeCapsFromConfig(cfgBytes []byte, dev *hardware.PCIDevice) erro
 	var cp *C.struct_pci_cap = C.pci_find_cap(pciDev, C.PCI_CAP_ID_EXP, C.PCI_CAP_NORMAL)
 
 	if cp == nil {
-		return errors.New("no pci-express capabilities found")
+		return ErrNoPCIeCaps
 	}
 
 	cpAddr := uint32(cp.addr)
diff --git a/src/control/lib/telemetry/counter.go b/src/control/lib/telemetry/counter.go
index 81549a32daf..e6e59a8d0ea 100644
--- a/src/control/lib/telemetry/counter.go
+++ b/src/control/lib/telemetry/counter.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -37,18 +37,22 @@ func (c *Counter) FloatValue() float64 {
 }
 
 func (c *Counter) Value() uint64 {
+	ctrVal := BadUintVal
 	if c.handle == nil || c.node == nil {
-		return BadUintVal
+		return ctrVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_counter(c.handle.ctx, &val, c.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_counter(c.handle.ctx, &val, c.node)
+		if res == C.DER_SUCCESS {
+			ctrVal = uint64(val)
+		}
+		return res
 	}
+	c.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return ctrVal
 }
 
 func newCounter(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Counter {
diff --git a/src/control/lib/telemetry/duration.go b/src/control/lib/telemetry/duration.go
index 1f32125bc90..3cfd240bde7 100644
--- a/src/control/lib/telemetry/duration.go
+++ b/src/control/lib/telemetry/duration.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,18 +34,22 @@ func (d *Duration) Type() MetricType {
 }
 
 func (d *Duration) Value() time.Duration {
+	durValue := BadDuration
 	if d.handle == nil || d.node == nil {
-		return BadDuration
+		return durValue
 	}
 
-	var tms C.struct_timespec
-
-	res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node)
-	if res == C.DER_SUCCESS {
-		return time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond
+	fetch := func() C.int {
+		var tms C.struct_timespec
+		res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node)
+		if res == C.DER_SUCCESS {
+			durValue = time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond
+		}
+		return res
 	}
+	d.fetchValWithRetry(fetch)
 
-	return BadDuration
+	return durValue
 }
 
 func (d *Duration) FloatValue() float64 {
diff --git a/src/control/lib/telemetry/gauge.go b/src/control/lib/telemetry/gauge.go
index ea84ff90504..93db24ab9fc 100644
--- a/src/control/lib/telemetry/gauge.go
+++ b/src/control/lib/telemetry/gauge.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -41,18 +41,22 @@ func (g *Gauge) FloatValue() float64 {
 
 // Value returns the value as an unsigned integer.
 func (g *Gauge) Value() uint64 {
+	gaugeVal := BadUintVal
 	if g.handle == nil || g.node == nil {
-		return BadUintVal
+		return gaugeVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node)
+		if res == C.DER_SUCCESS {
+			gaugeVal = uint64(val)
+		}
+		return res
 	}
+	g.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return gaugeVal
 }
 
 func newGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Gauge {
@@ -103,18 +107,22 @@ func (g *StatsGauge) FloatValue() float64 {
 
 // Value returns the gauge value as an unsigned integer.
 func (g *StatsGauge) Value() uint64 {
+	gaugeVal := BadUintVal
 	if g.handle == nil || g.node == nil {
-		return BadUintVal
+		return gaugeVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node)
+		if res == C.DER_SUCCESS {
+			gaugeVal = uint64(val)
+		}
+		return res
 	}
+	g.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return gaugeVal
 }
 
 func newStatsGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *StatsGauge {
diff --git a/src/control/lib/telemetry/snapshot.go b/src/control/lib/telemetry/snapshot.go
index 2ffa23296c3..5b2af9f0747 100644
--- a/src/control/lib/telemetry/snapshot.go
+++ b/src/control/lib/telemetry/snapshot.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,18 +34,22 @@ func (s *Snapshot) Type() MetricType {
 }
 
 func (s *Snapshot) Value() time.Time {
+	timeVal := time.Time{} // zero val
 	if s.handle == nil || s.node == nil {
-		return time.Time{}
+		return timeVal
 	}
 
-	var tms C.struct_timespec
-
-	res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node)
-	if res == C.DER_SUCCESS {
-		return time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec))
+	fetch := func() C.int {
+		var tms C.struct_timespec
+		res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node)
+		if res == C.DER_SUCCESS {
+			timeVal = time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec))
+		}
+		return res
 	}
+	s.fetchValWithRetry(fetch)
 
-	return time.Time{}
+	return timeVal
 }
 
 func (s *Snapshot) FloatValue() float64 {
diff --git a/src/control/lib/telemetry/telemetry.go b/src/control/lib/telemetry/telemetry.go
index bb0593240b6..479c41e2aab 100644
--- a/src/control/lib/telemetry/telemetry.go
+++ b/src/control/lib/telemetry/telemetry.go
@@ -84,6 +84,8 @@ const (
 	BadDuration = time.Duration(BadIntVal)
 
 	PathSep = filepath.Separator
+
+	maxFetchRetries = 1
 )
 
 type (
@@ -304,6 +306,16 @@ func (mb *metricBase) String() string {
 	return strings.TrimSpace(string(buf[:bytes.Index(buf, []byte{0})]))
 }
 
+func (mb *metricBase) fetchValWithRetry(fetchFn func() C.int) C.int {
+	var rc C.int
+	for i := 0; i < maxFetchRetries; i++ {
+		if rc = fetchFn(); rc == C.DER_SUCCESS {
+			return rc
+		}
+	}
+	return rc
+}
+
 func (sm *statsMetric) Min() uint64 {
 	return uint64(sm.stats.dtm_min)
 }
diff --git a/src/control/lib/telemetry/timestamp.go b/src/control/lib/telemetry/timestamp.go
index 97ef5bb1ed9..c787aed488d 100644
--- a/src/control/lib/telemetry/timestamp.go
+++ b/src/control/lib/telemetry/timestamp.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,16 +34,22 @@ func (t *Timestamp) Type() MetricType {
 }
 
 func (t *Timestamp) Value() time.Time {
-	zero := time.Time{}
+	timeVal := time.Time{} // zero val
 	if t.handle == nil || t.node == nil {
-		return zero
+		return timeVal
 	}
-	var clk C.time_t
-	res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node)
-	if res == C.DER_SUCCESS {
-		return time.Unix(int64(clk), 0)
+
+	fetch := func() C.int {
+		var clk C.time_t
+		res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node)
+		if res == C.DER_SUCCESS {
+			timeVal = time.Unix(int64(clk), 0)
+		}
+		return res
 	}
-	return zero
+	t.fetchValWithRetry(fetch)
+
+	return timeVal
 }
 
 // FloatValue converts the timestamp to time in seconds since the UNIX epoch.
diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go
index 6378e81d23c..06f1276fa25 100644
--- a/src/control/server/ctl_smd_rpc_test.go
+++ b/src/control/server/ctl_smd_rpc_test.go
@@ -17,6 +17,7 @@ import (
 	"github.com/daos-stack/daos/src/control/common/test"
 	"github.com/daos-stack/daos/src/control/drpc"
 	"github.com/daos-stack/daos/src/control/lib/daos"
+	"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
 	"github.com/daos-stack/daos/src/control/logging"
 	"github.com/daos-stack/daos/src/control/server/config"
@@ -88,6 +89,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 		drpcResps      map[int][]*mockDrpcResponse
 		harnessStopped bool
 		ioStopped      bool
+		pciDevErr      error
 		expResp        *ctlpb.SmdQueryResp
 		expErr         error
 	}{
@@ -658,6 +660,46 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 			},
 			expErr: daos.FreeMemError,
 		},
+		"list-devices; with health; update link stats": {
+			req: &ctlpb.SmdQueryReq{
+				OmitPools:        true,
+				Rank:             uint32(ranklist.NilRank),
+				Uuid:             test.MockUUID(1),
+				IncludeBioHealth: true,
+			},
+			drpcResps: map[int][]*mockDrpcResponse{
+				0: {
+					{
+						Message: &ctlpb.SmdDevResp{
+							Devices: []*ctlpb.SmdDevice{pbNormDev(0)},
+						},
+					},
+				},
+				1: {
+					{
+						Message: &ctlpb.SmdDevResp{
+							Devices: []*ctlpb.SmdDevice{
+								func() *ctlpb.SmdDevice {
+									sd := pbFaultDev(1)
+									sd.Ctrlr.PciCfg = "ABCD"
+									return sd
+								}(),
+							},
+						},
+					},
+					{
+						Message: &ctlpb.BioHealthResp{
+							Temperature: 1000000,
+							TempWarn:    true,
+						},
+					},
+				},
+			},
+			// Prove mock link stats provider gets called when IncludeBioHealth
+			// flag is set and Ctrlr.PciCfg string is not empty.
+			pciDevErr: errors.New("link stats provider fail"),
+			expErr:    errors.New("link stats provider fail"),
+		},
 		"ambiguous UUID": {
 			req: &ctlpb.SmdQueryReq{
 				Rank: uint32(ranklist.NilRank),
@@ -680,6 +722,13 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
 			defer test.ShowBufferOnFailure(t, buf)
 
+			linkStatsProv = &mockPCIeLinkStatsProvider{
+				pciDevErr: tc.pciDevErr,
+			}
+			defer func() {
+				linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
+			}()
+
 			engineCount := len(tc.drpcResps)
 			if engineCount == 0 {
 				engineCount = 1
diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go
index 90d66600dad..3097440f500 100644
--- a/src/control/server/instance_storage_rpc.go
+++ b/src/control/server/instance_storage_rpc.go
@@ -28,9 +28,14 @@ import (
 )
 
 var (
-	scanSmd                       = listSmdDevices
-	getCtrlrHealth                = getBioHealth
+	// Function pointers to enable mocking.
+	scanSmd       = listSmdDevices
+	scanHealth    = getBioHealth
+	linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
+
+	// Sentinel errors to enable comparison.
 	errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance")
+	errCtrlrHealthSkipped         = errors.New("controller health update was skipped")
 )
 
 // newMntRet creates and populates SCM mount result.
@@ -168,13 +173,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe
 }
 
 func addLinkInfoToHealthStats(prov hardware.PCIeLinkStatsProvider, pciCfg string, health *ctlpb.BioHealthResp) error {
+	if health == nil {
+		return errors.New("nil BioHealthResp")
+	}
+
 	// Convert byte-string to lspci-format.
 	sb := new(strings.Builder)
 	formatBytestring(pciCfg, sb)
 
 	pciDev := &hardware.PCIDevice{}
 	if err := prov.PCIeCapsFromConfig([]byte(sb.String()), pciDev); err != nil {
-		return err
+		return errors.Wrap(err, "pciutils lib")
 	}
 
 	// Copy link details from PCIDevice to health stats.
@@ -243,31 +252,74 @@ func publishLinkStatEvents(engine Engine, pciAddr string, stats *ctlpb.BioHealth
 		lastMaxWidthStr, lastWidthStr, stats.LinkPortId)
 }
 
-func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController, prov hardware.PCIeLinkStatsProvider) (bool, error) {
-	stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)]
-	if !ctrlr.CanSupplyHealthStats() {
-		engine.Debugf("skip fetching health stats on device %q in %q state",
-			ctrlr.PciAddr, stateName)
-		return false, nil
+type ctrlrHealthReq struct {
+	meta          bool
+	engine        Engine
+	bhReq         *ctlpb.BioHealthReq
+	ctrlr         *ctlpb.NvmeController
+	linkStatsProv hardware.PCIeLinkStatsProvider
+}
+
+// Retrieve NVMe controller health statistics for those in an acceptable state. Return nil health
+// resp if in a bad state.
+func getCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (*ctlpb.BioHealthResp, error) {
+	stateName := ctlpb.NvmeDevState_name[int32(req.ctrlr.DevState)]
+	if !req.ctrlr.CanSupplyHealthStats() {
+		req.engine.Debugf("skip fetching health stats on device %q in %q state",
+			req.ctrlr.PciAddr, stateName)
+		return nil, errCtrlrHealthSkipped
 	}
 
-	health, err := getCtrlrHealth(ctx, engine, req)
+	health, err := scanHealth(ctx, req.engine, req.bhReq)
 	if err != nil {
-		return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr,
+		return nil, errors.Wrapf(err, "retrieve health stats for %q (state %q)", req.ctrlr,
 			stateName)
 	}
 
-	if ctrlr.PciCfg != "" {
-		if err := addLinkInfoToHealthStats(prov, ctrlr.PciCfg, health); err != nil {
-			return false, errors.Wrapf(err, "add link stats for %q", ctrlr)
+	return health, nil
+}
+
+// Add link state and capability information to input health statistics for the given controller
+// then if successful publish events based on link statistic changes. Link updated health stats to
+// controller.
+func setCtrlrHealthWithLinkInfo(req ctrlrHealthReq, health *ctlpb.BioHealthResp) error {
+	err := addLinkInfoToHealthStats(req.linkStatsProv, req.ctrlr.PciCfg, health)
+	if err == nil {
+		publishLinkStatEvents(req.engine, req.ctrlr.PciAddr, health)
+	} else {
+		if errors.Cause(err) != pciutils.ErrNoPCIeCaps {
+			return errors.Wrapf(err, "add link stats for %q", req.ctrlr)
+		}
+		req.engine.Debugf("device %q not reporting PCIe capabilities", req.ctrlr.PciAddr)
+	}
+
+	return nil
+}
+
+// Update controller health statistics and include link info if required and available.
+func populateCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (bool, error) {
+	health, err := getCtrlrHealth(ctx, req)
+	if err != nil {
+		if err == errCtrlrHealthSkipped {
+			// Nothing to do.
+			return false, nil
 		}
-		publishLinkStatEvents(engine, ctrlr.PciAddr, health)
+		return false, errors.Wrap(err, "get ctrlr health")
+	}
+
+	if req.linkStatsProv == nil {
+		req.engine.Debugf("device %q skip adding link stats; nil provider",
+			req.ctrlr.PciAddr)
+	} else if req.ctrlr.PciCfg == "" {
+		req.engine.Debugf("device %q skip adding link stats; empty pci cfg",
+			req.ctrlr.PciAddr)
 	} else {
-		engine.Debugf("no pcie config space received for %q, skip add link stats", ctrlr)
+		if err = setCtrlrHealthWithLinkInfo(req, health); err != nil {
+			return false, errors.Wrap(err, "set ctrlr health")
+		}
 	}
 
-	ctrlr.HealthStats = health
-	ctrlr.PciCfg = ""
+	req.ctrlr.HealthStats = health
 	return true, nil
 }
 
@@ -305,12 +357,13 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
 
 		c := seenCtrlrs[addr]
 
-		// Only minimal info provided in standard scan to enable result aggregation across
-		// homogeneous hosts.
 		engineRank, err := engine.GetRank()
 		if err != nil {
 			return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index())
 		}
+
+		// Only provide minimal info in standard scan to enable result aggregation across
+		// homogeneous hosts.
 		nsd := &ctlpb.SmdDevice{
 			RoleBits:         sd.RoleBits,
 			CtrlrNamespaceId: sd.CtrlrNamespaceId,
@@ -326,18 +379,29 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
 		// Populate health if requested.
 		healthUpdated := false
 		if pbReq.Health && c.HealthStats == nil {
-			bhReq := &ctlpb.BioHealthReq{
-				DevUuid:  sd.Uuid,
-				MetaSize: pbReq.MetaSize,
-				RdbSize:  pbReq.RdbSize,
+			bhReq := &ctlpb.BioHealthReq{DevUuid: sd.Uuid}
+			if pbReq.Meta {
+				bhReq.MetaSize = pbReq.MetaSize
+				bhReq.RdbSize = pbReq.RdbSize
 			}
-			upd, err := populateCtrlrHealth(ctx, engine, bhReq, c,
-				pciutils.NewPCIeLinkStatsProvider())
+
+			chReq := ctrlrHealthReq{
+				engine: engine,
+				bhReq:  bhReq,
+				ctrlr:  c,
+			}
+			if pbReq.LinkStats {
+				// Add link stats to health if flag set.
+				chReq.linkStatsProv = linkStatsProv
+			}
+
+			healthUpdated, err = populateCtrlrHealth(ctx, chReq)
 			if err != nil {
 				return nil, err
 			}
-			healthUpdated = upd
 		}
+		// Used to update health with link stats, now redundant.
+		c.PciCfg = ""
 
 		// Populate usage data if requested.
 		if pbReq.Meta {
@@ -510,12 +574,20 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq
 			continue // Skip health query if UUID doesn't match requested.
 		}
 		if pbReq.IncludeBioHealth {
-			bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid}
-			if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr,
-				pciutils.NewPCIeLinkStatsProvider()); err != nil {
+			chReq := ctrlrHealthReq{
+				engine:        engine,
+				bhReq:         &ctlpb.BioHealthReq{DevUuid: dev.Uuid},
+				ctrlr:         dev.Ctrlr,
+				linkStatsProv: linkStatsProv,
+			}
+
+			if _, err = populateCtrlrHealth(ctx, chReq); err != nil {
 				return nil, err
 			}
 		}
+		// Used to update health with link stats, now redundant.
+		dev.Ctrlr.PciCfg = ""
+
 		if pbReq.Uuid != "" && dev.Uuid == pbReq.Uuid {
 			rResp.Devices = []*ctlpb.SmdDevice{dev}
 			found = true
diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go
index b199adb6b8d..0f72f739970 100644
--- a/src/control/server/instance_storage_rpc_test.go
+++ b/src/control/server/instance_storage_rpc_test.go
@@ -21,6 +21,7 @@ import (
 	"github.com/daos-stack/daos/src/control/common/test"
 	"github.com/daos-stack/daos/src/control/events"
 	"github.com/daos-stack/daos/src/control/lib/hardware"
+	"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
 	"github.com/daos-stack/daos/src/control/logging"
 	"github.com/daos-stack/daos/src/control/server/config"
@@ -59,24 +60,27 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 	}
 
 	for name, tc := range map[string]struct {
-		badDevState    bool
-		noPciCfgSpc    bool
-		pciDev         *hardware.PCIDevice
-		pciDevErr      error
-		emptyHealthRes bool
-		healthErr      error
-		lastStats      map[string]*ctlpb.BioHealthResp
-		expCtrlr       *ctlpb.NvmeController
-		expNotUpdated  bool
-		expErr         error
-		expDispatched  []*events.RASEvent
-		expLastStats   map[string]*ctlpb.BioHealthResp
+		badDevState      bool
+		nilLinkStatsProv bool
+		noPciCfgSpc      bool
+		pciDev           *hardware.PCIDevice
+		pciDevErr        error
+		healthReq        *ctlpb.BioHealthReq
+		healthRes        *ctlpb.BioHealthResp
+		nilHealthRes     bool
+		healthErr        error
+		lastStats        map[string]*ctlpb.BioHealthResp
+		expCtrlr         *ctlpb.NvmeController
+		expNotUpdated    bool
+		expErr           error
+		expDispatched    []*events.RASEvent
+		expLastStats     map[string]*ctlpb.BioHealthResp
 	}{
 		"bad state; skip health": {
 			badDevState: true,
-			noPciCfgSpc: true,
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr: pciAddr,
+				PciCfg:  "ABCD",
 			},
 			expNotUpdated: true,
 		},
@@ -88,11 +92,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 				HealthStats: healthWithLinkStats(0, 0, 0, 0),
 			},
 		},
+		"nil bio health response": {
+			nilHealthRes: true,
+			expErr:       errors.New("nil BioHealthResp"),
+		},
 		"empty bio health response; empty link stats": {
-			emptyHealthRes: true,
-			pciDev:         new(hardware.PCIDevice),
+			healthRes: new(ctlpb.BioHealthResp),
+			pciDev:    new(hardware.PCIDevice),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: new(ctlpb.BioHealthResp),
 			},
@@ -106,9 +115,19 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			pciDevErr: errors.New("fail"),
 			expErr:    errors.New("fail"),
 		},
+		"update health; add link stats; pciutils lib error; missing pcie caps": {
+			pciDevErr: pciutils.ErrNoPCIeCaps,
+			expCtrlr: &ctlpb.NvmeController{
+				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
+				DevState:    ctlpb.NvmeDevState_NORMAL,
+				HealthStats: healthWithLinkStats(0, 0, 0, 0),
+			},
+		},
 		"update health; add link stats; normal link state; no event published": {
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -128,6 +147,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			},
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4),
 			},
@@ -152,6 +172,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			},
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4),
 			},
@@ -167,6 +188,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(proto.MockNvmeHealth()),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -176,6 +198,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 0.5e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -191,6 +214,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 4, 1)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -213,6 +237,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4),
 			},
@@ -229,6 +254,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 8, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4),
 			},
@@ -245,6 +271,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 8, 8),
 			},
@@ -271,6 +298,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 8, 8)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4),
 			},
@@ -297,6 +325,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(8e+9, 2.5e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(8e+9, 1e+9, 4, 4),
 			},
@@ -319,6 +348,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 16, 8)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 16, 4),
 			},
@@ -335,15 +365,11 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
 			defer test.ShowBufferOnFailure(t, buf)
 
-			healthRes := healthWithLinkStats(0, 0, 0, 0)
-			if tc.emptyHealthRes {
-				healthRes = new(ctlpb.BioHealthResp)
-			}
-			getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
-				return healthRes, tc.healthErr
+			scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
+				return tc.healthRes, tc.healthErr
 			}
 			defer func() {
-				getCtrlrHealth = getBioHealth
+				scanHealth = getBioHealth
 			}()
 
 			var devState ctlpb.NvmeDevState
@@ -363,10 +389,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 					LinkMaxWidth: 4,
 				}
 			}
+			if tc.healthRes == nil && !tc.nilHealthRes {
+				tc.healthRes = healthWithLinkStats(0, 0, 0, 0)
+			}
 
-			mockProv := &mockPCIeLinkStatsProvider{
-				pciDev:    tc.pciDev,
-				pciDevErr: tc.pciDevErr,
+			var mockProv *mockPCIeLinkStatsProvider
+			if !tc.nilLinkStatsProv {
+				mockProv = &mockPCIeLinkStatsProvider{
+					pciDev:    tc.pciDev,
+					pciDevErr: tc.pciDevErr,
+				}
 			}
 
 			ctrlr := &ctlpb.NvmeController{
@@ -387,8 +419,14 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			subscriber := newMockSubscriber(2)
 			ps.Subscribe(events.RASTypeInfoOnly, subscriber)
 
-			upd, err := populateCtrlrHealth(test.Context(t), ei,
-				&ctlpb.BioHealthReq{}, ctrlr, mockProv)
+			chReq := ctrlrHealthReq{
+				engine:        ei,
+				bhReq:         tc.healthReq,
+				ctrlr:         ctrlr,
+				linkStatsProv: mockProv,
+			}
+
+			upd, err := populateCtrlrHealth(test.Context(t), chReq)
 			test.CmpErr(t, tc.expErr, err)
 			if err != nil {
 				return
@@ -436,6 +474,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 	}
 	defSmdScanRes := func() *ctlpb.SmdDevResp {
 		sd := proto.MockSmdDevice(c, 2)
+		sd.Rank = 2
 		return &ctlpb.SmdDevResp{Devices: []*ctlpb.SmdDevice{sd}}
 	}
 	healthRespWithUsage := func() *ctlpb.BioHealthResp {
@@ -444,6 +483,19 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 		mh.MetaWalSize, mh.RdbWalSize = 4, 5
 		return mh
 	}
+	ctrlrWithUsageAndMeta := func() *ctlpb.NvmeController {
+		c := proto.MockNvmeController(2)
+		c.HealthStats = healthRespWithUsage()
+		sd := proto.MockSmdDevice(nil, 2)
+		sd.Rank = 1
+		sd.TotalBytes = c.HealthStats.TotalBytes
+		sd.AvailBytes = c.HealthStats.AvailBytes
+		sd.ClusterSize = c.HealthStats.ClusterSize
+		sd.MetaWalSize = c.HealthStats.MetaWalSize
+		sd.RdbWalSize = c.HealthStats.RdbWalSize
+		c.SmdDevices = []*ctlpb.SmdDevice{sd}
+		return c
+	}
 
 	for name, tc := range map[string]struct {
 		req                 ctlpb.ScanNvmeReq
@@ -640,28 +692,14 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				State: new(ctlpb.ResponseState),
 			},
 		},
-		"scan over drpc; with smd and health; usage and wal size reported": {
+		"scan over drpc; with meta and health; usage and wal size reported": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			rank:      1,
 			smdRes:    defSmdScanRes(),
 			healthRes: healthRespWithUsage(),
 			expResp: &ctlpb.ScanNvmeResp{
-				Ctrlrs: proto.NvmeControllers{
-					func() *ctlpb.NvmeController {
-						c := proto.MockNvmeController(2)
-						c.HealthStats = healthRespWithUsage()
-						sd := proto.MockSmdDevice(nil, 2)
-						sd.Rank = 1
-						sd.TotalBytes = c.HealthStats.TotalBytes
-						sd.AvailBytes = c.HealthStats.AvailBytes
-						sd.ClusterSize = c.HealthStats.ClusterSize
-						sd.MetaWalSize = c.HealthStats.MetaWalSize
-						sd.RdbWalSize = c.HealthStats.RdbWalSize
-						c.SmdDevices = []*ctlpb.SmdDevice{sd}
-						return c
-					}(),
-				},
-				State: new(ctlpb.ResponseState),
+				Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()},
+				State:  new(ctlpb.ResponseState),
 			},
 		},
 		"scan over drpc; only ctrlrs with valid states shown": {
@@ -703,7 +741,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				State: new(ctlpb.ResponseState),
 			},
 		},
-		"scan over drpc; with smd and health; missing ctrlr in smd": {
+		"scan over drpc; with meta and health; missing ctrlr in smd": {
 			req: ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdRes: func() *ctlpb.SmdDevResp {
 				ssr := defSmdScanRes()
@@ -713,23 +751,48 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("no ctrlr ref"),
 		},
-		"scan over drpc; with smd and health; health scan fails": {
+		"scan over drpc; with meta and health; health scan fails": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdRes:    defSmdScanRes(),
 			healthErr: errors.New("health scan failed"),
 			expErr:    errors.New("health scan failed"),
 		},
-		"scan over drpc; with smd and health; smd list fails": {
+		"scan over drpc; with meta and health; smd list fails": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdErr:    errors.New("smd scan failed"),
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("smd scan failed"),
 		},
-		"scan over drpc; with smd and health; nil smd list returned": {
+		"scan over drpc; with meta and health; nil smd list returned": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("nil smd scan resp"),
 		},
+		"scan over drpc; with meta and health; link info update skipped": {
+			req:  ctlpb.ScanNvmeReq{Meta: true, Health: true},
+			rank: 1,
+			smdRes: func() *ctlpb.SmdDevResp {
+				ssr := defSmdScanRes()
+				ssr.Devices[0].Ctrlr.PciCfg = "ABCD"
+				return ssr
+			}(),
+			healthRes: healthRespWithUsage(),
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()},
+				State:  new(ctlpb.ResponseState),
+			},
+		},
+		"scan over drpc; with health; link info update run but failed": {
+			req: ctlpb.ScanNvmeReq{Health: true, LinkStats: true},
+			smdRes: func() *ctlpb.SmdDevResp {
+				ssr := defSmdScanRes()
+				ssr.Devices[0].Ctrlr.PciCfg = "ABCD"
+				return ssr
+			}(),
+			healthRes: healthRespWithUsage(),
+			// Prove link stat provider gets called when LinkStats flag set.
+			expErr: errors.New("link stats provider fail"),
+		},
 	} {
 		t.Run(name, func(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
@@ -741,11 +804,17 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 			defer func() {
 				scanSmd = listSmdDevices
 			}()
-			getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
+			scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
 				return tc.healthRes, tc.healthErr
 			}
 			defer func() {
-				getCtrlrHealth = getBioHealth
+				scanHealth = getBioHealth
+			}()
+			linkStatsProv = &mockPCIeLinkStatsProvider{
+				pciDevErr: errors.New("link stats provider fail"),
+			}
+			defer func() {
+				linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
 			}()
 
 			if tc.provRes == nil {
@@ -776,7 +845,8 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				ei.setSuperblock(nil)
 			} else {
 				ei.setSuperblock(&Superblock{
-					Rank: ranklist.NewRankPtr(uint32(tc.rank)), ValidRank: true,
+					Rank:      ranklist.NewRankPtr(uint32(tc.rank)),
+					ValidRank: true,
 				})
 			}
 
diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go
index e85091b5529..a35753d95ba 100644
--- a/src/control/server/mgmt_system.go
+++ b/src/control/server/mgmt_system.go
@@ -1002,6 +1002,8 @@ func (svc *mgmtSvc) SystemExclude(ctx context.Context, req *mgmtpb.SystemExclude
 		})
 	}
 
+	svc.reqGroupUpdate(ctx, false)
+
 	return resp, nil
 }
 
diff --git a/src/control/server/mgmt_system_test.go b/src/control/server/mgmt_system_test.go
index f482b1943a1..0cffe7f63d8 100644
--- a/src/control/server/mgmt_system_test.go
+++ b/src/control/server/mgmt_system_test.go
@@ -1765,6 +1765,7 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) {
 				mockMember(t, 3, 2, "joined"),
 			},
 		},
+
 		"unexclude hosts": {
 			req: &mgmtpb.SystemExcludeReq{Hosts: test.MockHostAddr(1).String(), Clear: true},
 			members: system.Members{
@@ -1795,12 +1796,32 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) {
 			if tc.req != nil && tc.req.Sys == "" {
 				tc.req.Sys = build.DefaultSystemName
 			}
+
+			startMapVer, err := svc.sysdb.CurMapVersion()
+			if err != nil {
+				t.Fatalf("startMapVer CurMapVersion() failed\n")
+				return
+			}
 			gotResp, gotAPIErr := svc.SystemExclude(ctx, tc.req)
 			test.CmpErr(t, tc.expAPIErr, gotAPIErr)
 			if tc.expAPIErr != nil {
 				return
 			}
 
+			// Check for any system map version increase by the (asynchronous) update.
+			// Test will time out if it never happens, thus choice of an infinite loop here.
+			for {
+				curMapVer, err := svc.sysdb.CurMapVersion()
+				if err != nil {
+					t.Fatalf("CurMapVersion() failed\n")
+					return
+				}
+
+				if curMapVer > startMapVer {
+					break
+				}
+			}
+
 			checkRankResults(t, tc.expResults, gotResp.Results)
 			checkMembers(t, tc.expMembers, svc.membership)
 		})
diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c
index 863307e9a7f..ba45aaa2616 100644
--- a/src/dtx/dtx_coll.c
+++ b/src/dtx/dtx_coll.c
@@ -80,8 +80,14 @@ dtx_coll_prep_ult(void *arg)
 				DP_UUID(cont->sc_uuid), DP_RC(rc));
 	}
 
-	if (dcpa->dcpa_result != 0)
+	if (dcpa->dcpa_result != 0) {
+		if (dcpa->dcpa_result < 0 &&
+		    dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST)
+			D_ERROR("Failed to load mbs for "DF_DTI" in "DF_UUID"/"DF_UUID", opc %u: "
+				DF_RC"\n", DP_DTI(&dci->dci_xid), DP_UUID(dci->dci_po_uuid),
+				DP_UUID(dci->dci_co_uuid), opc, DP_RC(dcpa->dcpa_result));
 		goto out;
+	}
 
 	dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1,
 					  dci->dci_version, cont->sc_pool->spc_map_version,
diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c
index ff4f2dfe4ef..1ee74ae11a4 100644
--- a/src/dtx/dtx_common.c
+++ b/src/dtx/dtx_common.c
@@ -392,11 +392,11 @@ dtx_cleanup(void *arg)
 				if (rc == 0) {
 					D_ASSERT(dce != NULL);
 
-					rc = dtx_coll_commit(cont, dce, NULL);
+					rc = dtx_coll_commit(cont, dce, NULL, false);
 					dtx_coll_entry_put(dce);
 				}
 			} else {
-				rc = dtx_commit(cont, &dte, NULL, 1);
+				rc = dtx_commit(cont, &dte, NULL, 1, false);
 			}
 		}
 
@@ -620,17 +620,16 @@ dtx_batched_commit_one(void *arg)
 	tls->dt_batched_ult_cnt++;
 
 	/* dbca->dbca_reg_gen != cont->sc_dtx_batched_gen means someone reopen the container. */
-	while (!dss_ult_exiting(dbca->dbca_commit_req) &&
+	while (!dss_ult_exiting(dbca->dbca_commit_req) && dtx_cont_opened(cont) &&
 		dbca->dbca_reg_gen == cont->sc_dtx_batched_gen) {
 		struct dtx_entry	**dtes = NULL;
-		struct dtx_cos_key	 *dcks = NULL;
 		struct dtx_coll_entry	 *dce = NULL;
 		struct dtx_stat		  stat = { 0 };
 		int			  cnt;
 		int			  rc;
 
 		cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, NULL,
-					    DAOS_EPOCH_MAX, false, &dtes, &dcks, &dce);
+					    DAOS_EPOCH_MAX, false, &dtes, NULL, &dce);
 		if (cnt == 0)
 			break;
 
@@ -644,11 +643,11 @@ dtx_batched_commit_one(void *arg)
 			/* Currently, commit collective DTX one by one. */
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, NULL, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, NULL, cnt, true);
 		}
-		dtx_free_committable(dtes, dcks, dce, cnt);
+		dtx_free_committable(dtes, NULL, dce, cnt);
 		if (rc != 0) {
 			D_WARN("Fail to batched commit %d entries for "DF_UUID": "DF_RC"\n",
 			       cnt, DP_UUID(cont->sc_uuid), DP_RC(rc));
@@ -1271,9 +1270,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	uint32_t			 flags;
 	int				 status = -1;
 	int				 rc = 0;
-	int				 i;
 	bool				 aborted = false;
-	bool				 unpin = false;
 
 	D_ASSERT(cont != NULL);
 
@@ -1341,7 +1338,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	 * it persistently. Otherwise, the subsequent DTX resync may not find it as
 	 * to regard it as failed transaction and abort it.
 	 */
-	if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo &&
+	if (!dth->dth_active && !dth->dth_prepared &&
 	    (dth->dth_dist || dth->dth_modification_cnt > 0)) {
 		result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false);
 		if (unlikely(result < 0)) {
@@ -1351,7 +1348,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		}
 	}
 
-	if (dth->dth_prepared || dtx_batched_ult_max == 0) {
+	if ((dth->dth_prepared && !dlh->dlh_coll) || dtx_batched_ult_max == 0) {
 		dth->dth_sync = 1;
 		goto sync;
 	}
@@ -1365,14 +1362,12 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	if (DAOS_FAIL_CHECK(DAOS_DTX_MISS_COMMIT))
 		dth->dth_sync = 1;
 
-	/* For synchronous DTX, do not add it into CoS cache, otherwise,
-	 * we may have no way to remove it from the cache.
-	 */
 	if (dth->dth_sync)
 		goto sync;
 
 	D_ASSERT(dth->dth_mbs != NULL);
 
+cache:
 	if (dlh->dlh_coll) {
 		rc = dtx_cos_add(cont, dlh->dlh_coll_entry, &dth->dth_leader_oid,
 				 dth->dth_dkey_hash, dth->dth_epoch, DCF_EXP_CMT | DCF_COLL);
@@ -1380,38 +1375,47 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size;
 		D_ALLOC(dte, size);
 		if (dte == NULL) {
-			dth->dth_sync = 1;
-			goto sync;
-		}
-
-		mbs = (struct dtx_memberships *)(dte + 1);
-		memcpy(mbs, dth->dth_mbs, size - sizeof(*dte));
-
-		dte->dte_xid = dth->dth_xid;
-		dte->dte_ver = dth->dth_ver;
-		dte->dte_refs = 1;
-		dte->dte_mbs = mbs;
+			rc = -DER_NOMEM;
+		} else {
+			mbs = (struct dtx_memberships *)(dte + 1);
+			memcpy(mbs, dth->dth_mbs, size - sizeof(*dte));
+
+			dte->dte_xid = dth->dth_xid;
+			dte->dte_ver = dth->dth_ver;
+			dte->dte_refs = 1;
+			dte->dte_mbs = mbs;
+
+			if (!(mbs->dm_flags & DMF_SRDG_REP))
+				flags = DCF_EXP_CMT;
+			else if (dth->dth_modify_shared)
+				flags = DCF_SHARED;
+			else
+				flags = 0;
 
-		if (!(mbs->dm_flags & DMF_SRDG_REP))
-			flags = DCF_EXP_CMT;
-		else if (dth->dth_modify_shared)
-			flags = DCF_SHARED;
-		else
-			flags = 0;
+			rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash,
+					 dth->dth_epoch, flags);
+			dtx_entry_put(dte);
+		}
+	}
 
-		rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash,
-				 dth->dth_epoch, flags);
-		dtx_entry_put(dte);
+	/*
+	 * NOTE: If we failed to add the committable DTX into CoS cache, then we also have no way
+	 *	 to commit (or abort) the DTX because of out of memory. Such DTX will be finally
+	 *	 committed via next DTX resync (after recovered from OOM).
+	 *
+	 *	 Here, we only warning to notify the trouble, but not failed the transaction.
+	 */
+	if (rc != 0) {
+		D_WARN(DF_UUID": Fail to cache %s DTX "DF_DTI": "DF_RC"\n",
+		       DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular",
+		       DP_DTI(&dth->dth_xid), DP_RC(rc));
+		D_GOTO(out, result = 0);
 	}
 
-	if (rc == 0) {
-		if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) {
-			vos_dtx_mark_committable(dth);
-			if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll)
-				sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req);
-		}
-	} else {
-		dth->dth_sync = 1;
+	if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) {
+		vos_dtx_mark_committable(dth);
+		if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll)
+			sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req);
 	}
 
 sync:
@@ -1424,16 +1428,21 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		vos_dtx_mark_committable(dth);
 
 		if (dlh->dlh_coll) {
-			rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL);
+			rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL, false);
 		} else {
 			dte = &dth->dth_dte;
-			rc = dtx_commit(cont, &dte, NULL, 1);
+			rc = dtx_commit(cont, &dte, NULL, 1, false);
 		}
 
-		if (rc != 0)
+		if (rc != 0) {
 			D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n",
 			       DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular",
 			       DP_DTI(&dth->dth_xid), DP_RC(rc));
+			if (likely(dtx_batched_ult_max != 0)) {
+				dth->dth_sync = 0;
+				goto cache;
+			}
+		}
 
 		/*
 		 * NOTE: The semantics of 'sync' commit does not guarantee that all
@@ -1453,7 +1462,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	 * to locally retry for avoiding related forwarded RPC timeout, instead,
 	 * The leader will trigger retry globally without abort 'prepared' ones.
 	 */
-	if (unpin || (result < 0 && result != -DER_AGAIN && !dth->dth_solo)) {
+	if (result < 0 && result != -DER_AGAIN && !dth->dth_solo) {
 		/* 1. Drop partial modification for distributed transaction.
 		 * 2. Remove the pinned DTX entry.
 		 */
@@ -1487,15 +1496,9 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	/* If piggyback DTX has been done everywhere, then need to handle CoS cache.
 	 * It is harmless to keep some partially committed DTX entries in CoS cache.
 	 */
-	if (result == 0 && dth->dth_cos_done) {
-		for (i = 0; i < dth->dth_dti_cos_count; i++)
-			dtx_cos_del(cont, &dth->dth_dti_cos[i],
-				    &dth->dth_leader_oid, dth->dth_dkey_hash);
-	} else {
-		for (i = 0; i < dth->dth_dti_cos_count; i++)
-			dtx_cos_put_piggyback(cont, &dth->dth_dti_cos[i],
-					      &dth->dth_leader_oid, dth->dth_dkey_hash);
-	}
+	dtx_cos_put_piggyback(cont, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_dti_cos,
+			      dth->dth_dti_cos_count,
+			      (result == 0 && dth->dth_cos_done) ? true : false);
 
 	D_DEBUG(DB_IO, "Stop the DTX "DF_DTI" ver %u, dkey %lu, %s, cos %d/%d: result "DF_RC"\n",
 		DP_DTI(&dth->dth_xid), dth->dth_ver, (unsigned long)dth->dth_dkey_hash,
@@ -1654,7 +1657,8 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db
 		struct dtx_coll_entry	 *dce = NULL;
 
 		cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT,
-					    NULL, DAOS_EPOCH_MAX, true, &dtes, &dcks, &dce);
+					    NULL, DAOS_EPOCH_MAX, true, &dtes,
+					    dbca->dbca_commit_req != NULL ? &dcks : NULL, &dce);
 		if (cnt <= 0)
 			D_GOTO(out, rc = cnt);
 
@@ -1675,9 +1679,9 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db
 		if (dce != NULL) {
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, dcks, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, dcks, cnt, true);
 		}
 		dtx_free_committable(dtes, dcks, dce, cnt);
 	}
@@ -2365,9 +2369,9 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 		if (dce != NULL) {
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, dcks, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, dcks, cnt, true);
 		}
 		dtx_free_committable(dtes, dcks, dce, cnt);
 		if (rc < 0) {
diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c
index 6e1d042b82b..4c165f94d0c 100644
--- a/src/dtx/dtx_cos.c
+++ b/src/dtx/dtx_cos.c
@@ -54,6 +54,8 @@ struct dtx_cos_rec_child {
 	d_list_t			 dcrc_gl_committable;
 	/* Link into related dcr_{reg,prio}_list. */
 	d_list_t			 dcrc_lo_link;
+	/* Link into container::sc_dtx_batched_list. */
+	d_list_t			 dcrc_batched_link;
 	union {
 		struct dtx_entry	*dcrc_dte;
 		struct dtx_coll_entry	*dcrc_dce;
@@ -61,8 +63,12 @@ struct dtx_cos_rec_child {
 	/* The DTX epoch. */
 	daos_epoch_t			 dcrc_epoch;
 	struct dtx_cos_rec		*dcrc_ptr;
+	uint64_t			 dcrc_ready_time;
 	uint32_t			 dcrc_piggyback_refs;
-	uint32_t			 dcrc_coll:1; /* For collective DTX. */
+	uint32_t			 dcrc_expcmt:1,
+					 dcrc_prio:1,
+					 dcrc_reg:1,
+					 dcrc_coll:1; /* For collective DTX. */
 };
 
 struct dtx_cos_rec_bundle {
@@ -129,6 +135,8 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 		return -DER_NOMEM;
 	}
 
+	D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link);
+	dcrc->dcrc_ready_time = daos_getmtime_coarse();
 	dcrc->dcrc_epoch = rbund->epoch;
 	dcrc->dcrc_ptr = dcr;
 	if (rbund->flags & DCF_COLL) {
@@ -144,12 +152,15 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 	d_tm_inc_gauge(tls->dt_committable, 1);
 
 	if (rbund->flags & DCF_EXP_CMT) {
+		dcrc->dcrc_expcmt = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list);
 		dcr->dcr_expcmt_count = 1;
 	} else if (rbund->flags & DCF_SHARED) {
+		dcrc->dcrc_prio = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 		dcr->dcr_prio_count = 1;
 	} else {
+		dcrc->dcrc_reg = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list);
 		dcr->dcr_reg_count = 1;
 	}
@@ -177,6 +188,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -190,6 +202,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -203,6 +216,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -256,6 +270,8 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec,
 	if (dcrc == NULL)
 		return -DER_NOMEM;
 
+	D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link);
+	dcrc->dcrc_ready_time = daos_getmtime_coarse();
 	dcrc->dcrc_epoch = rbund->epoch;
 	dcrc->dcrc_ptr = dcr;
 	if (rbund->flags & DCF_COLL) {
@@ -271,12 +287,15 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec,
 	d_tm_inc_gauge(tls->dt_committable, 1);
 
 	if (rbund->flags & DCF_EXP_CMT) {
+		dcrc->dcrc_expcmt = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list);
 		dcr->dcr_expcmt_count++;
 	} else if (rbund->flags & DCF_SHARED) {
+		dcrc->dcrc_prio = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 		dcr->dcr_prio_count++;
 	} else {
+		dcrc->dcrc_reg = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list);
 		dcr->dcr_reg_count++;
 	}
@@ -294,6 +313,53 @@ btr_ops_t dtx_btr_cos_ops = {
 	.to_rec_update	= dtx_cos_rec_update,
 };
 
+static int
+dtx_cos_del_one(struct ds_cont_child *cont, struct dtx_cos_rec_child *dcrc)
+{
+	struct dtx_cos_key	 key;
+	d_iov_t			 kiov;
+	struct dtx_cos_rec	*dcr = dcrc->dcrc_ptr;
+	uint64_t		 time = daos_getmtime_coarse() - dcrc->dcrc_ready_time;
+	int			 rc = 0;
+
+	d_list_del(&dcrc->dcrc_gl_committable);
+	d_list_del(&dcrc->dcrc_lo_link);
+	if (!d_list_empty(&dcrc->dcrc_batched_link))
+		d_list_del_init(&dcrc->dcrc_batched_link);
+
+	if (dcrc->dcrc_expcmt)
+		dcr->dcr_expcmt_count--;
+	else if (dcrc->dcrc_prio)
+		dcr->dcr_prio_count--;
+	else
+		dcr->dcr_reg_count--;
+
+	if (dcrc->dcrc_coll)
+		cont->sc_dtx_committable_coll_count--;
+	cont->sc_dtx_committable_count--;
+
+	d_tm_set_gauge(dtx_tls_get()->dt_async_cmt_lat, time);
+
+	if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && dcr->dcr_expcmt_count == 0) {
+		key.oid = dcr->dcr_oid;
+		key.dkey_hash = dcr->dcr_dkey_hash;
+		d_iov_set(&kiov, &key, sizeof(key));
+		rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL);
+	}
+
+	DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc,
+		  "Remove DTX "DF_DTI" from CoS cache", DP_DTI(&dcrc->dcrc_dte->dte_xid));
+
+	if (dcrc->dcrc_coll)
+		dtx_coll_entry_put(dcrc->dcrc_dce);
+	else
+		dtx_entry_put(dcrc->dcrc_dte);
+
+	D_FREE(dcrc);
+
+	return rc;
+}
+
 int
 dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 		      daos_unit_oid_t *oid, daos_epoch_t epoch, bool force,
@@ -306,18 +372,45 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	uint32_t			  count;
 	uint32_t			  i = 0;
 
+	/* Last batched commit failed, let's re-commit them. */
+	if (dcks == NULL && !d_list_empty(&cont->sc_dtx_batched_list)) {
+		dcrc = d_list_entry(cont->sc_dtx_batched_list.next, struct dtx_cos_rec_child,
+				    dcrc_batched_link);
+		if (unlikely(dcrc->dcrc_coll)) {
+			*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
+			return 1;
+		}
+
+		D_ALLOC_ARRAY(dte_buf, max_cnt);
+		if (dte_buf == NULL)
+			return -DER_NOMEM;
+
+		d_list_for_each_entry(dcrc, &cont->sc_dtx_batched_list, dcrc_batched_link) {
+			D_ASSERT(i < max_cnt);
+			dte_buf[i++] = dtx_entry_get(dcrc->dcrc_dte);
+		}
+
+		*dtes = dte_buf;
+		return i;
+	}
+
 	/* Process collective DXT with higher priority. */
 	if (!d_list_empty(&cont->sc_dtx_coll_list) && oid == NULL) {
 		d_list_for_each_entry(dcrc, &cont->sc_dtx_coll_list, dcrc_gl_committable) {
 			if (epoch >= dcrc->dcrc_epoch &&
 			    (dcrc->dcrc_piggyback_refs == 0 || force)) {
-				D_ALLOC_PTR(dck_buf);
-				if (dck_buf == NULL)
-					return -DER_NOMEM;
-
-				dck_buf->oid = dcrc->dcrc_ptr->dcr_oid;
-				dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
-				*dcks = dck_buf;
+				if (dcks != NULL) {
+					D_ALLOC_PTR(dck_buf);
+					if (dck_buf == NULL)
+						return -DER_NOMEM;
+
+					dck_buf->oid = dcrc->dcrc_ptr->dcr_oid;
+					dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+					*dcks = dck_buf;
+				} else {
+					d_list_add_tail(&dcrc->dcrc_batched_link,
+							&cont->sc_dtx_batched_list);
+				}
 				*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
 
 				return 1;
@@ -326,19 +419,19 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	}
 
 	count = min(cont->sc_dtx_committable_count, max_cnt);
-	if (count == 0) {
-		*dtes = NULL;
+	if (count == 0)
 		return 0;
-	}
 
 	D_ALLOC_ARRAY(dte_buf, count);
 	if (dte_buf == NULL)
 		return -DER_NOMEM;
 
-	D_ALLOC_ARRAY(dck_buf, count);
-	if (dck_buf == NULL) {
-		D_FREE(dte_buf);
-		return -DER_NOMEM;
+	if (dcks != NULL) {
+		D_ALLOC_ARRAY(dck_buf, count);
+		if (dck_buf == NULL) {
+			D_FREE(dte_buf);
+			return -DER_NOMEM;
+		}
 	}
 
 	d_list_for_each_entry(dcrc, &cont->sc_dtx_cos_list, dcrc_gl_committable) {
@@ -353,17 +446,26 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 				continue;
 
 			D_FREE(dte_buf);
-			dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
-			dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
-			*dcks = dck_buf;
+			if (dcks != NULL) {
+				dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
+				dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+				*dcks = dck_buf;
+			} else {
+				d_list_add_tail(&dcrc->dcrc_batched_link,
+						&cont->sc_dtx_batched_list);
+			}
 			*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
 
 			return 1;
 		}
 
 		dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte);
-		dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
-		dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+		if (dcks != NULL) {
+			dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
+			dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+		} else {
+			d_list_add_tail(&dcrc->dcrc_batched_link, &cont->sc_dtx_batched_list);
+		}
 
 		if (++i >= count)
 			break;
@@ -372,10 +474,10 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	if (i == 0) {
 		D_FREE(dte_buf);
 		D_FREE(dck_buf);
-		*dtes = NULL;
 	} else {
 		*dtes = dte_buf;
-		*dcks = dck_buf;
+		if (dcks != NULL)
+			*dcks = dck_buf;
 	}
 
 	return i;
@@ -436,32 +538,44 @@ dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 }
 
 void
-dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid,
-		      daos_unit_oid_t *oid, uint64_t dkey_hash)
+dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
+		      struct dtx_id xid[], uint32_t count, bool rm)
 {
 	struct dtx_cos_key		 key;
 	d_iov_t				 kiov;
 	d_iov_t				 riov;
 	struct dtx_cos_rec		*dcr;
 	struct dtx_cos_rec_child	*dcrc;
+	int				 del = 0;
 	int				 rc;
+	int				 i;
 
 	key.oid = *oid;
 	key.dkey_hash = dkey_hash;
 	d_iov_set(&kiov, &key, sizeof(key));
 	d_iov_set(&riov, NULL, 0);
 
-	/* It is normal that the DTX entry (to be put) in CoS has already been removed by race. */
-
 	rc = dbtree_lookup(cont->sc_dtx_cos_hdl, &kiov, &riov);
 	if (rc == 0) {
 		dcr = (struct dtx_cos_rec *)riov.iov_buf;
-		d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-			if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
-				dcrc->dcrc_piggyback_refs--;
-				return;
+		for (i = 0; i < count; i++) {
+			d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
+				if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i],
+					   sizeof(struct dtx_id)) == 0) {
+					if (rm) {
+						rc = dtx_cos_del_one(cont, dcrc);
+						if (rc == 0)
+							del++;
+					} else {
+						dcrc->dcrc_piggyback_refs--;
+					}
+					break;
+				}
 			}
 		}
+
+		if (del > 0)
+			d_tm_dec_gauge(dtx_tls_get()->dt_committable, del);
 	}
 }
 
@@ -493,12 +607,12 @@ dtx_cos_add(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid,
 			   DAOS_INTENT_UPDATE, &kiov, &riov, NULL);
 
 	if (flags & DCF_COLL)
-		D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert coll DTX "DF_DTI" to CoS cache, "
+		D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert coll DTX "DF_DTI" to CoS cache, "
 			 DF_UOID", key %lu, flags %x: "DF_RC"\n",
 			 DP_DTI(&((struct dtx_coll_entry *)entry)->dce_xid), DP_UOID(*oid),
 			 (unsigned long)dkey_hash, flags, DP_RC(rc));
 	else
-		D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert reg DTX "DF_DTI" to CoS cache, "
+		D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert reg DTX "DF_DTI" to CoS cache, "
 			 DF_UOID", key %lu, flags %x: "DF_RC"\n",
 			 DP_DTI(&((struct dtx_entry *)entry)->dte_xid), DP_UOID(*oid),
 			 (unsigned long)dkey_hash, flags, DP_RC(rc));
@@ -530,82 +644,36 @@ dtx_cos_del(struct ds_cont_child *cont, struct dtx_id *xid,
 	dcr = (struct dtx_cos_rec *)riov.iov_buf;
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 1);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_prio_count--;
-
-		D_GOTO(out, found = 1);
 	}
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 2);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_reg_count--;
-
-		D_GOTO(out, found = 2);
 	}
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 3);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_expcmt_count--;
-
-		D_GOTO(out, found = 3);
 	}
 
 out:
-	if (found > 0) {
+	if (found > 0)
 		d_tm_dec_gauge(dtx_tls_get()->dt_committable, 1);
 
-		if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 &&
-		    dcr->dcr_expcmt_count == 0)
-			rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL);
-	}
-
 	if (rc == 0 && found == 0)
 		rc = -DER_NONEXIST;
 
-	D_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_IO,
-		 "Remove DTX "DF_DTI" from CoS "
-		 "cache, "DF_UOID", key %lu, %s shared entry: rc = "DF_RC"\n",
-		 DP_DTI(xid), DP_UOID(*oid), (unsigned long)dkey_hash,
-		 found == 1 ? "has" : "has not", DP_RC(rc));
+	DL_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc,
+		  "Remove DTX from CoS cache "DF_UOID", key %lu",
+		  DP_UOID(*oid), (unsigned long)dkey_hash);
 
 	return rc == -DER_NONEXIST ? 0 : rc;
 }
@@ -624,6 +692,12 @@ dtx_cos_oldest(struct ds_cont_child *cont)
 	return dcrc->dcrc_epoch;
 }
 
+/*
+ * It is inefficient to search some item on a very long list. So let's skip
+ * the search if the length exceeds DTX_COS_SEARCH_MAX. That is not fatal.
+ */
+#define DTX_COS_SEARCH_MAX	32
+
 void
 dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 	     daos_unit_oid_t *oid, uint64_t dkey_hash)
@@ -647,8 +721,13 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 
 	dcr = (struct dtx_cos_rec *)riov.iov_buf;
 
+	if (dcr->dcr_reg_count > DTX_COS_SEARCH_MAX)
+		goto expcmt;
+
 	d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) {
 		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			dcrc->dcrc_reg = 0;
+			dcrc->dcrc_prio = 1;
 			d_list_del(&dcrc->dcrc_lo_link);
 			d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 			dcr->dcr_reg_count--;
@@ -658,14 +737,9 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 		}
 	}
 
-	d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
-			d_list_del(&dcrc->dcrc_lo_link);
-			d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
-
-			D_GOTO(out, found = true);
-		}
-	}
+expcmt:
+	if (dcr->dcr_expcmt_count > DTX_COS_SEARCH_MAX)
+		goto out;
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) {
 		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0)
@@ -683,3 +757,39 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 
 	/* It is normal that the DTX entry (for priority) in CoS has been committed by race. */
 }
+
+void
+dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], uint32_t count)
+{
+	struct dtx_cos_rec_child	*dcrc;
+	int				 del = 0;
+	int				 rc;
+	int				 i = 0;
+	bool				 found;
+
+	while ((dcrc = d_list_pop_entry(&cont->sc_dtx_batched_list, struct dtx_cos_rec_child,
+					dcrc_batched_link)) != NULL) {
+		for (found = false; i < count && !found; i++) {
+			/*
+			 * Some entries in the sc_dtx_batched_list may have been committed by
+			 * others by race. Since the entries order in the sc_dtx_batched_list
+			 * will not be changed, let's compare with xid[i] via one cycle scan.
+			 */
+			if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], sizeof(struct dtx_id)) == 0) {
+				found = true;
+
+				if (rm[i]) {
+					rc = dtx_cos_del_one(cont, dcrc);
+					if (rc == 0)
+						del++;
+				}
+			}
+		}
+
+		/* There must be one in xid array that matches current dcrc. */
+		D_ASSERT(found);
+	}
+
+	if (del > 0)
+		d_tm_dec_gauge(dtx_tls_get()->dt_committable, del);
+}
diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h
index a42bcc1d7d6..06d0333dd77 100644
--- a/src/dtx/dtx_internal.h
+++ b/src/dtx/dtx_internal.h
@@ -213,6 +213,7 @@ struct dtx_pool_metrics {
 struct dtx_tls {
 	struct d_tm_node_t	*dt_committable;
 	struct d_tm_node_t	*dt_dtx_leader_total;
+	struct d_tm_node_t	*dt_async_cmt_lat;
 	uint64_t		 dt_agg_gen;
 	uint32_t		 dt_batched_ult_cnt;
 };
@@ -263,6 +264,9 @@ uint64_t dtx_cos_oldest(struct ds_cont_child *cont);
 void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 		  daos_unit_oid_t *oid, uint64_t dkey_hash);
 
+void dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[],
+			 uint32_t count);
+
 /* dtx_rpc.c */
 int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte,
 	      daos_epoch_t epoch);
diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c
index b98a6469954..756dfe0ca44 100644
--- a/src/dtx/dtx_resync.c
+++ b/src/dtx/dtx_resync.c
@@ -115,7 +115,7 @@ dtx_resync_commit(struct ds_cont_child *cont,
 	}
 
 	if (j > 0) {
-		rc = dtx_commit(cont, dtes, dcks, j);
+		rc = dtx_commit(cont, dtes, dcks, j, true);
 		if (rc < 0)
 			D_ERROR("Failed to commit the DTXs: rc = "DF_RC"\n",
 				DP_RC(rc));
@@ -359,7 +359,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_un
 
 		dck.oid = oid;
 		dck.dkey_hash = dkey_hash;
-		rc = dtx_coll_commit(cont, dce, &dck);
+		rc = dtx_coll_commit(cont, dce, &dck, true);
 	}
 
 	dtx_coll_entry_put(dce);
diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c
index e654047a621..6d34e871269 100644
--- a/src/dtx/dtx_rpc.c
+++ b/src/dtx/dtx_rpc.c
@@ -225,9 +225,10 @@ dtx_req_cb(const struct crt_cb_info *cb_info)
 	}
 
 out:
-	D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: "
-		"epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future,
-		drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc);
+	DL_CDEBUG(rc < 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc,
+		  "DTX req for opc %x (req %p future %p) got reply from %d/%d: "
+		  "epoch :"DF_X64, dra->dra_opc, req, dra->dra_future,
+		  drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0);
 
 	drr->drr_comp = 1;
 	drr->drr_result = rc;
@@ -397,19 +398,14 @@ dtx_req_list_send(struct dtx_common_args *dca, bool is_reentrance)
 
 		if (unlikely(dra->dra_opc == DTX_COMMIT && dca->dca_i == 0 &&
 			     DAOS_FAIL_CHECK(DAOS_DTX_FAIL_COMMIT)))
-			rc = dtx_req_send(dca->dca_drr, 1);
+			dtx_req_send(dca->dca_drr, 1);
 		else
-			rc = dtx_req_send(dca->dca_drr, dca->dca_epoch);
-		if (rc != 0) {
-			/* If the first sub-RPC failed, then break, otherwise
-			 * other remote replicas may have already received the
-			 * RPC and executed it, so have to go ahead.
-			 */
-			if (dca->dca_i == 0) {
-				ABT_future_free(&dra->dra_future);
-				return rc;
-			}
-		}
+			dtx_req_send(dca->dca_drr, dca->dca_epoch);
+		/*
+		 * Do not care dtx_req_send result, itself or its cb func will set dra->dra_future.
+		 * Each RPC is independent from the others, let's go head to handle the other RPCs
+		 * and set dra->dra_future that will avoid blocking the RPC sponsor - dtx_req_wait.
+		 */
 
 		/* dca->dca_drr maybe not points to a real entry if all RPCs have been sent. */
 		dca->dca_drr = d_list_entry(dca->dca_drr->drr_link.next,
@@ -616,12 +612,8 @@ dtx_rpc_helper(struct dss_chore *chore, bool is_reentrance)
 	rc = dtx_req_list_send(dca, is_reentrance);
 	if (rc == DSS_CHORE_YIELD)
 		return DSS_CHORE_YIELD;
-	if (rc == DSS_CHORE_DONE)
-		rc = 0;
-	if (rc != 0)
-		dca->dca_dra.dra_result = rc;
-	D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, "%p: DTX RPC chore for %u done: %d\n", chore,
-		 dca->dca_dra.dra_opc, rc);
+	D_ASSERTF(rc == DSS_CHORE_DONE, "Unexpected helper return value for RPC %u: %d\n",
+		  dca->dca_dra.dra_opc, rc);
 	if (dca->dca_chore_eventual != ABT_EVENTUAL_NULL) {
 		rc = ABT_eventual_set(dca->dca_chore_eventual, NULL, 0);
 		D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_set: %d\n", rc);
@@ -737,8 +729,10 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list,  struct dtx_entry **dtes,
 			}
 
 			rc = dss_chore_delegate(&dca->dca_chore, dtx_rpc_helper);
-			if (rc != 0)
+			if (rc != 0) {
+				ABT_eventual_free(&dca->dca_chore_eventual);
 				goto out;
+			}
 
 			rc = ABT_eventual_wait(dca->dca_chore_eventual, NULL);
 			D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_wait: %d\n", rc);
@@ -809,7 +803,7 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list,  struct dtx_entry **dtes,
  */
 int
 dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
-	   struct dtx_cos_key *dcks, int count)
+	   struct dtx_cos_key *dcks, int count, bool has_cos)
 {
 	struct dtx_common_args	 dca;
 	struct dtx_req_args	*dra = &dca.dca_dra;
@@ -842,7 +836,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
 			rc1 = vos_dtx_set_flags(cont->sc_hdl, dca.dca_dtis, count,
 						DTE_PARTIAL_COMMITTED);
 	} else {
-		if (dcks != NULL) {
+		if (has_cos) {
 			if (count > 1) {
 				D_ALLOC_ARRAY(rm_cos, count);
 				if (rm_cos == NULL)
@@ -862,12 +856,16 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
 		}
 
 		if (rc1 == 0 && rm_cos != NULL) {
-			for (i = 0; i < count; i++) {
-				if (rm_cos[i]) {
-					D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub));
-					dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid,
-						    dcks[i].dkey_hash);
+			if (dcks != NULL) {
+				for (i = 0; i < count; i++) {
+					if (rm_cos[i]) {
+						D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub));
+						dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid,
+							    dcks[i].dkey_hash);
+					}
 				}
+			} else {
+				dtx_cos_batched_del(cont, dca.dca_dtis, rm_cos, count);
 			}
 		}
 
@@ -976,8 +974,11 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 		if (dsp->dsp_mbs == NULL) {
 			rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs);
 			if (rc != 0) {
-				if (rc < 0 && rc != -DER_NONEXIST && for_io)
+				if (rc < 0 && rc != -DER_NONEXIST && for_io) {
+					D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n",
+						DP_DTI(&dsp->dsp_xid), DP_RC(rc));
 					goto out;
+				}
 
 				drop = true;
 				goto next;
@@ -1237,7 +1238,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 			case DTX_ST_COMMITTABLE:
 				dck.oid = dsp->dsp_oid;
 				dck.dkey_hash = dsp->dsp_dkey_hash;
-				rc = dtx_commit(cont, &pdte, &dck, 1);
+				rc = dtx_commit(cont, &pdte, &dck, 1, true);
 				if (rc < 0 && rc != -DER_NONEXIST && for_io)
 					d_list_add_tail(&dsp->dsp_link, cmt_list);
 				else
@@ -1258,7 +1259,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 		case DSHR_NEED_COMMIT: {
 			dck.oid = dsp->dsp_oid;
 			dck.dkey_hash = dsp->dsp_dkey_hash;
-			rc = dtx_commit(cont, &pdte, &dck, 1);
+			rc = dtx_commit(cont, &pdte, &dck, 1, true);
 			if (rc < 0 && rc != -DER_NONEXIST && for_io)
 				d_list_add_tail(&dsp->dsp_link, cmt_list);
 			else
@@ -1571,17 +1572,20 @@ dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret)
 {
 	int	rc;
 
-	rc = ABT_future_wait(dcra->dcra_future);
-	D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE,
-		 "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n",
-		 dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result);
-	ABT_future_free(&dcra->dcra_future);
+	if (dcra->dcra_future != ABT_FUTURE_NULL) {
+		rc = ABT_future_wait(dcra->dcra_future);
+		D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE,
+			 "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n",
+			 dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result);
+		ABT_future_free(&dcra->dcra_future);
+	}
 
 	return ret != 0 ? ret : dcra->dcra_result;
 }
 
 int
-dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck)
+dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck,
+		bool has_cos)
 {
 	struct dtx_coll_rpc_args	 dcra = { 0 };
 	int				*results = NULL;
@@ -1591,11 +1595,22 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
 	int				 rc1 = 0;
 	int				 rc2 = 0;
 	int				 i;
+	bool				 cos = true;
 
 	if (dce->dce_ranks != NULL)
 		rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_COMMIT, 0, &dcra);
 
+	/*
+	 * NOTE: Before committing the DTX on remote participants, we cannot remove the active
+	 *	 DTX locally; otherwise, the local committed DTX entry may be removed via DTX
+	 *	 aggregation before remote participants commit done. Under such case, if some
+	 *	 remote DTX participant triggere DTX_REFRESH for such DTX during the interval,
+	 *	 then it will get -DER_TX_UNCERTAIN, that may cause related application to be
+	 *	 failed. So here, we let remote participants to commit firstly, if failed, we
+	 *	 will ask the leader to retry the commit until all participants got committed.
+	 */
 	if (dce->dce_bitmap != NULL) {
+		clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id);
 		len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0,
 					  DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap,
 					  &results);
@@ -1634,12 +1649,17 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
 	 *	 to remove the collective DTX entry from the CoS even if the commit failed remotely.
 	 *	 Otherwise, the batched commit ULT may be blocked by such "bad" entry.
 	 */
-	if (rc2 == 0 && dck != NULL)
-		dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash);
+	if (rc2 == 0 && has_cos) {
+		if (dck != NULL)
+			dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash);
+		else
+			dtx_cos_batched_del(cont, &dce->dce_xid, &cos, 1);
+	}
 
 	D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE,
-		 "Collectively commit DTX "DF_DTI": %d/%d/%d\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1, rc2);
+		 "Collectively commit DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/%d\n",
+		 DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid),
+		 rc, rc1, rc2);
 
 	return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2;
 }
@@ -1658,7 +1678,17 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 	if (dce->dce_ranks != NULL)
 		rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_ABORT, epoch, &dcra);
 
+	/*
+	 * NOTE: The DTX abort maybe triggered by dtx_leader_end() for timeout on some DTX
+	 *	 participant(s). Under such case, the client side RPC sponsor may also hit
+	 *	 the RPC timeout and resends related RPC to the leader. Here, to avoid DTX
+	 *	 abort and resend RPC forwarding being executed in parallel, we will abort
+	 *	 local DTX after remote done, before that the logic of handling resent RPC
+	 *	 on server will find the local pinned DTX entry then notify related client
+	 *	 to resend sometime later.
+	 */
 	if (dce->dce_bitmap != NULL) {
+		clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id);
 		len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch,
 					  DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap,
 					  &results);
@@ -1688,8 +1718,9 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 		rc2 = 0;
 
 	D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE,
-		 "Collectively abort DTX "DF_DTI": %d/%d/%d\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1, rc2);
+		 "Collectively abort DTX "DF_DTI" with epoch "DF_X64" in "
+		 DF_UUID"/"DF_UUID": %d/%d/%d\n", DP_DTI(&dce->dce_xid), epoch,
+		 DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), rc, rc1, rc2);
 
 	return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2;
 }
@@ -1737,8 +1768,9 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 	}
 
 	D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR,
-		 DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1);
+		 DB_TRACE, "Collectively check DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/\n",
+		 DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid),
+		 rc, rc1);
 
 	return dce->dce_ranks != NULL  ? rc : rc1;
 }
diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c
index 93cc6744a2d..084f804ec81 100644
--- a/src/dtx/dtx_srv.c
+++ b/src/dtx/dtx_srv.c
@@ -34,8 +34,8 @@ dtx_tls_init(int tags, int xs_id, int tgt_id)
 
 	tls->dt_agg_gen = 1;
 	rc = d_tm_add_metric(&tls->dt_committable, D_TM_STATS_GAUGE,
-			     "total number of committable DTX entries",
-			     "entries", "io/dtx/committable/tgt_%u", tgt_id);
+			     "total number of committable DTX entries", "entry",
+			     "io/dtx/committable/tgt_%u", tgt_id);
 	if (rc != DER_SUCCESS)
 		D_WARN("Failed to create DTX committable metric: " DF_RC"\n",
 		       DP_RC(rc));
@@ -48,6 +48,13 @@ dtx_tls_init(int tags, int xs_id, int tgt_id)
 		D_WARN("Failed to create DTX leader metric: " DF_RC"\n",
 		       DP_RC(rc));
 
+	rc = d_tm_add_metric(&tls->dt_async_cmt_lat, D_TM_STATS_GAUGE,
+			     "DTX async commit latency", "ms",
+			     "io/dtx/async_cmt_lat/tgt_%u", tgt_id);
+	if (rc != DER_SUCCESS)
+		D_WARN("Failed to create DTX async commit latency metric: " DF_RC"\n",
+		       DP_RC(rc));
+
 	return tls;
 }
 
@@ -117,7 +124,7 @@ dtx_metrics_alloc(const char *path, int tgt_id)
 
 	rc = d_tm_add_metric(&metrics->dpm_total[DTX_PROTO_SRV_RPC_COUNT], D_TM_COUNTER,
 			     "total number of processed sync DTX_COMMIT", "ops",
-			     "%s/ops/sync_dtx_commit/tgt_%u", path, tgt_id);
+			     "%s/ops/dtx_sync_commit/tgt_%u", path, tgt_id);
 	if (rc != DER_SUCCESS)
 		D_WARN("Failed to create sync DTX_COMMIT RPC cnt metric: "DF_RC"\n", DP_RC(rc));
 
@@ -330,7 +337,7 @@ dtx_handler(crt_rpc_t *rpc)
 	dout->do_status = rc;
 	/* For DTX_COMMIT, it is the count of real committed DTX entries. */
 	dout->do_misc = committed;
-	rc = crt_reply_send(rpc);
+	rc = crt_reply_send_input_free(rpc);
 	if (rc != 0)
 		D_ERROR("send reply failed for DTX rpc %u: rc = "DF_RC"\n", opc,
 			DP_RC(rc));
@@ -467,8 +474,9 @@ dtx_coll_handler(crt_rpc_t *rpc)
 
 out:
 	D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE,
-		 "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n",
-		 opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc));
+		 "Handled collective DTX PRC %u on rank %u for "DF_DTI" in "
+		 DF_UUID"/"DF_UUID": "DF_RC"\n", opc, myrank, DP_DTI(&dci->dci_xid),
+		 DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), DP_RC(rc));
 
 	dco->dco_status = rc;
 	rc = crt_reply_send(rpc);
diff --git a/src/engine/ult.c b/src/engine/ult.c
index 94a2a0f9390..fbeb3f538fa 100644
--- a/src/engine/ult.c
+++ b/src/engine/ult.c
@@ -458,7 +458,7 @@ sched_ult2xs(int xs_type, int tgt_id)
 		break;
 	case DSS_XS_OFFLOAD:
 		if (dss_numa_nr > 1)
-			xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
+			return sched_ult2xs_multisocket(xs_type, tgt_id);
 		if (!dss_helper_pool) {
 			if (dss_tgt_offload_xs_nr > 0)
 				xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr;
diff --git a/src/gurt/telemetry.c b/src/gurt/telemetry.c
index 8f08192d072..be7762184a9 100644
--- a/src/gurt/telemetry.c
+++ b/src/gurt/telemetry.c
@@ -13,6 +13,7 @@
 #include <float.h>
 #include <pthread.h>
 #include <malloc.h>
+#include <gurt/atomic.h>
 #include <gurt/common.h>
 #include <gurt/list.h>
 #include <sys/shm.h>
@@ -624,6 +625,7 @@ alloc_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t **newnode,
 		goto out;
 	tmp->dtn_metric  = NULL;
 	tmp->dtn_sibling = NULL;
+	atomic_store_relaxed(&tmp->dtn_readable, false);
 
 	*newnode = node;
 out:
@@ -2409,6 +2411,9 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type,
 		pthread_mutexattr_destroy(&mattr);
 		temp->dtn_protect = true;
 	}
+
+	atomic_store_relaxed(&temp->dtn_readable, true);
+
 	if (node != NULL)
 		*node = temp;
 
@@ -3090,6 +3095,15 @@ d_tm_try_del_ephemeral_dir(const char *fmt, ...)
 	return rc;
 }
 
+static bool
+node_is_readable(struct d_tm_node_t *node)
+{
+	if (node == NULL)
+		return false;
+
+	return atomic_load_relaxed(&node->dtn_readable);
+}
+
 /**
  * Creates histogram counters for the given node.  It calculates the
  * extents of each bucket and creates counters at the path specified that
@@ -3278,6 +3292,9 @@ d_tm_get_num_buckets(struct d_tm_context *ctx,
 	if (ctx == NULL || histogram == NULL || node == NULL)
 		return -DER_INVAL;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	rc = validate_node_ptr(ctx, node, &shmem);
 	if (rc != 0)
 		return rc;
@@ -3341,6 +3358,9 @@ d_tm_get_bucket_range(struct d_tm_context *ctx, struct d_tm_bucket_t *bucket,
 	if (rc != 0)
 		return rc;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	if (!has_stats(node))
 		return -DER_OP_NOT_PERMITTED;
 
@@ -3392,6 +3412,9 @@ d_tm_get_counter(struct d_tm_context *ctx, uint64_t *val,
 	if (node->dtn_type != D_TM_COUNTER)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	/* "ctx == NULL" is server side fast version to read the counter. */
 	if (ctx == NULL) {
 		metric_data = node->dtn_metric;
@@ -3441,6 +3464,9 @@ d_tm_get_timestamp(struct d_tm_context *ctx, time_t *val,
 	if (node->dtn_type != D_TM_TIMESTAMP)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3470,6 +3496,9 @@ d_tm_get_meminfo(struct d_tm_context *ctx, struct d_tm_meminfo_t *meminfo,
 	if (node->dtn_type != D_TM_MEMINFO)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3513,6 +3542,9 @@ d_tm_get_timer_snapshot(struct d_tm_context *ctx, struct timespec *tms,
 	if (!(node->dtn_type & D_TM_TIMER_SNAPSHOT))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3563,6 +3595,9 @@ d_tm_get_duration(struct d_tm_context *ctx, struct timespec *tms,
 	if (!(node->dtn_type & D_TM_DURATION))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data == NULL)
 		return -DER_METRIC_NOT_FOUND;
@@ -3628,6 +3663,9 @@ d_tm_get_gauge(struct d_tm_context *ctx, uint64_t *val,
 	if (!is_gauge(node))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		dtm_stats = conv_ptr(shmem, metric_data->dtm_stats);
@@ -3700,6 +3738,9 @@ int d_tm_get_metadata(struct d_tm_context *ctx, char **desc, char **units,
 	if (node->dtn_type == D_TM_DIRECTORY)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
diff --git a/src/include/cart/api.h b/src/include/cart/api.h
index 21941d7cb98..16fc2091d4b 100644
--- a/src/include/cart/api.h
+++ b/src/include/cart/api.h
@@ -474,6 +474,21 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg);
 int
 crt_reply_send(crt_rpc_t *req);
 
+/**
+ * Send an RPC reply and free the input buffer immediately.
+ * Only to be called on the server side.
+ *
+ * \param[in] req              pointer to RPC request
+ *
+ * \return                     DER_SUCCESS on success, negative value if error
+ *
+ * \note the crt_rpc_t is exported to user, caller should fill the
+ *        crt_rpc_t::cr_output before sending the RPC reply.
+ *        See \ref crt_req_create.
+ */
+int
+crt_reply_send_input_free(crt_rpc_t *req);
+
 /**
  * Return request buffer
  *
diff --git a/src/include/daos/container.h b/src/include/daos/container.h
index 282dfb8ec49..57ef06f63a2 100644
--- a/src/include/daos/container.h
+++ b/src/include/daos/container.h
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -130,7 +130,7 @@ dc_cont_open_flags_valid(uint64_t flags)
 	f = flags;
 
 	/* One and only one of DAOS_COO_RO, DAOS_COO_RW, and DAOS_COO_EX. */
-	m = f & (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX);
+	m = f & DAOS_COO_IO_BASE_MASK;
 	if (m != DAOS_COO_RO && m != DAOS_COO_RW && m != DAOS_COO_EX)
 		return false;
 
diff --git a/src/include/daos_cont.h b/src/include/daos_cont.h
index c3bda9837d8..632eafe8130 100644
--- a/src/include/daos_cont.h
+++ b/src/include/daos_cont.h
@@ -79,6 +79,9 @@ extern "C" {
 /** Mask for all of the bits in the container open mode flag, DAOS_COO_ bits */
 #define DAOS_COO_MASK	((1U << DAOS_COO_NBITS) - 1)
 
+/** The basic IO mode: read-only, read-write or exclusively read-write. */
+#define DAOS_COO_IO_BASE_MASK   (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX)
+
 /** Maximum length for container hints */
 #define DAOS_CONT_HINT_MAX_LEN	128
 
diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h
index 793d585c07a..9fc615c2a8b 100644
--- a/src/include/daos_srv/container.h
+++ b/src/include/daos_srv/container.h
@@ -108,6 +108,9 @@ struct ds_cont_child {
 	uint32_t		 sc_dtx_committable_count;
 	uint32_t		 sc_dtx_committable_coll_count;
 
+	/* Last timestamp when EC aggregation reports -DER_INPROGRESS. */
+	uint64_t		 sc_ec_agg_busy_ts;
+
 	/* The global minimum EC aggregation epoch, which will be upper
 	 * limit for VOS aggregation, i.e. EC object VOS aggregation can
 	 * not cross this limit. For simplification purpose, all objects
@@ -134,6 +137,8 @@ struct ds_cont_child {
 	d_list_t		 sc_dtx_cos_list;
 	/* The global list for committable collective DTXs. */
 	d_list_t		 sc_dtx_coll_list;
+	/* The list for current DTX batched commit. */
+	d_list_t		 sc_dtx_batched_list;
 	/* the pool map version of updating DAOS_PROP_CO_STATUS prop */
 	uint32_t		 sc_status_pm_ver;
 };
diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h
index 7b3f7e9ee57..7c60d2deaa0 100644
--- a/src/include/daos_srv/dtx_srv.h
+++ b/src/include/daos_srv/dtx_srv.h
@@ -318,8 +318,8 @@ int
 dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
 		      int max, struct dtx_id **dtis);
 void
-dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid,
-		      daos_unit_oid_t *oid, uint64_t dkey_hash);
+dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
+		      struct dtx_id xid[], uint32_t count, bool rm);
 int
 dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func,
 		    dtx_agg_cb_t agg_cb, int allow_failure, void *func_arg);
@@ -338,14 +338,15 @@ int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 		 daos_epoch_t epoch);
 
 int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
-	       struct dtx_cos_key *dcks, int count);
+	       struct dtx_cos_key *dcks, int count, bool has_cos);
 
 int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch);
 
 int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont);
 
 int
-dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck);
+dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck,
+		bool has_cos);
 
 int
 dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch);
diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h
index 6cbe3873f0a..de22d55ed5d 100644
--- a/src/include/daos_srv/pool.h
+++ b/src/include/daos_srv/pool.h
@@ -84,6 +84,7 @@ struct ds_pool {
 	uuid_t			sp_srv_pool_hdl;
 	uint32_t		sp_stopping:1,
 				sp_cr_checked:1,
+				sp_immutable:1,
 				sp_fetch_hdls:1,
 				sp_need_discard:1,
 				sp_disable_rebuild:1;
@@ -277,9 +278,9 @@ int ds_pool_tgt_finish_rebuild(uuid_t pool_uuid, struct pool_target_id_list *lis
 int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf,
 			   unsigned int map_version);
 
-bool ds_pool_skip_for_check(struct ds_pool *pool);
-int ds_pool_start_after_check(uuid_t uuid);
-int ds_pool_start(uuid_t uuid, bool aft_chk);
+bool ds_pool_restricted(struct ds_pool *pool, bool immutable);
+int ds_pool_start_after_check(uuid_t uuid, bool immutable);
+int ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable);
 int ds_pool_stop(uuid_t uuid);
 int dsc_pool_svc_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uint64_t deadline, int ntargets,
 			const d_rank_list_t *rank_list, int ndomains, const uint32_t *domains);
diff --git a/src/include/daos_srv/security.h b/src/include/daos_srv/security.h
index 203b9b548ee..28fe83852b3 100644
--- a/src/include/daos_srv/security.h
+++ b/src/include/daos_srv/security.h
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -304,6 +304,19 @@ ds_sec_cont_can_open_ex(uint64_t cont_capas);
 bool
 ds_sec_cont_can_evict_all(uint64_t cont_capas);
 
+/**
+ * Determine if the container can be modified based on the container security
+ * capabilities.
+ *
+ * \param[in]	cont_capas	Capability bits acquired via
+ *				ds_sec_cont_get_capabilities
+ *
+ * \return	True		Access allowed
+ *		False		Access denied
+ */
+bool
+ds_sec_cont_can_modify(uint64_t cont_capas);
+
 /**
  * Get the security capabilities for a rebuild container handle created by the
  * DAOS server.
diff --git a/src/include/gurt/telemetry_common.h b/src/include/gurt/telemetry_common.h
index 4e7d3b3a02d..2068c1c0cce 100644
--- a/src/include/gurt/telemetry_common.h
+++ b/src/include/gurt/telemetry_common.h
@@ -8,6 +8,8 @@
 
 #include <gurt/common.h>
 
+#include <stdatomic.h>
+
 #define D_TM_VERSION			1
 #define D_TM_MAX_NAME_LEN		256
 #define D_TM_MAX_DESC_LEN		128
@@ -236,6 +238,7 @@ struct d_tm_node_t {
 	pthread_mutex_t		dtn_lock; /** individual mutex */
 	struct d_tm_metric_t	*dtn_metric; /** values */
 	bool			dtn_protect; /** synchronized access */
+	_Atomic bool             dtn_readable; /** fully initialized and ready for reads */
 };
 
 struct d_tm_nodeList_t {
diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c
index e1d6ee79bce..7975a2115d4 100644
--- a/src/mgmt/srv_target.c
+++ b/src/mgmt/srv_target.c
@@ -1212,7 +1212,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 	tc_out->tc_ranks.ca_arrays = rank;
 	tc_out->tc_ranks.ca_count  = 1;
 
-	rc = ds_pool_start(tc_in->tc_pool_uuid, false);
+	rc = ds_pool_start(tc_in->tc_pool_uuid, false, false);
 	if (rc) {
 		D_ERROR(DF_UUID": failed to start pool: "DF_RC"\n",
 			DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c
index e05abadf3cf..d517e3269d6 100644
--- a/src/object/cli_coll.c
+++ b/src/object/cli_coll.c
@@ -178,6 +178,10 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool
 		if (coa->coa_sparse == 0)
 			coa->coa_dct_cap = obj_ranks;
 	}
+
+	/* Save obj->cob_min_rank for verification during subsequent obj_coll_prep_one(). */
+	coa->coa_min_rank = obj->cob_min_rank;
+
 	D_RWLOCK_UNLOCK(&obj->cob_lock);
 
 	if (coa->coa_sparse) {
@@ -208,7 +212,6 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool
 		coa->coa_dct_nr = -1;
 	}
 
-	coa->coa_max_dct_sz = 0;
 	coa->coa_max_shard_nr = 0;
 	coa->coa_max_bitmap_sz = 0;
 	coa->coa_target_nr = 0;
@@ -236,17 +239,55 @@ obj_coll_oper_args_fini(struct coll_oper_args *coa)
 	coa->coa_dct_nr = 0;
 }
 
+static void
+obj_coll_collapse_one(struct coll_oper_args *coa, struct daos_coll_target *dct,
+		      uint32_t *size, bool copy)
+{
+	struct daos_coll_shard	*dcs;
+	uint32_t		 dct_size;
+	int			 i;
+
+	/* The size may be over estimated, no matter. */
+	dct_size = sizeof(*dct) + dct->dct_bitmap_sz +
+		   sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1);
+
+	for (i = 0; i <= dct->dct_max_shard; i++) {
+		dcs = &dct->dct_shards[i];
+		if (dcs->dcs_nr > 1)
+			dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr;
+	}
+
+	if (coa->coa_for_modify)
+		dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr;
+
+	if (coa->coa_max_dct_sz < dct_size)
+		coa->coa_max_dct_sz = dct_size;
+
+	if (copy)
+		memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct));
+
+	coa->coa_dct_nr++;
+	*size += dct_size;
+}
+
+struct obj_coll_tree_args {
+	struct coll_oper_args	*coa;
+	uint32_t		*size;
+};
+
 static int
 obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg)
 {
-	struct coll_oper_args	*coa = arg;
-	struct daos_coll_target	*dct = val->iov_buf;
+	struct obj_coll_tree_args	*octa = arg;
+	struct coll_oper_args		*coa = octa->coa;
+	struct daos_coll_target		*dct = val->iov_buf;
 
 	D_ASSERTF(coa->coa_dct_nr < coa->coa_dct_cap,
 		  "Too short pre-allcoated dct_array: %u vs %u\n",
 		  coa->coa_dct_nr, coa->coa_dct_cap);
+	D_ASSERT(dct->dct_bitmap != NULL);
 
-	memcpy(&coa->coa_dcts[coa->coa_dct_nr++], dct, sizeof(*dct));
+	obj_coll_collapse_one(coa, dct, octa->size, true);
 
 	/* The following members have been migrated into coa->coa_dcts. */
 	dct->dct_bitmap = NULL;
@@ -259,6 +300,7 @@ obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg)
 static int
 obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size)
 {
+	struct obj_coll_tree_args	 octa;
 	struct coll_sparse_targets	*tree = coa->coa_tree;
 	int				 rc = 0;
 
@@ -270,7 +312,14 @@ obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size)
 		D_GOTO(out, rc = -DER_NOMEM);
 
 	coa->coa_sparse = 0;
-	rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, obj_coll_tree_cb, coa);
+	coa->coa_raw_sparse = 1;
+	coa->coa_dct_nr = 0;
+	coa->coa_max_dct_sz = 0;
+
+	octa.coa = coa;
+	octa.size = size;
+	rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false,
+			    obj_coll_tree_cb, &octa);
 	if (rc == 0)
 		D_ASSERTF(coa->coa_dct_nr == coa->coa_dct_cap,
 			  "Something is wrong when prepare coll target array: %u vs %u\n",
@@ -287,36 +336,13 @@ static int
 obj_coll_collapse_array(struct coll_oper_args *coa, uint32_t *size)
 {
 	struct daos_coll_target	*dct;
-	struct daos_coll_shard	*dcs;
-	uint32_t		 dct_size;
 	int			 i;
-	int			 j;
 
-	for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) {
+	for (i = 0, *size = 0, coa->coa_dct_nr = 0, coa->coa_max_dct_sz = 0;
+	     i < coa->coa_dct_cap; i++) {
 		dct = &coa->coa_dcts[i];
-		if (dct->dct_bitmap != NULL) {
-			/* The size may be over estimated, no matter. */
-			dct_size = sizeof(*dct) + dct->dct_bitmap_sz +
-				   sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1);
-
-			for (j = 0; j <= dct->dct_max_shard; j++) {
-				dcs = &dct->dct_shards[j];
-				if (dcs->dcs_nr > 1)
-					dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr;
-			}
-
-			if (coa->coa_for_modify)
-				dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr;
-
-			if (coa->coa_max_dct_sz < dct_size)
-				coa->coa_max_dct_sz = dct_size;
-
-			if (coa->coa_dct_nr < i)
-				memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct));
-
-			coa->coa_dct_nr++;
-			*size += dct_size;
-		}
+		if (dct->dct_bitmap != NULL)
+			obj_coll_collapse_one(coa, dct, size, coa->coa_dct_nr < i);
 	}
 
 	/* Reset the other dct slots to avoid double free during cleanup. */
@@ -373,8 +399,9 @@ obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj,
 
 	D_RWLOCK_RDLOCK(&obj->cob_lock);
 
-	D_ASSERTF(shard->do_target_rank <= obj->cob_max_rank,
-		  "Unexpected shard with rank %u > %u\n", shard->do_target_rank, obj->cob_max_rank);
+	D_ASSERTF(coa->coa_min_rank == obj->cob_min_rank,
+		  "Object "DF_OID" layout has been changed unexpectedly %u => %u, idx %u, ver %u\n",
+		  DP_OID(obj->cob_md.omd_id), coa->coa_min_rank, obj->cob_min_rank, idx, map_ver);
 	D_ASSERTF(shard->do_target_rank >= obj->cob_min_rank,
 		  "Unexpected shard with rank %u < %u\n", shard->do_target_rank, obj->cob_min_rank);
 
@@ -669,7 +696,6 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	uint32_t			 tgt_size = 0;
 	uint32_t			 mbs_max_size;
 	uint32_t			 inline_size;
-	uint32_t			 flags = ORF_LEADER;
 	uint32_t			 leader = -1;
 	uint32_t			 len;
 	int				 rc;
@@ -746,6 +772,12 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 		memcpy(dct, &tmp_tgt, sizeof(tmp_tgt));
 	}
 
+	/* 'shard' is on the leader target that is must be the coa->coa_dcts[0]. */
+	D_ASSERTF(shard->do_target_rank == coa->coa_dcts[0].dct_rank,
+		  "Object "DF_OID" target array corrupted: rank %u vs %ur, nr %u\n",
+		  DP_OID(obj->cob_md.omd_id), shard->do_target_rank,
+		  coa->coa_dcts[0].dct_rank, coa->coa_dct_nr);
+
 	rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs);
 	if (rc < 0)
 		goto out;
@@ -767,12 +799,14 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	if (rc != 0)
 		goto out;
 
+	auxi->flags = ORF_LEADER;
 	if (auxi->io_retry) {
-		flags |= ORF_RESEND;
+		auxi->flags |= ORF_RESEND;
 		/* Reset @enqueue_id if resend to new leader. */
 		if (spa->pa_auxi.target != shard->do_target_id)
 			spa->pa_auxi.enqueue_id = 0;
 	} else {
+		auxi->flags &= ~ORF_RESEND;
 		spa->pa_auxi.obj_auxi = auxi;
 		daos_dti_gen(&spa->pa_dti, false);
 	}
@@ -781,14 +815,15 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	spa->pa_auxi.shard = shard->do_shard_idx;
 
 	if (obj_is_ec(obj))
-		flags |= ORF_EC;
+		auxi->flags |= ORF_EC;
 
 	mbs_max_size = sizeof(*mbs) + mbs->dm_data_size +
 		       sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz;
 
 	return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size,
 				       coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch,
-				       args->flags, flags, map_ver, &auxi->map_ver_reply, task);
+				       args->flags, auxi->flags, map_ver,
+				       &auxi->map_ver_reply, task);
 
 out:
 	if (rc > 0)
@@ -838,7 +873,7 @@ queue_coll_query_task(tse_task_t *api_task, struct obj_auxi_args *obj_auxi, stru
 			   0, 0, ocdc);
 
 	for (i = 0; i < ocdc->grp_nr; i++) {
-		obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep);
+		obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep, obj->cob_md.omd_id);
 
 		tmp = coa->coa_dcts[ocdc->cur_pos].dct_shards[tgt_ep.ep_tag].dcs_idx;
 		rc = queue_shard_query_key_task(api_task, obj_auxi, epoch, tmp, map_ver,
diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c
index 318ba0cb51e..46161bf7a2d 100644
--- a/src/object/cli_obj.c
+++ b/src/object/cli_obj.c
@@ -1718,15 +1718,20 @@ dc_obj_retry_delay(tse_task_t *task, int err, uint16_t *retry_cnt, uint16_t *inp
 		   uint32_t timeout_sec)
 {
 	uint32_t	delay = 0;
+	uint32_t	limit = 4;
 
 	/*
-	 * Randomly delay 5 - 68 us if it is not the first retry for
+	 * Randomly delay 5 ~ 1028 us if it is not the first retry for
 	 * -DER_INPROGRESS || -DER_UPDATE_AGAIN cases.
 	 */
 	++(*retry_cnt);
 	if (err == -DER_INPROGRESS || err == -DER_UPDATE_AGAIN) {
 		if (++(*inprogress_cnt) > 1) {
-			delay = (d_rand() & ((1 << 6) - 1)) + 5;
+			limit += *inprogress_cnt;
+			if (limit > 10)
+				limit = 10;
+
+			delay = (d_rand() & ((1 << limit) - 1)) + 5;
 			/* Rebuild is being established on the server side, wait a bit longer */
 			if (err == -DER_UPDATE_AGAIN)
 				delay <<= 10;
@@ -4856,11 +4861,14 @@ obj_comp_cb(tse_task_t *task, void *data)
 		D_ASSERT(daos_handle_is_inval(obj_auxi->th));
 		D_ASSERT(obj_is_modification_opc(obj_auxi->opc));
 
-		if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0)
-			/* XXX: it is must because miss to set "RESEND" flag, that is bug. */
-			D_ASSERTF(0,
-				  "Miss 'RESEND' flag (%x) when resend the RPC for task %p: %u\n",
-				  obj_auxi->flags, task, obj_auxi->retry_cnt);
+		if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) {
+			D_ERROR("TX ID maybe reused for unknown reason, "
+				"task %p, opc %u, flags %x, retry_cnt %u\n",
+				task, obj_auxi->opc, obj_auxi->flags, obj_auxi->retry_cnt);
+			task->dt_result = -DER_IO;
+			obj_auxi->io_retry = 0;
+			goto args_fini;
+		}
 
 		if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) {
 			daos_obj_rw_t		*api_args = dc_task_get_args(obj_auxi->obj_task);
@@ -4886,6 +4894,7 @@ obj_comp_cb(tse_task_t *task, void *data)
 		}
 	}
 
+args_fini:
 	if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH)
 		obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa);
 
diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c
index 36c5c5f1e0c..9f084140f80 100644
--- a/src/object/cli_shard.c
+++ b/src/object/cli_shard.c
@@ -1451,11 +1451,15 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data)
 			shard_args->pa_auxi.obj_auxi->max_delay = timeout;
 	}
 
-	DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result,
-		  "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX "
-		  DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid),
-		  DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver,
-		  (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags);
+	DL_CDEBUG(task->dt_result < 0 && task->dt_result != -DER_INPROGRESS,
+		  DLOG_ERR, DB_IO, task->dt_result,
+		  "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" in "DF_UUID"/"DF_UUID"/"
+		  DF_UUID" with DTX "DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout",
+		  rpc, DP_UOID(ocpi->ocpi_oid), DP_UUID(ocpi->ocpi_po_uuid),
+		  DP_UUID(ocpi->ocpi_co_hdl), DP_UUID(ocpi->ocpi_co_uuid), DP_DTI(&ocpi->ocpi_xid),
+		  task, ocpi->ocpi_map_ver, *cb_args->cpca_ver,
+		  (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags,
+		  cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous");
 
 	crt_req_decref(rpc);
 
diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h
index ae5c9c82fd1..c0df21dd009 100644
--- a/src/object/obj_internal.h
+++ b/src/object/obj_internal.h
@@ -293,10 +293,16 @@ struct coll_oper_args {
 	struct shard_auxi_args	 coa_auxi;
 	int			 coa_dct_nr;
 	uint32_t		 coa_dct_cap;
-	uint32_t		 coa_max_dct_sz;
+	union {
+		/* Save obj->cob_min_rank for verification during obj_coll_prep_one(). */
+		uint32_t	 coa_min_rank;
+		/* Only can be used since obj_coll_oper_args_collapse() after object layout scan. */
+		uint32_t	 coa_max_dct_sz;
+	};
 	uint8_t			 coa_max_shard_nr;
 	uint8_t			 coa_max_bitmap_sz;
 	uint8_t			 coa_for_modify:1,
+				 coa_raw_sparse:1,
 				 coa_sparse:1;
 	uint8_t			 coa_target_nr;
 	/*
@@ -1094,7 +1100,7 @@ int daos_obj_query_merge(struct obj_query_merge_args *oqma);
 void obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size,
 			uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc);
 void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts,
-			crt_endpoint_t *tgt_ep);
+			crt_endpoint_t *tgt_ep, daos_obj_id_t oid);
 void obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc);
 int obj_utils_init(void);
 void obj_utils_fini(void);
diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c
index 7bf0ef4aaf9..c01947a05a1 100644
--- a/src/object/obj_utils.c
+++ b/src/object/obj_utils.c
@@ -616,24 +616,23 @@ obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size,
 
 void
 obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts,
-		   crt_endpoint_t *tgt_ep)
+		   crt_endpoint_t *tgt_ep, daos_obj_id_t oid)
 {
 	struct daos_coll_target		*dct = &tgts[ocdc->cur_pos];
 	struct daos_coll_target		 tmp;
-	unsigned long			 rand = 0;
 	uint32_t			 size;
 	int				 pos;
 	int				 i;
 
 	if (ocdc->cur_step > 2) {
-		rand = d_rand();
 		/*
-		 * Randomly choose an engine as the relay one for load balance.
-		 * If the one corresponding to "pos" is former moved one, then
-		 * use the "cur_pos" as the relay engine.
+		 * Choose an engine (according to the given oid) as the relay one for load balance.
+		 * If the one corresponding to "pos" is former moved one, then use the "cur_pos" as
+		 * the relay engine. Then even if related RPC was resent without changing pool map,
+		 * then the relay one will be the same as the original case.
 		 */
-		pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos;
-		if (pos != ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) {
+		pos = oid.lo % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos;
+		if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) {
 			memcpy(&tmp, &tgts[pos], sizeof(tmp));
 			memcpy(&tgts[pos], dct, sizeof(tmp));
 			memcpy(dct, &tmp, sizeof(tmp));
@@ -642,8 +641,8 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t
 
 	size = dct->dct_bitmap_sz << 3;
 
-	/* Randomly choose a XS as the local leader on target engine for load balance. */
-	for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) {
+	/* Choose a target as the local agent on the engine for load balance. */
+	for (i = 0, pos = oid.lo % dct->dct_tgt_nr; i < size; i++) {
 		if (isset(dct->dct_bitmap, i)) {
 			pos -= dct->dct_shards[i].dcs_nr;
 			if (pos < 0)
diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c
index 2a152b47bd6..5b59f954f86 100644
--- a/src/object/srv_coll.c
+++ b/src/object/srv_coll.c
@@ -239,9 +239,15 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct
 	int			 i;
 	int			 j;
 
-	/* dcts[0] is for current engine. */
-	if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 ||
-	    dcts[0].dct_shards == NULL) {
+	/* dcts[0] must be for current engine. */
+	if (unlikely(dcts[0].dct_rank != dss_self_rank())) {
+		D_ERROR("Invalid targets array: rank %u vs %u, nr %u, flags %x\n",
+			dcts[0].dct_rank, dss_self_rank(), dct_nr, ocpi->ocpi_flags);
+		D_GOTO(out, rc = -DER_INVAL);
+	}
+
+	if (unlikely(dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 ||
+		     dcts[0].dct_shards == NULL)) {
 		D_ERROR("Invalid input for current engine: bitmap %s, bitmap_sz %u, shards %s\n",
 			dcts[0].dct_bitmap == NULL ? "empty" : "non-empty", dcts[0].dct_bitmap_sz,
 			dcts[0].dct_shards == NULL ? "empty" : "non-empty");
diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c
index 71c630fa947..50b513d1612 100644
--- a/src/object/srv_ec_aggregate.c
+++ b/src/object/srv_ec_aggregate.c
@@ -2667,8 +2667,13 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
 	struct ec_agg_param	 *ec_agg_param = agg_param->ap_data;
 	vos_iter_param_t	 iter_param = { 0 };
 	struct vos_iter_anchors  anchors = { 0 };
+	struct dtx_handle	*dth = NULL;
+	struct dtx_share_peer	*dsp;
+	struct dtx_id		 dti = { 0 };
+	struct dtx_epoch	 epoch = { 0 };
+	daos_unit_oid_t		 oid = { 0 };
+	int			 blocks = 0;
 	int			 rc = 0;
-	int                       blocks       = 0;
 
 	/*
 	 * Avoid calling into vos_aggregate() when aborting aggregation
@@ -2715,8 +2720,32 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
 	agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL);
 
 retry:
+	epoch.oe_value = epr->epr_hi;
+	rc = dtx_begin(cont->sc_hdl, &dti, &epoch, 0, cont->sc_pool->spc_map_version, &oid,
+		       NULL, 0, 0, NULL, &dth);
+	if (rc != 0)
+		goto update_hae;
+
 	rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors, agg_iterate_pre_cb,
-			 agg_iterate_post_cb, ec_agg_param, NULL);
+			 agg_iterate_post_cb, ec_agg_param, dth);
+	if (rc == -DER_INPROGRESS && !d_list_empty(&dth->dth_share_tbd_list)) {
+		uint64_t	now = daos_gettime_coarse();
+
+		/* Report warning per each 10 seconds to avoid log flood. */
+		if (now - cont->sc_ec_agg_busy_ts > 10) {
+			while ((dsp = d_list_pop_entry(&dth->dth_share_tbd_list,
+						       struct dtx_share_peer, dsp_link)) != NULL) {
+				D_WARN(DF_CONT ": EC aggregate hit non-committed DTX " DF_DTI "\n",
+				       DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
+				       DP_DTI(&dsp->dsp_xid));
+				dtx_dsp_free(dsp);
+			}
+
+			cont->sc_ec_agg_busy_ts = now;
+		}
+	}
+
+	dtx_end(dth, cont, rc);
 
 	/* Post_cb may not being executed in some cases */
 	agg_clear_extents(&ec_agg_param->ap_agg_entry);
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index a51682b4785..4762e04b898 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -162,7 +162,7 @@ obj_rw_complete(crt_rpc_t *rpc, struct obj_io_context *ioc,
 }
 
 static void
-obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch,
+obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, bool release_input,
 	     struct obj_io_context *ioc)
 {
 	struct obj_rw_out	*orwo = crt_reply_get(rpc);
@@ -187,7 +187,11 @@ obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch,
 		ioc->ioc_map_ver, orwo->orw_epoch, status);
 
 	if (!ioc->ioc_lost_reply) {
-		rc = crt_reply_send(rpc);
+		if (release_input)
+			rc = crt_reply_send_input_free(rpc);
+		else
+			rc = crt_reply_send(rpc);
+
 		if (rc != 0)
 			D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc));
 	} else {
@@ -2079,6 +2083,8 @@ obj_ioc_init(uuid_t pool_uuid, uuid_t coh_uuid, uuid_t cont_uuid, crt_rpc_t *rpc
 
 	D_ASSERT(ioc != NULL);
 	memset(ioc, 0, sizeof(*ioc));
+
+	crt_req_addref(rpc);
 	ioc->ioc_rpc = rpc;
 	ioc->ioc_opc = opc_get(rpc->cr_opc);
 	rc = ds_cont_find_hdl(pool_uuid, coh_uuid, &coh);
@@ -2154,6 +2160,10 @@ obj_ioc_fini(struct obj_io_context *ioc, int err)
 		ds_cont_child_put(ioc->ioc_coc);
 		ioc->ioc_coc = NULL;
 	}
+	if (ioc->ioc_rpc) {
+		crt_req_decref(ioc->ioc_rpc);
+		ioc->ioc_rpc = NULL;
+	}
 }
 
 /* Setup lite IO context, it is only for compound RPC so far:
@@ -2508,7 +2518,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc)
 	rc = vos_obj_array_remove(ioc.ioc_coc->sc_hdl, oer->er_oid, &oer->er_epoch_range, dkey,
 				  &iod->iod_name, &recx);
 out:
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, false, &ioc);
 	obj_ioc_end(&ioc, rc);
 }
 
@@ -2604,7 +2614,7 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc)
 		D_ERROR(DF_UOID ": array_remove failed: " DF_RC "\n", DP_UOID(oea->ea_oid),
 			DP_RC(rc1));
 out:
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, false, &ioc);
 	obj_ioc_end(&ioc, rc);
 }
 
@@ -2733,7 +2743,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc)
 out:
 	if (dth != NULL)
 		rc = dtx_end(dth, ioc.ioc_coc, rc);
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, true, &ioc);
 	D_FREE(mbs);
 	obj_ioc_end(&ioc, rc);
 }
@@ -2943,8 +2953,11 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 
 		d_tm_inc_counter(opm->opm_update_resent, 1);
 
-again1:
-		e = 0;
+again:
+		if (flags & ORF_RESEND)
+			e = orw->orw_epoch;
+		else
+			e = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti,
 				       &e, &version);
 		switch (rc) {
@@ -2955,8 +2968,13 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 			orw->orw_epoch = e;
 			/* TODO: Also recover the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &orw->orw_dti, e);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
-			rc = 0;
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -2966,7 +2984,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 		D_GOTO(out, rc);
 	}
 
-again2:
 	/* For leader case, we need to find out the potential conflict
 	 * (or share the same non-committed object/dkey) DTX(s) in the
 	 * CoS (committable) cache, piggyback them via the dispdatched
@@ -3011,7 +3028,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 	exec_arg.start = orw->orw_start_shard;
 
 	/* Execute the operation on all targets */
@@ -3026,28 +3043,25 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 	case -DER_TX_RESTART:
 		/*
 		 * If this is a standalone operation, we can restart the
-		 * internal transaction right here. Otherwise, we have to defer
-		 * the restart to the RPC client.
+		 * internal transaction right here. Otherwise we have to
+		 * defer the restart to the RPC sponsor.
 		 */
-		if (opc == DAOS_OBJ_RPC_UPDATE) {
-			/*
-			 * Only standalone updates use this RPC. Retry with
-			 * newer epoch.
-			 */
-			orw->orw_epoch = d_hlc_get();
-			orw->orw_flags &= ~ORF_RESEND;
-			flags = 0;
-			d_tm_inc_counter(opm->opm_update_restart, 1);
-			goto again2;
-		}
+		if (opc != DAOS_OBJ_RPC_UPDATE)
+			break;
 
-		break;
+		/* Only standalone updates use this RPC. Retry with newer epoch. */
+		orw->orw_epoch = d_hlc_get();
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		d_tm_inc_counter(opm->opm_update_restart, 1);
+		goto again;
 	case -DER_AGAIN:
-		orw->orw_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		d_tm_inc_counter(opm->opm_update_retry, 1);
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -3073,7 +3087,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 	if (ioc.ioc_map_ver < max_ver)
 		ioc.ioc_map_ver = max_ver;
 
-	obj_rw_reply(rpc, rc, epoch.oe_value, &ioc);
+	obj_rw_reply(rpc, rc, epoch.oe_value, false, &ioc);
 	D_FREE(mbs);
 	D_FREE(dti_cos);
 	obj_ioc_end(&ioc, rc);
@@ -3865,8 +3879,11 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 	if (opi->opi_flags & ORF_RESEND) {
 		daos_epoch_t	e;
 
-again1:
-		e = 0;
+again:
+		if (flags & ORF_RESEND)
+			e = opi->opi_epoch;
+		else
+			e = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti,
 				       &e, &version);
 		switch (rc) {
@@ -3877,8 +3894,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 			flags |= ORF_RESEND;
 			/* TODO: Also recovery the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
-			rc = 0;
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -3888,7 +3910,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 		goto cleanup;
 	}
 
-again2:
 	/* For leader case, we need to find out the potential conflict
 	 * (or share the same non-committed object/dkey) DTX(s) in the
 	 * CoS (committable) cache, piggyback them via the dispdatched
@@ -3933,7 +3954,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 
 	/* Execute the operation on all shards */
 	if (opi->opi_api_flags & DAOS_COND_PUNCH)
@@ -3949,19 +3970,17 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 	rc = dtx_leader_end(dlh, ioc.ioc_coh, rc);
 	switch (rc) {
 	case -DER_TX_RESTART:
-		/*
-		 * Only standalone punches use this RPC. Retry with newer
-		 * epoch.
-		 */
+		/* Only standalone punches use this RPC. Retry with newer epoch. */
 		opi->opi_epoch = d_hlc_get();
-		opi->opi_flags &= ~ORF_RESEND;
-		flags = 0;
-		goto again2;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		goto again;
 	case -DER_AGAIN:
-		opi->opi_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -5605,12 +5624,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj "
 		DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti "
-		DF_DTI", forward width %u, forward depth %u\n",
+		DF_DTI", forward width %u, forward depth %u, flags %x\n",
 		(ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
 		(ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"),
 		rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
 		ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid),
-		ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth);
+		ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
 
 	D_ASSERT(dmi->dmi_xs_id != 0);
 
@@ -5653,8 +5672,11 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	if (ocpi->ocpi_flags & ORF_RESEND) {
 
-again1:
-		tmp = 0;
+again:
+		if (!(ocpi->ocpi_flags & ORF_LEADER) || (flags & ORF_RESEND))
+			tmp = ocpi->ocpi_epoch;
+		else
+			tmp = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version);
 		switch (rc) {
 		case -DER_ALREADY:
@@ -5664,7 +5686,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 			flags |= ORF_RESEND;
 			/* TODO: Also recovery the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &ocpi->ocpi_xid, tmp);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -5673,7 +5701,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 		dce->dce_ver = version;
 	}
 
-again2:
 	epoch.oe_value = ocpi->ocpi_epoch;
 	epoch.oe_first = epoch.oe_value;
 	epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags);
@@ -5685,7 +5712,7 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 	exec_arg.coll_shards = dcts[0].dct_shards;
 	exec_arg.coll_tgts = dcts;
 	obj_coll_disp_init(dct_nr, ocpi->ocpi_max_tgt_sz,
@@ -5718,14 +5745,15 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 	switch (rc) {
 	case -DER_TX_RESTART:
 		ocpi->ocpi_epoch = d_hlc_get();
-		ocpi->ocpi_flags &= ~ORF_RESEND;
-		flags = 0;
-		goto again2;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		goto again;
 	case -DER_AGAIN:
-		ocpi->ocpi_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -5745,15 +5773,17 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 		max_ver = version;
 
 	DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc,
-		  "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc "
-		  DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, "
-		  "tgt_nr %u, forward width %u, forward depth %u",
+		  "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/"
+		  DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, "
+		  "bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x",
 		  (ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
 		  (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc,
-		  DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch,
+		  DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
+		  DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl),
+		  DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch,
 		  ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz,
 		  ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count,
-		  ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth);
+		  ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
 
 	obj_punch_complete(rpc, rc, max_ver);
 
diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c
index ce06723621b..f64d851e5b4 100644
--- a/src/object/srv_obj_remote.c
+++ b/src/object/srv_obj_remote.c
@@ -136,7 +136,7 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx,
 	*orw = *orw_parent;
 
 	orw->orw_oid.id_shard = shard_tgt->st_shard_id;
-	orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags;
+	orw->orw_flags |= (ORF_BULK_BIND | obj_exec_arg->flags) & ~ORF_LEADER;
 	if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond)
 		orw->orw_api_flags &= ~DAOS_COND_MASK;
 	orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count;
@@ -247,7 +247,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx,
 	*opi = *opi_parent;
 
 	opi->opi_oid.id_shard = shard_tgt->st_shard_id;
-	opi->opi_flags |= obj_exec_arg->flags;
+	opi->opi_flags |= obj_exec_arg->flags & ~ORF_LEADER;
 	if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond)
 		opi->opi_api_flags &= ~DAOS_COND_PUNCH;
 	opi->opi_dti_cos.ca_count = dth->dth_dti_cos_count;
@@ -495,7 +495,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	crt_endpoint_t			 tgt_ep = { 0 };
 	crt_rpc_t			*parent_req = exec_arg->rpc;
 	crt_rpc_t			*req;
-	struct obj_coll_punch_in	*ocpi_parent;
+	struct obj_coll_punch_in	*ocpi_parent = crt_req_get(parent_req);
 	struct obj_coll_punch_in	*ocpi;
 	int				 tag;
 	int				 rc = 0;
@@ -509,7 +509,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	if (remote_arg == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep);
+	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocpi_parent->ocpi_oid.id_pub);
 	tag = tgt_ep.ep_tag;
 
 	crt_req_addref(parent_req);
@@ -524,9 +524,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 		D_GOTO(out, rc);
 	}
 
-	ocpi_parent = crt_req_get(parent_req);
 	ocpi = crt_req_get(req);
-
 	ocpi->ocpi_odm = ocpi_parent->ocpi_odm;
 	uuid_copy(ocpi->ocpi_po_uuid, ocpi_parent->ocpi_po_uuid);
 	uuid_copy(ocpi->ocpi_co_hdl, ocpi_parent->ocpi_co_hdl);
@@ -634,7 +632,7 @@ ds_obj_coll_query_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	if (remote_arg == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep);
+	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocqi_parent->ocqi_oid.id_pub);
 	tag = tgt_ep.ep_tag;
 
 	remote_arg->dlh = dlh;
diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h
index 8f864c8c11a..612f7760fd1 100644
--- a/src/pool/srv_internal.h
+++ b/src/pool/srv_internal.h
@@ -57,6 +57,12 @@ pool_tls_get()
 	return tls;
 }
 
+static inline bool
+ds_pool_skip_for_check(struct ds_pool *pool)
+{
+	return engine_in_check() && !pool->sp_cr_checked;
+}
+
 struct pool_iv_map {
 	d_rank_t	piv_master_rank;
 	uint32_t	piv_pool_map_ver;
diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c
index 667e4bc6ed6..6e3a01379fa 100644
--- a/src/pool/srv_pool.c
+++ b/src/pool/srv_pool.c
@@ -62,7 +62,6 @@ ds_pool_get_vos_df_version(uint32_t pool_global_version)
 
 /* Pool service crt event */
 struct pool_svc_event {
-	d_list_t		psv_link;
 	d_rank_t		psv_rank;
 	uint64_t		psv_incarnation;
 	enum crt_event_source	psv_src;
@@ -72,15 +71,40 @@ struct pool_svc_event {
 #define DF_PS_EVENT	"rank=%u inc="DF_U64" src=%d type=%d"
 #define DP_PS_EVENT(e)	e->psv_rank, e->psv_incarnation, e->psv_src, e->psv_type
 
-#define RECHOOSE_SLEEP_MS 250
+/*
+ * Pool service crt event set
+ *
+ * This stores an unordered array of pool_svc_event objects. For all different
+ * i and j, we have pss_buf[i].psv_rank != pss_buf[j].psv_rank.
+ *
+ * An event set facilitates the merging of a sequence of events. For instance,
+ * sequence (in the format <rank, type>)
+ *   <3, D>, <5, D>, <1, D>, <5, A>, <1, A>, <1, D>
+ * will merge into set
+ *   <3, D>, <5, A>, <1, D>
+ * (that is, during the merge, an event overrides a previuos event of the same
+ * rank in the set).
+ */
+struct pool_svc_event_set {
+	struct pool_svc_event *pss_buf;
+	uint32_t               pss_len;
+	uint32_t               pss_cap;
+};
+
+#define DF_PS_EVENT_SET    "len=%u"
+#define DP_PS_EVENT_SET(s) s->pss_len
 
 /* Pool service crt-event-handling state */
 struct pool_svc_events {
-	ABT_mutex		pse_mutex;
-	ABT_cond		pse_cv;
-	d_list_t		pse_queue;
-	ABT_thread		pse_handler;
-	bool			pse_stop;
+	ABT_mutex                  pse_mutex;
+	ABT_cond                   pse_cv;
+	struct pool_svc_event_set *pse_pending;
+	uint64_t                   pse_timeout; /* s */
+	uint64_t                   pse_time;    /* s */
+	struct sched_request      *pse_timer;
+	ABT_thread                 pse_handler;
+	bool                       pse_stop;
+	bool                       pse_paused;
 };
 
 /* Pool service schedule state */
@@ -1162,6 +1186,15 @@ pool_svc_locate_cb(d_iov_t *id, char **path)
 	return 0;
 }
 
+static unsigned int
+get_crt_event_delay(void)
+{
+	unsigned int t = 10 /* s */;
+
+	d_getenv_uint("CRT_EVENT_DELAY", &t);
+	return t;
+}
+
 static int
 pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc)
 {
@@ -1182,7 +1215,7 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc)
 	d_iov_set(&svc->ps_rsvc.s_id, svc->ps_uuid, sizeof(uuid_t));
 
 	uuid_copy(svc->ps_uuid, id->iov_buf);
-	D_INIT_LIST_HEAD(&svc->ps_events.pse_queue);
+	svc->ps_events.pse_timeout = get_crt_event_delay();
 	svc->ps_events.pse_handler = ABT_THREAD_NULL;
 	svc->ps_svc_rf = -1;
 	svc->ps_force_notify = false;
@@ -1300,98 +1333,221 @@ ds_pool_enable_exclude(void)
 	pool_disable_exclude = false;
 }
 
+static int
+alloc_event_set(struct pool_svc_event_set **event_set)
+{
+	D_ALLOC_PTR(*event_set);
+	if (*event_set == NULL)
+		return -DER_NOMEM;
+	return 0;
+}
+
+static void
+free_event_set(struct pool_svc_event_set **event_set)
+{
+	D_FREE((*event_set)->pss_buf);
+	D_FREE(*event_set);
+}
+
+static int
+add_to_event_set(struct pool_svc_event_set *event_set, d_rank_t rank, uint64_t incarnation,
+		 enum crt_event_source src, enum crt_event_type type)
+{
+	int i;
+
+	/* Find rank in event_set. */
+	for (i = 0; i < event_set->pss_len; i++)
+		if (event_set->pss_buf[i].psv_rank == rank)
+			break;
+
+	/* If not found, prepare to add a new event. */
+	if (i == event_set->pss_len) {
+		if (event_set->pss_len == event_set->pss_cap) {
+			uint32_t               cap;
+			struct pool_svc_event *buf;
+
+			if (event_set->pss_cap == 0)
+				cap = 1;
+			else
+				cap = 2 * event_set->pss_cap;
+			D_REALLOC_ARRAY(buf, event_set->pss_buf, event_set->pss_cap, cap);
+			if (buf == NULL)
+				return -DER_NOMEM;
+			event_set->pss_buf = buf;
+			event_set->pss_cap = cap;
+		}
+		event_set->pss_len++;
+	}
+
+	event_set->pss_buf[i].psv_rank        = rank;
+	event_set->pss_buf[i].psv_incarnation = incarnation;
+	event_set->pss_buf[i].psv_src         = src;
+	event_set->pss_buf[i].psv_type        = type;
+	return 0;
+}
+
+/* Merge next into prev. */
+static int
+merge_event_sets(struct pool_svc_event_set *prev, struct pool_svc_event_set *next)
+{
+	int i;
+
+	for (i = 0; i < next->pss_len; i++) {
+		struct pool_svc_event *event = &next->pss_buf[i];
+		int                    rc;
+
+		rc = add_to_event_set(prev, event->psv_rank, event->psv_incarnation, event->psv_src,
+				      event->psv_type);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
 static int
 queue_event(struct pool_svc *svc, d_rank_t rank, uint64_t incarnation, enum crt_event_source src,
 	    enum crt_event_type type)
 {
 	struct pool_svc_events *events = &svc->ps_events;
-	struct pool_svc_event  *event;
+	int                     rc;
+	bool                    allocated = false;
 
-	D_ALLOC_PTR(event);
-	if (event == NULL)
-		return -DER_NOMEM;
+	D_DEBUG(DB_MD, DF_UUID ": queuing event: " DF_PS_EVENT "\n", DP_UUID(svc->ps_uuid), rank,
+		incarnation, src, type);
 
-	event->psv_rank = rank;
-	event->psv_incarnation = incarnation;
-	event->psv_src = src;
-	event->psv_type = type;
+	ABT_mutex_lock(events->pse_mutex);
 
-	D_DEBUG(DB_MD, DF_UUID": queuing event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid),
-		DP_PS_EVENT(event));
+	if (events->pse_pending == NULL) {
+		rc = alloc_event_set(&events->pse_pending);
+		if (rc != 0)
+			goto out;
+		allocated = true;
+	}
+
+	rc = add_to_event_set(events->pse_pending, rank, incarnation, src, type);
+	if (rc != 0)
+		goto out;
+
+	events->pse_time = daos_gettime_coarse();
+
+	if (events->pse_paused) {
+		D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid));
+		events->pse_paused = false;
+	}
 
-	ABT_mutex_lock(events->pse_mutex);
-	d_list_add_tail(&event->psv_link, &events->pse_queue);
 	ABT_cond_broadcast(events->pse_cv);
+
+out:
+	if (rc != 0 && allocated)
+		free_event_set(&events->pse_pending);
 	ABT_mutex_unlock(events->pse_mutex);
-	return 0;
+	return rc;
 }
 
 static void
-discard_events(d_list_t *queue)
+resume_event_handling(struct pool_svc *svc)
 {
-	struct pool_svc_event  *event;
-	struct pool_svc_event  *tmp;
+	struct pool_svc_events *events = &svc->ps_events;
 
-	d_list_for_each_entry_safe(event, tmp, queue, psv_link) {
-		D_DEBUG(DB_MD, "discard event: "DF_PS_EVENT"\n", DP_PS_EVENT(event));
-		d_list_del_init(&event->psv_link);
-		D_FREE(event);
+	ABT_mutex_lock(events->pse_mutex);
+	if (events->pse_paused) {
+		D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid));
+		events->pse_paused = false;
+		ABT_cond_broadcast(events->pse_cv);
 	}
+	ABT_mutex_unlock(events->pse_mutex);
 }
 
-static int pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank);
+static int pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set);
 
-static void
-handle_event(struct pool_svc *svc, struct pool_svc_event *event)
+static int
+handle_event(struct pool_svc *svc, struct pool_svc_event_set *event_set)
 {
+	int i;
 	int rc;
 
-	if ((event->psv_src != CRT_EVS_GRPMOD && event->psv_src != CRT_EVS_SWIM) ||
-	    (event->psv_type == CRT_EVT_DEAD && pool_disable_exclude)) {
-		D_DEBUG(DB_MD, "ignore event: "DF_PS_EVENT" exclude=%d\n", DP_PS_EVENT(event),
-			pool_disable_exclude);
-		goto out;
+	D_INFO(DF_UUID ": handling event set: " DF_PS_EVENT_SET "\n", DP_UUID(svc->ps_uuid),
+	       DP_PS_EVENT_SET(event_set));
+
+	if (!pool_disable_exclude) {
+		rc = pool_svc_exclude_ranks(svc, event_set);
+		if (rc != 0) {
+			DL_ERROR(rc, DF_UUID ": failed to exclude ranks", DP_UUID(svc->ps_uuid));
+			return rc;
+		}
 	}
 
-	if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD &&
-	    event->psv_type == CRT_EVT_DEAD) {
-		D_DEBUG(DB_MD, "ignore exclusion of self\n");
-		goto out;
+	/*
+	 * Check if the alive ranks are up in the pool map. If in the future we
+	 * add automatic reintegration below, for instance, we may need
+	 * to not only take svc->ps_lock, but also employ an RDB TX by
+	 * the book.
+	 */
+	ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
+	for (i = 0; i < event_set->pss_len; i++) {
+		struct pool_svc_event *event = &event_set->pss_buf[i];
+
+		if (event->psv_src != CRT_EVS_SWIM || event->psv_type != CRT_EVT_ALIVE)
+			continue;
+		if (ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank)) {
+			/*
+			 * The rank is up in the pool map. Request a pool map
+			 * distribution just in case the rank has recently
+			 * restarted and does not have a copy of the pool map.
+			 */
+			ds_rsvc_request_map_dist(&svc->ps_rsvc);
+			D_DEBUG(DB_MD, DF_UUID ": requested map dist for rank %u\n",
+				DP_UUID(svc->ps_uuid), event->psv_rank);
+			break;
+		}
 	}
+	ABT_rwlock_unlock(svc->ps_pool->sp_lock);
 
-	D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid),
-		DP_PS_EVENT(event));
+	return 0;
+}
 
-	if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) {
-		/*
-		 * Check if the rank is up in the pool map. If in the future we
-		 * add automatic reintegration below, for instance, we may need
-		 * to not only take svc->ps_lock, but also employ an RDB TX by
-		 * the book.
-		 */
-		ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
-		rc = ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank);
-		ABT_rwlock_unlock(svc->ps_pool->sp_lock);
-		if (!rc)
-			goto out;
+struct event_timer_arg {
+	struct pool_svc_events *eta_events;
+	uint64_t                eta_deadline;
+};
 
-		/*
-		 * The rank is up in the pool map. Request a pool map
-		 * distribution just in case the rank has recently restarted
-		 * and does not have a copy of the pool map.
-		 */
-		ds_rsvc_request_map_dist(&svc->ps_rsvc);
-		D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n",
-			DP_UUID(svc->ps_uuid), event->psv_rank);
-	} else if (event->psv_type == CRT_EVT_DEAD) {
-		rc = pool_svc_exclude_rank(svc, event->psv_rank);
-		if (rc != 0)
-			D_ERROR(DF_UUID": failed to exclude rank %u: "DF_RC"\n",
-				DP_UUID(svc->ps_uuid), event->psv_rank, DP_RC(rc));
-	}
+static void
+event_timer(void *varg)
+{
+	struct event_timer_arg *arg = varg;
+	struct pool_svc_events *events = arg->eta_events;
+	int64_t                 time_left = arg->eta_deadline - daos_gettime_coarse();
 
-out:
-	return;
+	if (time_left > 0)
+		sched_req_sleep(events->pse_timer, time_left * 1000);
+	ABT_cond_broadcast(events->pse_cv);
+}
+
+static int
+start_event_timer(struct event_timer_arg *arg)
+{
+	struct pool_svc_events *events = arg->eta_events;
+	uuid_t                  uuid;
+	struct sched_req_attr   attr;
+
+	D_ASSERT(events->pse_timer == NULL);
+	uuid_clear(uuid);
+	sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &uuid);
+	events->pse_timer = sched_create_ult(&attr, event_timer, arg, 0);
+	if (events->pse_timer == NULL)
+		return -DER_NOMEM;
+	return 0;
+}
+
+static void
+stop_event_timer(struct event_timer_arg *arg)
+{
+	struct pool_svc_events *events = arg->eta_events;
+
+	D_ASSERT(events->pse_timer != NULL);
+	sched_req_wait(events->pse_timer, true /* abort */);
+	sched_req_put(events->pse_timer);
+	events->pse_timer = NULL;
 }
 
 static void
@@ -1403,31 +1559,83 @@ events_handler(void *arg)
 	D_DEBUG(DB_MD, DF_UUID": starting\n", DP_UUID(svc->ps_uuid));
 
 	for (;;) {
-		struct pool_svc_event  *event = NULL;
-		bool			stop;
+		struct pool_svc_event_set *event_set = NULL;
+		bool                       stop;
+		int                        rc;
 
 		ABT_mutex_lock(events->pse_mutex);
 		for (;;) {
+			struct event_timer_arg timer_arg;
+			int64_t                time_left;
+
 			stop = events->pse_stop;
 			if (stop) {
-				discard_events(&events->pse_queue);
+				events->pse_paused = false;
+				if (events->pse_pending != NULL)
+					free_event_set(&events->pse_pending);
 				break;
 			}
-			if (!d_list_empty(&events->pse_queue)) {
-				event = d_list_entry(events->pse_queue.next, struct pool_svc_event,
-						     psv_link);
-				d_list_del_init(&event->psv_link);
+
+			timer_arg.eta_events   = events;
+			timer_arg.eta_deadline = events->pse_time + events->pse_timeout;
+
+			time_left = timer_arg.eta_deadline - daos_gettime_coarse();
+			if (events->pse_pending != NULL && !events->pse_paused && time_left <= 0) {
+				event_set = events->pse_pending;
+				events->pse_pending = NULL;
 				break;
 			}
+
+			/* A simple timed cond_wait without polling. */
+			if (time_left > 0) {
+				rc = start_event_timer(&timer_arg);
+				if (rc != 0) {
+					/* No delay then. */
+					DL_ERROR(rc, DF_UUID ": failed to start event timer",
+						 DP_UUID(svc->ps_uuid));
+					events->pse_time = 0;
+					continue;
+				}
+			}
 			sched_cond_wait(events->pse_cv, events->pse_mutex);
+			if (time_left > 0)
+				stop_event_timer(&timer_arg);
 		}
 		ABT_mutex_unlock(events->pse_mutex);
 		if (stop)
 			break;
 
-		handle_event(svc, event);
+		rc = handle_event(svc, event_set);
+		if (rc != 0) {
+			/* Put event_set back to events->pse_pending. */
+			D_DEBUG(DB_MD, DF_UUID ": returning event set\n", DP_UUID(svc->ps_uuid));
+			ABT_mutex_lock(events->pse_mutex);
+			if (events->pse_pending == NULL) {
+				/*
+				 * No pending events; pause the handling until
+				 * next event or pool map change.
+				 */
+				D_DEBUG(DB_MD, DF_UUID ": pausing event handling\n",
+					DP_UUID(svc->ps_uuid));
+				events->pse_paused = true;
+			} else {
+				/*
+				 * There are pending events; do not pause the
+				 * handling.
+				 */
+				rc = merge_event_sets(event_set, events->pse_pending);
+				if (rc != 0)
+					DL_ERROR(rc, DF_UUID ": failed to merge events",
+						 DP_UUID(svc->ps_uuid));
+				free_event_set(&events->pse_pending);
+			}
+			events->pse_pending = event_set;
+			event_set = NULL;
+			ABT_mutex_unlock(events->pse_mutex);
+		}
 
-		D_FREE(event);
+		if (event_set != NULL)
+			free_event_set(&event_set);
 		ABT_thread_yield();
 	}
 
@@ -1437,7 +1645,7 @@ events_handler(void *arg)
 static bool
 events_pending(struct pool_svc *svc)
 {
-	return !d_list_empty(&svc->ps_events.pse_queue);
+	return svc->ps_events.pse_pending != NULL;
 }
 
 static void
@@ -1461,11 +1669,13 @@ init_events(struct pool_svc *svc)
 	struct pool_svc_events *events = &svc->ps_events;
 	int			rc;
 
-	D_ASSERT(d_list_empty(&events->pse_queue));
+	D_ASSERT(events->pse_pending == NULL);
+	D_ASSERT(events->pse_timer == NULL);
 	D_ASSERT(events->pse_handler == ABT_THREAD_NULL);
-	D_ASSERT(events->pse_stop == false);
+	D_ASSERT(!events->pse_stop);
+	D_ASSERT(!events->pse_paused);
 
-	if (!ds_pool_skip_for_check(svc->ps_pool)) {
+	if (!ds_pool_restricted(svc->ps_pool, false)) {
 		rc = crt_register_event_cb(ds_pool_crt_event_cb, svc);
 		if (rc != 0) {
 			D_ERROR(DF_UUID": failed to register event callback: "DF_RC"\n",
@@ -1496,9 +1706,10 @@ init_events(struct pool_svc *svc)
 	return 0;
 
 err_cb:
-	if (!ds_pool_skip_for_check(svc->ps_pool))
+	if (!ds_pool_restricted(svc->ps_pool, false))
 		crt_unregister_event_cb(ds_pool_crt_event_cb, svc);
-	discard_events(&events->pse_queue);
+	if (events->pse_pending != NULL)
+		free_event_set(&events->pse_pending);
 err:
 	return rc;
 }
@@ -1507,11 +1718,10 @@ static void
 fini_events(struct pool_svc *svc)
 {
 	struct pool_svc_events *events = &svc->ps_events;
-	int			rc;
 
 	D_ASSERT(events->pse_handler != ABT_THREAD_NULL);
 
-	if (!ds_pool_skip_for_check(svc->ps_pool))
+	if (!ds_pool_restricted(svc->ps_pool, false))
 		crt_unregister_event_cb(ds_pool_crt_event_cb, svc);
 
 	ABT_mutex_lock(events->pse_mutex);
@@ -1519,8 +1729,6 @@ fini_events(struct pool_svc *svc)
 	ABT_cond_broadcast(events->pse_cv);
 	ABT_mutex_unlock(events->pse_mutex);
 
-	rc = ABT_thread_join(events->pse_handler);
-	D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc));
 	ABT_thread_free(&events->pse_handler);
 	events->pse_handler = ABT_THREAD_NULL;
 	events->pse_stop = false;
@@ -2357,6 +2565,11 @@ int ds_pool_failed_lookup(uuid_t uuid)
 	return 0;
 }
 
+struct pool_start_args {
+	bool	psa_aft_chk;
+	bool	psa_immutable;
+};
+
 /*
  * Try to start the pool. Continue the iteration upon errors as other pools may
  * still be able to work.
@@ -2364,13 +2577,26 @@ int ds_pool_failed_lookup(uuid_t uuid)
 static int
 start_one(uuid_t uuid, void *varg)
 {
-	int rc;
+	struct pool_start_args	*psa = varg;
+	bool			 aft_chk;
+	bool			 immutable;
+	int			 rc;
+
+	if (psa != NULL) {
+		aft_chk = psa->psa_aft_chk;
+		immutable = psa->psa_immutable;
+	} else {
+		aft_chk = false;
+		immutable = false;
+	}
 
-	D_DEBUG(DB_MD, DF_UUID ": starting pool\n", DP_UUID(uuid));
+	D_DEBUG(DB_MD, DF_UUID ": starting pool, aft_chk %s, immutable %s\n",
+		DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no");
 
-	rc = ds_pool_start(uuid, varg != NULL ? true : false);
+	rc = ds_pool_start(uuid, aft_chk, immutable);
 	if (rc != 0) {
-		DL_ERROR(rc, DF_UUID ": failed to start pool", DP_UUID(uuid));
+		DL_ERROR(rc, DF_UUID ": failed to start pool, aft_chk %s, immutable %s",
+			 DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no");
 		ds_pool_failed_add(uuid, rc);
 	}
 
@@ -2389,12 +2615,27 @@ pool_start_all(void *arg)
 			DP_RC(rc));
 }
 
+bool
+ds_pool_restricted(struct ds_pool *pool, bool immutable)
+{
+	if (ds_pool_skip_for_check(pool))
+		return true;
+
+	if (pool->sp_immutable && !immutable)
+		return true;
+
+	return false;
+}
+
 int
-ds_pool_start_after_check(uuid_t uuid)
+ds_pool_start_after_check(uuid_t uuid, bool immutable)
 {
-	bool	aft_chk = true;
+	struct pool_start_args	psa;
+
+	psa.psa_aft_chk = true;
+	psa.psa_immutable = immutable;
 
-	return start_one(uuid, &aft_chk);
+	return start_one(uuid, &psa);
 }
 
 /* Note that this function is currently called from the main xstream. */
@@ -3612,7 +3853,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 			   &ds_pool_prop_connectable, &value);
 	if (rc != 0)
 		goto out_lock;
-	D_DEBUG(DB_MD, DF_UUID ": connectable=%u\n", DP_UUID(in->pci_op.pi_uuid), connectable);
 	if (!connectable) {
 		D_ERROR(DF_UUID": being destroyed, not accepting connections\n",
 			DP_UUID(in->pci_op.pi_uuid));
@@ -3621,12 +3861,21 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 
 	/*
 	 * NOTE: Under check mode, there is a small race window between ds_pool_mark_connectable()
-	 *	 the PS restart for full service. If some client tries to connect the pool during
+	 *	 and PS restart with full service. If some client tries to connect the pool during
 	 *	 such internal, it will get -DER_BUSY temporarily.
 	 */
 	if (unlikely(ds_pool_skip_for_check(svc->ps_pool))) {
-		D_ERROR(DF_UUID" is not ready for full pool service\n", DP_UUID(in->pci_op.pi_uuid));
-		D_GOTO(out_lock, rc = -DER_BUSY);
+		rc = -DER_BUSY;
+		D_ERROR(DF_UUID " is not ready for full pool service: " DF_RC "\n",
+			DP_UUID(in->pci_op.pi_uuid), DP_RC(rc));
+		goto out_lock;
+	}
+
+	if (svc->ps_pool->sp_immutable && flags != DAOS_PC_RO) {
+		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID " failed to connect immutable pool, flags " DF_X64 ": " DF_RC "\n",
+			DP_UUID(in->pci_op.pi_uuid), flags, DP_RC(rc));
+		goto out_lock;
 	}
 
 	/* Check existing pool handles. */
@@ -3789,6 +4038,11 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 		}
 	}
 
+	D_DEBUG(DB_MD, DF_UUID "/" DF_UUID ": connecting to %s pool with flags "
+		DF_X64", sec_capas " DF_X64 "\n",
+		DP_UUID(in->pci_op.pi_uuid), DP_UUID(in->pci_op.pi_hdl),
+		svc->ps_pool->sp_immutable ? "immutable" : "regular", flags, sec_capas);
+
 	rc = pool_connect_iv_dist(svc, in->pci_op.pi_hdl, flags, sec_capas, credp, global_ver,
 				  obj_layout_ver);
 	if (rc == 0 && DAOS_FAIL_CHECK(DAOS_POOL_CONNECT_FAIL_CORPC)) {
@@ -4332,7 +4586,7 @@ ds_pool_svc_list_cont(uuid_t uuid, d_rank_list_t *ranks,
 		list_cont_bulk_destroy(bulk);
 		D_FREE(resp_cont);
 		crt_req_decref(rpc);
-		dss_sleep(RECHOOSE_SLEEP_MS);
+		dss_sleep(250);
 		D_GOTO(rechoose, rc);
 	}
 
@@ -6314,7 +6568,7 @@ pool_svc_schedule(struct pool_svc *svc, struct pool_svc_sched *sched, void (*fun
 
 	D_DEBUG(DB_MD, DF_UUID": begin\n", DP_UUID(svc->ps_uuid));
 
-	if (ds_pool_skip_for_check(svc->ps_pool)) {
+	if (ds_pool_restricted(svc->ps_pool, false)) {
 		D_DEBUG(DB_MD, DF_UUID": end: skip in check mode\n", DP_UUID(svc->ps_uuid));
 		return -DER_OP_CANCELED;
 	}
@@ -6501,7 +6755,7 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma
 }
 
 static int
-pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank)
+pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map)
 {
 	crt_group_t		*primary_grp;
 	struct pool_domain	*doms;
@@ -6517,13 +6771,10 @@ pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank)
 
 	D_CRIT("!!! Please try to recover these engines in top priority -\n");
 	D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n");
-	D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n",
-	       DP_UUID(svc->ps_uuid), rank);
 	for (i = 0; i < doms_cnt; i++) {
 		struct swim_member_state state;
 
-		if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) ||
-		    (doms[i].do_comp.co_rank == rank))
+		if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN))
 			continue;
 
 		rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state);
@@ -6701,8 +6952,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 	 * with CRIT log message to ask administrator to bring back the engine.
 	 */
 	if (src == MUS_SWIM && opc == MAP_EXCLUDE) {
-		d_rank_t	rank;
-		int		failed_cnt;
+		int failed_cnt;
 
 		rc = pool_map_update_failed_cnt(map);
 		if (rc != 0) {
@@ -6711,15 +6961,19 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 			goto out_map;
 		}
 
-		D_ASSERT(tgt_addrs->pta_number == 1);
-		rank = tgt_addrs->pta_addrs->pta_rank;
 		failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE);
-		D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n",
-		       DP_UUID(svc->ps_uuid), rank, failed_cnt);
+		D_INFO(DF_UUID": SWIM exclude %d ranks, failed NODE %d\n",
+		       DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, failed_cnt);
 		if (failed_cnt > pw_rf) {
-			D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n",
-			       DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt);
-			rc = pool_map_crit_prompt(svc, map, rank);
+			D_CRIT(DF_UUID": exclude %d ranks will break pool RF %d, failed_cnt %d\n",
+			       DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, pw_rf, failed_cnt);
+			ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
+			rc = pool_map_crit_prompt(svc, svc->ps_pool->sp_map);
+			ABT_rwlock_unlock(svc->ps_pool->sp_lock);
+			if (rc != 0)
+				DL_ERROR(rc, DF_UUID ": failed to log prompt",
+					 DP_UUID(svc->ps_uuid));
+			rc = -DER_RF;
 			goto out_map;
 		}
 	}
@@ -6768,6 +7022,9 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 
 	ds_rsvc_request_map_dist(&svc->ps_rsvc);
 
+	/* See events_handler. */
+	resume_event_handling(svc);
+
 	rc = pool_svc_schedule_reconf(svc, NULL /* map */, map_version, false /* sync_remove */);
 	if (rc != 0) {
 		DL_INFO(rc, DF_UUID": failed to schedule pool service reconfiguration",
@@ -7167,28 +7424,51 @@ ds_pool_update_handler_v5(crt_rpc_t *rpc)
 }
 
 static int
-pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank)
+pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set)
 {
 	struct pool_target_addr_list	list;
 	struct pool_target_addr_list	inval_list_out = { 0 };
-	struct pool_target_addr		tgt_rank;
+	struct pool_target_addr	       *addrs;
+	d_rank_t			self_rank = dss_self_rank();
 	uint32_t			map_version = 0;
+	int				n = 0;
+	int				i;
 	int				rc;
 
-	tgt_rank.pta_rank = rank;
-	tgt_rank.pta_target = -1;
-	list.pta_number = 1;
-	list.pta_addrs = &tgt_rank;
+	D_ALLOC_ARRAY(addrs, event_set->pss_len);
+	if (addrs == NULL)
+		return -DER_NOMEM;
+	for (i = 0; i < event_set->pss_len; i++) {
+		struct pool_svc_event *event = &event_set->pss_buf[i];
+
+		if (event->psv_type != CRT_EVT_DEAD)
+			continue;
+		if (event->psv_src == CRT_EVS_GRPMOD && event->psv_rank == self_rank) {
+			D_DEBUG(DB_MD, DF_UUID ": ignore exclusion of self\n",
+				DP_UUID(svc->ps_uuid));
+			continue;
+		}
+		addrs[n].pta_rank   = event->psv_rank;
+		addrs[n].pta_target = -1;
+		n++;
+	}
+	if (n == 0) {
+		rc = 0;
+		goto out;
+	}
+	list.pta_number = n;
+	list.pta_addrs  = addrs;
 
 	rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */,
 				 NULL, NULL, 0, &list, &inval_list_out, &map_version,
 				 NULL /* hint */, MUS_SWIM);
 
-	D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n",
-		DP_UUID(svc->ps_uuid), map_version, rank, rc);
+	D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u ranks %u: rc %d\n",
+		DP_UUID(svc->ps_uuid), map_version, n, rc);
 
 	pool_target_addr_list_free(&inval_list_out);
-
+out:
+	D_FREE(addrs);
 	return rc;
 }
 
@@ -8527,9 +8807,3 @@ ds_pool_svc_upgrade_vos_pool(struct ds_pool *pool)
 	ds_rsvc_put(rsvc);
 	return rc;
 }
-
-bool
-ds_pool_skip_for_check(struct ds_pool *pool)
-{
-	return engine_in_check() && !pool->sp_cr_checked;
-}
diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c
index cfa837e8b2a..113fb757fd9 100644
--- a/src/pool/srv_target.c
+++ b/src/pool/srv_target.c
@@ -505,7 +505,7 @@ pool_child_start(struct ds_pool_child *child, bool recreate)
 		goto done;
 	}
 
-	if (!ds_pool_skip_for_check(child->spc_pool)) {
+	if (!ds_pool_restricted(child->spc_pool, false)) {
 		rc = start_gc_ult(child);
 		if (rc != 0)
 			goto out_close;
@@ -1128,7 +1128,7 @@ pool_fetch_hdls_ult_abort(struct ds_pool *pool)
  * till ds_pool_stop. Only for mgmt and pool modules.
  */
 int
-ds_pool_start(uuid_t uuid, bool aft_chk)
+ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable)
 {
 	struct ds_pool			*pool;
 	struct daos_llink		*llink;
@@ -1185,6 +1185,11 @@ ds_pool_start(uuid_t uuid, bool aft_chk)
 	else
 		pool->sp_cr_checked = 0;
 
+	if (immutable)
+		pool->sp_immutable = 1;
+	else
+		pool->sp_immutable = 0;
+
 	rc = pool_child_add_all(pool);
 	if (rc != 0)
 		goto failure_pool;
@@ -1199,7 +1204,9 @@ ds_pool_start(uuid_t uuid, bool aft_chk)
 		}
 
 		pool->sp_fetch_hdls = 1;
+	}
 
+	if (!ds_pool_restricted(pool, false)) {
 		rc = ds_pool_start_ec_eph_query_ult(pool);
 		if (rc != 0) {
 			D_ERROR(DF_UUID": failed to start ec eph query ult: "DF_RC"\n",
@@ -1882,13 +1889,10 @@ ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf,
 		       DP_UUID(pool->sp_uuid), map_version_before, map_version);
 	}
 
-	if (map_updated) {
+	if (map_updated && !ds_pool_restricted(pool, false)) {
 		struct dtx_scan_args	*arg;
 		int ret;
 
-		if (ds_pool_skip_for_check(pool))
-			D_GOTO(out, rc = 0);
-
 		D_ALLOC_PTR(arg);
 		if (arg == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto
index 944d8e943ba..be068e5274d 100644
--- a/src/proto/ctl/storage_nvme.proto
+++ b/src/proto/ctl/storage_nvme.proto
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2019-2023 Intel Corporation.
+// (C) Copyright 2019-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -28,6 +28,7 @@ message ScanNvmeReq {
 	bool Basic = 3;		// Strip NVMe device details to only basic
 	uint64 MetaSize = 4;	// Size of the metadata blob
 	uint64 RdbSize = 5;	// Size of the RDB blob
+	bool   LinkStats = 6;     // Populate PCIe link info in health statistics
 }
 
 message ScanNvmeResp {
diff --git a/src/rebuild/rebuild_iv.c b/src/rebuild/rebuild_iv.c
index cc585e037e1..f4b9f75f407 100644
--- a/src/rebuild/rebuild_iv.c
+++ b/src/rebuild/rebuild_iv.c
@@ -167,6 +167,7 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
 	struct rebuild_tgt_pool_tracker *rpt;
 	struct rebuild_iv *dst_iv = entry->iv_value.sg_iovs[0].iov_buf;
 	struct rebuild_iv *src_iv = src->sg_iovs[0].iov_buf;
+	uint32_t old_ver;
 	int rc = 0;
 
 	rpt = rpt_lookup(src_iv->riv_pool_uuid, -1, src_iv->riv_ver,
@@ -200,16 +201,18 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
 			       dst_iv->riv_stable_epoch);
 		rpt->rt_global_done = dst_iv->riv_global_done;
 		rpt->rt_global_scan_done = dst_iv->riv_global_scan_done;
-		if (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver &&
+		old_ver = rpt->rt_global_dtx_resync_version;
+		if (rpt->rt_global_dtx_resync_version < dst_iv->riv_global_dtx_resyc_version)
+			rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version;
+		if (old_ver < rpt->rt_rebuild_ver &&
 		    dst_iv->riv_global_dtx_resyc_version >= rpt->rt_rebuild_ver) {
 			D_INFO(DF_UUID " global/iv/rebuild_ver %u/%u/%u signal wait cond\n",
-			       DP_UUID(src_iv->riv_pool_uuid), rpt->rt_global_dtx_resync_version,
+			       DP_UUID(src_iv->riv_pool_uuid), old_ver,
 			       dst_iv->riv_global_dtx_resyc_version, rpt->rt_rebuild_ver);
 			ABT_mutex_lock(rpt->rt_lock);
 			ABT_cond_signal(rpt->rt_global_dtx_wait_cond);
 			ABT_mutex_unlock(rpt->rt_lock);
 		}
-		rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version;
 	}
 
 out:
diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c
index b1f722b8254..5c463af9e90 100644
--- a/src/rebuild/srv.c
+++ b/src/rebuild/srv.c
@@ -1941,7 +1941,7 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver,
 		return 0;
 	}
 
-	if (ds_pool_skip_for_check(pool)) {
+	if (ds_pool_restricted(pool, false)) {
 		D_DEBUG(DB_REBUILD, DF_UUID" skip rebuild under check mode\n",
 			DP_UUID(pool->sp_uuid));
 		return 0;
diff --git a/src/security/srv_acl.c b/src/security/srv_acl.c
index ad3c0107e3b..8f05216e971 100644
--- a/src/security/srv_acl.c
+++ b/src/security/srv_acl.c
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -754,6 +754,12 @@ ds_sec_cont_can_evict_all(uint64_t cont_capas)
 	return (cont_capas & CONT_CAPA_EVICT_ALL) != 0;
 }
 
+bool
+ds_sec_cont_can_modify(uint64_t cont_capas)
+{
+	return (cont_capas & CONT_CAPAS_W_MASK) != 0;
+}
+
 uint64_t
 ds_sec_get_rebuild_cont_capabilities(void)
 {
diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml
index 78f132474b5..95aab541329 100644
--- a/src/tests/ftest/erasurecode/multiple_failure.yaml
+++ b/src/tests/ftest/erasurecode/multiple_failure.yaml
@@ -25,6 +25,7 @@ server_config:
       storage: auto
 pool:
   size: 93%
+  set_logmasks: False
 container:
   type: POSIX
   control_method: daos
diff --git a/src/tests/ftest/erasurecode/rebuild_fio.yaml b/src/tests/ftest/erasurecode/rebuild_fio.yaml
index a895c356707..a3539d86579 100644
--- a/src/tests/ftest/erasurecode/rebuild_fio.yaml
+++ b/src/tests/ftest/erasurecode/rebuild_fio.yaml
@@ -39,6 +39,7 @@ pool:
   aggregation:
     threshold: 50000000
     aggr_timeout: 180
+  set_logmasks: False
 container:
   type: POSIX
   control_method: daos
diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py
index 42e05937f37..563ff7adece 100644
--- a/src/tests/ftest/util/apricot/apricot/test.py
+++ b/src/tests/ftest/util/apricot/apricot/test.py
@@ -643,6 +643,7 @@ def __init__(self, *args, **kwargs):
         self.setup_start_agents = True
         self.slurm_exclude_servers = False
         self.slurm_exclude_nodes = NodeSet()
+        self.max_test_dir_usage_check = 90
         self.host_info = HostInfo()
         self.hostlist_servers = NodeSet()
         self.hostlist_clients = NodeSet()
@@ -693,6 +694,11 @@ def setUp(self):
         self.slurm_exclude_servers = self.params.get(
             "slurm_exclude_servers", "/run/setup/*", self.slurm_exclude_servers)
 
+        # Max test directory usage percentage - when exceeded will display sizes of files in the
+        # test directory
+        self.max_test_dir_usage_check = self.params.get(
+            "max_test_dir_usage_check", "/run/setup/*", self.max_test_dir_usage_check)
+
         # The server config name should be obtained from each ServerManager
         # object, but some tests still use this TestWithServers attribute.
         self.server_group = self.params.get("name", "/run/server_config/*", "daos_server")
@@ -765,12 +771,20 @@ def setUp(self):
 
         # List common test directory contents before running the test
         self.log.info("-" * 100)
-        self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir))
+        self.log.debug(
+            "Common test directory (%s) contents (check > %s%%):",
+            os.path.dirname(self.test_dir), self.max_test_dir_usage_check)
         all_hosts = include_local_host(self.host_info.all_hosts)
         test_dir_parent = os.path.dirname(self.test_dir)
-        result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
-        if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90:
-            run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*")
+        _result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
+        _details = NodeSet()
+        for _host, _stdout in _result.all_stdout.items():
+            _test_dir_usage = re.findall(r"\s+([\d]+)%\s+", _stdout)
+            _test_dir_usage_int = int(max(_test_dir_usage + ["0"]))
+            if _test_dir_usage_int > self.max_test_dir_usage_check:
+                _details.add(_host)
+        if _details:
+            run_remote(self.log, _details, f"du -sh {test_dir_parent}/*")
         self.log.info("-" * 100)
 
         if not self.start_servers_once or self.name.uid == 1:
diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py
index 12c66d76b8c..f95e22bd4bd 100644
--- a/src/tests/ftest/util/file_count_test_base.py
+++ b/src/tests/ftest/util/file_count_test_base.py
@@ -17,15 +17,15 @@ class FileCountTestBase(IorTestBase, MdtestBase):
     :avocado: recursive
     """
 
-    def add_containers(self, file_oclass=None, dir_oclass=None):
-        """Create a list of containers that the various jobs use for storage.
+    def get_file_write_container(self, file_oclass=None, dir_oclass=None):
+        """Create a container, set oclass, dir_oclass, and add rd_fac property based on oclass.
 
         Args:
-            file_oclass (str, optional): file object class of container.
-                                         Defaults to None.
-            dir_oclass (str, optional): dir object class of container.
-                                        Defaults to None.
+            file_oclass (str, optional): file object class of container. Defaults to None.
+            dir_oclass (str, optional): dir object class of container. Defaults to None.
 
+        Returns:
+            TestContainer: Created container with oclass, dir_oclass, and rd_fac set.
 
         """
         # Create a container and add it to the overall list of containers
@@ -92,7 +92,7 @@ def run_file_count(self):
                     rd_fac = extract_redundancy_factor(oclass)
                     dir_oclass = self.get_diroclass(rd_fac)
                     self.mdtest_cmd.dfs_dir_oclass.update(dir_oclass)
-                    self.container = self.add_containers(oclass, dir_oclass)
+                    self.container = self.get_file_write_container(oclass, dir_oclass)
                     try:
                         self.processes = mdtest_np
                         self.ppn = mdtest_ppn
@@ -111,14 +111,27 @@ def run_file_count(self):
                 # run ior
                 self.log.info("=======>>>Starting IOR with %s and %s", api, oclass)
                 self.ior_cmd.dfs_oclass.update(oclass)
-                self.container = self.add_containers(oclass)
+                self.container = self.get_file_write_container(oclass)
                 self.update_ior_cmd_with_pool(False)
                 try:
                     self.processes = ior_np
                     self.ppn = ior_ppn
                     if api == 'HDF5-VOL':
+                        # Format the container properties so that it works with HDF5-VOL env var.
+                        # Each entry:value pair needs to be separated by a semicolon. Since we're
+                        # using this in the mpirun command, semicolon would indicate the end of the
+                        # command, so quote the whole thing.
+                        cont_props = self.container.properties.value
+                        cont_props_hdf5_vol = '"' + cont_props.replace(",", ";") + '"'
+                        self.log.info("cont_props_hdf5_vol = %s", cont_props_hdf5_vol)
+                        env = self.ior_cmd.env.copy()
+                        env.update({
+                            "HDF5_DAOS_OBJ_CLASS": oclass,
+                            "HDF5_DAOS_FILE_PROP": cont_props_hdf5_vol
+                        })
                         self.ior_cmd.api.update('HDF5')
-                        self.run_ior_with_pool(create_pool=False, plugin_path=hdf5_plugin_path)
+                        self.run_ior_with_pool(
+                            create_pool=False, plugin_path=hdf5_plugin_path, env=env)
                     elif self.ior_cmd.api.value == 'POSIX':
                         self.run_ior_with_pool(create_pool=False, intercept=intercept)
                     else:
diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py
index 440ffe68f82..7cda958d242 100644
--- a/src/tests/ftest/util/server_utils_params.py
+++ b/src/tests/ftest/util/server_utils_params.py
@@ -435,6 +435,7 @@ class EngineYamlParameters(YamlParameters):
         "common": [
             "D_LOG_FILE_APPEND_PID=1",
             "DAOS_POOL_RF=4",
+            "CRT_EVENT_DELAY=1",
             "COVFILE=/tmp/test.cov"],
         "ofi+tcp": [],
         "ofi+tcp;ofi_rxm": [],
diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py
index f32e068cb16..2d1c11e722e 100644
--- a/src/tests/ftest/util/soak_test_base.py
+++ b/src/tests/ftest/util/soak_test_base.py
@@ -25,8 +25,8 @@
 from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse,
                         create_app_cmdline, create_dm_cmdline, create_fio_cmdline,
                         create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline,
-                        create_racer_cmdline, ddhhmmss_format, get_daos_server_logs, get_harassers,
-                        get_journalctl, launch_exclude_reintegrate, launch_extend, launch_reboot,
+                        create_racer_cmdline, ddhhmmss_format, get_harassers,
+                        launch_exclude_reintegrate, launch_extend, launch_reboot,
                         launch_server_stop_start, launch_snapshot, launch_vmd_identify_check,
                         reserved_file_copy, run_event_check, run_metrics_check, run_monitor_check)
 
@@ -164,17 +164,6 @@ def pre_tear_down(self):
 
         # display final metrics
         run_metrics_check(self, prefix="final")
-        # Gather server logs
-        try:
-            get_daos_server_logs(self)
-        except SoakTestError as error:
-            errors.append(f"<<FAILED: Failed to gather server logs {error}>>")
-        # Gather journalctl logs
-        hosts = list(set(self.hostlist_servers))
-        since = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.start_time))
-        until = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.end_time))
-        for journalctl_type in ["kernel", "daos_server"]:
-            get_journalctl(self, hosts, since, until, journalctl_type, logging=True)
 
         if self.all_failed_harassers:
             errors.extend(self.all_failed_harassers)
diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py
index d59c8d39e81..70d2352881d 100644
--- a/src/tests/ftest/util/telemetry_utils.py
+++ b/src/tests/ftest/util/telemetry_utils.py
@@ -90,6 +90,7 @@ class TelemetryUtils():
         "engine_pool_ops_dtx_coll_commit",
         "engine_pool_ops_dtx_commit",
         "engine_pool_ops_dtx_refresh",
+        "engine_pool_ops_dtx_sync_commit",
         "engine_pool_ops_ec_agg",
         "engine_pool_ops_ec_rep",
         "engine_pool_ops_fetch",
@@ -200,6 +201,8 @@ class TelemetryUtils():
         "engine_dmabuff_queued_reqs",
         "engine_dmabuff_grab_errs",
         *_gen_stats_metrics("engine_dmabuff_grab_retries")]
+    ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS = \
+        _gen_stats_metrics("engine_io_dtx_async_cmt_lat")
     ENGINE_IO_DTX_COMMITTABLE_METRICS = \
         _gen_stats_metrics("engine_io_dtx_committable")
     ENGINE_IO_DTX_COMMITTED_METRICS = \
@@ -304,7 +307,8 @@ class TelemetryUtils():
         _gen_stats_metrics("engine_io_ops_tgt_update_active")
     ENGINE_IO_OPS_UPDATE_ACTIVE_METRICS = \
         _gen_stats_metrics("engine_io_ops_update_active")
-    ENGINE_IO_METRICS = ENGINE_IO_DTX_COMMITTABLE_METRICS +\
+    ENGINE_IO_METRICS = ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS +\
+        ENGINE_IO_DTX_COMMITTABLE_METRICS +\
         ENGINE_IO_DTX_COMMITTED_METRICS +\
         ENGINE_IO_LATENCY_FETCH_METRICS +\
         ENGINE_IO_LATENCY_BULK_FETCH_METRICS +\
diff --git a/src/tests/suite/daos_container.c b/src/tests/suite/daos_container.c
index 958f9fd1d16..243a829c317 100644
--- a/src/tests/suite/daos_container.c
+++ b/src/tests/suite/daos_container.c
@@ -3835,7 +3835,7 @@ co_op_dup_timing(void **state)
 	size_t const       in_sizes[]        = {strlen(in_values[0]), strlen(in_values[1])};
 	int                n                 = (int)ARRAY_SIZE(names);
 	const unsigned int NUM_FP            = 3;
-	const uint32_t     NUM_OPS           = 200;
+	const uint32_t     NUM_OPS           = 500;
 	uint32_t           num_failures      = 0;
 	const uint32_t     SVC_OPS_ENABLED   = 1;
 	const uint32_t     SVC_OPS_ENTRY_AGE = 60;
diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c
index 3827a0eda6a..4c981f29645 100644
--- a/src/tests/suite/daos_cr.c
+++ b/src/tests/suite/daos_cr.c
@@ -18,6 +18,8 @@
 
 #include <daos_mgmt.h>
 
+#include "daos_iotest.h"
+
 /*
  * Will enable accurate query result verification after DAOS-13520 resolved.
  * #define CR_ACCURATE_QUERY_RESULT	1
@@ -3573,6 +3575,146 @@ cr_handle_fail_pool2(void **state)
 	cr_cleanup(arg, &pool, 1);
 }
 
+#define CR_IO_SIZE	16
+
+/*
+ * 1. Create pool and container without inconsistency.
+ * 2. Write something to the container.
+ * 3. Start checker with --dry-run option.
+ * 4. Query checker, it should be completed without any inconsistency reported.
+ * 5. Verify the pool only can be connected with read-only permission.
+ * 6. Verify the container only can be opened with read-only permission.
+ * 7. Verify the object only can be opened with read-only permission.
+ * 8. Verify the object cannot be written.
+ * 9. Read former wrote data and verify its correctness.
+ * 10. Switch to normal mode and cleanup.
+ */
+static void
+cr_maintenance_mode(void **state)
+{
+	test_arg_t		*arg = *state;
+	struct test_pool	 pool = { 0 };
+	struct test_cont	 cont = { 0 };
+	daos_handle_t		 coh;
+	daos_obj_id_t		 oid;
+	struct ioreq		 req;
+	const char		*dkey = "cr_dkey";
+	const char		*akey = "cr_akey";
+	char			 update_buf[CR_IO_SIZE];
+	char			 fetch_buf[CR_IO_SIZE];
+	struct daos_check_info	 dci = { 0 };
+	int			 rc;
+
+	print_message("CR28: maintenance mode after dry-run check\n");
+
+	rc = cr_pool_create(state, &pool, true, TCC_NONE);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_cont_create(state, &pool, &cont, 0);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	oid = daos_test_oid_gen(coh, OC_SX, 0, 0, arg->myrank);
+	arg->fail_loc = DAOS_DTX_COMMIT_SYNC | DAOS_FAIL_ALWAYS;
+	arg->async = 0;
+	ioreq_init(&req, coh, oid, DAOS_IOD_SINGLE, arg);
+	dts_buf_render(update_buf, CR_IO_SIZE);
+
+	print_message("Generate some data.\n");
+
+	insert_single(dkey, akey, 0, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	daos_fail_loc_set(0);
+
+	rc = daos_obj_close(req.oh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_cont_close(coh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_pool_disconnect(pool.poh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_system_stop(false);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_mode_switch(true);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_check_start(TCSF_DRYRUN, 0, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	cr_ins_wait(1, &pool.pool_uuid, &dci);
+
+	rc = cr_ins_verify(&dci, TCIS_COMPLETED);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the pool only can be connected as RDONLY mode.\n");
+
+	rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RW, &pool.poh, NULL, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+
+	rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RO, &pool.poh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the container only can be opened as RDONLY mode.\n");
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RO, &coh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+#if 0
+	/*
+	 * NOTE: Currently, DAOS security check is server-side logic, but object open may not
+	 *	 talk with server. So related permission check will not be done until read or
+	 *	 write really happened. If not consider exclusively open and cache case, then
+	 *	 it seems fine; otherwise, need some enhancement in future.
+	 */
+
+	print_message("Verify the object only can be opened as RDONLY mode.\n");
+
+	rc = daos_obj_open(coh, oid, DAOS_OO_RW, &req.oh, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+#endif
+
+	rc = daos_obj_open(coh, oid, DAOS_OO_RO, &req.oh, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the object cannot be modified.\n");
+
+	req.arg->expect_result = -DER_NO_PERM;
+	insert_single(dkey, akey, 100, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	daos_fail_loc_set(0);
+
+	print_message("Verify former wrote data.\n");
+
+	req.arg->expect_result = 0;
+	lookup_single(dkey, akey, 0, fetch_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	assert_int_equal(req.iod[0].iod_size, CR_IO_SIZE);
+	assert_memory_equal(update_buf, fetch_buf, CR_IO_SIZE);
+
+	rc = daos_cont_close(coh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_pool_disconnect(pool.poh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_mode_switch(false);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_system_start();
+	assert_rc_equal(rc, 0);
+
+	cr_dci_fini(&dci);
+	cr_cleanup(arg, &pool, 1);
+}
+
 static const struct CMUnitTest cr_tests[] = {
 	{ "CR1: start checker for specified pools",
 	  cr_start_specified, async_disable, test_case_teardown},
@@ -3628,6 +3770,8 @@ static const struct CMUnitTest cr_tests[] = {
 	  cr_handle_fail_pool1, async_disable, test_case_teardown},
 	{ "CR27: handle the pool if some engine failed to report some pool service",
 	  cr_handle_fail_pool2, async_disable, test_case_teardown},
+	{ "CR28: maintenance mode after dry-run check",
+	  cr_maintenance_mode, async_disable, test_case_teardown},
 };
 
 static int
diff --git a/src/vos/sys_db.c b/src/vos/sys_db.c
index e7b4a2baa20..d1f6d4bce98 100644
--- a/src/vos/sys_db.c
+++ b/src/vos/sys_db.c
@@ -349,8 +349,9 @@ vos_db_init(const char *db_path)
 int
 vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool destroy_db_on_fini)
 {
-	int	create;
-	int	rc;
+	ABT_mutex_attr	attr = ABT_MUTEX_ATTR_NULL;
+	int		create;
+	int		rc;
 
 	D_ASSERT(db_path != NULL);
 
@@ -373,12 +374,26 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool
 		goto failed;
 	}
 
-	rc = ABT_mutex_create(&vos_db.db_lock);
+	rc = ABT_mutex_attr_create(&attr);
 	if (rc != ABT_SUCCESS) {
-		rc = -DER_NOMEM;
-		goto failed;
+		D_ERROR("Failed to create mutex attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
+	}
+
+	rc = ABT_mutex_attr_set_recursive(attr, ABT_TRUE);
+	if (rc != ABT_SUCCESS) {
+		D_ERROR("Failed to set mutex attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
 	}
 
+	rc = ABT_mutex_create_with_attr(attr, &vos_db.db_lock);
+	if (rc != ABT_SUCCESS) {
+		D_ERROR("Failed to create mutex with attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
+	}
+
+	ABT_mutex_attr_free(&attr);
+
 	vos_db.db_poh = DAOS_HDL_INVAL;
 	vos_db.db_coh = DAOS_HDL_INVAL;
 
@@ -416,6 +431,8 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool
 	}
 	return 0;
 failed:
+	if (attr != ABT_MUTEX_ATTR_NULL)
+		ABT_mutex_attr_free(&attr);
 	vos_db_fini();
 	return rc;
 }
diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index 1c60f781507..47e410fc1e7 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -15,11 +15,10 @@
 #include "vos_layout.h"
 #include "vos_internal.h"
 
-/* 128 KB per SCM blob */
-#define DTX_BLOB_SIZE		(1 << 17)
-/** Ensure 16-bit signed int is sufficient to store record index */
-D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_act_ent_df)) <  (1 << 15));
-D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_cmt_ent_df)) <  (1 << 15));
+/* 16 KB blob for each active DTX blob */
+#define DTX_ACT_BLOB_SIZE	(1 << 14)
+/* 4 KB for committed DTX blob */
+#define DTX_CMT_BLOB_SIZE	(1 << 12)
 
 #define DTX_ACT_BLOB_MAGIC	0x14130a2b
 #define DTX_CMT_BLOB_MAGIC	0x2502191c
@@ -313,16 +312,38 @@ dtx_act_ent_update(struct btr_instance *tins, struct btr_record *rec,
 
 	if (unlikely(!dae_old->dae_aborted)) {
 		/*
-		 * XXX: There are two possible reasons for that:
-		 *
-		 *	1. Client resent the RPC but without set 'RESEND' flag.
-		 *	2. Client reused the DTX ID for different modifications.
-		 *
-		 *	Currently, the 1st case is more suspected.
+		 * If the new entry and the old entry are for the same transaction, then the RPC
+		 * for the new one will take 'RESEND' flag, that will cause the old one has been
+		 * aborted before arriving at here. So it is quite possible that the new one and
+		 * the old one are for different transactions.
 		 */
-		D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n",
-			DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
-		return -DER_TX_ID_REUSED;
+		if (DAE_EPOCH(dae_old) < DAE_EPOCH(dae_new)) {
+			D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n",
+				DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
+			return -DER_TX_ID_REUSED;
+		}
+
+		/*
+		 * If the old entry has higher epoch, it is quite possible that the resent RPC
+		 * was handled before the original RPC (corresponding to 'dae_new'). Returning
+		 * -DER_INPROGRESS to make the RPC sponsor to retry the RPC with 'RESEND' flag,
+		 *  then related RPC handler logic will handle such case.
+		 */
+		if (DAE_EPOCH(dae_old) > DAE_EPOCH(dae_new)) {
+			D_ERROR("Resent RPC may be handled before original one for DTX "DF_DTI
+				" with epoch "DF_X64" vs "DF_X64"\n",
+				DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
+			return -DER_INPROGRESS;
+		}
+
+		/*
+		 * The two entries uses the same epoch, then it may be caused by repeated RPCs
+		 * from different sources, such as multiple relay engines forward the same RPC
+		 * to current target. We need to notify related caller for such buggy case.
+		 */
+		D_ERROR("Receive repeated DTX "DF_DTI" with epoch "DF_X64"\n",
+			DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old));
+		return -DER_MISC;
 	}
 
 	rec->rec_off = umem_ptr2off(&tins->ti_umm, dae_new);
@@ -767,7 +788,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae,
 static int
 vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t epoch,
 		   daos_epoch_t cmt_time, struct vos_dtx_cmt_ent **dce_p,
-		   struct vos_dtx_act_ent **dae_p, bool *rm_cos, bool *fatal)
+		   struct vos_dtx_act_ent **dae_p, bool *rm_cos)
 {
 	struct vos_dtx_act_ent		*dae = NULL;
 	struct vos_dtx_cmt_ent		*dce = NULL;
@@ -866,10 +887,8 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t
 		goto out;
 
 	rc = dtx_rec_release(cont, dae, false);
-	if (rc != 0) {
-		*fatal = true;
+	if (rc != 0)
 		goto out;
-	}
 
 	D_ASSERT(dae_p != NULL);
 	*dae_p = dae;
@@ -915,7 +934,7 @@ vos_dtx_extend_act_table(struct vos_container *cont)
 	umem_off_t			 dbd_off;
 	int				 rc;
 
-	dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE);
+	dbd_off = umem_zalloc(umm, DTX_ACT_BLOB_SIZE);
 	if (UMOFF_IS_NULL(dbd_off)) {
 		D_ERROR("No space when create active DTX table.\n");
 		return -DER_NOSPACE;
@@ -923,7 +942,7 @@ vos_dtx_extend_act_table(struct vos_container *cont)
 
 	dbd = umem_off2ptr(umm, dbd_off);
 	dbd->dbd_magic = DTX_ACT_BLOB_MAGIC;
-	dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
+	dbd->dbd_cap = (DTX_ACT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
 			sizeof(struct vos_dtx_act_ent_df);
 	dbd->dbd_count = 0;
 	dbd->dbd_index = 0;
@@ -1175,16 +1194,20 @@ vos_dtx_check_availability(daos_handle_t coh, uint32_t entry,
 	}
 
 	if (intent == DAOS_INTENT_PURGE) {
-		uint32_t	age = d_hlc_age2sec(DAE_XID(dae).dti_hlc);
+		uint64_t	now = daos_gettime_coarse();
 
 		/*
 		 * The DTX entry still references related data record,
 		 * then we cannot (vos) aggregate related data record.
+		 * Report warning per each 10 seconds to avoid log flood.
 		 */
-		if (age >= DAOS_AGG_THRESHOLD)
-			D_WARN("DTX "DF_DTI" (state:%u, age:%u) still references the data, "
-			       "cannot be (VOS) aggregated\n",
-			       DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), age);
+		if (now - cont->vc_agg_busy_ts > 10) {
+			D_WARN("DTX "DF_DTI" (state:%u, flags:%x, age:%u) still references "
+			       "the modification, cannot be (VOS) aggregated\n",
+			       DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), DAE_FLAGS(dae),
+			       (unsigned int)d_hlc_age2sec(DAE_XID(dae).dti_hlc));
+			cont->vc_agg_busy_ts = now;
+		}
 
 		return ALB_AVAILABLE_DIRTY;
 	}
@@ -1912,8 +1935,13 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch,
 			daos_epoch_t	e = *epoch;
 
 			*epoch = DAE_EPOCH(dae);
-			if (e != 0 && e != DAE_EPOCH(dae))
-				return -DER_MISMATCH;
+			if (e != 0) {
+				if (e > DAE_EPOCH(dae))
+					return -DER_MISMATCH;
+
+				if (e < DAE_EPOCH(dae))
+					return -DER_TX_RESTART;
+			}
 		}
 
 		return vos_dae_is_prepare(dae) ? DTX_ST_PREPARED : DTX_ST_INITED;
@@ -1976,9 +2004,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid,
 			rc = -DER_INPROGRESS;
 	}
 
-	if (rc < 0)
-		D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc));
-
 	return rc;
 }
 
@@ -1994,12 +2019,11 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 	umem_off_t			 dbd_off;
 	uint64_t			 cmt_time = daos_gettime_coarse();
 	int				 committed = 0;
-	int				 cur = 0;
 	int				 rc = 0;
-	int				 rc1 = 0;
+	int				 p = 0;
 	int				 i = 0;
 	int				 j;
-	bool				 fatal = false;
+	int				 k;
 	bool				 allocated = false;
 
 	dbd = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail);
@@ -2017,67 +2041,64 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 		goto new_blob;
 
 again:
-	for (j = dbd->dbd_count; i < count && j < dbd->dbd_cap && rc1 == 0;
-	     i++, cur++) {
-		struct vos_dtx_cmt_ent	*dce = NULL;
-
-		rc = vos_dtx_commit_one(cont, &dtis[cur], epoch, cmt_time, &dce,
-					daes != NULL ? &daes[cur] : NULL,
-					rm_cos != NULL ? &rm_cos[cur] : NULL, &fatal);
-		if (dces != NULL)
-			dces[cur] = dce;
-
-		if (fatal)
-			goto out;
-
-		if (rc == 0 && (daes == NULL || daes[cur] != NULL))
+	for (j = dbd->dbd_count; j < dbd->dbd_cap && i < count; i++) {
+		rc = vos_dtx_commit_one(cont, &dtis[i], epoch, cmt_time, &dces[i],
+					daes != NULL ? &daes[i] : NULL,
+					rm_cos != NULL ? &rm_cos[i] : NULL);
+		if (rc == 0 && (daes == NULL || daes[i] != NULL))
 			committed++;
 
 		if (rc == -DER_ALREADY || rc == -DER_NONEXIST)
 			rc = 0;
 
-		if (rc1 == 0)
-			rc1 = rc;
+		if (rc != 0)
+			goto out;
+
+		if (dces[i] != NULL)
+			j++;
+	}
+
+	if (j > dbd->dbd_count) {
+		if (!allocated) {
+			rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[dbd->dbd_count],
+					      sizeof(struct vos_dtx_cmt_ent_df) *
+					      (j - dbd->dbd_count), UMEM_XADD_NO_SNAPSHOT);
+			if (rc != 0)
+				goto out;
 
-		if (dce != NULL) {
-			rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[j],
-					      sizeof(struct vos_dtx_cmt_ent_df),
-					      UMEM_XADD_NO_SNAPSHOT);
+			/* Only need to add range for the first partial blob. */
+			rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count));
 			if (rc != 0)
-				D_GOTO(out, fatal = true);
+				goto out;
+		}
 
-			memcpy(&dbd->dbd_committed_data[j++], &dce->dce_base,
+		for (k = dbd->dbd_count; k < j; k++, p++) {
+			while (dces[p] == NULL)
+				p++;
+
+			memcpy(&dbd->dbd_committed_data[k], &dces[p]->dce_base,
 			       sizeof(struct vos_dtx_cmt_ent_df));
 		}
-	}
 
-	if (!allocated) {
-		/* Only need to add range for the first partial blob. */
-		rc = umem_tx_add_ptr(umm, &dbd->dbd_count,
-				     sizeof(dbd->dbd_count));
-		if (rc != 0)
-			D_GOTO(out, fatal = true);
+		dbd->dbd_count = j;
 	}
 
-	dbd->dbd_count = j;
-
-	if (i == count || rc1 != 0)
+	if (i == count)
 		goto out;
 
 new_blob:
 	dbd_prev = dbd;
 	/* Need new @dbd */
-	dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE);
+	dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE);
 	if (UMOFF_IS_NULL(dbd_off)) {
 		D_ERROR("No space to store committed DTX %d "DF_DTI"\n",
-			count, DP_DTI(&dtis[cur]));
-		fatal = true;
+			count, DP_DTI(&dtis[i]));
 		D_GOTO(out, rc = -DER_NOSPACE);
 	}
 
 	dbd = umem_off2ptr(umm, dbd_off);
 	dbd->dbd_magic = DTX_CMT_BLOB_MAGIC;
-	dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
+	dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
 		       sizeof(struct vos_dtx_cmt_ent_df);
 	dbd->dbd_prev = umem_ptr2off(umm, dbd_prev);
 
@@ -2090,21 +2111,21 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 				     sizeof(cont_df->cd_dtx_committed_head) +
 				     sizeof(cont_df->cd_dtx_committed_tail));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 
 		cont_df->cd_dtx_committed_head = dbd_off;
 	} else {
 		rc = umem_tx_add_ptr(umm, &dbd_prev->dbd_next,
 				     sizeof(dbd_prev->dbd_next));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 
 		dbd_prev->dbd_next = dbd_off;
 
 		rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_tail,
 				     sizeof(cont_df->cd_dtx_committed_tail));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 	}
 
 	D_DEBUG(DB_IO, "Allocated DTX committed blob %p ("UMOFF_PF") for cont "DF_UUID"\n",
@@ -2115,7 +2136,7 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 	goto again;
 
 out:
-	return fatal ? rc : (committed > 0 ? committed : rc1);
+	return rc < 0 ? rc : committed;
 }
 
 void
diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h
index 7d4dd3ac166..5aa0a1fadbb 100644
--- a/src/vos/vos_internal.h
+++ b/src/vos/vos_internal.h
@@ -353,6 +353,8 @@ struct vos_container {
 	daos_epoch_range_t	vc_epr_aggregation;
 	/* Current ongoing discard EPR */
 	daos_epoch_range_t	vc_epr_discard;
+	/* Last timestamp when VOS aggregation reports -DER_TX_BUSY */
+	uint64_t		vc_agg_busy_ts;
 	/* Last timestamp when VOS aggregation reporting ENOSPACE */
 	uint64_t		vc_agg_nospc_ts;
 	/* Last timestamp when IO reporting ENOSPACE */
diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c
index ea47cf4454c..58699453f40 100644
--- a/src/vos/vos_obj_index.c
+++ b/src/vos/vos_obj_index.c
@@ -791,13 +791,14 @@ oi_iter_process(struct vos_iterator *iter, vos_iter_proc_op_t op, void *args)
 int
 oi_iter_check_punch(daos_handle_t ih)
 {
-	struct vos_iterator	*iter = vos_hdl2iter(ih);
-	struct vos_oi_iter	*oiter = iter2oiter(iter);
-	struct vos_obj_df	*obj;
-	struct oi_delete_arg	del_arg;
-	daos_unit_oid_t		 oid;
-	d_iov_t			 rec_iov;
-	int			 rc;
+	struct vos_iterator  *iter  = vos_hdl2iter(ih);
+	struct vos_oi_iter   *oiter = iter2oiter(iter);
+	struct vos_container *cont  = oiter->oit_cont;
+	struct vos_obj_df    *obj;
+	struct oi_delete_arg  del_arg;
+	daos_unit_oid_t       oid;
+	d_iov_t               rec_iov;
+	int                   rc;
 
 	D_ASSERT(iter->it_type == VOS_ITER_OBJ);
 
@@ -811,10 +812,22 @@ oi_iter_check_punch(daos_handle_t ih)
 	obj = (struct vos_obj_df *)rec_iov.iov_buf;
 	oid = obj->vo_id;
 
-	if (!vos_ilog_is_punched(vos_cont2hdl(oiter->oit_cont), &obj->vo_ilog, &oiter->oit_epr,
-				 NULL, &oiter->oit_ilog_info))
+	if (!vos_ilog_is_punched(vos_cont2hdl(cont), &obj->vo_ilog, &oiter->oit_epr, NULL,
+				 &oiter->oit_ilog_info))
 		return 0;
 
+	rc = vos_obj_hold(vos_obj_cache_current(cont->vc_pool->vp_sysdb), cont, oid,
+			  &oiter->oit_epr, iter->it_bound, VOS_OBJ_AGGREGATE | VOS_OBJ_NO_HOLD,
+			  DAOS_INTENT_PURGE, NULL, NULL);
+	if (rc != 0) {
+		/** -DER_BUSY means the object is in-use already.  We will after a yield in this
+		 * case.
+		 */
+		D_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, "Hold check failed for " DF_UOID "\n",
+			 DP_UOID(oid));
+		return rc;
+	}
+
 	/** Ok, ilog is fully punched, so we can move it to gc heap */
 	rc = umem_tx_begin(vos_cont2umm(oiter->oit_cont), NULL);
 	if (rc != 0)
diff --git a/utils/build.config b/utils/build.config
index 4d770e6c924..a14ad039c15 100644
--- a/utils/build.config
+++ b/utils/build.config
@@ -8,8 +8,8 @@ pmdk=2.1.0
 isal=v2.30.0
 isal_crypto=v2.23.0
 spdk=v22.01.2
-ofi=v1.19.1
-mercury=v2.4.0rc5
+ofi=v1.22.0
+mercury=v2.4.0
 protobufc=v1.3.3
 ucx=v1.14.1
 
@@ -27,7 +27,6 @@ ucx=https://github.com/openucx/ucx.git
 
 [patch_versions]
 spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff
-ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff
-mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch
 fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff
+mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch
 pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff
diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec
index e58af4e6d66..59376d93d40 100644
--- a/utils/rpms/daos.spec
+++ b/utils/rpms/daos.spec
@@ -14,8 +14,8 @@
 %endif
 
 Name:          daos
-Version:       2.6.1
-Release:       4%{?relval}%{?dist}
+Version:       2.6.2
+Release:       1%{?relval}%{?dist}
 Summary:       DAOS Storage Engine
 
 License:       BSD-2-Clause-Patent
@@ -602,6 +602,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 
 %changelog
 
+* Tue Nov 05 2024 Phillip Henderson <phillip.henderson@intel.com> 2.6.2-1
+- First test build for 2.6.2
+
 * Wed Oct 02 2024 Tomasz Gromadzki <tomasz.gromadzki@intel.com> 2.6.1-4
 - Add support of the PMDK package 2.1.0 with NDCTL enabled.
   * Increase the default ULT stack size to 20KiB if the engine uses
diff --git a/utils/trivy/.trivyignore b/utils/trivy/.trivyignore
new file mode 100644
index 00000000000..3a3b4cff1ce
--- /dev/null
+++ b/utils/trivy/.trivyignore
@@ -0,0 +1,27 @@
+## Ignored hadoop 3.3.6 related CVE
+## CVE-2023-52428,MEDIUM,,"Denial of Service in Connect2id Nimbus JOSE+JWT","com.nimbusds:nimbus-jose-jwt","9.8.1","9.37.2",https://avd.aquasec.com/nvd/cve-2023-52428
+CVE-2023-52428
+## CVE-2023-39410,HIGH,7.5,"apache-avro: Apache Avro Java SDK: Memory when deserializing untrusted data in Avro Java SDK","org.apache.avro:avro","1.7.7","1.11.3",https://avd.aquasec.com/nvd/cve-2023-39410
+CVE-2023-39410
+## CVE-2024-25710,HIGH,5.5,"commons-compress: Denial of service caused by an infinite loop for a corrupted DUMP file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-25710
+CVE-2024-25710
+## CVE-2024-26308,HIGH,5.5,"commons-compress: OutOfMemoryError unpacking broken Pack200 file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-26308
+CVE-2024-26308
+## CVE-2024-29131,MEDIUM,,"commons-configuration: StackOverflowError adding property in AbstractListDelimiterHandler.flattenIterator()","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29131
+CVE-2024-29131
+## CVE-2024-29133,MEDIUM,,"commons-configuration: StackOverflowError calling ListDelimiterHandler.flatten(Object, int) with a cyclical object tree","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29133
+CVE-2024-29133
+## CVE-2024-25638,HIGH,,"dnsjava: Improper response validation allowing DNSSEC bypass","dnsjava:dnsjava","2.1.7","3.6.0",https://avd.aquasec.com/nvd/cve-2024-25638
+CVE-2024-25638
+
+## Ignored hadoop 3.4.0 related CVE
+## CVE-2024-47561,CRITICAL,,"apache-avro: Schema parsing may trigger Remote Code Execution (RCE)","org.apache.avro:avro","1.9.2","1.11.4",https://avd.aquasec.com/nvd/cve-2024-47561
+CVE-2024-47561
+## CVE-2023-33201,MEDIUM,5.3,"bouncycastle: potential  blind LDAP injection attack using a self-signed certificate","org.bouncycastle:bcprov-jdk15on","1.70","",https://avd.aquasec.com/nvd/cve-2023-33201
+CVE-2023-33201
+## CVE-2024-29857,MEDIUM,,"org.bouncycastle: Importing an EC certificate with crafted F2m parameters may lead to Denial of Service","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-29857
+CVE-2024-29857
+## CVE-2024-30171,MEDIUM,,"bc-java: BouncyCastle vulnerable to a timing variant of Bleichenbacher (Marvin Attack)","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30171
+CVE-2024-30171
+## CVE-2024-30172,MEDIUM,,"org.bouncycastle:bcprov-jdk18on: Infinite loop in ED25519 verification in the ScalarUtil class","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30172
+CVE-2024-30172
diff --git a/utils/trivy/csv.tpl b/utils/trivy/csv.tpl
new file mode 100644
index 00000000000..d3bcafcef5e
--- /dev/null
+++ b/utils/trivy/csv.tpl
@@ -0,0 +1,29 @@
+{{ range . }}
+Trivy Vulnerability Scan Results ({{- .Target -}})
+VulnerabilityID,Severity,CVSS Score,Title,Library,Vulnerable Version,Fixed Version,Information URL,Triage Information
+{{ range .Vulnerabilities }}
+    {{- .VulnerabilityID }},
+    {{- .Severity }},
+    {{- range $key, $value := .CVSS }}
+        {{- if (eq $key "nvd") }}
+            {{- .V3Score -}}
+        {{- end }}
+    {{- end }},
+    {{- quote .Title }},
+    {{- quote .PkgName }},
+    {{- quote .InstalledVersion }},
+    {{- quote .FixedVersion }},
+    {{- .PrimaryURL }}
+{{ else -}}
+    No vulnerabilities found at this time.
+{{ end }}
+Trivy Dependency Scan Results ({{ .Target }})
+ID,Name,Version,Notes
+{{ range .Packages -}}
+    {{- quote .ID }},
+    {{- quote .Name }},
+    {{- quote .Version }}
+{{ else -}}
+    No dependencies found at this time.
+{{ end }}
+{{ end }}
\ No newline at end of file
diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml
new file mode 100644
index 00000000000..cfb13b5c40f
--- /dev/null
+++ b/utils/trivy/trivy.yaml
@@ -0,0 +1,249 @@
+cache:
+  backend: fs
+  dir:
+  redis:
+    ca: ""
+    cert: ""
+    key: ""
+    tls: false
+    ttl: 0s
+config: trivy.yaml
+db:
+  download-java-only: false
+  download-only: false
+  java-repository: ghcr.io/aquasecurity/trivy-java-db
+  java-skip-update: false
+  no-progress: false
+  repository: ghcr.io/aquasecurity/trivy-db
+  skip-update: false
+debug: false
+dependency-tree: true
+exit-code: 0
+generate-default-config: false
+ignore-policy: ""
+ignorefile: ./utils/trivy/.trivyignore
+include-dev-deps: false
+insecure: false
+license:
+  confidencelevel: "0.9"
+  forbidden:
+    - AGPL-1.0
+    - AGPL-3.0
+    - CC-BY-NC-1.0
+    - CC-BY-NC-2.0
+    - CC-BY-NC-2.5
+    - CC-BY-NC-3.0
+    - CC-BY-NC-4.0
+    - CC-BY-NC-ND-1.0
+    - CC-BY-NC-ND-2.0
+    - CC-BY-NC-ND-2.5
+    - CC-BY-NC-ND-3.0
+    - CC-BY-NC-ND-4.0
+    - CC-BY-NC-SA-1.0
+    - CC-BY-NC-SA-2.0
+    - CC-BY-NC-SA-2.5
+    - CC-BY-NC-SA-3.0
+    - CC-BY-NC-SA-4.0
+    - Commons-Clause
+    - Facebook-2-Clause
+    - Facebook-3-Clause
+    - Facebook-Examples
+    - WTFPL
+  full: false
+  ignored: []
+  notice:
+    - AFL-1.1
+    - AFL-1.2
+    - AFL-2.0
+    - AFL-2.1
+    - AFL-3.0
+    - Apache-1.0
+    - Apache-1.1
+    - Apache-2.0
+    - Artistic-1.0-cl8
+    - Artistic-1.0-Perl
+    - Artistic-1.0
+    - Artistic-2.0
+    - BSL-1.0
+    - BSD-2-Clause-FreeBSD
+    - BSD-2-Clause-NetBSD
+    - BSD-2-Clause
+    - BSD-3-Clause-Attribution
+    - BSD-3-Clause-Clear
+    - BSD-3-Clause-LBNL
+    - BSD-3-Clause
+    - BSD-4-Clause
+    - BSD-4-Clause-UC
+    - BSD-Protection
+    - CC-BY-1.0
+    - CC-BY-2.0
+    - CC-BY-2.5
+    - CC-BY-3.0
+    - CC-BY-4.0
+    - FTL
+    - ISC
+    - ImageMagick
+    - Libpng
+    - Lil-1.0
+    - Linux-OpenIB
+    - LPL-1.02
+    - LPL-1.0
+    - MS-PL
+    - MIT
+    - NCSA
+    - OpenSSL
+    - PHP-3.01
+    - PHP-3.0
+    - PIL
+    - Python-2.0
+    - Python-2.0-complete
+    - PostgreSQL
+    - SGI-B-1.0
+    - SGI-B-1.1
+    - SGI-B-2.0
+    - Unicode-DFS-2015
+    - Unicode-DFS-2016
+    - Unicode-TOU
+    - UPL-1.0
+    - W3C-19980720
+    - W3C-20150513
+    - W3C
+    - X11
+    - Xnet
+    - Zend-2.0
+    - zlib-acknowledgement
+    - Zlib
+    - ZPL-1.1
+    - ZPL-2.0
+    - ZPL-2.1
+  permissive: []
+  reciprocal:
+    - APSL-1.0
+    - APSL-1.1
+    - APSL-1.2
+    - APSL-2.0
+    - CDDL-1.0
+    - CDDL-1.1
+    - CPL-1.0
+    - EPL-1.0
+    - EPL-2.0
+    - FreeImage
+    - IPL-1.0
+    - MPL-1.0
+    - MPL-1.1
+    - MPL-2.0
+    - Ruby
+  restricted:
+    - BCL
+    - CC-BY-ND-1.0
+    - CC-BY-ND-2.0
+    - CC-BY-ND-2.5
+    - CC-BY-ND-3.0
+    - CC-BY-ND-4.0
+    - CC-BY-SA-1.0
+    - CC-BY-SA-2.0
+    - CC-BY-SA-2.5
+    - CC-BY-SA-3.0
+    - CC-BY-SA-4.0
+    - GPL-1.0
+    - GPL-2.0
+    - GPL-2.0-with-autoconf-exception
+    - GPL-2.0-with-bison-exception
+    - GPL-2.0-with-classpath-exception
+    - GPL-2.0-with-font-exception
+    - GPL-2.0-with-GCC-exception
+    - GPL-3.0
+    - GPL-3.0-with-autoconf-exception
+    - GPL-3.0-with-GCC-exception
+    - LGPL-2.0
+    - LGPL-2.1
+    - LGPL-3.0
+    - NPL-1.0
+    - NPL-1.1
+    - OSL-1.0
+    - OSL-1.1
+    - OSL-2.0
+    - OSL-2.1
+    - OSL-3.0
+    - QPL-1.0
+    - Sleepycat
+  unencumbered:
+    - CC0-1.0
+    - Unlicense
+    - 0BSD
+list-all-pkgs: false
+misconfiguration:
+  cloudformation:
+    params: []
+  helm:
+    set: []
+    set-file: []
+    set-string: []
+    values: []
+  include-non-failures: false
+  check-bundle-repository: ghcr.io/aquasecurity/trivy-policies:0
+  # scanners:
+  #  - azure-arm
+  #  - cloudformation
+  #  - dockerfile
+  #  - helm
+  #  - kubernetes
+  #  - terraform
+  #  - terraformplan
+  terraform:
+    exclude-downloaded-modules: false
+    vars: []
+module:
+  dir:
+  enable-modules: []
+output: "trivy-report-daos.txt"
+format: template
+template: '@./utils/trivy/csv.tpl'
+output-plugin-arg: ""
+quiet: false
+registry:
+  password: []
+  token: ""
+  username: []
+rego:
+  data: []
+  namespaces: []
+  policy: []
+  skip-policy-update: false
+  trace: false
+report: all
+scan:
+  compliance: ""
+  file-patterns: []
+  offline: false
+  parallel: 1
+  rekor-url: https://rekor.sigstore.dev
+  sbom-sources: []
+  scanners:
+    - vuln
+    - secret
+  # ignore all hadoop dependencies
+  skip-dirs:
+    ./src/client/java/hadoop-daos
+  skip-files: []
+  show-suppressed: true
+secret:
+  config: trivy-secret.yaml
+server:
+  addr: ""
+  custom-headers: []
+  token: ""
+  token-header: Trivy-Token
+severity:
+  - UNKNOWN
+  - MEDIUM
+  - HIGH
+  - CRITICAL
+timeout: 5m0s
+version: false
+vulnerability:
+  ignore-status: []
+  ignore-unfixed: false
+  type:
+    - os
+    - library