From 76cfb41b852f94859a32caaa7b3b379f069d0881 Mon Sep 17 00:00:00 2001
From: wiliamhuang <lei.huang@intel.com>
Date: Tue, 15 Oct 2024 10:18:18 -0500
Subject: [PATCH 01/34] DAOS-16556 client: call fstat() before mmap() to update
 file status in kernel (#15304)

Signed-off-by: Lei Huang <lei.huang@intel.com>
---
 src/client/dfuse/pil4dfs/int_dfs.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c
index fff49d9f0ae..3087cb09c75 100644
--- a/src/client/dfuse/pil4dfs/int_dfs.c
+++ b/src/client/dfuse/pil4dfs/int_dfs.c
@@ -6224,6 +6224,14 @@ new_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
 
 	atomic_fetch_add_relaxed(&num_mmap, 1);
 
+	if ((fd < FD_FILE_BASE) && (fd_directed >= FD_FILE_BASE) && d_compatible_mode) {
+		/* DAOS-14494: Force the kernel to update the size before mapping. */
+		rc = next_fxstat(1, fd, &stat_buf);
+		if (rc == -1)
+			return MAP_FAILED;
+		return next_mmap(addr, length, prot, flags, fd, offset);
+	}
+
 	addr_ret = next_mmap(addr, length, prot, flags | MAP_ANONYMOUS, -1, offset);
 	if (addr_ret == MAP_FAILED)
 		return MAP_FAILED;
@@ -6842,10 +6850,8 @@ init_myhook(void)
 
 	register_a_hook("libc", "fcntl", (void *)new_fcntl, (long int *)(&libc_fcntl));
 
-	if (d_compatible_mode == false) {
-		register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap));
-		register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap));
-	}
+	register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap));
+	register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap));
 
 	register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit));
 	register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3));

From b4eb68912b13345905efc98bdf2ed9b1c948113d Mon Sep 17 00:00:00 2001
From: Makito Kano <makito.kano@intel.com>
Date: Wed, 16 Oct 2024 09:40:29 +0900
Subject: [PATCH 02/34] =?UTF-8?q?DAOS-16446=20test:=20HDF5-VOL=20test=20-?=
 =?UTF-8?q?=20Set=20object=20class=20and=20container=20prope=E2=80=A6=20(#?=
 =?UTF-8?q?15004)=20(#15098)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In HDF5, DFS, MPIIO, or POSIX, object class and container properties are defined
during the container create. If it’s DFS, object class is also set to the IOR
parameter. However, in HDF5-VOL, object class and container properties are
defined with the following environment variables of mpirun.

HDF5_DAOS_OBJ_CLASS (Object class)
HDF5_DAOS_FILE_PROP (Container properties)

The infrastructure to set these variables are already there in run_ior_with_pool().
In file_count_test_base.py, pass in the env vars to run_ior_with_pool(env=env) as a
dictionary. Object class is the oclass variable. Container properties can be
obtained from self.container.properties.value.

This fix is discussed in PR #14964.

Signed-off-by: Makito Kano <makito.kano@intel.com>
---
 src/tests/ftest/util/file_count_test_base.py | 31 ++++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py
index 12c66d76b8c..f95e22bd4bd 100644
--- a/src/tests/ftest/util/file_count_test_base.py
+++ b/src/tests/ftest/util/file_count_test_base.py
@@ -17,15 +17,15 @@ class FileCountTestBase(IorTestBase, MdtestBase):
     :avocado: recursive
     """
 
-    def add_containers(self, file_oclass=None, dir_oclass=None):
-        """Create a list of containers that the various jobs use for storage.
+    def get_file_write_container(self, file_oclass=None, dir_oclass=None):
+        """Create a container, set oclass, dir_oclass, and add rd_fac property based on oclass.
 
         Args:
-            file_oclass (str, optional): file object class of container.
-                                         Defaults to None.
-            dir_oclass (str, optional): dir object class of container.
-                                        Defaults to None.
+            file_oclass (str, optional): file object class of container. Defaults to None.
+            dir_oclass (str, optional): dir object class of container. Defaults to None.
 
+        Returns:
+            TestContainer: Created container with oclass, dir_oclass, and rd_fac set.
 
         """
         # Create a container and add it to the overall list of containers
@@ -92,7 +92,7 @@ def run_file_count(self):
                     rd_fac = extract_redundancy_factor(oclass)
                     dir_oclass = self.get_diroclass(rd_fac)
                     self.mdtest_cmd.dfs_dir_oclass.update(dir_oclass)
-                    self.container = self.add_containers(oclass, dir_oclass)
+                    self.container = self.get_file_write_container(oclass, dir_oclass)
                     try:
                         self.processes = mdtest_np
                         self.ppn = mdtest_ppn
@@ -111,14 +111,27 @@ def run_file_count(self):
                 # run ior
                 self.log.info("=======>>>Starting IOR with %s and %s", api, oclass)
                 self.ior_cmd.dfs_oclass.update(oclass)
-                self.container = self.add_containers(oclass)
+                self.container = self.get_file_write_container(oclass)
                 self.update_ior_cmd_with_pool(False)
                 try:
                     self.processes = ior_np
                     self.ppn = ior_ppn
                     if api == 'HDF5-VOL':
+                        # Format the container properties so that it works with HDF5-VOL env var.
+                        # Each entry:value pair needs to be separated by a semicolon. Since we're
+                        # using this in the mpirun command, semicolon would indicate the end of the
+                        # command, so quote the whole thing.
+                        cont_props = self.container.properties.value
+                        cont_props_hdf5_vol = '"' + cont_props.replace(",", ";") + '"'
+                        self.log.info("cont_props_hdf5_vol = %s", cont_props_hdf5_vol)
+                        env = self.ior_cmd.env.copy()
+                        env.update({
+                            "HDF5_DAOS_OBJ_CLASS": oclass,
+                            "HDF5_DAOS_FILE_PROP": cont_props_hdf5_vol
+                        })
                         self.ior_cmd.api.update('HDF5')
-                        self.run_ior_with_pool(create_pool=False, plugin_path=hdf5_plugin_path)
+                        self.run_ior_with_pool(
+                            create_pool=False, plugin_path=hdf5_plugin_path, env=env)
                     elif self.ior_cmd.api.value == 'POSIX':
                         self.run_ior_with_pool(create_pool=False, intercept=intercept)
                     else:

From 6e16c8e21bcc1cb5b29c62aca45e309201c0d9ca Mon Sep 17 00:00:00 2001
From: Tomasz Gromadzki <tomasz.gromadzki@intel.com>
Date: Wed, 16 Oct 2024 02:43:26 +0200
Subject: [PATCH 03/34] DAOS-16673 common: ignore Hadoop 3.4.0 related CVE
 (#15320)

Hadoope 3.4.0 has resolved a few CVE issues but introduces new

+ enable Trivy scans on release branch
+ enable on demand scan and scan on final PR merge.

Signed-off-by: Tomasz Gromadzki <tomasz.gromadzki@intel.com>
---
 .github/workflows/trivy.yml |  70 ++++++++++
 utils/trivy/.trivyignore    |  27 ++++
 utils/trivy/csv.tpl         |  29 +++++
 utils/trivy/trivy.yaml      | 249 ++++++++++++++++++++++++++++++++++++
 4 files changed, 375 insertions(+)
 create mode 100644 .github/workflows/trivy.yml
 create mode 100644 utils/trivy/.trivyignore
 create mode 100644 utils/trivy/csv.tpl
 create mode 100644 utils/trivy/trivy.yaml

diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
new file mode 100644
index 00000000000..8f5524d4513
--- /dev/null
+++ b/.github/workflows/trivy.yml
@@ -0,0 +1,70 @@
+name: Trivy scan
+
+on:
+  workflow_dispatch:
+  push:
+    branches: ["master", "release/**"]
+  pull_request:
+    branches: ["master", "release/**"]
+
+# Declare default permissions as nothing.
+permissions: {}
+
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11  # v4.1.1
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
+
+      - name: Prepare the report to be uploaded to the GitHub artifact store
+        run: |
+          mkdir report
+          cp trivy-report-daos.txt report
+          cp utils/trivy/.trivyignore report/trivyignore.txt
+
+      - name: Upload the report to the GitHub artifact store
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808  # v4.3.3
+        with:
+          path: report/*
+          name: trivy-report-daos
+
+      - name: Adjust config file to use sarif format
+        run: |
+          sed -i 's/output: "trivy-report-daos.txt"/output: "trivy-results.sarif"/g' \
+            utils/trivy/trivy.yaml
+          sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
+
+      - name: Upload Trivy scan results to GitHub Security tab
+        uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a
+        # 3.25.15 (v3)
+        with:
+          sarif_file: 'trivy-results.sarif'
+
+      - name: Adjust config file to show and validate scan results
+        run: |
+          sed -i 's/output: "trivy-results.sarif"//g' utils/trivy/trivy.yaml
+          sed -i 's/format: sarif/format: table/g' utils/trivy/trivy.yaml
+          sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml
+
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8  # 0.24.0
+        with:
+          scan-type: 'fs'
+          scan-ref: '.'
+          trivy-config: 'utils/trivy/trivy.yaml'
diff --git a/utils/trivy/.trivyignore b/utils/trivy/.trivyignore
new file mode 100644
index 00000000000..3a3b4cff1ce
--- /dev/null
+++ b/utils/trivy/.trivyignore
@@ -0,0 +1,27 @@
+## Ignored hadoop 3.3.6 related CVE
+## CVE-2023-52428,MEDIUM,,"Denial of Service in Connect2id Nimbus JOSE+JWT","com.nimbusds:nimbus-jose-jwt","9.8.1","9.37.2",https://avd.aquasec.com/nvd/cve-2023-52428
+CVE-2023-52428
+## CVE-2023-39410,HIGH,7.5,"apache-avro: Apache Avro Java SDK: Memory when deserializing untrusted data in Avro Java SDK","org.apache.avro:avro","1.7.7","1.11.3",https://avd.aquasec.com/nvd/cve-2023-39410
+CVE-2023-39410
+## CVE-2024-25710,HIGH,5.5,"commons-compress: Denial of service caused by an infinite loop for a corrupted DUMP file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-25710
+CVE-2024-25710
+## CVE-2024-26308,HIGH,5.5,"commons-compress: OutOfMemoryError unpacking broken Pack200 file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-26308
+CVE-2024-26308
+## CVE-2024-29131,MEDIUM,,"commons-configuration: StackOverflowError adding property in AbstractListDelimiterHandler.flattenIterator()","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29131
+CVE-2024-29131
+## CVE-2024-29133,MEDIUM,,"commons-configuration: StackOverflowError calling ListDelimiterHandler.flatten(Object, int) with a cyclical object tree","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29133
+CVE-2024-29133
+## CVE-2024-25638,HIGH,,"dnsjava: Improper response validation allowing DNSSEC bypass","dnsjava:dnsjava","2.1.7","3.6.0",https://avd.aquasec.com/nvd/cve-2024-25638
+CVE-2024-25638
+
+## Ignored hadoop 3.4.0 related CVE
+## CVE-2024-47561,CRITICAL,,"apache-avro: Schema parsing may trigger Remote Code Execution (RCE)","org.apache.avro:avro","1.9.2","1.11.4",https://avd.aquasec.com/nvd/cve-2024-47561
+CVE-2024-47561
+## CVE-2023-33201,MEDIUM,5.3,"bouncycastle: potential  blind LDAP injection attack using a self-signed certificate","org.bouncycastle:bcprov-jdk15on","1.70","",https://avd.aquasec.com/nvd/cve-2023-33201
+CVE-2023-33201
+## CVE-2024-29857,MEDIUM,,"org.bouncycastle: Importing an EC certificate with crafted F2m parameters may lead to Denial of Service","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-29857
+CVE-2024-29857
+## CVE-2024-30171,MEDIUM,,"bc-java: BouncyCastle vulnerable to a timing variant of Bleichenbacher (Marvin Attack)","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30171
+CVE-2024-30171
+## CVE-2024-30172,MEDIUM,,"org.bouncycastle:bcprov-jdk18on: Infinite loop in ED25519 verification in the ScalarUtil class","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30172
+CVE-2024-30172
diff --git a/utils/trivy/csv.tpl b/utils/trivy/csv.tpl
new file mode 100644
index 00000000000..d3bcafcef5e
--- /dev/null
+++ b/utils/trivy/csv.tpl
@@ -0,0 +1,29 @@
+{{ range . }}
+Trivy Vulnerability Scan Results ({{- .Target -}})
+VulnerabilityID,Severity,CVSS Score,Title,Library,Vulnerable Version,Fixed Version,Information URL,Triage Information
+{{ range .Vulnerabilities }}
+    {{- .VulnerabilityID }},
+    {{- .Severity }},
+    {{- range $key, $value := .CVSS }}
+        {{- if (eq $key "nvd") }}
+            {{- .V3Score -}}
+        {{- end }}
+    {{- end }},
+    {{- quote .Title }},
+    {{- quote .PkgName }},
+    {{- quote .InstalledVersion }},
+    {{- quote .FixedVersion }},
+    {{- .PrimaryURL }}
+{{ else -}}
+    No vulnerabilities found at this time.
+{{ end }}
+Trivy Dependency Scan Results ({{ .Target }})
+ID,Name,Version,Notes
+{{ range .Packages -}}
+    {{- quote .ID }},
+    {{- quote .Name }},
+    {{- quote .Version }}
+{{ else -}}
+    No dependencies found at this time.
+{{ end }}
+{{ end }}
\ No newline at end of file
diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml
new file mode 100644
index 00000000000..cfb13b5c40f
--- /dev/null
+++ b/utils/trivy/trivy.yaml
@@ -0,0 +1,249 @@
+cache:
+  backend: fs
+  dir:
+  redis:
+    ca: ""
+    cert: ""
+    key: ""
+    tls: false
+    ttl: 0s
+config: trivy.yaml
+db:
+  download-java-only: false
+  download-only: false
+  java-repository: ghcr.io/aquasecurity/trivy-java-db
+  java-skip-update: false
+  no-progress: false
+  repository: ghcr.io/aquasecurity/trivy-db
+  skip-update: false
+debug: false
+dependency-tree: true
+exit-code: 0
+generate-default-config: false
+ignore-policy: ""
+ignorefile: ./utils/trivy/.trivyignore
+include-dev-deps: false
+insecure: false
+license:
+  confidencelevel: "0.9"
+  forbidden:
+    - AGPL-1.0
+    - AGPL-3.0
+    - CC-BY-NC-1.0
+    - CC-BY-NC-2.0
+    - CC-BY-NC-2.5
+    - CC-BY-NC-3.0
+    - CC-BY-NC-4.0
+    - CC-BY-NC-ND-1.0
+    - CC-BY-NC-ND-2.0
+    - CC-BY-NC-ND-2.5
+    - CC-BY-NC-ND-3.0
+    - CC-BY-NC-ND-4.0
+    - CC-BY-NC-SA-1.0
+    - CC-BY-NC-SA-2.0
+    - CC-BY-NC-SA-2.5
+    - CC-BY-NC-SA-3.0
+    - CC-BY-NC-SA-4.0
+    - Commons-Clause
+    - Facebook-2-Clause
+    - Facebook-3-Clause
+    - Facebook-Examples
+    - WTFPL
+  full: false
+  ignored: []
+  notice:
+    - AFL-1.1
+    - AFL-1.2
+    - AFL-2.0
+    - AFL-2.1
+    - AFL-3.0
+    - Apache-1.0
+    - Apache-1.1
+    - Apache-2.0
+    - Artistic-1.0-cl8
+    - Artistic-1.0-Perl
+    - Artistic-1.0
+    - Artistic-2.0
+    - BSL-1.0
+    - BSD-2-Clause-FreeBSD
+    - BSD-2-Clause-NetBSD
+    - BSD-2-Clause
+    - BSD-3-Clause-Attribution
+    - BSD-3-Clause-Clear
+    - BSD-3-Clause-LBNL
+    - BSD-3-Clause
+    - BSD-4-Clause
+    - BSD-4-Clause-UC
+    - BSD-Protection
+    - CC-BY-1.0
+    - CC-BY-2.0
+    - CC-BY-2.5
+    - CC-BY-3.0
+    - CC-BY-4.0
+    - FTL
+    - ISC
+    - ImageMagick
+    - Libpng
+    - Lil-1.0
+    - Linux-OpenIB
+    - LPL-1.02
+    - LPL-1.0
+    - MS-PL
+    - MIT
+    - NCSA
+    - OpenSSL
+    - PHP-3.01
+    - PHP-3.0
+    - PIL
+    - Python-2.0
+    - Python-2.0-complete
+    - PostgreSQL
+    - SGI-B-1.0
+    - SGI-B-1.1
+    - SGI-B-2.0
+    - Unicode-DFS-2015
+    - Unicode-DFS-2016
+    - Unicode-TOU
+    - UPL-1.0
+    - W3C-19980720
+    - W3C-20150513
+    - W3C
+    - X11
+    - Xnet
+    - Zend-2.0
+    - zlib-acknowledgement
+    - Zlib
+    - ZPL-1.1
+    - ZPL-2.0
+    - ZPL-2.1
+  permissive: []
+  reciprocal:
+    - APSL-1.0
+    - APSL-1.1
+    - APSL-1.2
+    - APSL-2.0
+    - CDDL-1.0
+    - CDDL-1.1
+    - CPL-1.0
+    - EPL-1.0
+    - EPL-2.0
+    - FreeImage
+    - IPL-1.0
+    - MPL-1.0
+    - MPL-1.1
+    - MPL-2.0
+    - Ruby
+  restricted:
+    - BCL
+    - CC-BY-ND-1.0
+    - CC-BY-ND-2.0
+    - CC-BY-ND-2.5
+    - CC-BY-ND-3.0
+    - CC-BY-ND-4.0
+    - CC-BY-SA-1.0
+    - CC-BY-SA-2.0
+    - CC-BY-SA-2.5
+    - CC-BY-SA-3.0
+    - CC-BY-SA-4.0
+    - GPL-1.0
+    - GPL-2.0
+    - GPL-2.0-with-autoconf-exception
+    - GPL-2.0-with-bison-exception
+    - GPL-2.0-with-classpath-exception
+    - GPL-2.0-with-font-exception
+    - GPL-2.0-with-GCC-exception
+    - GPL-3.0
+    - GPL-3.0-with-autoconf-exception
+    - GPL-3.0-with-GCC-exception
+    - LGPL-2.0
+    - LGPL-2.1
+    - LGPL-3.0
+    - NPL-1.0
+    - NPL-1.1
+    - OSL-1.0
+    - OSL-1.1
+    - OSL-2.0
+    - OSL-2.1
+    - OSL-3.0
+    - QPL-1.0
+    - Sleepycat
+  unencumbered:
+    - CC0-1.0
+    - Unlicense
+    - 0BSD
+list-all-pkgs: false
+misconfiguration:
+  cloudformation:
+    params: []
+  helm:
+    set: []
+    set-file: []
+    set-string: []
+    values: []
+  include-non-failures: false
+  check-bundle-repository: ghcr.io/aquasecurity/trivy-policies:0
+  # scanners:
+  #  - azure-arm
+  #  - cloudformation
+  #  - dockerfile
+  #  - helm
+  #  - kubernetes
+  #  - terraform
+  #  - terraformplan
+  terraform:
+    exclude-downloaded-modules: false
+    vars: []
+module:
+  dir:
+  enable-modules: []
+output: "trivy-report-daos.txt"
+format: template
+template: '@./utils/trivy/csv.tpl'
+output-plugin-arg: ""
+quiet: false
+registry:
+  password: []
+  token: ""
+  username: []
+rego:
+  data: []
+  namespaces: []
+  policy: []
+  skip-policy-update: false
+  trace: false
+report: all
+scan:
+  compliance: ""
+  file-patterns: []
+  offline: false
+  parallel: 1
+  rekor-url: https://rekor.sigstore.dev
+  sbom-sources: []
+  scanners:
+    - vuln
+    - secret
+  # ignore all hadoop dependencies
+  skip-dirs:
+    ./src/client/java/hadoop-daos
+  skip-files: []
+  show-suppressed: true
+secret:
+  config: trivy-secret.yaml
+server:
+  addr: ""
+  custom-headers: []
+  token: ""
+  token-header: Trivy-Token
+severity:
+  - UNKNOWN
+  - MEDIUM
+  - HIGH
+  - CRITICAL
+timeout: 5m0s
+version: false
+vulnerability:
+  ignore-status: []
+  ignore-unfixed: false
+  type:
+    - os
+    - library

From d9f16a1d6f7326ace19b1f7d4a6f182b4bde0020 Mon Sep 17 00:00:00 2001
From: Tomasz Gromadzki <tomasz.gromadzki@intel.com>
Date: Wed, 16 Oct 2024 02:48:45 +0200
Subject: [PATCH 04/34] DAOS-14408 common: ensure NDCTL not used for storage
 class `ram` (#15203)

* DAOS-14408 common: enable NDCTL for DCPM

This PR prepares DAOS to be used with NDCTL enabled in PMDK, which means:
- NDCTL must not be used when non-DCPM (simulate PMem) - `storage class: "ram"` is used:
`PMEMOBJ_CONF=sds.at_create=0` env variable disables NDCTL features in the PMDK
This change affects all tests run on simulated PMem (e.g. inside VMs).
Some DOAS utility applications may also require `PMEMOBJ_CONF=sds.at_create=0` to be set.

- The default ULT stack size must be at least 20KiB to avoid stack overuse by PMDK with NDCTL enabled and be aligned with Linux page size.
`ABT_THREAD_STACKSIZE=20480` env variable is used to increase the default ULT stack size.
This env variable is set by control/server module just before engine is started.
Much bigger stack is used for pmempool open/create-related tasks e.g. `tgt_vos_create_one` to avoid stack overusage.

This modification shall not affect md-on-ssd mode as long as `storage class: "ram"` is used for the first tier in the `storage` configuration.
This change does not require any configuration changes to existing systems.

The new PMDK package with NDCTL enabled (https://github.com/daos-stack/pmdk/pull/38) will land as soon as this PR is merged.

Signed-off-by: Jan Michalski <jan.michalski@intel.com>
---
 .github/workflows/landing-builds.yml     |   1 +
 debian/changelog                         |  16 ++
 debian/control                           |   6 +-
 site_scons/components/__init__.py        |   1 -
 src/control/server/engine/config.go      |  86 ++++++++++
 src/control/server/engine/config_test.go | 207 +++++++++++++++++++++++
 src/control/server/server.go             |   6 +
 src/mgmt/srv_target.c                    |   2 +-
 utils/build.config                       |   3 +-
 utils/rpms/daos.rpmlintrc                |  10 +-
 utils/rpms/daos.spec                     |  29 +++-
 utils/run_utest.py                       |   1 +
 utils/scripts/install-el8.sh             |   2 +
 utils/scripts/install-el9.sh             |   2 +
 utils/scripts/install-leap15.sh          |   1 +
 utils/scripts/install-ubuntu.sh          |   2 +
 16 files changed, 361 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/landing-builds.yml b/.github/workflows/landing-builds.yml
index 103f150915a..814c4f0e71f 100644
--- a/.github/workflows/landing-builds.yml
+++ b/.github/workflows/landing-builds.yml
@@ -16,6 +16,7 @@ on:
       - ci/**
       - requirements-build.txt
       - requirements-utest.txt
+      - utils/build.config
 
 permissions: {}
 
diff --git a/debian/changelog b/debian/changelog
index 6891cea4737..9790dcbee3e 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,19 @@
+daos (2.6.1-4) unstable; urgency=medium
+  [ Tomasz Gromadzki ]
+  * Add support of the PMDK package 2.1.0 with NDCTL enabled.
+    * Increase the default ULT stack size to 20KiB if the engine uses
+      the DCPM storage class.
+    * Prevent using the RAM storage class (simulated PMem) when
+      the shutdown state (SDS) is active.
+      * Automatically disable SDS for the RAM storage class on engine startup.
+      * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0'
+        environment variable to deactivate SDS for the DAOS tools
+        (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM.
+        Otherwise, a user is supposed to be stopped by an error
+        like: "Unsafe shutdown count is not supported for this source".
+
+ -- Tomasz Gromadzki <tomasz.gromadzki@intel.com>  Wed, 02 Oct 2024 12:00:00 +0200
+
 daos (2.6.1-3) unstable; urgency=medium
   [ Phillip Henderson ]
   * Third release candidate for 2.6.1
diff --git a/debian/control b/debian/control
index bdc377e6d94..32b97fd67d1 100644
--- a/debian/control
+++ b/debian/control
@@ -18,7 +18,7 @@ Build-Depends: debhelper (>= 10),
                python3-distro,
                libabt-dev,
                libucx-dev,
-               libpmemobj-dev (>= 2.0.0),
+               libpmemobj-dev (>= 2.1.0),
                libfuse3-dev,
                libprotobuf-c-dev,
                libjson-c-dev,
@@ -118,7 +118,9 @@ Depends: python (>=3.8), python3, python-yaml, python3-yaml,
          daos-client (= ${binary:Version}),
          daos-admin (= ${binary:Version}),
          golang-go (>=1.18),
-         libcapstone-dev
+         libcapstone-dev,
+         libndctl-dev,
+         libdaxctl-dev
 Description: The Distributed Asynchronous Object Storage (DAOS) is an open-source
  software-defined object store designed from the ground up for
  massively distributed Non Volatile Memory (NVM). DAOS takes advantage
diff --git a/site_scons/components/__init__.py b/site_scons/components/__init__.py
index ed50b80461b..1ae7bb7a2aa 100644
--- a/site_scons/components/__init__.py
+++ b/site_scons/components/__init__.py
@@ -266,7 +266,6 @@ def define_components(reqs):
                 retriever=GitRepoRetriever(),
                 commands=[['make',
                            'all',
-                           'NDCTL_ENABLE=n',
                            'BUILD_EXAMPLES=n',
                            'BUILD_BENCHMARKS=n',
                            'DOC=n',
diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go
index 3e6ab0b64d7..38595fd8091 100644
--- a/src/control/server/engine/config.go
+++ b/src/control/server/engine/config.go
@@ -9,6 +9,7 @@ package engine
 import (
 	"fmt"
 	"os"
+	"strconv"
 	"strings"
 
 	"github.com/pkg/errors"
@@ -28,6 +29,8 @@ const (
 	envLogMasks      = "D_LOG_MASK"
 	envLogDbgStreams = "DD_MASK"
 	envLogSubsystems = "DD_SUBSYS"
+
+	minABTThreadStackSizeDCPM = 20480
 )
 
 // FabricConfig encapsulates networking fabric configuration.
@@ -342,7 +345,80 @@ func (c *Config) Validate() error {
 	if err := ValidateLogSubsystems(subsystems); err != nil {
 		return errors.Wrap(err, "validate engine log subsystems")
 	}
+	return nil
+}
+
+// Ensure at least 20KiB ABT stack size for an engine with DCPM storage class.
+func (c *Config) UpdatePMDKEnvarsStackSizeDCPM() error {
+	stackSizeStr, err := c.GetEnvVar("ABT_THREAD_STACKSIZE")
+	if err != nil {
+		c.EnvVars = append(c.EnvVars, fmt.Sprintf("ABT_THREAD_STACKSIZE=%d",
+			minABTThreadStackSizeDCPM))
+		return nil
+	}
+	// Ensure at least 20KiB ABT stack size for an engine with DCPM storage class.
+	stackSizeValue, err := strconv.Atoi(stackSizeStr)
+	if err != nil {
+		return errors.Errorf("env_var ABT_THREAD_STACKSIZE has invalid value: %s",
+			stackSizeStr)
+	}
+	if stackSizeValue < minABTThreadStackSizeDCPM {
+		return errors.Errorf("env_var ABT_THREAD_STACKSIZE should be >= %d "+
+			"for DCPM storage class, found %d", minABTThreadStackSizeDCPM,
+			stackSizeValue)
+	}
+	return nil
+}
+
+// Ensure proper configuration of shutdown (SDS) state
+func (c *Config) UpdatePMDKEnvarsPMemobjConf(isDCPM bool) error {
+	pmemobjConfStr, pmemobjConfErr := c.GetEnvVar("PMEMOBJ_CONF")
+	//also work for empty string
+	hasSdsAtCreate := strings.Contains(pmemobjConfStr, "sds.at_create")
+	if isDCPM {
+		if !hasSdsAtCreate {
+			return nil
+		}
+		// Confirm default handling of shutdown state (SDS) for DCPM storage class.
+		return errors.New("env_var PMEMOBJ_CONF should NOT contain 'sds.at_create=?' " +
+			"for DCPM storage class, found '" + pmemobjConfStr + "'")
+	}
+
+	// Disable shutdown state (SDS) (part of RAS) for RAM-based simulated SCM.
+	if pmemobjConfErr != nil {
+		c.EnvVars = append(c.EnvVars, "PMEMOBJ_CONF=sds.at_create=0")
+		return nil
+	}
+	if !hasSdsAtCreate {
+		envVars, _ := common.DeleteKeyValue(c.EnvVars, "PMEMOBJ_CONF")
+		c.EnvVars = append(envVars, "PMEMOBJ_CONF="+pmemobjConfStr+
+			";sds.at_create=0")
+		return nil
+	}
+	if strings.Contains(pmemobjConfStr, "sds.at_create=1") {
+		return errors.New("env_var PMEMOBJ_CONF should contain 'sds.at_create=0' " +
+			"for non-DCPM storage class, found '" + pmemobjConfStr + "'")
+	}
+	return nil
+}
+
+// Ensure proper environment variables for PMDK w/ NDCTL enabled based on
+// the actual configuration of the storage class.
+func (c *Config) UpdatePMDKEnvars() error {
+
+	if len(c.Storage.Tiers) == 0 {
+		return errors.New("Invalid config - no tier 0 defined")
+	}
+
+	isDCPM := c.Storage.Tiers[0].Class == storage.ClassDcpm
 
+	if err := c.UpdatePMDKEnvarsPMemobjConf(isDCPM); err != nil {
+		return err
+	}
+
+	if isDCPM {
+		return c.UpdatePMDKEnvarsStackSizeDCPM()
+	}
 	return nil
 }
 
@@ -690,3 +766,13 @@ func (c *Config) WithStorageIndex(i uint32) *Config {
 	c.Storage.EngineIdx = uint(i)
 	return c
 }
+
+// WithEnvVarAbtThreadStackSize sets environment variable ABT_THREAD_STACKSIZE.
+func (c *Config) WithEnvVarAbtThreadStackSize(stack_size uint16) *Config {
+	return c.WithEnvVars(fmt.Sprintf("ABT_THREAD_STACKSIZE=%d", stack_size))
+}
+
+// WithEnvVarPMemObjSdsAtCreate sets PMEMOBJ_CONF env. var. to sds.at_create=0/1 value
+func (c *Config) WithEnvVarPMemObjSdsAtCreate(value uint8) *Config {
+	return c.WithEnvVars(fmt.Sprintf("PMEMOBJ_CONF=sds.at_create=%d", value))
+}
diff --git a/src/control/server/engine/config_test.go b/src/control/server/engine/config_test.go
index 463e86567dc..d9ca2bfebbd 100644
--- a/src/control/server/engine/config_test.go
+++ b/src/control/server/engine/config_test.go
@@ -1104,3 +1104,210 @@ func TestFabricConfig_Update(t *testing.T) {
 		})
 	}
 }
+
+func TestConfig_UpdatePMDKEnvarsStackSizeDCPM(t *testing.T) {
+	validConfig := func() *Config {
+		return MockConfig().WithStorage(
+			storage.NewTierConfig().
+				WithStorageClass("dcpm"))
+	}
+
+	for name, tc := range map[string]struct {
+		cfg                   *Config
+		expErr                error
+		expABTthreadStackSize int
+	}{
+		"empty config should not fail": {
+			cfg:                   MockConfig(),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM,
+		},
+		"valid config for DCPM should not fail": {
+			cfg:                   validConfig().WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM,
+		},
+		"config for DCPM without thread size should not fail": {
+			cfg:                   validConfig(),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM,
+		},
+		"config for DCPM with stack size big enough should not fail": {
+			cfg: validConfig().
+				WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM + 1),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM + 1,
+		},
+		"config for DCPM with stack size too small should fail": {
+			cfg: validConfig().
+				WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
+			expErr: errors.New(fmt.Sprintf("env_var ABT_THREAD_STACKSIZE "+
+				"should be >= %d for DCPM storage class, found %d",
+				minABTThreadStackSizeDCPM, minABTThreadStackSizeDCPM-1)),
+		},
+		"config for DCPM with invalid ABT_THREAD_STACKSIZE value should fail": {
+			cfg:    validConfig().WithEnvVars("ABT_THREAD_STACKSIZE=foo_bar"),
+			expErr: errors.New("env_var ABT_THREAD_STACKSIZE has invalid value: foo_bar"),
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			err := tc.cfg.UpdatePMDKEnvarsStackSizeDCPM()
+			test.CmpErr(t, tc.expErr, err)
+			if err == nil {
+				stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE")
+				test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE")
+				stackSizeVal, err := strconv.Atoi(stackSizeStr)
+				test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE")
+				test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal,
+					"Invalid ABT_THREAD_STACKSIZE value")
+			}
+		})
+	}
+}
+
+func TestConfig_UpdatePMDKEnvarsPMemobjConfDCPM(t *testing.T) {
+	validConfig := func() *Config {
+		return MockConfig().WithStorage(
+			storage.NewTierConfig().WithStorageClass("dcpm"))
+	}
+
+	for name, tc := range map[string]struct {
+		cfg    *Config
+		expErr error
+	}{
+		"empty config should not fail": {
+			cfg: MockConfig(),
+		},
+		"valid config for DCPM should not fail": {
+			cfg: validConfig(),
+		},
+		"config for DCPM with forced sds.at_create (1) should fail": {
+			cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1),
+			expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " +
+				"'sds.at_create=?' for DCPM storage class, found 'sds.at_create=1'"),
+		},
+		"config for DCPM with forced sds.at_create (0) should fail": {
+			cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(0),
+			expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " +
+				"'sds.at_create=?' for DCPM storage class, found 'sds.at_create=0'"),
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(true))
+		})
+	}
+}
+
+func TestConfig_UpdatePMDKEnvarsPMemobjConfNRam(t *testing.T) {
+	validConfig := func() *Config {
+		return MockConfig().WithStorage(
+			storage.NewTierConfig().
+				WithStorageClass("dcpm"))
+	}
+
+	for name, tc := range map[string]struct {
+		cfg             *Config
+		expErr          error
+		expPMEMOBJ_CONF string
+	}{
+		"empty config should not fail": {
+			cfg:             validConfig(),
+			expPMEMOBJ_CONF: "sds.at_create=0",
+		},
+		"config for ram without PMEMOBJ_CONF should not fail": {
+			cfg:             MockConfig(),
+			expPMEMOBJ_CONF: "sds.at_create=0",
+		},
+		"valid config for should not fail": {
+			cfg:             validConfig().WithEnvVarPMemObjSdsAtCreate(0),
+			expPMEMOBJ_CONF: "sds.at_create=0",
+		},
+		"config for ram w/ PMEMOBJ_CONF w/o sds.at_create should should be updated": {
+			cfg:             validConfig().WithEnvVars("PMEMOBJ_CONF=foo_bar"),
+			expPMEMOBJ_CONF: "foo_bar;sds.at_create=0",
+		},
+		"config for ram with sds.at_create set to 1 should fail": {
+			cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1),
+			expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
+				"'sds.at_create=0' for non-DCPM storage class" +
+				", found 'sds.at_create=1'"),
+		},
+		"config for ram w/ PMEMOBJ_CONF w/ sds.at_create=1 should fail": {
+			cfg: validConfig().
+				WithEnvVars("PMEMOBJ_CONF=sds.at_create=1;foo-bar"),
+			expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
+				"'sds.at_create=0' for non-DCPM storage class" +
+				", found 'sds.at_create=1;foo-bar'"),
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(false))
+			if len(tc.expPMEMOBJ_CONF) > 0 {
+				sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF")
+				test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF")
+				test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create,
+					"Invalid PMEMOBJ_CONF")
+			}
+
+		})
+	}
+}
+
+func TestConfig_UpdatePMDKEnvars(t *testing.T) {
+	validConfig := func(storageclas string) *Config {
+		return MockConfig().WithStorage(
+			storage.NewTierConfig().
+				WithStorageClass(storageclas))
+	}
+	for name, tc := range map[string]struct {
+		cfg                   *Config
+		expErr                error
+		expPMEMOBJ_CONF       string
+		expABTthreadStackSize int
+	}{
+		"empty config should fail": {
+			cfg:                   MockConfig(),
+			expErr:                errors.New("Invalid config - no tier 0 defined"),
+			expABTthreadStackSize: -1,
+		},
+		"valid config for RAM should not fail": {
+			cfg: validConfig("ram").
+				WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
+			expPMEMOBJ_CONF:       "sds.at_create=0",
+			expABTthreadStackSize: minABTThreadStackSizeDCPM - 1,
+		},
+		"invalid config for RAM should fail": {
+			cfg: validConfig("ram").WithEnvVarPMemObjSdsAtCreate(1),
+			expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
+				"'sds.at_create=0' for non-DCPM storage class, " +
+				"found 'sds.at_create=1'"),
+			expABTthreadStackSize: -1,
+		},
+		"valid config for DCPM should not fail": {
+			cfg:                   validConfig("dcpm"),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM,
+		},
+		"invalid config for DCPM should not fail": {
+			cfg: validConfig("dcpm").
+				WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
+			expErr: errors.New("env_var ABT_THREAD_STACKSIZE should be >= 20480 " +
+				"for DCPM storage class, found 20479"),
+			expABTthreadStackSize: minABTThreadStackSizeDCPM - 1,
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			errTc := tc.cfg.UpdatePMDKEnvars()
+			test.CmpErr(t, tc.expErr, errTc)
+			if len(tc.expPMEMOBJ_CONF) > 0 {
+				sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF")
+				test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF")
+				test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create,
+					"Invalid PMEMOBJ_CONF")
+			}
+			if tc.expABTthreadStackSize >= 0 {
+				stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE")
+				test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE")
+				stackSizeVal, err := strconv.Atoi(stackSizeStr)
+				test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE")
+				test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal,
+					"Invalid ABT_THREAD_STACKSIZE value")
+			}
+		})
+	}
+}
diff --git a/src/control/server/server.go b/src/control/server/server.go
index 4ac8ce5e04b..c9232dd1725 100644
--- a/src/control/server/server.go
+++ b/src/control/server/server.go
@@ -103,6 +103,12 @@ func processConfig(log logging.Logger, cfg *config.Server, fis *hardware.FabricI
 		return err
 	}
 
+	for _, ec := range cfg.Engines {
+		if err := ec.UpdatePMDKEnvars(); err != nil {
+			return err
+		}
+	}
+
 	return nil
 }
 
diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c
index b03215d08aa..e1d6ee79bce 100644
--- a/src/mgmt/srv_target.c
+++ b/src/mgmt/srv_target.c
@@ -1180,7 +1180,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 		/* A zero size accommodates the existing file */
 		vpa.vpa_scm_size = 0;
 		vpa.vpa_nvme_size = tc_in->tc_nvme_size / dss_tgt_nr;
-		rc = dss_thread_collective(tgt_vos_create_one, &vpa, 0);
+		rc = dss_thread_collective(tgt_vos_create_one, &vpa, DSS_ULT_DEEP_STACK);
 		if (rc) {
 			D_ERROR(DF_UUID": thread collective tgt_vos_create_one failed, "DF_RC"\n",
 				DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
diff --git a/utils/build.config b/utils/build.config
index 080ebae1283..52749b3f1ce 100644
--- a/utils/build.config
+++ b/utils/build.config
@@ -4,7 +4,7 @@ component=daos
 [commit_versions]
 argobots=v1.1
 fuse=fuse-3.16.2
-pmdk=2.0.0
+pmdk=2.1.0
 isal=v2.30.0
 isal_crypto=v2.23.0
 spdk=v22.01.2
@@ -30,3 +30,4 @@ spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc7566497
 ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff
 fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff
 mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch
+pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff
diff --git a/utils/rpms/daos.rpmlintrc b/utils/rpms/daos.rpmlintrc
index 09022a74e59..bbf0f9e2b92 100644
--- a/utils/rpms/daos.rpmlintrc
+++ b/utils/rpms/daos.rpmlintrc
@@ -47,10 +47,10 @@ addFilter("E: static-library-without-debuginfo \/usr\/lib64\/lib(dfuse|ioil)\.a"
 addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)).so")
 
 # Tests rpm needs to be able to build daos from source so pulls in build deps and is expected.
-addFilter("daos-client-tests.x86_64: E: devel-dependency protobuf-c-devel")
+addFilter("daos-client-tests\.x86_64: E: devel-dependency protobuf-c-devel")
 
 # a functional test builds daos from source, so it needs the various *-devel packages for daos' build dependencies.
-addFilter("daos-client-tests.x86_64: E: devel-dependency capstone-devel")
-addFilter("daos-client-tests.x86_64: E: explicit-lib-dependency libcapstone-devel")
-addFilter("daos-client-tests.x86_64: E: devel-dependency libcapstone-devel")
-addFilter("daos-client-tests.x86_64: E: devel-dependency fuse3-devel")
+addFilter("daos-client-tests\.x86_64: E: devel-dependency capstone-devel")
+addFilter("daos-client-tests\.x86_64: E: explicit-lib-dependency lib(capstone|ndctl)-devel")
+addFilter("daos-client-tests\.x86_64: E: devel-dependency libcapstone-devel")
+addFilter("daos-client-tests\.x86_64: E: devel-dependency fuse3-devel")
diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec
index 14fa74776a2..29576bc71a1 100644
--- a/utils/rpms/daos.spec
+++ b/utils/rpms/daos.spec
@@ -15,7 +15,7 @@
 
 Name:          daos
 Version:       2.6.1
-Release:       3%{?relval}%{?dist}
+Release:       4%{?relval}%{?dist}
 Summary:       DAOS Storage Engine
 
 License:       BSD-2-Clause-Patent
@@ -49,7 +49,7 @@ BuildRequires: libabt-devel >= 1.0rc1
 BuildRequires: libjson-c-devel
 BuildRequires: boost-devel
 %endif
-BuildRequires: libpmemobj-devel >= 2.0.0
+BuildRequires: libpmemobj-devel >= 2.1.0
 %if (0%{?rhel} >= 8)
 BuildRequires: fuse3-devel >= 3
 %else
@@ -147,11 +147,11 @@ Requires: ndctl
 # needed to set PMem configuration goals in BIOS through control-plane
 %if (0%{?suse_version} >= 1500)
 Requires: ipmctl >= 03.00.00.0423
-Requires: libpmemobj1 >= 2.0.0-1.suse1500
+Requires: libpmemobj1 >= 2.1.0-1.suse1500
 Requires: libfabric1 >= %{libfabric_version}
 %else
 Requires: ipmctl >= 03.00.00.0468
-Requires: libpmemobj >= 2.0.0-1%{?dist}
+Requires: libpmemobj >= 2.1.0-1%{?dist}
 %endif
 Requires: libfabric >= %{libfabric_version}
 Requires: mercury >= %{mercury_version}
@@ -232,6 +232,13 @@ Requires: fuse3-devel >= 3
 Requires: fuse3-devel >= 3.4.2
 %endif
 Requires: pciutils-devel
+%if (0%{?suse_version} > 0)
+Requires: libndctl-devel
+%endif
+%if (0%{?rhel} >= 8)
+Requires: ndctl-devel
+Requires: daxctl-devel
+%endif
 
 %description client-tests
 This is the package needed to run the DAOS test suite (client tests)
@@ -591,6 +598,20 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 # No files in a shim package
 
 %changelog
+
+* Wed Oct 02 2024 Tomasz Gromadzki <tomasz.gromadzki@intel.com> 2.6.1-4
+- Add support of the PMDK package 2.1.0 with NDCTL enabled.
+  * Increase the default ULT stack size to 20KiB if the engine uses
+    the DCPM storage class.
+  * Prevent using the RAM storage class (simulated PMem) when
+    the shutdown state (SDS) is active.
+    * Automatically disable SDS for the RAM storage class on engine startup.
+    * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0'
+      environment variable to deactivate SDS for the DAOS tools
+      (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM.
+      Otherwise, a user is supposed to be stopped by an error
+      like: "Unsafe shutdown count is not supported for this source".
+
 * Tue Oct 01 2024 Phillip Henderson <phillip.henderson@intel.com> 2.6.1-3
 - Third release candidate for 2.6.1
 
diff --git a/utils/run_utest.py b/utils/run_utest.py
index c2f4ffd002d..1835f230e36 100755
--- a/utils/run_utest.py
+++ b/utils/run_utest.py
@@ -440,6 +440,7 @@ def run(self, base, memcheck, sudo):
             cmd = new_cmd
         self.last = cmd
 
+        self.env.update({"PMEMOBJ_CONF": "sds.at_create=0"})
         if self.suite.gha:
             retval = run_cmd(cmd, env=self.env)
         else:
diff --git a/utils/scripts/install-el8.sh b/utils/scripts/install-el8.sh
index 81a044bfddb..472f88c9925 100755
--- a/utils/scripts/install-el8.sh
+++ b/utils/scripts/install-el8.sh
@@ -20,6 +20,7 @@ dnf --nodocs install \
     clang-tools-extra \
     cmake \
     CUnit-devel \
+    daxctl-devel \
     diffutils \
     e2fsprogs \
     file \
@@ -48,6 +49,7 @@ dnf --nodocs install \
     lz4-devel \
     make \
     ndctl \
+    ndctl-devel \
     numactl \
     numactl-devel \
     openmpi-devel \
diff --git a/utils/scripts/install-el9.sh b/utils/scripts/install-el9.sh
index 9ddd8c257d6..092de8eba0f 100755
--- a/utils/scripts/install-el9.sh
+++ b/utils/scripts/install-el9.sh
@@ -18,6 +18,7 @@ dnf --nodocs install \
     clang-tools-extra \
     cmake \
     CUnit-devel \
+    daxctl-devel \
     diffutils \
     e2fsprogs \
     file \
@@ -47,6 +48,7 @@ dnf --nodocs install \
     lz4-devel \
     make \
     ndctl \
+    ndctl-devel \
     numactl \
     numactl-devel \
     openmpi-devel \
diff --git a/utils/scripts/install-leap15.sh b/utils/scripts/install-leap15.sh
index fc9826ce508..0eb8ef44fee 100755
--- a/utils/scripts/install-leap15.sh
+++ b/utils/scripts/install-leap15.sh
@@ -38,6 +38,7 @@ dnf --nodocs install \
     libjson-c-devel \
     libltdl7 \
     liblz4-devel \
+    libndctl-devel \
     libnuma-devel \
     libopenssl-devel \
     libprotobuf-c-devel \
diff --git a/utils/scripts/install-ubuntu.sh b/utils/scripts/install-ubuntu.sh
index ec21c92f474..a36641d5f10 100755
--- a/utils/scripts/install-ubuntu.sh
+++ b/utils/scripts/install-ubuntu.sh
@@ -29,11 +29,13 @@ apt-get install \
     libcapstone-dev \
     libcmocka-dev \
     libcunit1-dev \
+    libdaxctl-dev \
     libfuse3-dev \
     libhwloc-dev \
     libibverbs-dev \
     libjson-c-dev \
     liblz4-dev \
+    libndctl-dev \
     libnuma-dev \
     libopenmpi-dev \
     libpci-dev \

From 60d4b5dc682463eb61f829b415cc31ea0b993ebc Mon Sep 17 00:00:00 2001
From: Li Wei <wei.g.li@intel.com>
Date: Fri, 18 Oct 2024 17:39:03 +0900
Subject: [PATCH 05/34] DAOS-16653 pool: Batch crt events (#15230) (#15302)

* DAOS-16653 pool: Batch crt events

When multiple engines become unavailable around the same time, if a pool
cannot tolerate the unavailability of those engines, it is sometimes
desired that the pool would not exclude any of the engines. Hence, this
patch introduces a CaRT event delay, tunable via the server-side
environment variable, CRT_EVENT_DELAY, so that the events signaling the
unavailability of those engines will be handled in hopefully one batch,
giving pool_svc_update_map_internal a chance to reject the pool map
update based on the RF check.

When the RF check rejects a pool map change, we should revisit the
corresponding events later, rather than simply throwing them away. This
patch improves this case by returning the events back to the event
queue, and pause the queue handling until next new event or pool map
update.

  - Introduce event sets: pool_svc_event_set. Now the event queue can be
    simplified to just one event set.

  - Add the ability to pause and resume the event handling: pse_paused.

  - Track the time when the latest event was queued: pse_time.

Signed-off-by: Li Wei <wei.g.li@intel.com>
---
 docs/admin/env_variables.md                 |   1 +
 src/pool/srv_pool.c                         | 454 +++++++++++++++-----
 src/tests/ftest/util/server_utils_params.py |   1 +
 3 files changed, 346 insertions(+), 110 deletions(-)

diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md
index 060c3790d57..31e0f3e8767 100644
--- a/docs/admin/env_variables.md
+++ b/docs/admin/env_variables.md
@@ -44,6 +44,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_MD\_CAP         |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.|
 |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.|
 |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.|
+|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
 |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.|
 |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"|
 |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.|
diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c
index 667e4bc6ed6..2d80088d4b6 100644
--- a/src/pool/srv_pool.c
+++ b/src/pool/srv_pool.c
@@ -62,7 +62,6 @@ ds_pool_get_vos_df_version(uint32_t pool_global_version)
 
 /* Pool service crt event */
 struct pool_svc_event {
-	d_list_t		psv_link;
 	d_rank_t		psv_rank;
 	uint64_t		psv_incarnation;
 	enum crt_event_source	psv_src;
@@ -72,15 +71,40 @@ struct pool_svc_event {
 #define DF_PS_EVENT	"rank=%u inc="DF_U64" src=%d type=%d"
 #define DP_PS_EVENT(e)	e->psv_rank, e->psv_incarnation, e->psv_src, e->psv_type
 
-#define RECHOOSE_SLEEP_MS 250
+/*
+ * Pool service crt event set
+ *
+ * This stores an unordered array of pool_svc_event objects. For all different
+ * i and j, we have pss_buf[i].psv_rank != pss_buf[j].psv_rank.
+ *
+ * An event set facilitates the merging of a sequence of events. For instance,
+ * sequence (in the format <rank, type>)
+ *   <3, D>, <5, D>, <1, D>, <5, A>, <1, A>, <1, D>
+ * will merge into set
+ *   <3, D>, <5, A>, <1, D>
+ * (that is, during the merge, an event overrides a previuos event of the same
+ * rank in the set).
+ */
+struct pool_svc_event_set {
+	struct pool_svc_event *pss_buf;
+	uint32_t               pss_len;
+	uint32_t               pss_cap;
+};
+
+#define DF_PS_EVENT_SET    "len=%u"
+#define DP_PS_EVENT_SET(s) s->pss_len
 
 /* Pool service crt-event-handling state */
 struct pool_svc_events {
-	ABT_mutex		pse_mutex;
-	ABT_cond		pse_cv;
-	d_list_t		pse_queue;
-	ABT_thread		pse_handler;
-	bool			pse_stop;
+	ABT_mutex                  pse_mutex;
+	ABT_cond                   pse_cv;
+	struct pool_svc_event_set *pse_pending;
+	uint64_t                   pse_timeout; /* s */
+	uint64_t                   pse_time;    /* s */
+	struct sched_request      *pse_timer;
+	ABT_thread                 pse_handler;
+	bool                       pse_stop;
+	bool                       pse_paused;
 };
 
 /* Pool service schedule state */
@@ -1162,6 +1186,15 @@ pool_svc_locate_cb(d_iov_t *id, char **path)
 	return 0;
 }
 
+static unsigned int
+get_crt_event_delay(void)
+{
+	unsigned int t = 10 /* s */;
+
+	d_getenv_uint("CRT_EVENT_DELAY", &t);
+	return t;
+}
+
 static int
 pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc)
 {
@@ -1182,7 +1215,7 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc)
 	d_iov_set(&svc->ps_rsvc.s_id, svc->ps_uuid, sizeof(uuid_t));
 
 	uuid_copy(svc->ps_uuid, id->iov_buf);
-	D_INIT_LIST_HEAD(&svc->ps_events.pse_queue);
+	svc->ps_events.pse_timeout = get_crt_event_delay();
 	svc->ps_events.pse_handler = ABT_THREAD_NULL;
 	svc->ps_svc_rf = -1;
 	svc->ps_force_notify = false;
@@ -1300,98 +1333,221 @@ ds_pool_enable_exclude(void)
 	pool_disable_exclude = false;
 }
 
+static int
+alloc_event_set(struct pool_svc_event_set **event_set)
+{
+	D_ALLOC_PTR(*event_set);
+	if (*event_set == NULL)
+		return -DER_NOMEM;
+	return 0;
+}
+
+static void
+free_event_set(struct pool_svc_event_set **event_set)
+{
+	D_FREE((*event_set)->pss_buf);
+	D_FREE(*event_set);
+}
+
+static int
+add_to_event_set(struct pool_svc_event_set *event_set, d_rank_t rank, uint64_t incarnation,
+		 enum crt_event_source src, enum crt_event_type type)
+{
+	int i;
+
+	/* Find rank in event_set. */
+	for (i = 0; i < event_set->pss_len; i++)
+		if (event_set->pss_buf[i].psv_rank == rank)
+			break;
+
+	/* If not found, prepare to add a new event. */
+	if (i == event_set->pss_len) {
+		if (event_set->pss_len == event_set->pss_cap) {
+			uint32_t               cap;
+			struct pool_svc_event *buf;
+
+			if (event_set->pss_cap == 0)
+				cap = 1;
+			else
+				cap = 2 * event_set->pss_cap;
+			D_REALLOC_ARRAY(buf, event_set->pss_buf, event_set->pss_cap, cap);
+			if (buf == NULL)
+				return -DER_NOMEM;
+			event_set->pss_buf = buf;
+			event_set->pss_cap = cap;
+		}
+		event_set->pss_len++;
+	}
+
+	event_set->pss_buf[i].psv_rank        = rank;
+	event_set->pss_buf[i].psv_incarnation = incarnation;
+	event_set->pss_buf[i].psv_src         = src;
+	event_set->pss_buf[i].psv_type        = type;
+	return 0;
+}
+
+/* Merge next into prev. */
+static int
+merge_event_sets(struct pool_svc_event_set *prev, struct pool_svc_event_set *next)
+{
+	int i;
+
+	for (i = 0; i < next->pss_len; i++) {
+		struct pool_svc_event *event = &next->pss_buf[i];
+		int                    rc;
+
+		rc = add_to_event_set(prev, event->psv_rank, event->psv_incarnation, event->psv_src,
+				      event->psv_type);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
 static int
 queue_event(struct pool_svc *svc, d_rank_t rank, uint64_t incarnation, enum crt_event_source src,
 	    enum crt_event_type type)
 {
 	struct pool_svc_events *events = &svc->ps_events;
-	struct pool_svc_event  *event;
+	int                     rc;
+	bool                    allocated = false;
 
-	D_ALLOC_PTR(event);
-	if (event == NULL)
-		return -DER_NOMEM;
+	D_DEBUG(DB_MD, DF_UUID ": queuing event: " DF_PS_EVENT "\n", DP_UUID(svc->ps_uuid), rank,
+		incarnation, src, type);
 
-	event->psv_rank = rank;
-	event->psv_incarnation = incarnation;
-	event->psv_src = src;
-	event->psv_type = type;
+	ABT_mutex_lock(events->pse_mutex);
 
-	D_DEBUG(DB_MD, DF_UUID": queuing event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid),
-		DP_PS_EVENT(event));
+	if (events->pse_pending == NULL) {
+		rc = alloc_event_set(&events->pse_pending);
+		if (rc != 0)
+			goto out;
+		allocated = true;
+	}
+
+	rc = add_to_event_set(events->pse_pending, rank, incarnation, src, type);
+	if (rc != 0)
+		goto out;
+
+	events->pse_time = daos_gettime_coarse();
+
+	if (events->pse_paused) {
+		D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid));
+		events->pse_paused = false;
+	}
 
-	ABT_mutex_lock(events->pse_mutex);
-	d_list_add_tail(&event->psv_link, &events->pse_queue);
 	ABT_cond_broadcast(events->pse_cv);
+
+out:
+	if (rc != 0 && allocated)
+		free_event_set(&events->pse_pending);
 	ABT_mutex_unlock(events->pse_mutex);
-	return 0;
+	return rc;
 }
 
 static void
-discard_events(d_list_t *queue)
+resume_event_handling(struct pool_svc *svc)
 {
-	struct pool_svc_event  *event;
-	struct pool_svc_event  *tmp;
+	struct pool_svc_events *events = &svc->ps_events;
 
-	d_list_for_each_entry_safe(event, tmp, queue, psv_link) {
-		D_DEBUG(DB_MD, "discard event: "DF_PS_EVENT"\n", DP_PS_EVENT(event));
-		d_list_del_init(&event->psv_link);
-		D_FREE(event);
+	ABT_mutex_lock(events->pse_mutex);
+	if (events->pse_paused) {
+		D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid));
+		events->pse_paused = false;
+		ABT_cond_broadcast(events->pse_cv);
 	}
+	ABT_mutex_unlock(events->pse_mutex);
 }
 
-static int pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank);
+static int pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set);
 
-static void
-handle_event(struct pool_svc *svc, struct pool_svc_event *event)
+static int
+handle_event(struct pool_svc *svc, struct pool_svc_event_set *event_set)
 {
+	int i;
 	int rc;
 
-	if ((event->psv_src != CRT_EVS_GRPMOD && event->psv_src != CRT_EVS_SWIM) ||
-	    (event->psv_type == CRT_EVT_DEAD && pool_disable_exclude)) {
-		D_DEBUG(DB_MD, "ignore event: "DF_PS_EVENT" exclude=%d\n", DP_PS_EVENT(event),
-			pool_disable_exclude);
-		goto out;
+	D_INFO(DF_UUID ": handling event set: " DF_PS_EVENT_SET "\n", DP_UUID(svc->ps_uuid),
+	       DP_PS_EVENT_SET(event_set));
+
+	if (!pool_disable_exclude) {
+		rc = pool_svc_exclude_ranks(svc, event_set);
+		if (rc != 0) {
+			DL_ERROR(rc, DF_UUID ": failed to exclude ranks", DP_UUID(svc->ps_uuid));
+			return rc;
+		}
 	}
 
-	if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD &&
-	    event->psv_type == CRT_EVT_DEAD) {
-		D_DEBUG(DB_MD, "ignore exclusion of self\n");
-		goto out;
+	/*
+	 * Check if the alive ranks are up in the pool map. If in the future we
+	 * add automatic reintegration below, for instance, we may need
+	 * to not only take svc->ps_lock, but also employ an RDB TX by
+	 * the book.
+	 */
+	ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
+	for (i = 0; i < event_set->pss_len; i++) {
+		struct pool_svc_event *event = &event_set->pss_buf[i];
+
+		if (event->psv_src != CRT_EVS_SWIM || event->psv_type != CRT_EVT_ALIVE)
+			continue;
+		if (ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank)) {
+			/*
+			 * The rank is up in the pool map. Request a pool map
+			 * distribution just in case the rank has recently
+			 * restarted and does not have a copy of the pool map.
+			 */
+			ds_rsvc_request_map_dist(&svc->ps_rsvc);
+			D_DEBUG(DB_MD, DF_UUID ": requested map dist for rank %u\n",
+				DP_UUID(svc->ps_uuid), event->psv_rank);
+			break;
+		}
 	}
+	ABT_rwlock_unlock(svc->ps_pool->sp_lock);
 
-	D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid),
-		DP_PS_EVENT(event));
+	return 0;
+}
 
-	if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) {
-		/*
-		 * Check if the rank is up in the pool map. If in the future we
-		 * add automatic reintegration below, for instance, we may need
-		 * to not only take svc->ps_lock, but also employ an RDB TX by
-		 * the book.
-		 */
-		ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
-		rc = ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank);
-		ABT_rwlock_unlock(svc->ps_pool->sp_lock);
-		if (!rc)
-			goto out;
+struct event_timer_arg {
+	struct pool_svc_events *eta_events;
+	uint64_t                eta_deadline;
+};
 
-		/*
-		 * The rank is up in the pool map. Request a pool map
-		 * distribution just in case the rank has recently restarted
-		 * and does not have a copy of the pool map.
-		 */
-		ds_rsvc_request_map_dist(&svc->ps_rsvc);
-		D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n",
-			DP_UUID(svc->ps_uuid), event->psv_rank);
-	} else if (event->psv_type == CRT_EVT_DEAD) {
-		rc = pool_svc_exclude_rank(svc, event->psv_rank);
-		if (rc != 0)
-			D_ERROR(DF_UUID": failed to exclude rank %u: "DF_RC"\n",
-				DP_UUID(svc->ps_uuid), event->psv_rank, DP_RC(rc));
-	}
+static void
+event_timer(void *varg)
+{
+	struct event_timer_arg *arg = varg;
+	struct pool_svc_events *events = arg->eta_events;
+	int64_t                 time_left = arg->eta_deadline - daos_gettime_coarse();
 
-out:
-	return;
+	if (time_left > 0)
+		sched_req_sleep(events->pse_timer, time_left * 1000);
+	ABT_cond_broadcast(events->pse_cv);
+}
+
+static int
+start_event_timer(struct event_timer_arg *arg)
+{
+	struct pool_svc_events *events = arg->eta_events;
+	uuid_t                  uuid;
+	struct sched_req_attr   attr;
+
+	D_ASSERT(events->pse_timer == NULL);
+	uuid_clear(uuid);
+	sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &uuid);
+	events->pse_timer = sched_create_ult(&attr, event_timer, arg, 0);
+	if (events->pse_timer == NULL)
+		return -DER_NOMEM;
+	return 0;
+}
+
+static void
+stop_event_timer(struct event_timer_arg *arg)
+{
+	struct pool_svc_events *events = arg->eta_events;
+
+	D_ASSERT(events->pse_timer != NULL);
+	sched_req_wait(events->pse_timer, true /* abort */);
+	sched_req_put(events->pse_timer);
+	events->pse_timer = NULL;
 }
 
 static void
@@ -1403,31 +1559,83 @@ events_handler(void *arg)
 	D_DEBUG(DB_MD, DF_UUID": starting\n", DP_UUID(svc->ps_uuid));
 
 	for (;;) {
-		struct pool_svc_event  *event = NULL;
-		bool			stop;
+		struct pool_svc_event_set *event_set = NULL;
+		bool                       stop;
+		int                        rc;
 
 		ABT_mutex_lock(events->pse_mutex);
 		for (;;) {
+			struct event_timer_arg timer_arg;
+			int64_t                time_left;
+
 			stop = events->pse_stop;
 			if (stop) {
-				discard_events(&events->pse_queue);
+				events->pse_paused = false;
+				if (events->pse_pending != NULL)
+					free_event_set(&events->pse_pending);
 				break;
 			}
-			if (!d_list_empty(&events->pse_queue)) {
-				event = d_list_entry(events->pse_queue.next, struct pool_svc_event,
-						     psv_link);
-				d_list_del_init(&event->psv_link);
+
+			timer_arg.eta_events   = events;
+			timer_arg.eta_deadline = events->pse_time + events->pse_timeout;
+
+			time_left = timer_arg.eta_deadline - daos_gettime_coarse();
+			if (events->pse_pending != NULL && !events->pse_paused && time_left <= 0) {
+				event_set = events->pse_pending;
+				events->pse_pending = NULL;
 				break;
 			}
+
+			/* A simple timed cond_wait without polling. */
+			if (time_left > 0) {
+				rc = start_event_timer(&timer_arg);
+				if (rc != 0) {
+					/* No delay then. */
+					DL_ERROR(rc, DF_UUID ": failed to start event timer",
+						 DP_UUID(svc->ps_uuid));
+					events->pse_time = 0;
+					continue;
+				}
+			}
 			sched_cond_wait(events->pse_cv, events->pse_mutex);
+			if (time_left > 0)
+				stop_event_timer(&timer_arg);
 		}
 		ABT_mutex_unlock(events->pse_mutex);
 		if (stop)
 			break;
 
-		handle_event(svc, event);
+		rc = handle_event(svc, event_set);
+		if (rc != 0) {
+			/* Put event_set back to events->pse_pending. */
+			D_DEBUG(DB_MD, DF_UUID ": returning event set\n", DP_UUID(svc->ps_uuid));
+			ABT_mutex_lock(events->pse_mutex);
+			if (events->pse_pending == NULL) {
+				/*
+				 * No pending events; pause the handling until
+				 * next event or pool map change.
+				 */
+				D_DEBUG(DB_MD, DF_UUID ": pausing event handling\n",
+					DP_UUID(svc->ps_uuid));
+				events->pse_paused = true;
+			} else {
+				/*
+				 * There are pending events; do not pause the
+				 * handling.
+				 */
+				rc = merge_event_sets(event_set, events->pse_pending);
+				if (rc != 0)
+					DL_ERROR(rc, DF_UUID ": failed to merge events",
+						 DP_UUID(svc->ps_uuid));
+				free_event_set(&events->pse_pending);
+			}
+			events->pse_pending = event_set;
+			event_set = NULL;
+			ABT_mutex_unlock(events->pse_mutex);
+		}
 
-		D_FREE(event);
+		if (event_set != NULL)
+			free_event_set(&event_set);
 		ABT_thread_yield();
 	}
 
@@ -1437,7 +1645,7 @@ events_handler(void *arg)
 static bool
 events_pending(struct pool_svc *svc)
 {
-	return !d_list_empty(&svc->ps_events.pse_queue);
+	return svc->ps_events.pse_pending != NULL;
 }
 
 static void
@@ -1461,9 +1669,11 @@ init_events(struct pool_svc *svc)
 	struct pool_svc_events *events = &svc->ps_events;
 	int			rc;
 
-	D_ASSERT(d_list_empty(&events->pse_queue));
+	D_ASSERT(events->pse_pending == NULL);
+	D_ASSERT(events->pse_timer == NULL);
 	D_ASSERT(events->pse_handler == ABT_THREAD_NULL);
-	D_ASSERT(events->pse_stop == false);
+	D_ASSERT(!events->pse_stop);
+	D_ASSERT(!events->pse_paused);
 
 	if (!ds_pool_skip_for_check(svc->ps_pool)) {
 		rc = crt_register_event_cb(ds_pool_crt_event_cb, svc);
@@ -1498,7 +1708,8 @@ init_events(struct pool_svc *svc)
 err_cb:
 	if (!ds_pool_skip_for_check(svc->ps_pool))
 		crt_unregister_event_cb(ds_pool_crt_event_cb, svc);
-	discard_events(&events->pse_queue);
+	if (events->pse_pending != NULL)
+		free_event_set(&events->pse_pending);
 err:
 	return rc;
 }
@@ -1507,7 +1718,6 @@ static void
 fini_events(struct pool_svc *svc)
 {
 	struct pool_svc_events *events = &svc->ps_events;
-	int			rc;
 
 	D_ASSERT(events->pse_handler != ABT_THREAD_NULL);
 
@@ -1519,8 +1729,6 @@ fini_events(struct pool_svc *svc)
 	ABT_cond_broadcast(events->pse_cv);
 	ABT_mutex_unlock(events->pse_mutex);
 
-	rc = ABT_thread_join(events->pse_handler);
-	D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc));
 	ABT_thread_free(&events->pse_handler);
 	events->pse_handler = ABT_THREAD_NULL;
 	events->pse_stop = false;
@@ -4332,7 +4540,7 @@ ds_pool_svc_list_cont(uuid_t uuid, d_rank_list_t *ranks,
 		list_cont_bulk_destroy(bulk);
 		D_FREE(resp_cont);
 		crt_req_decref(rpc);
-		dss_sleep(RECHOOSE_SLEEP_MS);
+		dss_sleep(250);
 		D_GOTO(rechoose, rc);
 	}
 
@@ -6501,7 +6709,7 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma
 }
 
 static int
-pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank)
+pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map)
 {
 	crt_group_t		*primary_grp;
 	struct pool_domain	*doms;
@@ -6517,13 +6725,10 @@ pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank)
 
 	D_CRIT("!!! Please try to recover these engines in top priority -\n");
 	D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n");
-	D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n",
-	       DP_UUID(svc->ps_uuid), rank);
 	for (i = 0; i < doms_cnt; i++) {
 		struct swim_member_state state;
 
-		if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) ||
-		    (doms[i].do_comp.co_rank == rank))
+		if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN))
 			continue;
 
 		rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state);
@@ -6701,8 +6906,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 	 * with CRIT log message to ask administrator to bring back the engine.
 	 */
 	if (src == MUS_SWIM && opc == MAP_EXCLUDE) {
-		d_rank_t	rank;
-		int		failed_cnt;
+		int failed_cnt;
 
 		rc = pool_map_update_failed_cnt(map);
 		if (rc != 0) {
@@ -6711,15 +6915,19 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 			goto out_map;
 		}
 
-		D_ASSERT(tgt_addrs->pta_number == 1);
-		rank = tgt_addrs->pta_addrs->pta_rank;
 		failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE);
-		D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n",
-		       DP_UUID(svc->ps_uuid), rank, failed_cnt);
+		D_INFO(DF_UUID": SWIM exclude %d ranks, failed NODE %d\n",
+		       DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, failed_cnt);
 		if (failed_cnt > pw_rf) {
-			D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n",
-			       DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt);
-			rc = pool_map_crit_prompt(svc, map, rank);
+			D_CRIT(DF_UUID": exclude %d ranks will break pool RF %d, failed_cnt %d\n",
+			       DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, pw_rf, failed_cnt);
+			ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
+			rc = pool_map_crit_prompt(svc, svc->ps_pool->sp_map);
+			ABT_rwlock_unlock(svc->ps_pool->sp_lock);
+			if (rc != 0)
+				DL_ERROR(rc, DF_UUID ": failed to log prompt",
+					 DP_UUID(svc->ps_uuid));
+			rc = -DER_RF;
 			goto out_map;
 		}
 	}
@@ -6768,6 +6976,9 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 
 	ds_rsvc_request_map_dist(&svc->ps_rsvc);
 
+	/* See events_handler. */
+	resume_event_handling(svc);
+
 	rc = pool_svc_schedule_reconf(svc, NULL /* map */, map_version, false /* sync_remove */);
 	if (rc != 0) {
 		DL_INFO(rc, DF_UUID": failed to schedule pool service reconfiguration",
@@ -7167,28 +7378,51 @@ ds_pool_update_handler_v5(crt_rpc_t *rpc)
 }
 
 static int
-pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank)
+pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set)
 {
 	struct pool_target_addr_list	list;
 	struct pool_target_addr_list	inval_list_out = { 0 };
-	struct pool_target_addr		tgt_rank;
+	struct pool_target_addr	       *addrs;
+	d_rank_t			self_rank = dss_self_rank();
 	uint32_t			map_version = 0;
+	int				n = 0;
+	int				i;
 	int				rc;
 
-	tgt_rank.pta_rank = rank;
-	tgt_rank.pta_target = -1;
-	list.pta_number = 1;
-	list.pta_addrs = &tgt_rank;
+	D_ALLOC_ARRAY(addrs, event_set->pss_len);
+	if (addrs == NULL)
+		return -DER_NOMEM;
+	for (i = 0; i < event_set->pss_len; i++) {
+		struct pool_svc_event *event = &event_set->pss_buf[i];
+
+		if (event->psv_type != CRT_EVT_DEAD)
+			continue;
+		if (event->psv_src == CRT_EVS_GRPMOD && event->psv_rank == self_rank) {
+			D_DEBUG(DB_MD, DF_UUID ": ignore exclusion of self\n",
+				DP_UUID(svc->ps_uuid));
+			continue;
+		}
+		addrs[n].pta_rank   = event->psv_rank;
+		addrs[n].pta_target = -1;
+		n++;
+	}
+	if (n == 0) {
+		rc = 0;
+		goto out;
+	}
+	list.pta_number = n;
+	list.pta_addrs  = addrs;
 
 	rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */,
 				 NULL, NULL, 0, &list, &inval_list_out, &map_version,
 				 NULL /* hint */, MUS_SWIM);
 
-	D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n",
-		DP_UUID(svc->ps_uuid), map_version, rank, rc);
+	D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u ranks %u: rc %d\n",
+		DP_UUID(svc->ps_uuid), map_version, n, rc);
 
 	pool_target_addr_list_free(&inval_list_out);
-
+out:
+	D_FREE(addrs);
 	return rc;
 }
 
diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py
index 440ffe68f82..7cda958d242 100644
--- a/src/tests/ftest/util/server_utils_params.py
+++ b/src/tests/ftest/util/server_utils_params.py
@@ -435,6 +435,7 @@ class EngineYamlParameters(YamlParameters):
         "common": [
             "D_LOG_FILE_APPEND_PID=1",
             "DAOS_POOL_RF=4",
+            "CRT_EVENT_DELAY=1",
             "COVFILE=/tmp/test.cov"],
         "ofi+tcp": [],
         "ofi+tcp;ofi_rxm": [],

From e0f58831ef4b7de01bc0abb4be85d0635a85f3cd Mon Sep 17 00:00:00 2001
From: Dalton Bohning <dalton.bohning@intel.com>
Date: Fri, 18 Oct 2024 06:48:22 -0700
Subject: [PATCH 06/34] DAOS-16720 cq: pin isort to v1.1.0 (#15338) (#15339)

Pin isort to v1.1.0 to avoid suprprise changes and because v1.1.1 is not
working for us.

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
---
 .github/workflows/linting.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 5fce435168a..67c57bd3154 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -26,7 +26,7 @@ jobs:
       - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d  # v5.1.0
         with:
           python-version: '3'
-      - uses: isort/isort-action@master
+      - uses: isort/isort-action@f14e57e1d457956c45a19c05a89cccdf087846e5  # v1.1.0
         with:
           requirementsFiles: "requirements.txt"
       - name: Run on SConstruct file.

From cb9d278be5b6730f294d65b74db17e4a948a9a23 Mon Sep 17 00:00:00 2001
From: Ken Cain <kenneth.c.cain@intel.com>
Date: Sat, 19 Oct 2024 02:53:37 -0400
Subject: [PATCH 07/34] DAOS-15852 test: more timing samples for
 co_op_dup_timing() (#14497) (#15324)

Rarely, this test will produce timings that exceed the failure
threshold. Local and PR/CI experiments have shown that increasing
the test's NUM_OPS to more than 200 iterations greatly reduces
or may eliminate such intermittent timing failures, by "spreading out"
the magnitude of the time spent in the 3 main loops of the test
(0% loops perform fault injections, 33%, and 50%).


Signed-off-by: Kenneth Cain <kenneth.c.cain@intel.com>
---
 src/tests/suite/daos_container.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests/suite/daos_container.c b/src/tests/suite/daos_container.c
index 958f9fd1d16..243a829c317 100644
--- a/src/tests/suite/daos_container.c
+++ b/src/tests/suite/daos_container.c
@@ -3835,7 +3835,7 @@ co_op_dup_timing(void **state)
 	size_t const       in_sizes[]        = {strlen(in_values[0]), strlen(in_values[1])};
 	int                n                 = (int)ARRAY_SIZE(names);
 	const unsigned int NUM_FP            = 3;
-	const uint32_t     NUM_OPS           = 200;
+	const uint32_t     NUM_OPS           = 500;
 	uint32_t           num_failures      = 0;
 	const uint32_t     SVC_OPS_ENABLED   = 1;
 	const uint32_t     SVC_OPS_ENTRY_AGE = 60;

From f8682fb56bc0d7928eb807f45599a5d9e9ada5de Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Sun, 20 Oct 2024 21:55:19 +0800
Subject: [PATCH 08/34] DAOS-16572 rebuild: properly assign
 global_dtx_resync_version in IV - b26 (#15186)

In rebuild_iv_ent_refresh() for refreshing DTX resync version, needs
to assign rt_global_dtx_resync_version firstly before wakeup related
rebuild_scan_leader.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/rebuild/rebuild_iv.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/rebuild/rebuild_iv.c b/src/rebuild/rebuild_iv.c
index cc585e037e1..f4b9f75f407 100644
--- a/src/rebuild/rebuild_iv.c
+++ b/src/rebuild/rebuild_iv.c
@@ -167,6 +167,7 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
 	struct rebuild_tgt_pool_tracker *rpt;
 	struct rebuild_iv *dst_iv = entry->iv_value.sg_iovs[0].iov_buf;
 	struct rebuild_iv *src_iv = src->sg_iovs[0].iov_buf;
+	uint32_t old_ver;
 	int rc = 0;
 
 	rpt = rpt_lookup(src_iv->riv_pool_uuid, -1, src_iv->riv_ver,
@@ -200,16 +201,18 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
 			       dst_iv->riv_stable_epoch);
 		rpt->rt_global_done = dst_iv->riv_global_done;
 		rpt->rt_global_scan_done = dst_iv->riv_global_scan_done;
-		if (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver &&
+		old_ver = rpt->rt_global_dtx_resync_version;
+		if (rpt->rt_global_dtx_resync_version < dst_iv->riv_global_dtx_resyc_version)
+			rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version;
+		if (old_ver < rpt->rt_rebuild_ver &&
 		    dst_iv->riv_global_dtx_resyc_version >= rpt->rt_rebuild_ver) {
 			D_INFO(DF_UUID " global/iv/rebuild_ver %u/%u/%u signal wait cond\n",
-			       DP_UUID(src_iv->riv_pool_uuid), rpt->rt_global_dtx_resync_version,
+			       DP_UUID(src_iv->riv_pool_uuid), old_ver,
 			       dst_iv->riv_global_dtx_resyc_version, rpt->rt_rebuild_ver);
 			ABT_mutex_lock(rpt->rt_lock);
 			ABT_cond_signal(rpt->rt_global_dtx_wait_cond);
 			ABT_mutex_unlock(rpt->rt_lock);
 		}
-		rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version;
 	}
 
 out:

From 81e57d0ec818c1f117afa97418d1f126bc528952 Mon Sep 17 00:00:00 2001
From: Jeff Olivier <jeffolivier@google.com>
Date: Mon, 21 Oct 2024 10:10:03 -0600
Subject: [PATCH 09/34] DAOS-16716 ci: Set reference build for PRs (#15337)

Release branch PRs should use the release branch build
instead of master branch build for NLT reference

Signed-off-by: Jeff Olivier <jeffolivier@google.com>
---
 Jenkinsfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 63c2dc28ae1..13ccc512895 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -801,6 +801,7 @@ pipeline {
                             unitTestPost artifacts: ['nlt_logs/'],
                                          testResults: 'nlt-junit.xml',
                                          always_script: 'ci/unit/test_nlt_post.sh',
+					 referenceJobName: 'daos-stack/daos/release%252F2.6',
                                          valgrind_stash: 'el8-gcc-nlt-memcheck'
                             recordIssues enabledForFailure: true,
                                          failOnError: false,

From c8213799c4c14ad23ab6ef87c386b5e1d16ec089 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Tue, 22 Oct 2024 02:14:44 +0800
Subject: [PATCH 10/34] DAOS-16329 chk: maintenance mode after checking pool
 with dryrun - b26 (#14985)

Sometimes, after system shutdown unexpectedly, the users may expect
to check their critical data under some kind of maintenance mode.
Under such mode, no user data can be modified or moved or aggregated.
That will guarantee no further potential (DAOS logic caused) damage
can happen during the check.

For such purpose, we will enhance current DAOS CR logic with --dryrun
option to allow the pool (after check) to be opened as immutable with
disabling some mechanism that may potentially cause data modification
or movement (such as rebuild or aggregation).

Under such mode, if client wants to connect to the pool, the read-only
option must be specified. Similarly for opening container in such pool.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/chk/chk_common.c            |   2 +-
 src/chk/chk_engine.c            |  13 +--
 src/chk/chk_internal.h          |   1 +
 src/client/dfs/cont.c           |   4 +-
 src/container/srv_container.c   |  32 +++++--
 src/container/srv_target.c      |  10 ++-
 src/include/daos/container.h    |   4 +-
 src/include/daos_cont.h         |   3 +
 src/include/daos_srv/pool.h     |   7 +-
 src/include/daos_srv/security.h |  15 +++-
 src/mgmt/srv_target.c           |   2 +-
 src/pool/srv_internal.h         |   6 ++
 src/pool/srv_pool.c             |  82 +++++++++++++-----
 src/pool/srv_target.c           |  16 ++--
 src/rebuild/srv.c               |   2 +-
 src/security/srv_acl.c          |   8 +-
 src/tests/suite/daos_cr.c       | 144 ++++++++++++++++++++++++++++++++
 17 files changed, 297 insertions(+), 54 deletions(-)

diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c
index ace1c736791..39821ec6328 100644
--- a/src/chk/chk_common.c
+++ b/src/chk/chk_common.c
@@ -403,7 +403,7 @@ chk_pool_restart_svc(struct chk_pool_rec *cpr)
 		if (cpr->cpr_started)
 			chk_pool_shutdown(cpr, true);
 
-		rc = ds_pool_start_after_check(cpr->cpr_uuid);
+		rc = ds_pool_start_after_check(cpr->cpr_uuid, cpr->cpr_immutable);
 		if (rc != 0) {
 			D_WARN("Cannot start full PS for "DF_UUIDF" after CR check: "DF_RC"\n",
 			       DP_UUID(cpr->cpr_uuid), DP_RC(rc));
diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c
index 56e6da3ad9b..ad61af851ce 100644
--- a/src/chk/chk_engine.c
+++ b/src/chk/chk_engine.c
@@ -1797,10 +1797,8 @@ chk_engine_pool_ult(void *args)
 	}
 
 	rc = chk_engine_cont_cleanup(cpr, svc, &aggregator);
-	if (rc != 0)
-		goto out;
-
-	rc = ds_pool_svc_schedule_reconf(svc);
+	if (rc == 0 && !cpr->cpr_immutable)
+		rc = ds_pool_svc_schedule_reconf(svc);
 
 out:
 	chk_engine_cont_list_fini(&aggregator);
@@ -2113,6 +2111,11 @@ chk_engine_start_post(struct chk_instance *ins)
 		if (pool_cbk->cb_phase == CHK__CHECK_SCAN_PHASE__CSP_DONE)
 			continue;
 
+		if (ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_DRYRUN)
+			cpr->cpr_immutable = 1;
+		else
+			cpr->cpr_immutable = 0;
+
 		if (phase > pool_cbk->cb_phase)
 			phase = pool_cbk->cb_phase;
 
@@ -2950,7 +2953,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags)
 	cbk = &cpr->cpr_bk;
 	chk_pool_get(cpr);
 
-	rc = ds_pool_start(uuid, false);
+	rc = ds_pool_start(uuid, false, cpr->cpr_immutable);
 	if (rc != 0)
 		D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc));
 
diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h
index 86868e305ee..9ab16b060b3 100644
--- a/src/chk/chk_internal.h
+++ b/src/chk/chk_internal.h
@@ -596,6 +596,7 @@ struct chk_pool_rec {
 				 cpr_stop:1,
 				 cpr_done:1,
 				 cpr_skip:1,
+				 cpr_immutable:1,
 				 cpr_dangling:1,
 				 cpr_for_orphan:1,
 				 cpr_notified_exit:1,
diff --git a/src/client/dfs/cont.c b/src/client/dfs/cont.c
index 910e1819aa3..27822914056 100644
--- a/src/client/dfs/cont.c
+++ b/src/client/dfs/cont.c
@@ -970,7 +970,9 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
 out_snap:
 	D_FREE(oit_args);
 	epr.epr_hi = epr.epr_lo = snap_epoch;
-	rc2                     = daos_cont_destroy_snap(coh, epr, NULL);
+	rc2 = daos_cont_destroy_snap(coh, epr, NULL);
+	if (rc2 != 0)
+		D_ERROR("Failed to destroy OID table: " DF_RC "\n", DP_RC(rc2));
 	if (rc == 0)
 		rc = daos_der2errno(rc2);
 out_dfs:
diff --git a/src/container/srv_container.c b/src/container/srv_container.c
index 372da43afe4..91d87d9b978 100644
--- a/src/container/srv_container.c
+++ b/src/container/srv_container.c
@@ -1555,9 +1555,9 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont,
 	 * - Users who can delete any container in the pool
 	 * - Users who have been given access to delete the specific container
 	 */
-	if (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) &&
-	    !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred,
-				    &owner, acl)) {
+	if (pool_hdl->sph_pool->sp_immutable ||
+	    (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) &&
+	     !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, &owner, acl))) {
 		D_ERROR(DF_CONT": permission denied to delete cont\n",
 			DP_CONT(pool_hdl->sph_pool->sp_uuid, cont->c_uuid));
 		D_GOTO(out_prop, rc = -DER_NO_PERM);
@@ -2253,6 +2253,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		goto out;
 	}
 
+	if (pool_hdl->sph_pool->sp_immutable && (flags & DAOS_COO_IO_BASE_MASK) != DAOS_COO_RO) {
+		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the immutable "
+			"container with flags " DF_X64 ", sec_capas " DF_X64 ": " DF_RC "\n",
+			DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+			DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, DP_RC(rc));
+		goto out;
+	}
+
 	/*
 	 * Need props to check for pool redundancy requirements and access
 	 * control.
@@ -2274,6 +2283,11 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		D_GOTO(out, rc);
 	}
 
+	D_DEBUG(DB_MD, DF_UUID "/" DF_UUID "/" DF_UUID ": opening container with flags "
+		DF_X64", sec_capas " DF_X64 "/" DF_X64 "\n",
+		DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+		DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas);
+
 	if ((flags & DAOS_COO_EVICT_ALL) && !ds_sec_cont_can_evict_all(sec_capas)) {
 		D_ERROR(DF_CONT": permission denied evicting all handles\n",
 			DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid));
@@ -2282,11 +2296,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr
 		goto out;
 	}
 
-	if ((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) {
-		D_ERROR(DF_CONT": permission denied opening exclusively\n",
-			DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid));
-		daos_prop_free(prop);
+	if (((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) ||
+	    ((flags & DAOS_COO_RW) && !ds_sec_cont_can_modify(sec_capas))) {
 		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the container "
+			"with flags " DF_X64 ", capas " DF_X64 "/" DF_X64 ": " DF_RC "\n",
+			DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid),
+			DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas,
+			DP_RC(rc));
+		daos_prop_free(prop);
 		goto out;
 	}
 
diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index b0f3b693580..a6d3d3c5286 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -935,7 +935,7 @@ cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid,
 			cont_child->sc_stopping, cont_child->sc_destroying);
 		rc = -DER_SHUTDOWN;
 	} else if (!cont_child_started(cont_child)) {
-		if (!ds_pool_skip_for_check(pool_child->spc_pool)) {
+		if (!ds_pool_restricted(pool_child->spc_pool, false)) {
 			rc = cont_start_agg(cont_child);
 			if (rc != 0)
 				goto out;
@@ -1591,11 +1591,15 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 		 *     but for creating rebuild global container handle.
 		 */
 		D_ASSERT(hdl->sch_cont != NULL);
+		D_ASSERT(hdl->sch_cont->sc_pool != NULL);
 		hdl->sch_cont->sc_open++;
 
 		if (hdl->sch_cont->sc_open > 1)
 			goto opened;
 
+		if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false))
+			goto csum_init;
+
 		rc = dtx_cont_open(hdl->sch_cont);
 		if (rc != 0) {
 			D_ASSERTF(hdl->sch_cont->sc_open == 1, "Unexpected open count for cont "
@@ -1623,10 +1627,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
 			D_GOTO(err_dtx, rc);
 		}
 
-		D_ASSERT(hdl->sch_cont != NULL);
-		D_ASSERT(hdl->sch_cont->sc_pool != NULL);
+csum_init:
 		rc = ds_cont_csummer_init(hdl->sch_cont);
-
 		if (rc != 0)
 			D_GOTO(err_dtx, rc);
 	}
diff --git a/src/include/daos/container.h b/src/include/daos/container.h
index 282dfb8ec49..57ef06f63a2 100644
--- a/src/include/daos/container.h
+++ b/src/include/daos/container.h
@@ -1,5 +1,5 @@
 /**
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -130,7 +130,7 @@ dc_cont_open_flags_valid(uint64_t flags)
 	f = flags;
 
 	/* One and only one of DAOS_COO_RO, DAOS_COO_RW, and DAOS_COO_EX. */
-	m = f & (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX);
+	m = f & DAOS_COO_IO_BASE_MASK;
 	if (m != DAOS_COO_RO && m != DAOS_COO_RW && m != DAOS_COO_EX)
 		return false;
 
diff --git a/src/include/daos_cont.h b/src/include/daos_cont.h
index c3bda9837d8..632eafe8130 100644
--- a/src/include/daos_cont.h
+++ b/src/include/daos_cont.h
@@ -79,6 +79,9 @@ extern "C" {
 /** Mask for all of the bits in the container open mode flag, DAOS_COO_ bits */
 #define DAOS_COO_MASK	((1U << DAOS_COO_NBITS) - 1)
 
+/** The basic IO mode: read-only, read-write or exclusively read-write. */
+#define DAOS_COO_IO_BASE_MASK   (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX)
+
 /** Maximum length for container hints */
 #define DAOS_CONT_HINT_MAX_LEN	128
 
diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h
index 6cbe3873f0a..de22d55ed5d 100644
--- a/src/include/daos_srv/pool.h
+++ b/src/include/daos_srv/pool.h
@@ -84,6 +84,7 @@ struct ds_pool {
 	uuid_t			sp_srv_pool_hdl;
 	uint32_t		sp_stopping:1,
 				sp_cr_checked:1,
+				sp_immutable:1,
 				sp_fetch_hdls:1,
 				sp_need_discard:1,
 				sp_disable_rebuild:1;
@@ -277,9 +278,9 @@ int ds_pool_tgt_finish_rebuild(uuid_t pool_uuid, struct pool_target_id_list *lis
 int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf,
 			   unsigned int map_version);
 
-bool ds_pool_skip_for_check(struct ds_pool *pool);
-int ds_pool_start_after_check(uuid_t uuid);
-int ds_pool_start(uuid_t uuid, bool aft_chk);
+bool ds_pool_restricted(struct ds_pool *pool, bool immutable);
+int ds_pool_start_after_check(uuid_t uuid, bool immutable);
+int ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable);
 int ds_pool_stop(uuid_t uuid);
 int dsc_pool_svc_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uint64_t deadline, int ntargets,
 			const d_rank_list_t *rank_list, int ndomains, const uint32_t *domains);
diff --git a/src/include/daos_srv/security.h b/src/include/daos_srv/security.h
index 203b9b548ee..28fe83852b3 100644
--- a/src/include/daos_srv/security.h
+++ b/src/include/daos_srv/security.h
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -304,6 +304,19 @@ ds_sec_cont_can_open_ex(uint64_t cont_capas);
 bool
 ds_sec_cont_can_evict_all(uint64_t cont_capas);
 
+/**
+ * Determine if the container can be modified based on the container security
+ * capabilities.
+ *
+ * \param[in]	cont_capas	Capability bits acquired via
+ *				ds_sec_cont_get_capabilities
+ *
+ * \return	True		Access allowed
+ *		False		Access denied
+ */
+bool
+ds_sec_cont_can_modify(uint64_t cont_capas);
+
 /**
  * Get the security capabilities for a rebuild container handle created by the
  * DAOS server.
diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c
index e1d6ee79bce..7975a2115d4 100644
--- a/src/mgmt/srv_target.c
+++ b/src/mgmt/srv_target.c
@@ -1212,7 +1212,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 	tc_out->tc_ranks.ca_arrays = rank;
 	tc_out->tc_ranks.ca_count  = 1;
 
-	rc = ds_pool_start(tc_in->tc_pool_uuid, false);
+	rc = ds_pool_start(tc_in->tc_pool_uuid, false, false);
 	if (rc) {
 		D_ERROR(DF_UUID": failed to start pool: "DF_RC"\n",
 			DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h
index 8f864c8c11a..612f7760fd1 100644
--- a/src/pool/srv_internal.h
+++ b/src/pool/srv_internal.h
@@ -57,6 +57,12 @@ pool_tls_get()
 	return tls;
 }
 
+static inline bool
+ds_pool_skip_for_check(struct ds_pool *pool)
+{
+	return engine_in_check() && !pool->sp_cr_checked;
+}
+
 struct pool_iv_map {
 	d_rank_t	piv_master_rank;
 	uint32_t	piv_pool_map_ver;
diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c
index 2d80088d4b6..6e3a01379fa 100644
--- a/src/pool/srv_pool.c
+++ b/src/pool/srv_pool.c
@@ -1675,7 +1675,7 @@ init_events(struct pool_svc *svc)
 	D_ASSERT(!events->pse_stop);
 	D_ASSERT(!events->pse_paused);
 
-	if (!ds_pool_skip_for_check(svc->ps_pool)) {
+	if (!ds_pool_restricted(svc->ps_pool, false)) {
 		rc = crt_register_event_cb(ds_pool_crt_event_cb, svc);
 		if (rc != 0) {
 			D_ERROR(DF_UUID": failed to register event callback: "DF_RC"\n",
@@ -1706,7 +1706,7 @@ init_events(struct pool_svc *svc)
 	return 0;
 
 err_cb:
-	if (!ds_pool_skip_for_check(svc->ps_pool))
+	if (!ds_pool_restricted(svc->ps_pool, false))
 		crt_unregister_event_cb(ds_pool_crt_event_cb, svc);
 	if (events->pse_pending != NULL)
 		free_event_set(&events->pse_pending);
@@ -1721,7 +1721,7 @@ fini_events(struct pool_svc *svc)
 
 	D_ASSERT(events->pse_handler != ABT_THREAD_NULL);
 
-	if (!ds_pool_skip_for_check(svc->ps_pool))
+	if (!ds_pool_restricted(svc->ps_pool, false))
 		crt_unregister_event_cb(ds_pool_crt_event_cb, svc);
 
 	ABT_mutex_lock(events->pse_mutex);
@@ -2565,6 +2565,11 @@ int ds_pool_failed_lookup(uuid_t uuid)
 	return 0;
 }
 
+struct pool_start_args {
+	bool	psa_aft_chk;
+	bool	psa_immutable;
+};
+
 /*
  * Try to start the pool. Continue the iteration upon errors as other pools may
  * still be able to work.
@@ -2572,13 +2577,26 @@ int ds_pool_failed_lookup(uuid_t uuid)
 static int
 start_one(uuid_t uuid, void *varg)
 {
-	int rc;
+	struct pool_start_args	*psa = varg;
+	bool			 aft_chk;
+	bool			 immutable;
+	int			 rc;
+
+	if (psa != NULL) {
+		aft_chk = psa->psa_aft_chk;
+		immutable = psa->psa_immutable;
+	} else {
+		aft_chk = false;
+		immutable = false;
+	}
 
-	D_DEBUG(DB_MD, DF_UUID ": starting pool\n", DP_UUID(uuid));
+	D_DEBUG(DB_MD, DF_UUID ": starting pool, aft_chk %s, immutable %s\n",
+		DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no");
 
-	rc = ds_pool_start(uuid, varg != NULL ? true : false);
+	rc = ds_pool_start(uuid, aft_chk, immutable);
 	if (rc != 0) {
-		DL_ERROR(rc, DF_UUID ": failed to start pool", DP_UUID(uuid));
+		DL_ERROR(rc, DF_UUID ": failed to start pool, aft_chk %s, immutable %s",
+			 DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no");
 		ds_pool_failed_add(uuid, rc);
 	}
 
@@ -2597,12 +2615,27 @@ pool_start_all(void *arg)
 			DP_RC(rc));
 }
 
+bool
+ds_pool_restricted(struct ds_pool *pool, bool immutable)
+{
+	if (ds_pool_skip_for_check(pool))
+		return true;
+
+	if (pool->sp_immutable && !immutable)
+		return true;
+
+	return false;
+}
+
 int
-ds_pool_start_after_check(uuid_t uuid)
+ds_pool_start_after_check(uuid_t uuid, bool immutable)
 {
-	bool	aft_chk = true;
+	struct pool_start_args	psa;
 
-	return start_one(uuid, &aft_chk);
+	psa.psa_aft_chk = true;
+	psa.psa_immutable = immutable;
+
+	return start_one(uuid, &psa);
 }
 
 /* Note that this function is currently called from the main xstream. */
@@ -3820,7 +3853,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 			   &ds_pool_prop_connectable, &value);
 	if (rc != 0)
 		goto out_lock;
-	D_DEBUG(DB_MD, DF_UUID ": connectable=%u\n", DP_UUID(in->pci_op.pi_uuid), connectable);
 	if (!connectable) {
 		D_ERROR(DF_UUID": being destroyed, not accepting connections\n",
 			DP_UUID(in->pci_op.pi_uuid));
@@ -3829,12 +3861,21 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 
 	/*
 	 * NOTE: Under check mode, there is a small race window between ds_pool_mark_connectable()
-	 *	 the PS restart for full service. If some client tries to connect the pool during
+	 *	 and PS restart with full service. If some client tries to connect the pool during
 	 *	 such internal, it will get -DER_BUSY temporarily.
 	 */
 	if (unlikely(ds_pool_skip_for_check(svc->ps_pool))) {
-		D_ERROR(DF_UUID" is not ready for full pool service\n", DP_UUID(in->pci_op.pi_uuid));
-		D_GOTO(out_lock, rc = -DER_BUSY);
+		rc = -DER_BUSY;
+		D_ERROR(DF_UUID " is not ready for full pool service: " DF_RC "\n",
+			DP_UUID(in->pci_op.pi_uuid), DP_RC(rc));
+		goto out_lock;
+	}
+
+	if (svc->ps_pool->sp_immutable && flags != DAOS_PC_RO) {
+		rc = -DER_NO_PERM;
+		D_ERROR(DF_UUID " failed to connect immutable pool, flags " DF_X64 ": " DF_RC "\n",
+			DP_UUID(in->pci_op.pi_uuid), flags, DP_RC(rc));
+		goto out_lock;
 	}
 
 	/* Check existing pool handles. */
@@ -3997,6 +4038,11 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
 		}
 	}
 
+	D_DEBUG(DB_MD, DF_UUID "/" DF_UUID ": connecting to %s pool with flags "
+		DF_X64", sec_capas " DF_X64 "\n",
+		DP_UUID(in->pci_op.pi_uuid), DP_UUID(in->pci_op.pi_hdl),
+		svc->ps_pool->sp_immutable ? "immutable" : "regular", flags, sec_capas);
+
 	rc = pool_connect_iv_dist(svc, in->pci_op.pi_hdl, flags, sec_capas, credp, global_ver,
 				  obj_layout_ver);
 	if (rc == 0 && DAOS_FAIL_CHECK(DAOS_POOL_CONNECT_FAIL_CORPC)) {
@@ -6522,7 +6568,7 @@ pool_svc_schedule(struct pool_svc *svc, struct pool_svc_sched *sched, void (*fun
 
 	D_DEBUG(DB_MD, DF_UUID": begin\n", DP_UUID(svc->ps_uuid));
 
-	if (ds_pool_skip_for_check(svc->ps_pool)) {
+	if (ds_pool_restricted(svc->ps_pool, false)) {
 		D_DEBUG(DB_MD, DF_UUID": end: skip in check mode\n", DP_UUID(svc->ps_uuid));
 		return -DER_OP_CANCELED;
 	}
@@ -8761,9 +8807,3 @@ ds_pool_svc_upgrade_vos_pool(struct ds_pool *pool)
 	ds_rsvc_put(rsvc);
 	return rc;
 }
-
-bool
-ds_pool_skip_for_check(struct ds_pool *pool)
-{
-	return engine_in_check() && !pool->sp_cr_checked;
-}
diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c
index cfa837e8b2a..113fb757fd9 100644
--- a/src/pool/srv_target.c
+++ b/src/pool/srv_target.c
@@ -505,7 +505,7 @@ pool_child_start(struct ds_pool_child *child, bool recreate)
 		goto done;
 	}
 
-	if (!ds_pool_skip_for_check(child->spc_pool)) {
+	if (!ds_pool_restricted(child->spc_pool, false)) {
 		rc = start_gc_ult(child);
 		if (rc != 0)
 			goto out_close;
@@ -1128,7 +1128,7 @@ pool_fetch_hdls_ult_abort(struct ds_pool *pool)
  * till ds_pool_stop. Only for mgmt and pool modules.
  */
 int
-ds_pool_start(uuid_t uuid, bool aft_chk)
+ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable)
 {
 	struct ds_pool			*pool;
 	struct daos_llink		*llink;
@@ -1185,6 +1185,11 @@ ds_pool_start(uuid_t uuid, bool aft_chk)
 	else
 		pool->sp_cr_checked = 0;
 
+	if (immutable)
+		pool->sp_immutable = 1;
+	else
+		pool->sp_immutable = 0;
+
 	rc = pool_child_add_all(pool);
 	if (rc != 0)
 		goto failure_pool;
@@ -1199,7 +1204,9 @@ ds_pool_start(uuid_t uuid, bool aft_chk)
 		}
 
 		pool->sp_fetch_hdls = 1;
+	}
 
+	if (!ds_pool_restricted(pool, false)) {
 		rc = ds_pool_start_ec_eph_query_ult(pool);
 		if (rc != 0) {
 			D_ERROR(DF_UUID": failed to start ec eph query ult: "DF_RC"\n",
@@ -1882,13 +1889,10 @@ ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf,
 		       DP_UUID(pool->sp_uuid), map_version_before, map_version);
 	}
 
-	if (map_updated) {
+	if (map_updated && !ds_pool_restricted(pool, false)) {
 		struct dtx_scan_args	*arg;
 		int ret;
 
-		if (ds_pool_skip_for_check(pool))
-			D_GOTO(out, rc = 0);
-
 		D_ALLOC_PTR(arg);
 		if (arg == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c
index b1f722b8254..5c463af9e90 100644
--- a/src/rebuild/srv.c
+++ b/src/rebuild/srv.c
@@ -1941,7 +1941,7 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver,
 		return 0;
 	}
 
-	if (ds_pool_skip_for_check(pool)) {
+	if (ds_pool_restricted(pool, false)) {
 		D_DEBUG(DB_REBUILD, DF_UUID" skip rebuild under check mode\n",
 			DP_UUID(pool->sp_uuid));
 		return 0;
diff --git a/src/security/srv_acl.c b/src/security/srv_acl.c
index ad3c0107e3b..8f05216e971 100644
--- a/src/security/srv_acl.c
+++ b/src/security/srv_acl.c
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2019-2023 Intel Corporation.
+ * (C) Copyright 2019-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -754,6 +754,12 @@ ds_sec_cont_can_evict_all(uint64_t cont_capas)
 	return (cont_capas & CONT_CAPA_EVICT_ALL) != 0;
 }
 
+bool
+ds_sec_cont_can_modify(uint64_t cont_capas)
+{
+	return (cont_capas & CONT_CAPAS_W_MASK) != 0;
+}
+
 uint64_t
 ds_sec_get_rebuild_cont_capabilities(void)
 {
diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c
index 3827a0eda6a..4c981f29645 100644
--- a/src/tests/suite/daos_cr.c
+++ b/src/tests/suite/daos_cr.c
@@ -18,6 +18,8 @@
 
 #include <daos_mgmt.h>
 
+#include "daos_iotest.h"
+
 /*
  * Will enable accurate query result verification after DAOS-13520 resolved.
  * #define CR_ACCURATE_QUERY_RESULT	1
@@ -3573,6 +3575,146 @@ cr_handle_fail_pool2(void **state)
 	cr_cleanup(arg, &pool, 1);
 }
 
+#define CR_IO_SIZE	16
+
+/*
+ * 1. Create pool and container without inconsistency.
+ * 2. Write something to the container.
+ * 3. Start checker with --dry-run option.
+ * 4. Query checker, it should be completed without any inconsistency reported.
+ * 5. Verify the pool only can be connected with read-only permission.
+ * 6. Verify the container only can be opened with read-only permission.
+ * 7. Verify the object only can be opened with read-only permission.
+ * 8. Verify the object cannot be written.
+ * 9. Read former wrote data and verify its correctness.
+ * 10. Switch to normal mode and cleanup.
+ */
+static void
+cr_maintenance_mode(void **state)
+{
+	test_arg_t		*arg = *state;
+	struct test_pool	 pool = { 0 };
+	struct test_cont	 cont = { 0 };
+	daos_handle_t		 coh;
+	daos_obj_id_t		 oid;
+	struct ioreq		 req;
+	const char		*dkey = "cr_dkey";
+	const char		*akey = "cr_akey";
+	char			 update_buf[CR_IO_SIZE];
+	char			 fetch_buf[CR_IO_SIZE];
+	struct daos_check_info	 dci = { 0 };
+	int			 rc;
+
+	print_message("CR28: maintenance mode after dry-run check\n");
+
+	rc = cr_pool_create(state, &pool, true, TCC_NONE);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_cont_create(state, &pool, &cont, 0);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	oid = daos_test_oid_gen(coh, OC_SX, 0, 0, arg->myrank);
+	arg->fail_loc = DAOS_DTX_COMMIT_SYNC | DAOS_FAIL_ALWAYS;
+	arg->async = 0;
+	ioreq_init(&req, coh, oid, DAOS_IOD_SINGLE, arg);
+	dts_buf_render(update_buf, CR_IO_SIZE);
+
+	print_message("Generate some data.\n");
+
+	insert_single(dkey, akey, 0, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	daos_fail_loc_set(0);
+
+	rc = daos_obj_close(req.oh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_cont_close(coh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_pool_disconnect(pool.poh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_system_stop(false);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_mode_switch(true);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_check_start(TCSF_DRYRUN, 0, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	cr_ins_wait(1, &pool.pool_uuid, &dci);
+
+	rc = cr_ins_verify(&dci, TCIS_COMPLETED);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the pool only can be connected as RDONLY mode.\n");
+
+	rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RW, &pool.poh, NULL, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+
+	rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RO, &pool.poh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the container only can be opened as RDONLY mode.\n");
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+
+	rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RO, &coh, NULL, NULL);
+	assert_rc_equal(rc, 0);
+
+#if 0
+	/*
+	 * NOTE: Currently, DAOS security check is server-side logic, but object open may not
+	 *	 talk with server. So related permission check will not be done until read or
+	 *	 write really happened. If not consider exclusively open and cache case, then
+	 *	 it seems fine; otherwise, need some enhancement in future.
+	 */
+
+	print_message("Verify the object only can be opened as RDONLY mode.\n");
+
+	rc = daos_obj_open(coh, oid, DAOS_OO_RW, &req.oh, NULL);
+	assert_rc_equal(rc, -DER_NO_PERM);
+#endif
+
+	rc = daos_obj_open(coh, oid, DAOS_OO_RO, &req.oh, NULL);
+	assert_rc_equal(rc, 0);
+
+	print_message("Verify the object cannot be modified.\n");
+
+	req.arg->expect_result = -DER_NO_PERM;
+	insert_single(dkey, akey, 100, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	daos_fail_loc_set(0);
+
+	print_message("Verify former wrote data.\n");
+
+	req.arg->expect_result = 0;
+	lookup_single(dkey, akey, 0, fetch_buf, CR_IO_SIZE, DAOS_TX_NONE, &req);
+	assert_int_equal(req.iod[0].iod_size, CR_IO_SIZE);
+	assert_memory_equal(update_buf, fetch_buf, CR_IO_SIZE);
+
+	rc = daos_cont_close(coh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = daos_pool_disconnect(pool.poh, NULL);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_mode_switch(false);
+	assert_rc_equal(rc, 0);
+
+	rc = cr_system_start();
+	assert_rc_equal(rc, 0);
+
+	cr_dci_fini(&dci);
+	cr_cleanup(arg, &pool, 1);
+}
+
 static const struct CMUnitTest cr_tests[] = {
 	{ "CR1: start checker for specified pools",
 	  cr_start_specified, async_disable, test_case_teardown},
@@ -3628,6 +3770,8 @@ static const struct CMUnitTest cr_tests[] = {
 	  cr_handle_fail_pool1, async_disable, test_case_teardown},
 	{ "CR27: handle the pool if some engine failed to report some pool service",
 	  cr_handle_fail_pool2, async_disable, test_case_teardown},
+	{ "CR28: maintenance mode after dry-run check",
+	  cr_maintenance_mode, async_disable, test_case_teardown},
 };
 
 static int

From b913d3e9685fe8b31b046d197f66477460053e68 Mon Sep 17 00:00:00 2001
From: Phil Henderson <phillip.henderson@intel.com>
Date: Mon, 21 Oct 2024 14:19:00 -0400
Subject: [PATCH 11/34] DAOS-16265 test: Fix erasurecode/rebuild_fio.py out of
 space (#15020) (#15340)

Prevent accumulating large server log files caused by temporarily
enabling the DEBUG log mask while creating or destroying pools.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
---
 .../ftest/erasurecode/multiple_failure.yaml   |  1 +
 src/tests/ftest/erasurecode/rebuild_fio.yaml  |  1 +
 src/tests/ftest/util/apricot/apricot/test.py  | 22 +++++++++++++++----
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml
index 78f132474b5..95aab541329 100644
--- a/src/tests/ftest/erasurecode/multiple_failure.yaml
+++ b/src/tests/ftest/erasurecode/multiple_failure.yaml
@@ -25,6 +25,7 @@ server_config:
       storage: auto
 pool:
   size: 93%
+  set_logmasks: False
 container:
   type: POSIX
   control_method: daos
diff --git a/src/tests/ftest/erasurecode/rebuild_fio.yaml b/src/tests/ftest/erasurecode/rebuild_fio.yaml
index a895c356707..a3539d86579 100644
--- a/src/tests/ftest/erasurecode/rebuild_fio.yaml
+++ b/src/tests/ftest/erasurecode/rebuild_fio.yaml
@@ -39,6 +39,7 @@ pool:
   aggregation:
     threshold: 50000000
     aggr_timeout: 180
+  set_logmasks: False
 container:
   type: POSIX
   control_method: daos
diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py
index 42e05937f37..563ff7adece 100644
--- a/src/tests/ftest/util/apricot/apricot/test.py
+++ b/src/tests/ftest/util/apricot/apricot/test.py
@@ -643,6 +643,7 @@ def __init__(self, *args, **kwargs):
         self.setup_start_agents = True
         self.slurm_exclude_servers = False
         self.slurm_exclude_nodes = NodeSet()
+        self.max_test_dir_usage_check = 90
         self.host_info = HostInfo()
         self.hostlist_servers = NodeSet()
         self.hostlist_clients = NodeSet()
@@ -693,6 +694,11 @@ def setUp(self):
         self.slurm_exclude_servers = self.params.get(
             "slurm_exclude_servers", "/run/setup/*", self.slurm_exclude_servers)
 
+        # Max test directory usage percentage - when exceeded will display sizes of files in the
+        # test directory
+        self.max_test_dir_usage_check = self.params.get(
+            "max_test_dir_usage_check", "/run/setup/*", self.max_test_dir_usage_check)
+
         # The server config name should be obtained from each ServerManager
         # object, but some tests still use this TestWithServers attribute.
         self.server_group = self.params.get("name", "/run/server_config/*", "daos_server")
@@ -765,12 +771,20 @@ def setUp(self):
 
         # List common test directory contents before running the test
         self.log.info("-" * 100)
-        self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir))
+        self.log.debug(
+            "Common test directory (%s) contents (check > %s%%):",
+            os.path.dirname(self.test_dir), self.max_test_dir_usage_check)
         all_hosts = include_local_host(self.host_info.all_hosts)
         test_dir_parent = os.path.dirname(self.test_dir)
-        result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
-        if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90:
-            run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*")
+        _result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
+        _details = NodeSet()
+        for _host, _stdout in _result.all_stdout.items():
+            _test_dir_usage = re.findall(r"\s+([\d]+)%\s+", _stdout)
+            _test_dir_usage_int = int(max(_test_dir_usage + ["0"]))
+            if _test_dir_usage_int > self.max_test_dir_usage_check:
+                _details.add(_host)
+        if _details:
+            run_remote(self.log, _details, f"du -sh {test_dir_parent}/*")
         self.log.info("-" * 100)
 
         if not self.start_servers_once or self.name.uid == 1:

From ffa1c9d952fb62f72524abec119e7d829bce5633 Mon Sep 17 00:00:00 2001
From: Michael MacDonald <mjmac@google.com>
Date: Tue, 22 Oct 2024 11:38:08 -0400
Subject: [PATCH 12/34] DAOS-16693 telemetry: Avoid race between init/read
 (#15306) (#15322)

In rare cases, a reader may attempt to access a telemetry
node after it has been added to the tree, but before it
has been fully initialized. Use an atomic to prevent
reads before the initialization has completed. Unlucky
readers will get a -DER_AGAIN instead of crashing.

Signed-off-by: Michael MacDonald <mjmac@google.com>
---
 src/control/lib/telemetry/counter.go   | 20 ++++++++-----
 src/control/lib/telemetry/duration.go  | 20 ++++++++-----
 src/control/lib/telemetry/gauge.go     | 38 ++++++++++++++----------
 src/control/lib/telemetry/snapshot.go  | 20 ++++++++-----
 src/control/lib/telemetry/telemetry.go | 12 ++++++++
 src/control/lib/telemetry/timestamp.go | 22 +++++++++-----
 src/gurt/telemetry.c                   | 41 ++++++++++++++++++++++++++
 src/include/gurt/telemetry_common.h    |  3 ++
 8 files changed, 129 insertions(+), 47 deletions(-)

diff --git a/src/control/lib/telemetry/counter.go b/src/control/lib/telemetry/counter.go
index 81549a32daf..e6e59a8d0ea 100644
--- a/src/control/lib/telemetry/counter.go
+++ b/src/control/lib/telemetry/counter.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -37,18 +37,22 @@ func (c *Counter) FloatValue() float64 {
 }
 
 func (c *Counter) Value() uint64 {
+	ctrVal := BadUintVal
 	if c.handle == nil || c.node == nil {
-		return BadUintVal
+		return ctrVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_counter(c.handle.ctx, &val, c.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_counter(c.handle.ctx, &val, c.node)
+		if res == C.DER_SUCCESS {
+			ctrVal = uint64(val)
+		}
+		return res
 	}
+	c.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return ctrVal
 }
 
 func newCounter(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Counter {
diff --git a/src/control/lib/telemetry/duration.go b/src/control/lib/telemetry/duration.go
index 1f32125bc90..3cfd240bde7 100644
--- a/src/control/lib/telemetry/duration.go
+++ b/src/control/lib/telemetry/duration.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,18 +34,22 @@ func (d *Duration) Type() MetricType {
 }
 
 func (d *Duration) Value() time.Duration {
+	durValue := BadDuration
 	if d.handle == nil || d.node == nil {
-		return BadDuration
+		return durValue
 	}
 
-	var tms C.struct_timespec
-
-	res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node)
-	if res == C.DER_SUCCESS {
-		return time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond
+	fetch := func() C.int {
+		var tms C.struct_timespec
+		res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node)
+		if res == C.DER_SUCCESS {
+			durValue = time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond
+		}
+		return res
 	}
+	d.fetchValWithRetry(fetch)
 
-	return BadDuration
+	return durValue
 }
 
 func (d *Duration) FloatValue() float64 {
diff --git a/src/control/lib/telemetry/gauge.go b/src/control/lib/telemetry/gauge.go
index ea84ff90504..93db24ab9fc 100644
--- a/src/control/lib/telemetry/gauge.go
+++ b/src/control/lib/telemetry/gauge.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -41,18 +41,22 @@ func (g *Gauge) FloatValue() float64 {
 
 // Value returns the value as an unsigned integer.
 func (g *Gauge) Value() uint64 {
+	gaugeVal := BadUintVal
 	if g.handle == nil || g.node == nil {
-		return BadUintVal
+		return gaugeVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node)
+		if res == C.DER_SUCCESS {
+			gaugeVal = uint64(val)
+		}
+		return res
 	}
+	g.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return gaugeVal
 }
 
 func newGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Gauge {
@@ -103,18 +107,22 @@ func (g *StatsGauge) FloatValue() float64 {
 
 // Value returns the gauge value as an unsigned integer.
 func (g *StatsGauge) Value() uint64 {
+	gaugeVal := BadUintVal
 	if g.handle == nil || g.node == nil {
-		return BadUintVal
+		return gaugeVal
 	}
 
-	var val C.uint64_t
-
-	res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node)
-	if res == C.DER_SUCCESS {
-		return uint64(val)
+	fetch := func() C.int {
+		var val C.uint64_t
+		res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node)
+		if res == C.DER_SUCCESS {
+			gaugeVal = uint64(val)
+		}
+		return res
 	}
+	g.fetchValWithRetry(fetch)
 
-	return BadUintVal
+	return gaugeVal
 }
 
 func newStatsGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *StatsGauge {
diff --git a/src/control/lib/telemetry/snapshot.go b/src/control/lib/telemetry/snapshot.go
index 2ffa23296c3..5b2af9f0747 100644
--- a/src/control/lib/telemetry/snapshot.go
+++ b/src/control/lib/telemetry/snapshot.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,18 +34,22 @@ func (s *Snapshot) Type() MetricType {
 }
 
 func (s *Snapshot) Value() time.Time {
+	timeVal := time.Time{} // zero val
 	if s.handle == nil || s.node == nil {
-		return time.Time{}
+		return timeVal
 	}
 
-	var tms C.struct_timespec
-
-	res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node)
-	if res == C.DER_SUCCESS {
-		return time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec))
+	fetch := func() C.int {
+		var tms C.struct_timespec
+		res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node)
+		if res == C.DER_SUCCESS {
+			timeVal = time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec))
+		}
+		return res
 	}
+	s.fetchValWithRetry(fetch)
 
-	return time.Time{}
+	return timeVal
 }
 
 func (s *Snapshot) FloatValue() float64 {
diff --git a/src/control/lib/telemetry/telemetry.go b/src/control/lib/telemetry/telemetry.go
index bb0593240b6..479c41e2aab 100644
--- a/src/control/lib/telemetry/telemetry.go
+++ b/src/control/lib/telemetry/telemetry.go
@@ -84,6 +84,8 @@ const (
 	BadDuration = time.Duration(BadIntVal)
 
 	PathSep = filepath.Separator
+
+	maxFetchRetries = 1
 )
 
 type (
@@ -304,6 +306,16 @@ func (mb *metricBase) String() string {
 	return strings.TrimSpace(string(buf[:bytes.Index(buf, []byte{0})]))
 }
 
+func (mb *metricBase) fetchValWithRetry(fetchFn func() C.int) C.int {
+	var rc C.int
+	for i := 0; i < maxFetchRetries; i++ {
+		if rc = fetchFn(); rc == C.DER_SUCCESS {
+			return rc
+		}
+	}
+	return rc
+}
+
 func (sm *statsMetric) Min() uint64 {
 	return uint64(sm.stats.dtm_min)
 }
diff --git a/src/control/lib/telemetry/timestamp.go b/src/control/lib/telemetry/timestamp.go
index 97ef5bb1ed9..c787aed488d 100644
--- a/src/control/lib/telemetry/timestamp.go
+++ b/src/control/lib/telemetry/timestamp.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2021-2022 Intel Corporation.
+// (C) Copyright 2021-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -34,16 +34,22 @@ func (t *Timestamp) Type() MetricType {
 }
 
 func (t *Timestamp) Value() time.Time {
-	zero := time.Time{}
+	timeVal := time.Time{} // zero val
 	if t.handle == nil || t.node == nil {
-		return zero
+		return timeVal
 	}
-	var clk C.time_t
-	res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node)
-	if res == C.DER_SUCCESS {
-		return time.Unix(int64(clk), 0)
+
+	fetch := func() C.int {
+		var clk C.time_t
+		res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node)
+		if res == C.DER_SUCCESS {
+			timeVal = time.Unix(int64(clk), 0)
+		}
+		return res
 	}
-	return zero
+	t.fetchValWithRetry(fetch)
+
+	return timeVal
 }
 
 // FloatValue converts the timestamp to time in seconds since the UNIX epoch.
diff --git a/src/gurt/telemetry.c b/src/gurt/telemetry.c
index 8f08192d072..be7762184a9 100644
--- a/src/gurt/telemetry.c
+++ b/src/gurt/telemetry.c
@@ -13,6 +13,7 @@
 #include <float.h>
 #include <pthread.h>
 #include <malloc.h>
+#include <gurt/atomic.h>
 #include <gurt/common.h>
 #include <gurt/list.h>
 #include <sys/shm.h>
@@ -624,6 +625,7 @@ alloc_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t **newnode,
 		goto out;
 	tmp->dtn_metric  = NULL;
 	tmp->dtn_sibling = NULL;
+	atomic_store_relaxed(&tmp->dtn_readable, false);
 
 	*newnode = node;
 out:
@@ -2409,6 +2411,9 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type,
 		pthread_mutexattr_destroy(&mattr);
 		temp->dtn_protect = true;
 	}
+
+	atomic_store_relaxed(&temp->dtn_readable, true);
+
 	if (node != NULL)
 		*node = temp;
 
@@ -3090,6 +3095,15 @@ d_tm_try_del_ephemeral_dir(const char *fmt, ...)
 	return rc;
 }
 
+static bool
+node_is_readable(struct d_tm_node_t *node)
+{
+	if (node == NULL)
+		return false;
+
+	return atomic_load_relaxed(&node->dtn_readable);
+}
+
 /**
  * Creates histogram counters for the given node.  It calculates the
  * extents of each bucket and creates counters at the path specified that
@@ -3278,6 +3292,9 @@ d_tm_get_num_buckets(struct d_tm_context *ctx,
 	if (ctx == NULL || histogram == NULL || node == NULL)
 		return -DER_INVAL;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	rc = validate_node_ptr(ctx, node, &shmem);
 	if (rc != 0)
 		return rc;
@@ -3341,6 +3358,9 @@ d_tm_get_bucket_range(struct d_tm_context *ctx, struct d_tm_bucket_t *bucket,
 	if (rc != 0)
 		return rc;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	if (!has_stats(node))
 		return -DER_OP_NOT_PERMITTED;
 
@@ -3392,6 +3412,9 @@ d_tm_get_counter(struct d_tm_context *ctx, uint64_t *val,
 	if (node->dtn_type != D_TM_COUNTER)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	/* "ctx == NULL" is server side fast version to read the counter. */
 	if (ctx == NULL) {
 		metric_data = node->dtn_metric;
@@ -3441,6 +3464,9 @@ d_tm_get_timestamp(struct d_tm_context *ctx, time_t *val,
 	if (node->dtn_type != D_TM_TIMESTAMP)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3470,6 +3496,9 @@ d_tm_get_meminfo(struct d_tm_context *ctx, struct d_tm_meminfo_t *meminfo,
 	if (node->dtn_type != D_TM_MEMINFO)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3513,6 +3542,9 @@ d_tm_get_timer_snapshot(struct d_tm_context *ctx, struct timespec *tms,
 	if (!(node->dtn_type & D_TM_TIMER_SNAPSHOT))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
@@ -3563,6 +3595,9 @@ d_tm_get_duration(struct d_tm_context *ctx, struct timespec *tms,
 	if (!(node->dtn_type & D_TM_DURATION))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data == NULL)
 		return -DER_METRIC_NOT_FOUND;
@@ -3628,6 +3663,9 @@ d_tm_get_gauge(struct d_tm_context *ctx, uint64_t *val,
 	if (!is_gauge(node))
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		dtm_stats = conv_ptr(shmem, metric_data->dtm_stats);
@@ -3700,6 +3738,9 @@ int d_tm_get_metadata(struct d_tm_context *ctx, char **desc, char **units,
 	if (node->dtn_type == D_TM_DIRECTORY)
 		return -DER_OP_NOT_PERMITTED;
 
+	if (unlikely(!node_is_readable(node)))
+		return -DER_AGAIN;
+
 	metric_data = conv_ptr(shmem, node->dtn_metric);
 	if (metric_data != NULL) {
 		d_tm_node_lock(node);
diff --git a/src/include/gurt/telemetry_common.h b/src/include/gurt/telemetry_common.h
index 4e7d3b3a02d..2068c1c0cce 100644
--- a/src/include/gurt/telemetry_common.h
+++ b/src/include/gurt/telemetry_common.h
@@ -8,6 +8,8 @@
 
 #include <gurt/common.h>
 
+#include <stdatomic.h>
+
 #define D_TM_VERSION			1
 #define D_TM_MAX_NAME_LEN		256
 #define D_TM_MAX_DESC_LEN		128
@@ -236,6 +238,7 @@ struct d_tm_node_t {
 	pthread_mutex_t		dtn_lock; /** individual mutex */
 	struct d_tm_metric_t	*dtn_metric; /** values */
 	bool			dtn_protect; /** synchronized access */
+	_Atomic bool             dtn_readable; /** fully initialized and ready for reads */
 };
 
 struct d_tm_nodeList_t {

From 42a0d35d815505ba0435e192933160b52e55c7c2 Mon Sep 17 00:00:00 2001
From: Alexander Oganezov <alexander.a.oganezov@intel.com>
Date: Tue, 22 Oct 2024 09:10:51 -0700
Subject: [PATCH 13/34] DAOS-16696 cart: Fix rc in error path (#15313) (#15357)

- Fix rc in error path during ivo_on_update failure

Required-githooks: true

Signed-off-by: Alexander A Oganezov <alexander.a.oganezov@intel.com>
---
 src/cart/crt_iv.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c
index 603e565ac1f..bf8124a8a6a 100644
--- a/src/cart/crt_iv.c
+++ b/src/cart/crt_iv.c
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2016-2023 Intel Corporation.
+ * (C) Copyright 2016-2024 Intel Corporation.
  *
  * SPDX-License-Identifier: BSD-2-Clause-Patent
  */
@@ -2911,8 +2911,12 @@ bulk_update_transfer_done_aux(const struct crt_bulk_cb_info *info)
 	return rc;
 
 send_error:
-	rc = crt_bulk_free(cb_info->buc_bulk_hdl);
+	/* send back whatever error got us here */
 	output->rc = rc;
+	rc         = crt_bulk_free(cb_info->buc_bulk_hdl);
+	if (rc != 0)
+		DL_ERROR(rc, "crt_bulk_free() failed");
+
 	iv_ops->ivo_on_put(ivns_internal, &cb_info->buc_iv_value,
 			   cb_info->buc_user_priv);
 

From 1ae3f29dcbebe9a4ae123302cc11b9dfac870c8f Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Wed, 23 Oct 2024 00:57:32 +0800
Subject: [PATCH 14/34] DAOS-16574 vos: shrink DTX table blob size - b26
 (#15220) (#15221)

Use 4KB blob for committed DTX table and 16KB for active DTX table.
It is more efficient for lower allocator and reduce the possibility
of space allocation failure when space pressure.

Simplify vos_dtx_commit logic and code cleanup.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/vos/vos_dtx.c | 101 +++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 54 deletions(-)

diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index 1c60f781507..f1d69b54f4e 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -15,11 +15,10 @@
 #include "vos_layout.h"
 #include "vos_internal.h"
 
-/* 128 KB per SCM blob */
-#define DTX_BLOB_SIZE		(1 << 17)
-/** Ensure 16-bit signed int is sufficient to store record index */
-D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_act_ent_df)) <  (1 << 15));
-D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_cmt_ent_df)) <  (1 << 15));
+/* 16 KB blob for each active DTX blob */
+#define DTX_ACT_BLOB_SIZE	(1 << 14)
+/* 4 KB for committed DTX blob */
+#define DTX_CMT_BLOB_SIZE	(1 << 12)
 
 #define DTX_ACT_BLOB_MAGIC	0x14130a2b
 #define DTX_CMT_BLOB_MAGIC	0x2502191c
@@ -767,7 +766,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae,
 static int
 vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t epoch,
 		   daos_epoch_t cmt_time, struct vos_dtx_cmt_ent **dce_p,
-		   struct vos_dtx_act_ent **dae_p, bool *rm_cos, bool *fatal)
+		   struct vos_dtx_act_ent **dae_p, bool *rm_cos)
 {
 	struct vos_dtx_act_ent		*dae = NULL;
 	struct vos_dtx_cmt_ent		*dce = NULL;
@@ -866,10 +865,8 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t
 		goto out;
 
 	rc = dtx_rec_release(cont, dae, false);
-	if (rc != 0) {
-		*fatal = true;
+	if (rc != 0)
 		goto out;
-	}
 
 	D_ASSERT(dae_p != NULL);
 	*dae_p = dae;
@@ -915,7 +912,7 @@ vos_dtx_extend_act_table(struct vos_container *cont)
 	umem_off_t			 dbd_off;
 	int				 rc;
 
-	dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE);
+	dbd_off = umem_zalloc(umm, DTX_ACT_BLOB_SIZE);
 	if (UMOFF_IS_NULL(dbd_off)) {
 		D_ERROR("No space when create active DTX table.\n");
 		return -DER_NOSPACE;
@@ -923,7 +920,7 @@ vos_dtx_extend_act_table(struct vos_container *cont)
 
 	dbd = umem_off2ptr(umm, dbd_off);
 	dbd->dbd_magic = DTX_ACT_BLOB_MAGIC;
-	dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
+	dbd->dbd_cap = (DTX_ACT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
 			sizeof(struct vos_dtx_act_ent_df);
 	dbd->dbd_count = 0;
 	dbd->dbd_index = 0;
@@ -1994,12 +1991,11 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 	umem_off_t			 dbd_off;
 	uint64_t			 cmt_time = daos_gettime_coarse();
 	int				 committed = 0;
-	int				 cur = 0;
 	int				 rc = 0;
-	int				 rc1 = 0;
+	int				 p = 0;
 	int				 i = 0;
 	int				 j;
-	bool				 fatal = false;
+	int				 k;
 	bool				 allocated = false;
 
 	dbd = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail);
@@ -2017,67 +2013,64 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 		goto new_blob;
 
 again:
-	for (j = dbd->dbd_count; i < count && j < dbd->dbd_cap && rc1 == 0;
-	     i++, cur++) {
-		struct vos_dtx_cmt_ent	*dce = NULL;
-
-		rc = vos_dtx_commit_one(cont, &dtis[cur], epoch, cmt_time, &dce,
-					daes != NULL ? &daes[cur] : NULL,
-					rm_cos != NULL ? &rm_cos[cur] : NULL, &fatal);
-		if (dces != NULL)
-			dces[cur] = dce;
-
-		if (fatal)
-			goto out;
-
-		if (rc == 0 && (daes == NULL || daes[cur] != NULL))
+	for (j = dbd->dbd_count; j < dbd->dbd_cap && i < count; i++) {
+		rc = vos_dtx_commit_one(cont, &dtis[i], epoch, cmt_time, &dces[i],
+					daes != NULL ? &daes[i] : NULL,
+					rm_cos != NULL ? &rm_cos[i] : NULL);
+		if (rc == 0 && (daes == NULL || daes[i] != NULL))
 			committed++;
 
 		if (rc == -DER_ALREADY || rc == -DER_NONEXIST)
 			rc = 0;
 
-		if (rc1 == 0)
-			rc1 = rc;
+		if (rc != 0)
+			goto out;
+
+		if (dces[i] != NULL)
+			j++;
+	}
 
-		if (dce != NULL) {
-			rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[j],
-					      sizeof(struct vos_dtx_cmt_ent_df),
-					      UMEM_XADD_NO_SNAPSHOT);
+	if (j > dbd->dbd_count) {
+		if (!allocated) {
+			rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[dbd->dbd_count],
+					      sizeof(struct vos_dtx_cmt_ent_df) *
+					      (j - dbd->dbd_count), UMEM_XADD_NO_SNAPSHOT);
 			if (rc != 0)
-				D_GOTO(out, fatal = true);
+				goto out;
 
-			memcpy(&dbd->dbd_committed_data[j++], &dce->dce_base,
+			/* Only need to add range for the first partial blob. */
+			rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count));
+			if (rc != 0)
+				goto out;
+		}
+
+		for (k = dbd->dbd_count; k < j; k++, p++) {
+			while (dces[p] == NULL)
+				p++;
+
+			memcpy(&dbd->dbd_committed_data[k], &dces[p]->dce_base,
 			       sizeof(struct vos_dtx_cmt_ent_df));
 		}
-	}
 
-	if (!allocated) {
-		/* Only need to add range for the first partial blob. */
-		rc = umem_tx_add_ptr(umm, &dbd->dbd_count,
-				     sizeof(dbd->dbd_count));
-		if (rc != 0)
-			D_GOTO(out, fatal = true);
+		dbd->dbd_count = j;
 	}
 
-	dbd->dbd_count = j;
-
-	if (i == count || rc1 != 0)
+	if (i == count)
 		goto out;
 
 new_blob:
 	dbd_prev = dbd;
 	/* Need new @dbd */
-	dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE);
+	dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE);
 	if (UMOFF_IS_NULL(dbd_off)) {
 		D_ERROR("No space to store committed DTX %d "DF_DTI"\n",
-			count, DP_DTI(&dtis[cur]));
-		fatal = true;
+			count, DP_DTI(&dtis[i]));
 		D_GOTO(out, rc = -DER_NOSPACE);
 	}
 
 	dbd = umem_off2ptr(umm, dbd_off);
 	dbd->dbd_magic = DTX_CMT_BLOB_MAGIC;
-	dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
+	dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) /
 		       sizeof(struct vos_dtx_cmt_ent_df);
 	dbd->dbd_prev = umem_ptr2off(umm, dbd_prev);
 
@@ -2090,21 +2083,21 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 				     sizeof(cont_df->cd_dtx_committed_head) +
 				     sizeof(cont_df->cd_dtx_committed_tail));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 
 		cont_df->cd_dtx_committed_head = dbd_off;
 	} else {
 		rc = umem_tx_add_ptr(umm, &dbd_prev->dbd_next,
 				     sizeof(dbd_prev->dbd_next));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 
 		dbd_prev->dbd_next = dbd_off;
 
 		rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_tail,
 				     sizeof(cont_df->cd_dtx_committed_tail));
 		if (rc != 0)
-			D_GOTO(out, fatal = true);
+			goto out;
 	}
 
 	D_DEBUG(DB_IO, "Allocated DTX committed blob %p ("UMOFF_PF") for cont "DF_UUID"\n",
@@ -2115,7 +2108,7 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[],
 	goto again;
 
 out:
-	return fatal ? rc : (committed > 0 ? committed : rc1);
+	return rc < 0 ? rc : committed;
 }
 
 void

From dcf84195f80f2d0367fd03264b76d7a9674fdd19 Mon Sep 17 00:00:00 2001
From: Li Wei <wei.g.li@intel.com>
Date: Wed, 23 Oct 2024 23:20:15 +0900
Subject: [PATCH 15/34] DAOS-16653 doc: Fix CRT_EVENT_DELAY description
 (#15351) (#15371)

Fix the description of the CRT_EVENT_DELAY environment variable in
docs/admin/env_variables.md.

Signed-off-by: Li Wei <wei.g.li@intel.com>
---
 docs/admin/env_variables.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md
index 31e0f3e8767..3de0a079203 100644
--- a/docs/admin/env_variables.md
+++ b/docs/admin/env_variables.md
@@ -44,7 +44,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_MD\_CAP         |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.|
 |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.|
 |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.|
-|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
+|CRT\_EVENT\_DELAY|Delay in seconds before handling a set of CaRT events. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.|
 |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.|
 |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"|
 |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.|

From 4c49f3626726e561dfab41c21a9345573828abc3 Mon Sep 17 00:00:00 2001
From: Ken Cain <kenneth.c.cain@intel.com>
Date: Wed, 23 Oct 2024 16:54:02 -0400
Subject: [PATCH 16/34] DAOS-16650 control: dmg system exclude, update group
 version (#15288) (#15349)

With this change, when a daos administrator runs dmg system exclude
for a given set of engines, the system map version / cart primary group
version will be updated. In turn, daos_engines will more immediately
detect the "loss" of the administratively excluded engines, update
pool maps and perform rebuild. This change supports a use case of
a proactive exclusion of ranks that are expected to be impacted by
planned maintenance that would cut off connectivity to certain
engines.

Signed-off-by: Kenneth Cain <kenneth.c.cain@intel.com>
---
 src/control/server/mgmt_system.go      |  2 ++
 src/control/server/mgmt_system_test.go | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go
index e85091b5529..a35753d95ba 100644
--- a/src/control/server/mgmt_system.go
+++ b/src/control/server/mgmt_system.go
@@ -1002,6 +1002,8 @@ func (svc *mgmtSvc) SystemExclude(ctx context.Context, req *mgmtpb.SystemExclude
 		})
 	}
 
+	svc.reqGroupUpdate(ctx, false)
+
 	return resp, nil
 }
 
diff --git a/src/control/server/mgmt_system_test.go b/src/control/server/mgmt_system_test.go
index f482b1943a1..0cffe7f63d8 100644
--- a/src/control/server/mgmt_system_test.go
+++ b/src/control/server/mgmt_system_test.go
@@ -1765,6 +1765,7 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) {
 				mockMember(t, 3, 2, "joined"),
 			},
 		},
+
 		"unexclude hosts": {
 			req: &mgmtpb.SystemExcludeReq{Hosts: test.MockHostAddr(1).String(), Clear: true},
 			members: system.Members{
@@ -1795,12 +1796,32 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) {
 			if tc.req != nil && tc.req.Sys == "" {
 				tc.req.Sys = build.DefaultSystemName
 			}
+
+			startMapVer, err := svc.sysdb.CurMapVersion()
+			if err != nil {
+				t.Fatalf("startMapVer CurMapVersion() failed\n")
+				return
+			}
 			gotResp, gotAPIErr := svc.SystemExclude(ctx, tc.req)
 			test.CmpErr(t, tc.expAPIErr, gotAPIErr)
 			if tc.expAPIErr != nil {
 				return
 			}
 
+			// Check for any system map version increase by the (asynchronous) update.
+			// Test will time out if it never happens, thus choice of an infinite loop here.
+			for {
+				curMapVer, err := svc.sysdb.CurMapVersion()
+				if err != nil {
+					t.Fatalf("CurMapVersion() failed\n")
+					return
+				}
+
+				if curMapVer > startMapVer {
+					break
+				}
+			}
+
 			checkRankResults(t, tc.expResults, gotResp.Results)
 			checkMembers(t, tc.expMembers, svc.membership)
 		})

From 2819d4532e00b2395396ba3294e0bdc2c6a0449c Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Thu, 24 Oct 2024 15:04:10 +0800
Subject: [PATCH 17/34] DAOS-16488 chk: take sd_lock before accessing VOS
 sys_db - b26 (#15269)

The VOS sys_db may have multuiple users, such as SMD and CHK.
It is caller's duty to take lock against the VOS sys_db before
accessing it to handle concurrent operations from multiple XS.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/chk/chk_vos.c | 67 ++++++++++++++++++++++++++---------------------
 src/common/lru.c  |  4 ++-
 src/vos/sys_db.c  | 27 +++++++++++++++----
 3 files changed, 62 insertions(+), 36 deletions(-)

diff --git a/src/chk/chk_vos.c b/src/chk/chk_vos.c
index fdefc2995f7..af2904affd1 100644
--- a/src/chk/chk_vos.c
+++ b/src/chk/chk_vos.c
@@ -19,11 +19,20 @@ chk_db_fetch(char *key, int key_size, void *val, int val_size)
 {
 	d_iov_t	key_iov;
 	d_iov_t	val_iov;
+	int	rc;
 
 	d_iov_set(&key_iov, key, key_size);
 	d_iov_set(&val_iov, val, val_size);
 
-	return chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
+	rc = chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
+
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
+	return rc;
 }
 
 static int
@@ -33,21 +42,17 @@ chk_db_update(char *key, int key_size, void *val, int val_size)
 	d_iov_t	val_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
-
 	d_iov_set(&key_iov, key, key_size);
 	d_iov_set(&val_iov, val, val_size);
 
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
 	rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
 
-out:
 	return rc;
 }
 
@@ -57,27 +62,33 @@ chk_db_delete(char *key, int key_size)
 	d_iov_t	key_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
-
 	d_iov_set(&key_iov, key, key_size);
 
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
 	rc = chk_db->sd_delete(chk_db, CHK_DB_TABLE, &key_iov);
 
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
 
-out:
 	return rc;
 }
 
 static int
 chk_db_traverse(sys_db_trav_cb_t cb, void *args)
 {
-	return chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args);
+	int	rc;
+
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
+
+	rc = chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args);
+
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
+	return rc;
 }
 
 int
@@ -243,11 +254,8 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 	d_iov_t	val_iov;
 	int	rc;
 
-	if (chk_db->sd_tx_begin) {
-		rc = chk_db->sd_tx_begin(chk_db);
-		if (rc != 0)
-			goto out;
-	}
+	if (chk_db->sd_lock)
+		chk_db->sd_lock(chk_db);
 
 	if (cpp->cp_rank_nr != 0 && rank_list != NULL) {
 		D_ASSERTF(cpp->cp_rank_nr == rank_list->rl_nr, "Invalid rank nr %u/%u\n",
@@ -259,7 +267,7 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 
 		rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 		if (rc != 0)
-			goto end;
+			goto out;
 	}
 
 	d_iov_set(&key_iov, CHK_PROPERTY, strlen(CHK_PROPERTY));
@@ -267,11 +275,10 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list)
 
 	rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov);
 
-end:
-	if (chk_db->sd_tx_end)
-		rc = chk_db->sd_tx_end(chk_db, rc);
-
 out:
+	if (chk_db->sd_unlock)
+		chk_db->sd_unlock(chk_db);
+
 	if (rc != 0)
 		D_ERROR("Failed to update check property on rank %u: "DF_RC"\n",
 			dss_self_rank(), DP_RC(rc));
diff --git a/src/common/lru.c b/src/common/lru.c
index de86d367e0e..87dbdaddaa9 100644
--- a/src/common/lru.c
+++ b/src/common/lru.c
@@ -255,7 +255,9 @@ void
 daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
 {
 	D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1);
-	D_ASSERT(d_list_empty(&llink->ll_qlink));
+	D_ASSERTF(d_list_empty(&llink->ll_qlink),
+		  "May hit corrupted item in LRU cache %p: llink %p, refs %d, prev %p, next %p\n",
+		  lcache, llink, llink->ll_ref, llink->ll_qlink.prev, llink->ll_qlink.next);
 
 	lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link);
 
diff --git a/src/vos/sys_db.c b/src/vos/sys_db.c
index e7b4a2baa20..d1f6d4bce98 100644
--- a/src/vos/sys_db.c
+++ b/src/vos/sys_db.c
@@ -349,8 +349,9 @@ vos_db_init(const char *db_path)
 int
 vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool destroy_db_on_fini)
 {
-	int	create;
-	int	rc;
+	ABT_mutex_attr	attr = ABT_MUTEX_ATTR_NULL;
+	int		create;
+	int		rc;
 
 	D_ASSERT(db_path != NULL);
 
@@ -373,12 +374,26 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool
 		goto failed;
 	}
 
-	rc = ABT_mutex_create(&vos_db.db_lock);
+	rc = ABT_mutex_attr_create(&attr);
 	if (rc != ABT_SUCCESS) {
-		rc = -DER_NOMEM;
-		goto failed;
+		D_ERROR("Failed to create mutex attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
+	}
+
+	rc = ABT_mutex_attr_set_recursive(attr, ABT_TRUE);
+	if (rc != ABT_SUCCESS) {
+		D_ERROR("Failed to set mutex attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
 	}
 
+	rc = ABT_mutex_create_with_attr(attr, &vos_db.db_lock);
+	if (rc != ABT_SUCCESS) {
+		D_ERROR("Failed to create mutex with attr: %d\n", rc);
+		D_GOTO(failed, rc = dss_abterr2der(rc));
+	}
+
+	ABT_mutex_attr_free(&attr);
+
 	vos_db.db_poh = DAOS_HDL_INVAL;
 	vos_db.db_coh = DAOS_HDL_INVAL;
 
@@ -416,6 +431,8 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool
 	}
 	return 0;
 failed:
+	if (attr != ABT_MUTEX_ATTR_NULL)
+		ABT_mutex_attr_free(&attr);
 	vos_db_fini();
 	return rc;
 }

From ec3aa1c64d02d959289ec9417df96a8e945b5a0b Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Thu, 24 Oct 2024 20:34:28 +0800
Subject: [PATCH 18/34] DAOS-16469 dtx: optimize DTX CoS cache - b26 (#15085)

If there are a lot of committable DTX entries in DTX CoS cache,
then it may be inefficient to locate the DTX entry in CoS cache
with given oid + dkey_hash, that may happen under the case of
that DTX batched commit is blocked (such as because of network
trouble) as to trigger DTX refresh (for DTX cleanup) on other
related engines. If that happened, it will increase the system
load on such engine and slow down DTX commit further more. The
patch reduces unnecessary search operation inside CoS cache.

Other changes:

1. Metrics (io/dtx/async_cmt_lat/tgt_id) for DTX asynchronously
   commit latency (with unit ms).

2. Fix a bug in sched_ult2xs() with multiple numa sockets for
   DSS_XS_OFFLOAD case.

3. Delay commit (or abort) collective DTX on the leader target
   to handle resent race.

4. Avoid blocking dtx_req_wait() if chore failed to send out
   some DTX RPC.

5. Some cleanup for error handling.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/container/srv_target.c              |   1 +
 src/dtx/dtx_common.c                    |  43 ++--
 src/dtx/dtx_cos.c                       | 308 ++++++++++++++++--------
 src/dtx/dtx_internal.h                  |   4 +
 src/dtx/dtx_resync.c                    |   4 +-
 src/dtx/dtx_rpc.c                       | 104 +++++---
 src/dtx/dtx_srv.c                       |  13 +-
 src/engine/ult.c                        |   2 +-
 src/include/daos_srv/container.h        |   2 +
 src/include/daos_srv/dtx_srv.h          |   9 +-
 src/tests/ftest/util/telemetry_utils.py |   6 +-
 11 files changed, 322 insertions(+), 174 deletions(-)

diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index a6d3d3c5286..35ac3d4285c 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -669,6 +669,7 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid,
 	cont->sc_dtx_committable_coll_count = 0;
 	D_INIT_LIST_HEAD(&cont->sc_dtx_cos_list);
 	D_INIT_LIST_HEAD(&cont->sc_dtx_coll_list);
+	D_INIT_LIST_HEAD(&cont->sc_dtx_batched_list);
 
 	*link = &cont->sc_list;
 	return 0;
diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c
index ff4f2dfe4ef..ecb156729ed 100644
--- a/src/dtx/dtx_common.c
+++ b/src/dtx/dtx_common.c
@@ -392,11 +392,11 @@ dtx_cleanup(void *arg)
 				if (rc == 0) {
 					D_ASSERT(dce != NULL);
 
-					rc = dtx_coll_commit(cont, dce, NULL);
+					rc = dtx_coll_commit(cont, dce, NULL, false);
 					dtx_coll_entry_put(dce);
 				}
 			} else {
-				rc = dtx_commit(cont, &dte, NULL, 1);
+				rc = dtx_commit(cont, &dte, NULL, 1, false);
 			}
 		}
 
@@ -620,17 +620,16 @@ dtx_batched_commit_one(void *arg)
 	tls->dt_batched_ult_cnt++;
 
 	/* dbca->dbca_reg_gen != cont->sc_dtx_batched_gen means someone reopen the container. */
-	while (!dss_ult_exiting(dbca->dbca_commit_req) &&
+	while (!dss_ult_exiting(dbca->dbca_commit_req) && dtx_cont_opened(cont) &&
 		dbca->dbca_reg_gen == cont->sc_dtx_batched_gen) {
 		struct dtx_entry	**dtes = NULL;
-		struct dtx_cos_key	 *dcks = NULL;
 		struct dtx_coll_entry	 *dce = NULL;
 		struct dtx_stat		  stat = { 0 };
 		int			  cnt;
 		int			  rc;
 
 		cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, NULL,
-					    DAOS_EPOCH_MAX, false, &dtes, &dcks, &dce);
+					    DAOS_EPOCH_MAX, false, &dtes, NULL, &dce);
 		if (cnt == 0)
 			break;
 
@@ -644,11 +643,11 @@ dtx_batched_commit_one(void *arg)
 			/* Currently, commit collective DTX one by one. */
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, NULL, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, NULL, cnt, true);
 		}
-		dtx_free_committable(dtes, dcks, dce, cnt);
+		dtx_free_committable(dtes, NULL, dce, cnt);
 		if (rc != 0) {
 			D_WARN("Fail to batched commit %d entries for "DF_UUID": "DF_RC"\n",
 			       cnt, DP_UUID(cont->sc_uuid), DP_RC(rc));
@@ -1271,7 +1270,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	uint32_t			 flags;
 	int				 status = -1;
 	int				 rc = 0;
-	int				 i;
 	bool				 aborted = false;
 	bool				 unpin = false;
 
@@ -1424,10 +1422,10 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		vos_dtx_mark_committable(dth);
 
 		if (dlh->dlh_coll) {
-			rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL);
+			rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL, false);
 		} else {
 			dte = &dth->dth_dte;
-			rc = dtx_commit(cont, &dte, NULL, 1);
+			rc = dtx_commit(cont, &dte, NULL, 1, false);
 		}
 
 		if (rc != 0)
@@ -1487,15 +1485,9 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	/* If piggyback DTX has been done everywhere, then need to handle CoS cache.
 	 * It is harmless to keep some partially committed DTX entries in CoS cache.
 	 */
-	if (result == 0 && dth->dth_cos_done) {
-		for (i = 0; i < dth->dth_dti_cos_count; i++)
-			dtx_cos_del(cont, &dth->dth_dti_cos[i],
-				    &dth->dth_leader_oid, dth->dth_dkey_hash);
-	} else {
-		for (i = 0; i < dth->dth_dti_cos_count; i++)
-			dtx_cos_put_piggyback(cont, &dth->dth_dti_cos[i],
-					      &dth->dth_leader_oid, dth->dth_dkey_hash);
-	}
+	dtx_cos_put_piggyback(cont, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_dti_cos,
+			      dth->dth_dti_cos_count,
+			      (result == 0 && dth->dth_cos_done) ? true : false);
 
 	D_DEBUG(DB_IO, "Stop the DTX "DF_DTI" ver %u, dkey %lu, %s, cos %d/%d: result "DF_RC"\n",
 		DP_DTI(&dth->dth_xid), dth->dth_ver, (unsigned long)dth->dth_dkey_hash,
@@ -1654,7 +1646,8 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db
 		struct dtx_coll_entry	 *dce = NULL;
 
 		cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT,
-					    NULL, DAOS_EPOCH_MAX, true, &dtes, &dcks, &dce);
+					    NULL, DAOS_EPOCH_MAX, true, &dtes,
+					    dbca->dbca_commit_req != NULL ? &dcks : NULL, &dce);
 		if (cnt <= 0)
 			D_GOTO(out, rc = cnt);
 
@@ -1675,9 +1668,9 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db
 		if (dce != NULL) {
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, dcks, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, dcks, cnt, true);
 		}
 		dtx_free_committable(dtes, dcks, dce, cnt);
 	}
@@ -2365,9 +2358,9 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 		if (dce != NULL) {
 			D_ASSERT(cnt == 1);
 
-			rc = dtx_coll_commit(cont, dce, dcks);
+			rc = dtx_coll_commit(cont, dce, dcks, true);
 		} else {
-			rc = dtx_commit(cont, dtes, dcks, cnt);
+			rc = dtx_commit(cont, dtes, dcks, cnt, true);
 		}
 		dtx_free_committable(dtes, dcks, dce, cnt);
 		if (rc < 0) {
diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c
index 6e1d042b82b..4c165f94d0c 100644
--- a/src/dtx/dtx_cos.c
+++ b/src/dtx/dtx_cos.c
@@ -54,6 +54,8 @@ struct dtx_cos_rec_child {
 	d_list_t			 dcrc_gl_committable;
 	/* Link into related dcr_{reg,prio}_list. */
 	d_list_t			 dcrc_lo_link;
+	/* Link into container::sc_dtx_batched_list. */
+	d_list_t			 dcrc_batched_link;
 	union {
 		struct dtx_entry	*dcrc_dte;
 		struct dtx_coll_entry	*dcrc_dce;
@@ -61,8 +63,12 @@ struct dtx_cos_rec_child {
 	/* The DTX epoch. */
 	daos_epoch_t			 dcrc_epoch;
 	struct dtx_cos_rec		*dcrc_ptr;
+	uint64_t			 dcrc_ready_time;
 	uint32_t			 dcrc_piggyback_refs;
-	uint32_t			 dcrc_coll:1; /* For collective DTX. */
+	uint32_t			 dcrc_expcmt:1,
+					 dcrc_prio:1,
+					 dcrc_reg:1,
+					 dcrc_coll:1; /* For collective DTX. */
 };
 
 struct dtx_cos_rec_bundle {
@@ -129,6 +135,8 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 		return -DER_NOMEM;
 	}
 
+	D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link);
+	dcrc->dcrc_ready_time = daos_getmtime_coarse();
 	dcrc->dcrc_epoch = rbund->epoch;
 	dcrc->dcrc_ptr = dcr;
 	if (rbund->flags & DCF_COLL) {
@@ -144,12 +152,15 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov,
 	d_tm_inc_gauge(tls->dt_committable, 1);
 
 	if (rbund->flags & DCF_EXP_CMT) {
+		dcrc->dcrc_expcmt = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list);
 		dcr->dcr_expcmt_count = 1;
 	} else if (rbund->flags & DCF_SHARED) {
+		dcrc->dcrc_prio = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 		dcr->dcr_prio_count = 1;
 	} else {
+		dcrc->dcrc_reg = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list);
 		dcr->dcr_reg_count = 1;
 	}
@@ -177,6 +188,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -190,6 +202,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -203,6 +216,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args)
 				   dcrc_lo_link) {
 		d_list_del(&dcrc->dcrc_lo_link);
 		d_list_del(&dcrc->dcrc_gl_committable);
+		d_list_del(&dcrc->dcrc_batched_link);
 		if (dcrc->dcrc_coll) {
 			dtx_coll_entry_put(dcrc->dcrc_dce);
 			coll++;
@@ -256,6 +270,8 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec,
 	if (dcrc == NULL)
 		return -DER_NOMEM;
 
+	D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link);
+	dcrc->dcrc_ready_time = daos_getmtime_coarse();
 	dcrc->dcrc_epoch = rbund->epoch;
 	dcrc->dcrc_ptr = dcr;
 	if (rbund->flags & DCF_COLL) {
@@ -271,12 +287,15 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec,
 	d_tm_inc_gauge(tls->dt_committable, 1);
 
 	if (rbund->flags & DCF_EXP_CMT) {
+		dcrc->dcrc_expcmt = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list);
 		dcr->dcr_expcmt_count++;
 	} else if (rbund->flags & DCF_SHARED) {
+		dcrc->dcrc_prio = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 		dcr->dcr_prio_count++;
 	} else {
+		dcrc->dcrc_reg = 1;
 		d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list);
 		dcr->dcr_reg_count++;
 	}
@@ -294,6 +313,53 @@ btr_ops_t dtx_btr_cos_ops = {
 	.to_rec_update	= dtx_cos_rec_update,
 };
 
+static int
+dtx_cos_del_one(struct ds_cont_child *cont, struct dtx_cos_rec_child *dcrc)
+{
+	struct dtx_cos_key	 key;
+	d_iov_t			 kiov;
+	struct dtx_cos_rec	*dcr = dcrc->dcrc_ptr;
+	uint64_t		 time = daos_getmtime_coarse() - dcrc->dcrc_ready_time;
+	int			 rc = 0;
+
+	d_list_del(&dcrc->dcrc_gl_committable);
+	d_list_del(&dcrc->dcrc_lo_link);
+	if (!d_list_empty(&dcrc->dcrc_batched_link))
+		d_list_del_init(&dcrc->dcrc_batched_link);
+
+	if (dcrc->dcrc_expcmt)
+		dcr->dcr_expcmt_count--;
+	else if (dcrc->dcrc_prio)
+		dcr->dcr_prio_count--;
+	else
+		dcr->dcr_reg_count--;
+
+	if (dcrc->dcrc_coll)
+		cont->sc_dtx_committable_coll_count--;
+	cont->sc_dtx_committable_count--;
+
+	d_tm_set_gauge(dtx_tls_get()->dt_async_cmt_lat, time);
+
+	if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && dcr->dcr_expcmt_count == 0) {
+		key.oid = dcr->dcr_oid;
+		key.dkey_hash = dcr->dcr_dkey_hash;
+		d_iov_set(&kiov, &key, sizeof(key));
+		rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL);
+	}
+
+	DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc,
+		  "Remove DTX "DF_DTI" from CoS cache", DP_DTI(&dcrc->dcrc_dte->dte_xid));
+
+	if (dcrc->dcrc_coll)
+		dtx_coll_entry_put(dcrc->dcrc_dce);
+	else
+		dtx_entry_put(dcrc->dcrc_dte);
+
+	D_FREE(dcrc);
+
+	return rc;
+}
+
 int
 dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 		      daos_unit_oid_t *oid, daos_epoch_t epoch, bool force,
@@ -306,18 +372,45 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	uint32_t			  count;
 	uint32_t			  i = 0;
 
+	/* Last batched commit failed, let's re-commit them. */
+	if (dcks == NULL && !d_list_empty(&cont->sc_dtx_batched_list)) {
+		dcrc = d_list_entry(cont->sc_dtx_batched_list.next, struct dtx_cos_rec_child,
+				    dcrc_batched_link);
+		if (unlikely(dcrc->dcrc_coll)) {
+			*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
+			return 1;
+		}
+
+		D_ALLOC_ARRAY(dte_buf, max_cnt);
+		if (dte_buf == NULL)
+			return -DER_NOMEM;
+
+		d_list_for_each_entry(dcrc, &cont->sc_dtx_batched_list, dcrc_batched_link) {
+			D_ASSERT(i < max_cnt);
+			dte_buf[i++] = dtx_entry_get(dcrc->dcrc_dte);
+		}
+
+		*dtes = dte_buf;
+		return i;
+	}
+
 	/* Process collective DXT with higher priority. */
 	if (!d_list_empty(&cont->sc_dtx_coll_list) && oid == NULL) {
 		d_list_for_each_entry(dcrc, &cont->sc_dtx_coll_list, dcrc_gl_committable) {
 			if (epoch >= dcrc->dcrc_epoch &&
 			    (dcrc->dcrc_piggyback_refs == 0 || force)) {
-				D_ALLOC_PTR(dck_buf);
-				if (dck_buf == NULL)
-					return -DER_NOMEM;
-
-				dck_buf->oid = dcrc->dcrc_ptr->dcr_oid;
-				dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
-				*dcks = dck_buf;
+				if (dcks != NULL) {
+					D_ALLOC_PTR(dck_buf);
+					if (dck_buf == NULL)
+						return -DER_NOMEM;
+
+					dck_buf->oid = dcrc->dcrc_ptr->dcr_oid;
+					dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+					*dcks = dck_buf;
+				} else {
+					d_list_add_tail(&dcrc->dcrc_batched_link,
+							&cont->sc_dtx_batched_list);
+				}
 				*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
 
 				return 1;
@@ -326,19 +419,19 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	}
 
 	count = min(cont->sc_dtx_committable_count, max_cnt);
-	if (count == 0) {
-		*dtes = NULL;
+	if (count == 0)
 		return 0;
-	}
 
 	D_ALLOC_ARRAY(dte_buf, count);
 	if (dte_buf == NULL)
 		return -DER_NOMEM;
 
-	D_ALLOC_ARRAY(dck_buf, count);
-	if (dck_buf == NULL) {
-		D_FREE(dte_buf);
-		return -DER_NOMEM;
+	if (dcks != NULL) {
+		D_ALLOC_ARRAY(dck_buf, count);
+		if (dck_buf == NULL) {
+			D_FREE(dte_buf);
+			return -DER_NOMEM;
+		}
 	}
 
 	d_list_for_each_entry(dcrc, &cont->sc_dtx_cos_list, dcrc_gl_committable) {
@@ -353,17 +446,26 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 				continue;
 
 			D_FREE(dte_buf);
-			dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
-			dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
-			*dcks = dck_buf;
+			if (dcks != NULL) {
+				dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
+				dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+				*dcks = dck_buf;
+			} else {
+				d_list_add_tail(&dcrc->dcrc_batched_link,
+						&cont->sc_dtx_batched_list);
+			}
 			*p_dce = dtx_coll_entry_get(dcrc->dcrc_dce);
 
 			return 1;
 		}
 
 		dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte);
-		dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
-		dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+		if (dcks != NULL) {
+			dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid;
+			dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash;
+		} else {
+			d_list_add_tail(&dcrc->dcrc_batched_link, &cont->sc_dtx_batched_list);
+		}
 
 		if (++i >= count)
 			break;
@@ -372,10 +474,10 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt,
 	if (i == 0) {
 		D_FREE(dte_buf);
 		D_FREE(dck_buf);
-		*dtes = NULL;
 	} else {
 		*dtes = dte_buf;
-		*dcks = dck_buf;
+		if (dcks != NULL)
+			*dcks = dck_buf;
 	}
 
 	return i;
@@ -436,32 +538,44 @@ dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 }
 
 void
-dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid,
-		      daos_unit_oid_t *oid, uint64_t dkey_hash)
+dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
+		      struct dtx_id xid[], uint32_t count, bool rm)
 {
 	struct dtx_cos_key		 key;
 	d_iov_t				 kiov;
 	d_iov_t				 riov;
 	struct dtx_cos_rec		*dcr;
 	struct dtx_cos_rec_child	*dcrc;
+	int				 del = 0;
 	int				 rc;
+	int				 i;
 
 	key.oid = *oid;
 	key.dkey_hash = dkey_hash;
 	d_iov_set(&kiov, &key, sizeof(key));
 	d_iov_set(&riov, NULL, 0);
 
-	/* It is normal that the DTX entry (to be put) in CoS has already been removed by race. */
-
 	rc = dbtree_lookup(cont->sc_dtx_cos_hdl, &kiov, &riov);
 	if (rc == 0) {
 		dcr = (struct dtx_cos_rec *)riov.iov_buf;
-		d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-			if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
-				dcrc->dcrc_piggyback_refs--;
-				return;
+		for (i = 0; i < count; i++) {
+			d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
+				if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i],
+					   sizeof(struct dtx_id)) == 0) {
+					if (rm) {
+						rc = dtx_cos_del_one(cont, dcrc);
+						if (rc == 0)
+							del++;
+					} else {
+						dcrc->dcrc_piggyback_refs--;
+					}
+					break;
+				}
 			}
 		}
+
+		if (del > 0)
+			d_tm_dec_gauge(dtx_tls_get()->dt_committable, del);
 	}
 }
 
@@ -493,12 +607,12 @@ dtx_cos_add(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid,
 			   DAOS_INTENT_UPDATE, &kiov, &riov, NULL);
 
 	if (flags & DCF_COLL)
-		D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert coll DTX "DF_DTI" to CoS cache, "
+		D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert coll DTX "DF_DTI" to CoS cache, "
 			 DF_UOID", key %lu, flags %x: "DF_RC"\n",
 			 DP_DTI(&((struct dtx_coll_entry *)entry)->dce_xid), DP_UOID(*oid),
 			 (unsigned long)dkey_hash, flags, DP_RC(rc));
 	else
-		D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert reg DTX "DF_DTI" to CoS cache, "
+		D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert reg DTX "DF_DTI" to CoS cache, "
 			 DF_UOID", key %lu, flags %x: "DF_RC"\n",
 			 DP_DTI(&((struct dtx_entry *)entry)->dte_xid), DP_UOID(*oid),
 			 (unsigned long)dkey_hash, flags, DP_RC(rc));
@@ -530,82 +644,36 @@ dtx_cos_del(struct ds_cont_child *cont, struct dtx_id *xid,
 	dcr = (struct dtx_cos_rec *)riov.iov_buf;
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 1);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_prio_count--;
-
-		D_GOTO(out, found = 1);
 	}
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 2);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_reg_count--;
-
-		D_GOTO(out, found = 2);
 	}
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0)
-			continue;
-
-		d_list_del(&dcrc->dcrc_gl_committable);
-		d_list_del(&dcrc->dcrc_lo_link);
-		if (dcrc->dcrc_coll) {
-			dtx_coll_entry_put(dcrc->dcrc_dce);
-			cont->sc_dtx_committable_coll_count--;
-		} else {
-			dtx_entry_put(dcrc->dcrc_dte);
+		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			rc = dtx_cos_del_one(cont, dcrc);
+			D_GOTO(out, found = 3);
 		}
-		D_FREE(dcrc);
-
-		cont->sc_dtx_committable_count--;
-		dcr->dcr_expcmt_count--;
-
-		D_GOTO(out, found = 3);
 	}
 
 out:
-	if (found > 0) {
+	if (found > 0)
 		d_tm_dec_gauge(dtx_tls_get()->dt_committable, 1);
 
-		if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 &&
-		    dcr->dcr_expcmt_count == 0)
-			rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL);
-	}
-
 	if (rc == 0 && found == 0)
 		rc = -DER_NONEXIST;
 
-	D_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_IO,
-		 "Remove DTX "DF_DTI" from CoS "
-		 "cache, "DF_UOID", key %lu, %s shared entry: rc = "DF_RC"\n",
-		 DP_DTI(xid), DP_UOID(*oid), (unsigned long)dkey_hash,
-		 found == 1 ? "has" : "has not", DP_RC(rc));
+	DL_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc,
+		  "Remove DTX from CoS cache "DF_UOID", key %lu",
+		  DP_UOID(*oid), (unsigned long)dkey_hash);
 
 	return rc == -DER_NONEXIST ? 0 : rc;
 }
@@ -624,6 +692,12 @@ dtx_cos_oldest(struct ds_cont_child *cont)
 	return dcrc->dcrc_epoch;
 }
 
+/*
+ * It is inefficient to search some item on a very long list. So let's skip
+ * the search if the length exceeds DTX_COS_SEARCH_MAX. That is not fatal.
+ */
+#define DTX_COS_SEARCH_MAX	32
+
 void
 dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 	     daos_unit_oid_t *oid, uint64_t dkey_hash)
@@ -647,8 +721,13 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 
 	dcr = (struct dtx_cos_rec *)riov.iov_buf;
 
+	if (dcr->dcr_reg_count > DTX_COS_SEARCH_MAX)
+		goto expcmt;
+
 	d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) {
 		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
+			dcrc->dcrc_reg = 0;
+			dcrc->dcrc_prio = 1;
 			d_list_del(&dcrc->dcrc_lo_link);
 			d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
 			dcr->dcr_reg_count--;
@@ -658,14 +737,9 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 		}
 	}
 
-	d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) {
-		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) {
-			d_list_del(&dcrc->dcrc_lo_link);
-			d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list);
-
-			D_GOTO(out, found = true);
-		}
-	}
+expcmt:
+	if (dcr->dcr_expcmt_count > DTX_COS_SEARCH_MAX)
+		goto out;
 
 	d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) {
 		if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0)
@@ -683,3 +757,39 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 
 	/* It is normal that the DTX entry (for priority) in CoS has been committed by race. */
 }
+
+void
+dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], uint32_t count)
+{
+	struct dtx_cos_rec_child	*dcrc;
+	int				 del = 0;
+	int				 rc;
+	int				 i = 0;
+	bool				 found;
+
+	while ((dcrc = d_list_pop_entry(&cont->sc_dtx_batched_list, struct dtx_cos_rec_child,
+					dcrc_batched_link)) != NULL) {
+		for (found = false; i < count && !found; i++) {
+			/*
+			 * Some entries in the sc_dtx_batched_list may have been committed by
+			 * others by race. Since the entries order in the sc_dtx_batched_list
+			 * will not be changed, let's compare with xid[i] via one cycle scan.
+			 */
+			if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], sizeof(struct dtx_id)) == 0) {
+				found = true;
+
+				if (rm[i]) {
+					rc = dtx_cos_del_one(cont, dcrc);
+					if (rc == 0)
+						del++;
+				}
+			}
+		}
+
+		/* There must be one in xid array that matches current dcrc. */
+		D_ASSERT(found);
+	}
+
+	if (del > 0)
+		d_tm_dec_gauge(dtx_tls_get()->dt_committable, del);
+}
diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h
index a42bcc1d7d6..06d0333dd77 100644
--- a/src/dtx/dtx_internal.h
+++ b/src/dtx/dtx_internal.h
@@ -213,6 +213,7 @@ struct dtx_pool_metrics {
 struct dtx_tls {
 	struct d_tm_node_t	*dt_committable;
 	struct d_tm_node_t	*dt_dtx_leader_total;
+	struct d_tm_node_t	*dt_async_cmt_lat;
 	uint64_t		 dt_agg_gen;
 	uint32_t		 dt_batched_ult_cnt;
 };
@@ -263,6 +264,9 @@ uint64_t dtx_cos_oldest(struct ds_cont_child *cont);
 void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid,
 		  daos_unit_oid_t *oid, uint64_t dkey_hash);
 
+void dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[],
+			 uint32_t count);
+
 /* dtx_rpc.c */
 int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte,
 	      daos_epoch_t epoch);
diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c
index b98a6469954..756dfe0ca44 100644
--- a/src/dtx/dtx_resync.c
+++ b/src/dtx/dtx_resync.c
@@ -115,7 +115,7 @@ dtx_resync_commit(struct ds_cont_child *cont,
 	}
 
 	if (j > 0) {
-		rc = dtx_commit(cont, dtes, dcks, j);
+		rc = dtx_commit(cont, dtes, dcks, j, true);
 		if (rc < 0)
 			D_ERROR("Failed to commit the DTXs: rc = "DF_RC"\n",
 				DP_RC(rc));
@@ -359,7 +359,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_un
 
 		dck.oid = oid;
 		dck.dkey_hash = dkey_hash;
-		rc = dtx_coll_commit(cont, dce, &dck);
+		rc = dtx_coll_commit(cont, dce, &dck, true);
 	}
 
 	dtx_coll_entry_put(dce);
diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c
index e654047a621..5423f6c0cc8 100644
--- a/src/dtx/dtx_rpc.c
+++ b/src/dtx/dtx_rpc.c
@@ -225,9 +225,10 @@ dtx_req_cb(const struct crt_cb_info *cb_info)
 	}
 
 out:
-	D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: "
-		"epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future,
-		drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc);
+	DL_CDEBUG(rc < 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc,
+		  "DTX req for opc %x (req %p future %p) got reply from %d/%d: "
+		  "epoch :"DF_X64, dra->dra_opc, req, dra->dra_future,
+		  drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0);
 
 	drr->drr_comp = 1;
 	drr->drr_result = rc;
@@ -397,19 +398,14 @@ dtx_req_list_send(struct dtx_common_args *dca, bool is_reentrance)
 
 		if (unlikely(dra->dra_opc == DTX_COMMIT && dca->dca_i == 0 &&
 			     DAOS_FAIL_CHECK(DAOS_DTX_FAIL_COMMIT)))
-			rc = dtx_req_send(dca->dca_drr, 1);
+			dtx_req_send(dca->dca_drr, 1);
 		else
-			rc = dtx_req_send(dca->dca_drr, dca->dca_epoch);
-		if (rc != 0) {
-			/* If the first sub-RPC failed, then break, otherwise
-			 * other remote replicas may have already received the
-			 * RPC and executed it, so have to go ahead.
-			 */
-			if (dca->dca_i == 0) {
-				ABT_future_free(&dra->dra_future);
-				return rc;
-			}
-		}
+			dtx_req_send(dca->dca_drr, dca->dca_epoch);
+		/*
+		 * Do not care dtx_req_send result, itself or its cb func will set dra->dra_future.
+		 * Each RPC is independent from the others, let's go head to handle the other RPCs
+		 * and set dra->dra_future that will avoid blocking the RPC sponsor - dtx_req_wait.
+		 */
 
 		/* dca->dca_drr maybe not points to a real entry if all RPCs have been sent. */
 		dca->dca_drr = d_list_entry(dca->dca_drr->drr_link.next,
@@ -616,12 +612,8 @@ dtx_rpc_helper(struct dss_chore *chore, bool is_reentrance)
 	rc = dtx_req_list_send(dca, is_reentrance);
 	if (rc == DSS_CHORE_YIELD)
 		return DSS_CHORE_YIELD;
-	if (rc == DSS_CHORE_DONE)
-		rc = 0;
-	if (rc != 0)
-		dca->dca_dra.dra_result = rc;
-	D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, "%p: DTX RPC chore for %u done: %d\n", chore,
-		 dca->dca_dra.dra_opc, rc);
+	D_ASSERTF(rc == DSS_CHORE_DONE, "Unexpected helper return value for RPC %u: %d\n",
+		  dca->dca_dra.dra_opc, rc);
 	if (dca->dca_chore_eventual != ABT_EVENTUAL_NULL) {
 		rc = ABT_eventual_set(dca->dca_chore_eventual, NULL, 0);
 		D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_set: %d\n", rc);
@@ -737,8 +729,10 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list,  struct dtx_entry **dtes,
 			}
 
 			rc = dss_chore_delegate(&dca->dca_chore, dtx_rpc_helper);
-			if (rc != 0)
+			if (rc != 0) {
+				ABT_eventual_free(&dca->dca_chore_eventual);
 				goto out;
+			}
 
 			rc = ABT_eventual_wait(dca->dca_chore_eventual, NULL);
 			D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_wait: %d\n", rc);
@@ -809,7 +803,7 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list,  struct dtx_entry **dtes,
  */
 int
 dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
-	   struct dtx_cos_key *dcks, int count)
+	   struct dtx_cos_key *dcks, int count, bool has_cos)
 {
 	struct dtx_common_args	 dca;
 	struct dtx_req_args	*dra = &dca.dca_dra;
@@ -842,7 +836,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
 			rc1 = vos_dtx_set_flags(cont->sc_hdl, dca.dca_dtis, count,
 						DTE_PARTIAL_COMMITTED);
 	} else {
-		if (dcks != NULL) {
+		if (has_cos) {
 			if (count > 1) {
 				D_ALLOC_ARRAY(rm_cos, count);
 				if (rm_cos == NULL)
@@ -862,12 +856,16 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
 		}
 
 		if (rc1 == 0 && rm_cos != NULL) {
-			for (i = 0; i < count; i++) {
-				if (rm_cos[i]) {
-					D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub));
-					dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid,
-						    dcks[i].dkey_hash);
+			if (dcks != NULL) {
+				for (i = 0; i < count; i++) {
+					if (rm_cos[i]) {
+						D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub));
+						dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid,
+							    dcks[i].dkey_hash);
+					}
 				}
+			} else {
+				dtx_cos_batched_del(cont, dca.dca_dtis, rm_cos, count);
 			}
 		}
 
@@ -1237,7 +1235,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 			case DTX_ST_COMMITTABLE:
 				dck.oid = dsp->dsp_oid;
 				dck.dkey_hash = dsp->dsp_dkey_hash;
-				rc = dtx_commit(cont, &pdte, &dck, 1);
+				rc = dtx_commit(cont, &pdte, &dck, 1, true);
 				if (rc < 0 && rc != -DER_NONEXIST && for_io)
 					d_list_add_tail(&dsp->dsp_link, cmt_list);
 				else
@@ -1258,7 +1256,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 		case DSHR_NEED_COMMIT: {
 			dck.oid = dsp->dsp_oid;
 			dck.dkey_hash = dsp->dsp_dkey_hash;
-			rc = dtx_commit(cont, &pdte, &dck, 1);
+			rc = dtx_commit(cont, &pdte, &dck, 1, true);
 			if (rc < 0 && rc != -DER_NONEXIST && for_io)
 				d_list_add_tail(&dsp->dsp_link, cmt_list);
 			else
@@ -1571,17 +1569,20 @@ dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret)
 {
 	int	rc;
 
-	rc = ABT_future_wait(dcra->dcra_future);
-	D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE,
-		 "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n",
-		 dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result);
-	ABT_future_free(&dcra->dcra_future);
+	if (dcra->dcra_future != ABT_FUTURE_NULL) {
+		rc = ABT_future_wait(dcra->dcra_future);
+		D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE,
+			 "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n",
+			 dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result);
+		ABT_future_free(&dcra->dcra_future);
+	}
 
 	return ret != 0 ? ret : dcra->dcra_result;
 }
 
 int
-dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck)
+dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck,
+		bool has_cos)
 {
 	struct dtx_coll_rpc_args	 dcra = { 0 };
 	int				*results = NULL;
@@ -1591,11 +1592,22 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
 	int				 rc1 = 0;
 	int				 rc2 = 0;
 	int				 i;
+	bool				 cos = true;
 
 	if (dce->dce_ranks != NULL)
 		rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_COMMIT, 0, &dcra);
 
+	/*
+	 * NOTE: Before committing the DTX on remote participants, we cannot remove the active
+	 *	 DTX locally; otherwise, the local committed DTX entry may be removed via DTX
+	 *	 aggregation before remote participants commit done. Under such case, if some
+	 *	 remote DTX participant triggere DTX_REFRESH for such DTX during the interval,
+	 *	 then it will get -DER_TX_UNCERTAIN, that may cause related application to be
+	 *	 failed. So here, we let remote participants to commit firstly, if failed, we
+	 *	 will ask the leader to retry the commit until all participants got committed.
+	 */
 	if (dce->dce_bitmap != NULL) {
+		clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id);
 		len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0,
 					  DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap,
 					  &results);
@@ -1634,8 +1646,12 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
 	 *	 to remove the collective DTX entry from the CoS even if the commit failed remotely.
 	 *	 Otherwise, the batched commit ULT may be blocked by such "bad" entry.
 	 */
-	if (rc2 == 0 && dck != NULL)
-		dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash);
+	if (rc2 == 0 && has_cos) {
+		if (dck != NULL)
+			dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash);
+		else
+			dtx_cos_batched_del(cont, &dce->dce_xid, &cos, 1);
+	}
 
 	D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE,
 		 "Collectively commit DTX "DF_DTI": %d/%d/%d\n",
@@ -1658,7 +1674,17 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 	if (dce->dce_ranks != NULL)
 		rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_ABORT, epoch, &dcra);
 
+	/*
+	 * NOTE: The DTX abort maybe triggered by dtx_leader_end() for timeout on some DTX
+	 *	 participant(s). Under such case, the client side RPC sponsor may also hit
+	 *	 the RPC timeout and resends related RPC to the leader. Here, to avoid DTX
+	 *	 abort and resend RPC forwarding being executed in parallel, we will abort
+	 *	 local DTX after remote done, before that the logic of handling resent RPC
+	 *	 on server will find the local pinned DTX entry then notify related client
+	 *	 to resend sometime later.
+	 */
 	if (dce->dce_bitmap != NULL) {
+		clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id);
 		len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch,
 					  DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap,
 					  &results);
diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c
index 93cc6744a2d..9e78100c451 100644
--- a/src/dtx/dtx_srv.c
+++ b/src/dtx/dtx_srv.c
@@ -34,8 +34,8 @@ dtx_tls_init(int tags, int xs_id, int tgt_id)
 
 	tls->dt_agg_gen = 1;
 	rc = d_tm_add_metric(&tls->dt_committable, D_TM_STATS_GAUGE,
-			     "total number of committable DTX entries",
-			     "entries", "io/dtx/committable/tgt_%u", tgt_id);
+			     "total number of committable DTX entries", "entry",
+			     "io/dtx/committable/tgt_%u", tgt_id);
 	if (rc != DER_SUCCESS)
 		D_WARN("Failed to create DTX committable metric: " DF_RC"\n",
 		       DP_RC(rc));
@@ -48,6 +48,13 @@ dtx_tls_init(int tags, int xs_id, int tgt_id)
 		D_WARN("Failed to create DTX leader metric: " DF_RC"\n",
 		       DP_RC(rc));
 
+	rc = d_tm_add_metric(&tls->dt_async_cmt_lat, D_TM_STATS_GAUGE,
+			     "DTX async commit latency", "ms",
+			     "io/dtx/async_cmt_lat/tgt_%u", tgt_id);
+	if (rc != DER_SUCCESS)
+		D_WARN("Failed to create DTX async commit latency metric: " DF_RC"\n",
+		       DP_RC(rc));
+
 	return tls;
 }
 
@@ -117,7 +124,7 @@ dtx_metrics_alloc(const char *path, int tgt_id)
 
 	rc = d_tm_add_metric(&metrics->dpm_total[DTX_PROTO_SRV_RPC_COUNT], D_TM_COUNTER,
 			     "total number of processed sync DTX_COMMIT", "ops",
-			     "%s/ops/sync_dtx_commit/tgt_%u", path, tgt_id);
+			     "%s/ops/dtx_sync_commit/tgt_%u", path, tgt_id);
 	if (rc != DER_SUCCESS)
 		D_WARN("Failed to create sync DTX_COMMIT RPC cnt metric: "DF_RC"\n", DP_RC(rc));
 
diff --git a/src/engine/ult.c b/src/engine/ult.c
index 94a2a0f9390..fbeb3f538fa 100644
--- a/src/engine/ult.c
+++ b/src/engine/ult.c
@@ -458,7 +458,7 @@ sched_ult2xs(int xs_type, int tgt_id)
 		break;
 	case DSS_XS_OFFLOAD:
 		if (dss_numa_nr > 1)
-			xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
+			return sched_ult2xs_multisocket(xs_type, tgt_id);
 		if (!dss_helper_pool) {
 			if (dss_tgt_offload_xs_nr > 0)
 				xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr;
diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h
index 793d585c07a..f99f4df14e3 100644
--- a/src/include/daos_srv/container.h
+++ b/src/include/daos_srv/container.h
@@ -134,6 +134,8 @@ struct ds_cont_child {
 	d_list_t		 sc_dtx_cos_list;
 	/* The global list for committable collective DTXs. */
 	d_list_t		 sc_dtx_coll_list;
+	/* The list for current DTX batched commit. */
+	d_list_t		 sc_dtx_batched_list;
 	/* the pool map version of updating DAOS_PROP_CO_STATUS prop */
 	uint32_t		 sc_status_pm_ver;
 };
diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h
index 7b3f7e9ee57..7c60d2deaa0 100644
--- a/src/include/daos_srv/dtx_srv.h
+++ b/src/include/daos_srv/dtx_srv.h
@@ -318,8 +318,8 @@ int
 dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
 		      int max, struct dtx_id **dtis);
 void
-dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid,
-		      daos_unit_oid_t *oid, uint64_t dkey_hash);
+dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash,
+		      struct dtx_id xid[], uint32_t count, bool rm);
 int
 dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func,
 		    dtx_agg_cb_t agg_cb, int allow_failure, void *func_arg);
@@ -338,14 +338,15 @@ int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid,
 		 daos_epoch_t epoch);
 
 int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes,
-	       struct dtx_cos_key *dcks, int count);
+	       struct dtx_cos_key *dcks, int count, bool has_cos);
 
 int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch);
 
 int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont);
 
 int
-dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck);
+dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck,
+		bool has_cos);
 
 int
 dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch);
diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py
index db424b6de68..17f1753b100 100644
--- a/src/tests/ftest/util/telemetry_utils.py
+++ b/src/tests/ftest/util/telemetry_utils.py
@@ -90,6 +90,7 @@ class TelemetryUtils():
         "engine_pool_ops_dtx_coll_commit",
         "engine_pool_ops_dtx_commit",
         "engine_pool_ops_dtx_refresh",
+        "engine_pool_ops_dtx_sync_commit",
         "engine_pool_ops_ec_agg",
         "engine_pool_ops_ec_rep",
         "engine_pool_ops_fetch",
@@ -200,6 +201,8 @@ class TelemetryUtils():
         "engine_dmabuff_queued_reqs",
         "engine_dmabuff_grab_errs",
         *_gen_stats_metrics("engine_dmabuff_grab_retries")]
+    ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS = \
+        _gen_stats_metrics("engine_io_dtx_async_cmt_lat")
     ENGINE_IO_DTX_COMMITTABLE_METRICS = \
         _gen_stats_metrics("engine_io_dtx_committable")
     ENGINE_IO_DTX_COMMITTED_METRICS = \
@@ -304,7 +307,8 @@ class TelemetryUtils():
         _gen_stats_metrics("engine_io_ops_tgt_update_active")
     ENGINE_IO_OPS_UPDATE_ACTIVE_METRICS = \
         _gen_stats_metrics("engine_io_ops_update_active")
-    ENGINE_IO_METRICS = ENGINE_IO_DTX_COMMITTABLE_METRICS +\
+    ENGINE_IO_METRICS = ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS +\
+        ENGINE_IO_DTX_COMMITTABLE_METRICS +\
         ENGINE_IO_DTX_COMMITTED_METRICS +\
         ENGINE_IO_LATENCY_FETCH_METRICS +\
         ENGINE_IO_LATENCY_BULK_FETCH_METRICS +\

From 2b5620b230df6a7c3700f10128998cfb5360b2fc Mon Sep 17 00:00:00 2001
From: Jerome Soumagne <jerome.soumagne@intel.com>
Date: Thu, 24 Oct 2024 10:47:16 -0500
Subject: [PATCH 19/34] DAOS-14262 cart: add ability to select traffic class
 for SWIM context  (#14893) (#14917)

Add SWIM_TRAFFIC_CLASS env var (default is unspec)

Signed-off-by: Jerome Soumagne <jerome.soumagne@intel.com>
---
 src/cart/README.env           | 12 +++++++-----
 src/cart/crt_hg.c             |  3 +++
 src/cart/crt_init.c           | 36 ++++++++++++++++++++++++++---------
 src/cart/crt_internal_types.h | 15 +++++++++++++++
 4 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/src/cart/README.env b/src/cart/README.env
index 00f270d7a41..b90939c8c72 100644
--- a/src/cart/README.env
+++ b/src/cart/README.env
@@ -1,13 +1,10 @@
 This file lists the environment variables used in CaRT.
 
  . D_PROVIDER (Deprecated: CRT_PHY_ADDR_STR)
-   It determines which mercury NA plugin to be used:
+   It determines which mercury NA plugin and transport to be used:
    - set it as "ofi+verbs;ofi_rxm" to use OFI verbs;ofi_rxm provider
-   - set it as "ofi+gni" to use OFI gni provider
    - set it as "sm" to use SM plugin which only works within single node
-   - set it as "ofi+tcp;ofi_rxm" to use OFI tcp;ofi_rxm provider.
-   - set it as "ofi+sockets" to use OFI sockets provider
-      NOTE: This provider is deprecated in favor of "ofi+tcp;ofi_rxm"
+   - set it as "ofi+tcp" to use OFI tcp provider.
    - by default (not set or set as any other value) it will use ofi tcp
       provider.
 
@@ -205,3 +202,8 @@ This file lists the environment variables used in CaRT.
    start copying data in an effort to release multi-recv buffers. Copy will occur when at
    most D_MRECV_BUF_COPY buffers remain.
 
+   SWIM_TRAFFIC_CLASS
+   (server only) Select a traffic class for the SWIM protocol to use and prevent potential
+   traffic congestion. Available options are: "unspec" (default), "best_effort",
+   "low_latency", "bulk_data".
+
diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c
index 789d75ceb31..244373c9228 100644
--- a/src/cart/crt_hg.c
+++ b/src/cart/crt_hg.c
@@ -863,6 +863,9 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
 	init_info.request_post_incr         = crt_gdata.cg_post_incr;
 	init_info.multi_recv_op_max         = crt_gdata.cg_mrecv_buf;
 	init_info.multi_recv_copy_threshold = crt_gdata.cg_mrecv_buf_copy;
+	/* Separate SWIM traffic in an effort to prevent potential congestion. */
+	if (crt_is_service() && ctx_idx == crt_gdata.cg_swim_crt_idx)
+		init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc;
 
 	hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info);
 	if (hg_class == NULL) {
diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c
index 21fc184d446..48d2090a5b3 100644
--- a/src/cart/crt_init.c
+++ b/src/cart/crt_init.c
@@ -18,6 +18,10 @@ static volatile int	gdata_init_flag;
 struct crt_plugin_gdata crt_plugin_gdata;
 static bool		g_prov_settings_applied[CRT_PROV_COUNT];
 
+#define X(a, b) b,
+static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES};
+#undef X
+
 static void
 crt_lib_init(void) __attribute__((__constructor__));
 
@@ -237,18 +241,30 @@ crt_gdata_dump(void)
 	DUMP_GDATA_FIELD("%d", cg_rpc_quota);
 }
 
+static enum crt_traffic_class
+crt_str_to_tc(const char *str)
+{
+	enum crt_traffic_class i = 0;
+
+	while (str != NULL && strcmp(crt_tc_name[i], str) != 0 && i < CRT_TC_UNKNOWN)
+		i++;
+
+	return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i;
+}
+
 /* first step init - for initializing crt_gdata */
 static int data_init(int server, crt_init_options_t *opt)
 {
-	uint32_t        timeout = 0;
-	uint32_t	credits;
-	uint32_t	fi_univ_size = 0;
-	uint32_t	mem_pin_enable = 0;
-	uint32_t        is_secondary;
-	uint32_t        post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR;
-	unsigned int    mrecv_buf      = CRT_HG_MRECV_BUF;
-	unsigned int    mrecv_buf_copy = 0; /* buf copy disabled by default */
-	int             rc             = 0;
+	uint32_t     timeout = 0;
+	uint32_t     credits;
+	uint32_t     fi_univ_size   = 0;
+	uint32_t     mem_pin_enable = 0;
+	uint32_t     is_secondary;
+	uint32_t     post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR;
+	unsigned int mrecv_buf          = CRT_HG_MRECV_BUF;
+	unsigned int mrecv_buf_copy     = 0; /* buf copy disabled by default */
+	char        *swim_traffic_class = NULL;
+	int          rc                 = 0;
 
 	crt_env_dump();
 
@@ -261,6 +277,8 @@ static int data_init(int server, crt_init_options_t *opt)
 	crt_gdata.cg_mrecv_buf = mrecv_buf;
 	crt_env_get(D_MRECV_BUF_COPY, &mrecv_buf_copy);
 	crt_gdata.cg_mrecv_buf_copy = mrecv_buf_copy;
+	crt_env_get(SWIM_TRAFFIC_CLASS, &swim_traffic_class);
+	crt_gdata.cg_swim_tc = crt_str_to_tc(swim_traffic_class);
 
 	is_secondary = 0;
 	/* Apply CART-890 workaround for server side only */
diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h
index 38133e0cf17..c3eb9cae12c 100644
--- a/src/cart/crt_internal_types.h
+++ b/src/cart/crt_internal_types.h
@@ -42,6 +42,17 @@ struct crt_na_config {
 	char            **noc_domain_str; /* Array of domains */
 };
 
+#define CRT_TRAFFIC_CLASSES                                                                        \
+	X(CRT_TC_UNSPEC, "unspec")           /* Leave it upon plugin to choose */                  \
+	X(CRT_TC_BEST_EFFORT, "best_effort") /* Best effort */                                     \
+	X(CRT_TC_LOW_LATENCY, "low_latency") /* Low latency */                                     \
+	X(CRT_TC_BULK_DATA, "bulk_data")     /* Bulk data */                                       \
+	X(CRT_TC_UNKNOWN, "unknown")         /* Unknown */
+
+#define X(a, b) a,
+enum crt_traffic_class { CRT_TRAFFIC_CLASSES };
+#undef X
+
 struct crt_prov_gdata {
 	/** NA plugin type */
 	int			cpg_provider;
@@ -105,6 +116,9 @@ struct crt_gdata {
 	/** global swim index for all servers */
 	int32_t			cg_swim_crt_idx;
 
+	/** traffic class used by SWIM */
+	enum crt_traffic_class   cg_swim_tc;
+
 	/** credits limitation for #in-flight RPCs per target EP CTX */
 	uint32_t		cg_credit_ep_ctx;
 
@@ -220,6 +234,7 @@ struct crt_event_cb_priv {
 	ENV(SWIM_PING_TIMEOUT)                                                                     \
 	ENV(SWIM_PROTOCOL_PERIOD_LEN)                                                              \
 	ENV(SWIM_SUSPECT_TIMEOUT)                                                                  \
+	ENV_STR(SWIM_TRAFFIC_CLASS)                                                                \
 	ENV_STR(UCX_IB_FORK_INIT)
 
 /* uint env */

From 70b12e3331fea2c44beeba6792b316f1157eec51 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Fri, 25 Oct 2024 00:35:58 +0800
Subject: [PATCH 20/34] DAOS-16469 container: Lower log level for
 cont_aggregate_interval (#15283)

To reduce the side-effect caused by frequent log with -DER_INPROGRESS.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/container/srv_target.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index 35ac3d4285c..caf2a4b2ee7 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -475,7 +475,7 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb,
 		if (rc == -DER_SHUTDOWN) {
 			break;	/* pool destroyed */
 		} else if (rc < 0) {
-			DL_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, rc,
+			DL_CDEBUG(rc == -DER_BUSY || rc == -DER_INPROGRESS, DB_EPC, DLOG_ERR, rc,
 				  DF_CONT ": %s aggregate failed",
 				  DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
 				  param->ap_vos_agg ? "VOS" : "EC");

From 2a1892f02c08135e9d7d90d23c589df30068b330 Mon Sep 17 00:00:00 2001
From: Jeff Olivier <jeffolivier@google.com>
Date: Thu, 24 Oct 2024 12:33:26 -0600
Subject: [PATCH 21/34] DAOS-16716 ci: Set reference build for PRs (#15379)

Release branch PRs should use the release branch build
instead of master branch build for Fault Injection reference

Signed-off-by: Jeff Olivier <jeffolivier@google.com>
---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 13ccc512895..8b2c027b15f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1038,7 +1038,7 @@ pipeline {
                     }
                     post {
                         always {
-                            discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master',
+                            discoverGitReferenceBuild referenceJob: 'daos-stack/daos/release%252F2.6',
                                                       scm: 'daos-stack/daos',
                                                       requiredResult: hudson.model.Result.UNSTABLE
                             recordIssues enabledForFailure: true,

From 23f0787ab188704aeb0c0d000e2e181777831690 Mon Sep 17 00:00:00 2001
From: Alexander Oganezov <alexander.a.oganezov@intel.com>
Date: Fri, 25 Oct 2024 08:51:07 -0700
Subject: [PATCH 22/34] DAOS-15914: crt_reply_send_input_free() (#14817)

- New crt_reply_send_input_free() API added which releases input buffer right
  after HG_Respond() instead of waiting until the handle is destroyed.
- srv_obj.c calls changed to use new crt_reply_send_input_free()
- I/O context takes refcount on RPC
- only release input buffer for target update

Signed-off-by: Alexander A Oganezov <alexander.a.oganezov@intel.com>
Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Co-authored-by: Liang Zhen <liang.zhen@intel.com>
---
 src/cart/crt_hg.c      | 10 +++++++++
 src/cart/crt_rpc.c     | 20 ++++++++++++++++++
 src/cart/crt_rpc.h     | 47 +++++++++++++++++++++---------------------
 src/dtx/dtx_srv.c      |  2 +-
 src/include/cart/api.h | 15 ++++++++++++++
 src/object/srv_obj.c   | 22 ++++++++++++++------
 6 files changed, 86 insertions(+), 30 deletions(-)

diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c
index 244373c9228..26b54b52ec3 100644
--- a/src/cart/crt_hg.c
+++ b/src/cart/crt_hg.c
@@ -1482,6 +1482,16 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv)
 		rc = crt_hgret_2_der(hg_ret);
 	}
 
+	/* Release input buffer */
+	if (rpc_priv->crp_release_input_early && !rpc_priv->crp_forward) {
+		hg_ret = HG_Release_input_buf(rpc_priv->crp_hg_hdl);
+		if (hg_ret != HG_SUCCESS) {
+			RPC_ERROR(rpc_priv, "HG_Release_input_buf failed, hg_ret: " DF_HG_RC "\n",
+				  DP_HG_RC(hg_ret));
+			/* Fall through */
+		}
+	}
+
 	return rc;
 }
 
diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c
index 72d8214aa09..ead03d1bf29 100644
--- a/src/cart/crt_rpc.c
+++ b/src/cart/crt_rpc.c
@@ -1550,6 +1550,26 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg)
 	return rc;
 }
 
+int
+crt_reply_send_input_free(crt_rpc_t *req)
+{
+	struct crt_rpc_priv *rpc_priv = NULL;
+	int                  rc       = 0;
+
+	if (req == NULL) {
+		D_ERROR("invalid parameter (NULL req).\n");
+		D_GOTO(out, rc = -DER_INVAL);
+	}
+
+	rpc_priv                          = container_of(req, struct crt_rpc_priv, crp_pub);
+	rpc_priv->crp_release_input_early = 1;
+
+	return crt_reply_send(req);
+
+out:
+	return rc;
+}
+
 int
 crt_reply_send(crt_rpc_t *req)
 {
diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h
index a5e6afee780..3a590933103 100644
--- a/src/cart/crt_rpc.h
+++ b/src/cart/crt_rpc.h
@@ -166,29 +166,30 @@ struct crt_rpc_priv {
 	 * match with crp_req_hdr.cch_flags.
 	 */
 	uint32_t		crp_flags;
-	uint32_t		crp_srv:1, /* flag of server received request */
-				crp_output_got:1,
-				crp_input_got:1,
-				/* flag of collective RPC request */
-				crp_coll:1,
-				/* flag of crp_tgt_uri need to be freed */
-				crp_uri_free:1,
-				/* flag of forwarded rpc for corpc */
-				crp_forward:1,
-				/* flag of in timeout binheap */
-				crp_in_binheap:1,
-				/* set if a call to crt_req_reply pending */
-				crp_reply_pending:1,
-				/* set to 1 if target ep is set */
-				crp_have_ep:1,
-				/* RPC is tracked by the context */
-				crp_ctx_tracked:1,
-				/* 1 if RPC fails HLC epsilon check */
-				crp_fail_hlc:1,
-				/* RPC completed flag */
-				crp_completed:1,
-				/* RPC originated from a primary provider */
-				crp_src_is_primary:1;
+	uint32_t                 crp_srv : 1, /* flag of server received request */
+	    crp_output_got : 1, crp_input_got : 1,
+	    /* flag of collective RPC request */
+	    crp_coll                : 1,
+	    /* flag of crp_tgt_uri need to be freed */
+	    crp_uri_free            : 1,
+	    /* flag of forwarded rpc for corpc */
+	    crp_forward             : 1,
+	    /* flag of in timeout binheap */
+	    crp_in_binheap          : 1,
+	    /* set if a call to crt_req_reply pending */
+	    crp_reply_pending       : 1,
+	    /* set to 1 if target ep is set */
+	    crp_have_ep             : 1,
+	    /* RPC is tracked by the context */
+	    crp_ctx_tracked         : 1,
+	    /* 1 if RPC fails HLC epsilon check */
+	    crp_fail_hlc            : 1,
+	    /* RPC completed flag */
+	    crp_completed           : 1,
+	    /* RPC originated from a primary provider */
+	    crp_src_is_primary      : 1,
+	    /* release input buffer early */
+	    crp_release_input_early : 1;
 
 	struct crt_opc_info	*crp_opc_info;
 	/* corpc info, only valid when (crp_coll == 1) */
diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c
index 9e78100c451..0fd4e7c513f 100644
--- a/src/dtx/dtx_srv.c
+++ b/src/dtx/dtx_srv.c
@@ -337,7 +337,7 @@ dtx_handler(crt_rpc_t *rpc)
 	dout->do_status = rc;
 	/* For DTX_COMMIT, it is the count of real committed DTX entries. */
 	dout->do_misc = committed;
-	rc = crt_reply_send(rpc);
+	rc = crt_reply_send_input_free(rpc);
 	if (rc != 0)
 		D_ERROR("send reply failed for DTX rpc %u: rc = "DF_RC"\n", opc,
 			DP_RC(rc));
diff --git a/src/include/cart/api.h b/src/include/cart/api.h
index 21941d7cb98..16fc2091d4b 100644
--- a/src/include/cart/api.h
+++ b/src/include/cart/api.h
@@ -474,6 +474,21 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg);
 int
 crt_reply_send(crt_rpc_t *req);
 
+/**
+ * Send an RPC reply and free the input buffer immediately.
+ * Only to be called on the server side.
+ *
+ * \param[in] req              pointer to RPC request
+ *
+ * \return                     DER_SUCCESS on success, negative value if error
+ *
+ * \note the crt_rpc_t is exported to user, caller should fill the
+ *        crt_rpc_t::cr_output before sending the RPC reply.
+ *        See \ref crt_req_create.
+ */
+int
+crt_reply_send_input_free(crt_rpc_t *req);
+
 /**
  * Return request buffer
  *
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index a51682b4785..f220149a532 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -162,7 +162,7 @@ obj_rw_complete(crt_rpc_t *rpc, struct obj_io_context *ioc,
 }
 
 static void
-obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch,
+obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, bool release_input,
 	     struct obj_io_context *ioc)
 {
 	struct obj_rw_out	*orwo = crt_reply_get(rpc);
@@ -187,7 +187,11 @@ obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch,
 		ioc->ioc_map_ver, orwo->orw_epoch, status);
 
 	if (!ioc->ioc_lost_reply) {
-		rc = crt_reply_send(rpc);
+		if (release_input)
+			rc = crt_reply_send_input_free(rpc);
+		else
+			rc = crt_reply_send(rpc);
+
 		if (rc != 0)
 			D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc));
 	} else {
@@ -2079,6 +2083,8 @@ obj_ioc_init(uuid_t pool_uuid, uuid_t coh_uuid, uuid_t cont_uuid, crt_rpc_t *rpc
 
 	D_ASSERT(ioc != NULL);
 	memset(ioc, 0, sizeof(*ioc));
+
+	crt_req_addref(rpc);
 	ioc->ioc_rpc = rpc;
 	ioc->ioc_opc = opc_get(rpc->cr_opc);
 	rc = ds_cont_find_hdl(pool_uuid, coh_uuid, &coh);
@@ -2154,6 +2160,10 @@ obj_ioc_fini(struct obj_io_context *ioc, int err)
 		ds_cont_child_put(ioc->ioc_coc);
 		ioc->ioc_coc = NULL;
 	}
+	if (ioc->ioc_rpc) {
+		crt_req_decref(ioc->ioc_rpc);
+		ioc->ioc_rpc = NULL;
+	}
 }
 
 /* Setup lite IO context, it is only for compound RPC so far:
@@ -2508,7 +2518,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc)
 	rc = vos_obj_array_remove(ioc.ioc_coc->sc_hdl, oer->er_oid, &oer->er_epoch_range, dkey,
 				  &iod->iod_name, &recx);
 out:
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, false, &ioc);
 	obj_ioc_end(&ioc, rc);
 }
 
@@ -2604,7 +2614,7 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc)
 		D_ERROR(DF_UOID ": array_remove failed: " DF_RC "\n", DP_UOID(oea->ea_oid),
 			DP_RC(rc1));
 out:
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, false, &ioc);
 	obj_ioc_end(&ioc, rc);
 }
 
@@ -2733,7 +2743,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc)
 out:
 	if (dth != NULL)
 		rc = dtx_end(dth, ioc.ioc_coc, rc);
-	obj_rw_reply(rpc, rc, 0, &ioc);
+	obj_rw_reply(rpc, rc, 0, true, &ioc);
 	D_FREE(mbs);
 	obj_ioc_end(&ioc, rc);
 }
@@ -3073,7 +3083,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 	if (ioc.ioc_map_ver < max_ver)
 		ioc.ioc_map_ver = max_ver;
 
-	obj_rw_reply(rpc, rc, epoch.oe_value, &ioc);
+	obj_rw_reply(rpc, rc, epoch.oe_value, false, &ioc);
 	D_FREE(mbs);
 	D_FREE(dti_cos);
 	obj_ioc_end(&ioc, rc);

From 67da3b9b924c16b7b240fcac94e6cdd3044d6415 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Fri, 25 Oct 2024 23:52:33 +0800
Subject: [PATCH 23/34] DAOS-16721 object: fix coll RPC for obj with sparse
 layout - b26 (#15376)

The old implementation did not correctly calculate some collective
object RPC size, and may cause trouble when need bulk data transfer
for large collective object RPC. It also potentially affects how to
dispatch collective RPCs from leader to other engines.

The patch also addes more sanity check for coll-punch RPC to detect
potential DRAM corruption.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/dtx/dtx_coll.c        |   6 ++-
 src/dtx/dtx_rpc.c         |   5 +-
 src/object/cli_coll.c     | 111 +++++++++++++++++++++++++-------------
 src/object/cli_shard.c    |   7 +--
 src/object/obj_internal.h |   8 ++-
 src/object/obj_utils.c    |   2 +-
 src/object/srv_coll.c     |  12 +++--
 src/object/srv_obj.c      |   8 +--
 src/vos/vos_dtx.c         |   3 --
 9 files changed, 107 insertions(+), 55 deletions(-)

diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c
index 863307e9a7f..7e7c8199155 100644
--- a/src/dtx/dtx_coll.c
+++ b/src/dtx/dtx_coll.c
@@ -80,8 +80,12 @@ dtx_coll_prep_ult(void *arg)
 				DP_UUID(cont->sc_uuid), DP_RC(rc));
 	}
 
-	if (dcpa->dcpa_result != 0)
+	if (dcpa->dcpa_result != 0) {
+		if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST)
+			D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n",
+				DP_DTI(&dci->dci_xid), opc, DP_RC(rc));
 		goto out;
+	}
 
 	dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1,
 					  dci->dci_version, cont->sc_pool->spc_map_version,
diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c
index 5423f6c0cc8..2ccbfec2734 100644
--- a/src/dtx/dtx_rpc.c
+++ b/src/dtx/dtx_rpc.c
@@ -974,8 +974,11 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che
 		if (dsp->dsp_mbs == NULL) {
 			rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs);
 			if (rc != 0) {
-				if (rc < 0 && rc != -DER_NONEXIST && for_io)
+				if (rc < 0 && rc != -DER_NONEXIST && for_io) {
+					D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n",
+						DP_DTI(&dsp->dsp_xid), DP_RC(rc));
 					goto out;
+				}
 
 				drop = true;
 				goto next;
diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c
index e05abadf3cf..3285bec58b3 100644
--- a/src/object/cli_coll.c
+++ b/src/object/cli_coll.c
@@ -178,6 +178,10 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool
 		if (coa->coa_sparse == 0)
 			coa->coa_dct_cap = obj_ranks;
 	}
+
+	/* Save obj->cob_min_rank for verification during subsequent obj_coll_prep_one(). */
+	coa->coa_min_rank = obj->cob_min_rank;
+
 	D_RWLOCK_UNLOCK(&obj->cob_lock);
 
 	if (coa->coa_sparse) {
@@ -208,7 +212,6 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool
 		coa->coa_dct_nr = -1;
 	}
 
-	coa->coa_max_dct_sz = 0;
 	coa->coa_max_shard_nr = 0;
 	coa->coa_max_bitmap_sz = 0;
 	coa->coa_target_nr = 0;
@@ -236,17 +239,55 @@ obj_coll_oper_args_fini(struct coll_oper_args *coa)
 	coa->coa_dct_nr = 0;
 }
 
+static void
+obj_coll_collapse_one(struct coll_oper_args *coa, struct daos_coll_target *dct,
+		      uint32_t *size, bool copy)
+{
+	struct daos_coll_shard	*dcs;
+	uint32_t		 dct_size;
+	int			 i;
+
+	/* The size may be over estimated, no matter. */
+	dct_size = sizeof(*dct) + dct->dct_bitmap_sz +
+		   sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1);
+
+	for (i = 0; i <= dct->dct_max_shard; i++) {
+		dcs = &dct->dct_shards[i];
+		if (dcs->dcs_nr > 1)
+			dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr;
+	}
+
+	if (coa->coa_for_modify)
+		dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr;
+
+	if (coa->coa_max_dct_sz < dct_size)
+		coa->coa_max_dct_sz = dct_size;
+
+	if (copy)
+		memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct));
+
+	coa->coa_dct_nr++;
+	*size += dct_size;
+}
+
+struct obj_coll_tree_args {
+	struct coll_oper_args	*coa;
+	uint32_t		*size;
+};
+
 static int
 obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg)
 {
-	struct coll_oper_args	*coa = arg;
-	struct daos_coll_target	*dct = val->iov_buf;
+	struct obj_coll_tree_args	*octa = arg;
+	struct coll_oper_args		*coa = octa->coa;
+	struct daos_coll_target		*dct = val->iov_buf;
 
 	D_ASSERTF(coa->coa_dct_nr < coa->coa_dct_cap,
 		  "Too short pre-allcoated dct_array: %u vs %u\n",
 		  coa->coa_dct_nr, coa->coa_dct_cap);
+	D_ASSERT(dct->dct_bitmap != NULL);
 
-	memcpy(&coa->coa_dcts[coa->coa_dct_nr++], dct, sizeof(*dct));
+	obj_coll_collapse_one(coa, dct, octa->size, true);
 
 	/* The following members have been migrated into coa->coa_dcts. */
 	dct->dct_bitmap = NULL;
@@ -259,6 +300,7 @@ obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg)
 static int
 obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size)
 {
+	struct obj_coll_tree_args	 octa;
 	struct coll_sparse_targets	*tree = coa->coa_tree;
 	int				 rc = 0;
 
@@ -270,7 +312,14 @@ obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size)
 		D_GOTO(out, rc = -DER_NOMEM);
 
 	coa->coa_sparse = 0;
-	rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, obj_coll_tree_cb, coa);
+	coa->coa_raw_sparse = 1;
+	coa->coa_dct_nr = 0;
+	coa->coa_max_dct_sz = 0;
+
+	octa.coa = coa;
+	octa.size = size;
+	rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false,
+			    obj_coll_tree_cb, &octa);
 	if (rc == 0)
 		D_ASSERTF(coa->coa_dct_nr == coa->coa_dct_cap,
 			  "Something is wrong when prepare coll target array: %u vs %u\n",
@@ -287,36 +336,13 @@ static int
 obj_coll_collapse_array(struct coll_oper_args *coa, uint32_t *size)
 {
 	struct daos_coll_target	*dct;
-	struct daos_coll_shard	*dcs;
-	uint32_t		 dct_size;
 	int			 i;
-	int			 j;
 
-	for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) {
+	for (i = 0, *size = 0, coa->coa_dct_nr = 0, coa->coa_max_dct_sz = 0;
+	     i < coa->coa_dct_cap; i++) {
 		dct = &coa->coa_dcts[i];
-		if (dct->dct_bitmap != NULL) {
-			/* The size may be over estimated, no matter. */
-			dct_size = sizeof(*dct) + dct->dct_bitmap_sz +
-				   sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1);
-
-			for (j = 0; j <= dct->dct_max_shard; j++) {
-				dcs = &dct->dct_shards[j];
-				if (dcs->dcs_nr > 1)
-					dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr;
-			}
-
-			if (coa->coa_for_modify)
-				dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr;
-
-			if (coa->coa_max_dct_sz < dct_size)
-				coa->coa_max_dct_sz = dct_size;
-
-			if (coa->coa_dct_nr < i)
-				memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct));
-
-			coa->coa_dct_nr++;
-			*size += dct_size;
-		}
+		if (dct->dct_bitmap != NULL)
+			obj_coll_collapse_one(coa, dct, size, coa->coa_dct_nr < i);
 	}
 
 	/* Reset the other dct slots to avoid double free during cleanup. */
@@ -373,8 +399,9 @@ obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj,
 
 	D_RWLOCK_RDLOCK(&obj->cob_lock);
 
-	D_ASSERTF(shard->do_target_rank <= obj->cob_max_rank,
-		  "Unexpected shard with rank %u > %u\n", shard->do_target_rank, obj->cob_max_rank);
+	D_ASSERTF(coa->coa_min_rank == obj->cob_min_rank,
+		  "Object "DF_OID" layout has been changed unexpectedly %u => %u, idx %u, ver %u\n",
+		  DP_OID(obj->cob_md.omd_id), coa->coa_min_rank, obj->cob_min_rank, idx, map_ver);
 	D_ASSERTF(shard->do_target_rank >= obj->cob_min_rank,
 		  "Unexpected shard with rank %u < %u\n", shard->do_target_rank, obj->cob_min_rank);
 
@@ -669,7 +696,6 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	uint32_t			 tgt_size = 0;
 	uint32_t			 mbs_max_size;
 	uint32_t			 inline_size;
-	uint32_t			 flags = ORF_LEADER;
 	uint32_t			 leader = -1;
 	uint32_t			 len;
 	int				 rc;
@@ -746,6 +772,12 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 		memcpy(dct, &tmp_tgt, sizeof(tmp_tgt));
 	}
 
+	/* 'shard' is on the leader target that is must be the coa->coa_dcts[0]. */
+	D_ASSERTF(shard->do_target_rank == coa->coa_dcts[0].dct_rank,
+		  "Object "DF_OID" target array corrupted: rank %u vs %ur, nr %u\n",
+		  DP_OID(obj->cob_md.omd_id), shard->do_target_rank,
+		  coa->coa_dcts[0].dct_rank, coa->coa_dct_nr);
+
 	rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs);
 	if (rc < 0)
 		goto out;
@@ -767,12 +799,14 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	if (rc != 0)
 		goto out;
 
+	auxi->flags = ORF_LEADER;
 	if (auxi->io_retry) {
-		flags |= ORF_RESEND;
+		auxi->flags |= ORF_RESEND;
 		/* Reset @enqueue_id if resend to new leader. */
 		if (spa->pa_auxi.target != shard->do_target_id)
 			spa->pa_auxi.enqueue_id = 0;
 	} else {
+		auxi->flags &= ~ORF_RESEND;
 		spa->pa_auxi.obj_auxi = auxi;
 		daos_dti_gen(&spa->pa_dti, false);
 	}
@@ -781,14 +815,15 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo
 	spa->pa_auxi.shard = shard->do_shard_idx;
 
 	if (obj_is_ec(obj))
-		flags |= ORF_EC;
+		auxi->flags |= ORF_EC;
 
 	mbs_max_size = sizeof(*mbs) + mbs->dm_data_size +
 		       sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz;
 
 	return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size,
 				       coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch,
-				       args->flags, flags, map_ver, &auxi->map_ver_reply, task);
+				       args->flags, auxi->flags, map_ver,
+				       &auxi->map_ver_reply, task);
 
 out:
 	if (rc > 0)
diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c
index 36c5c5f1e0c..0c9dfc1418e 100644
--- a/src/object/cli_shard.c
+++ b/src/object/cli_shard.c
@@ -1453,9 +1453,10 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data)
 
 	DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result,
 		  "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX "
-		  DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid),
-		  DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver,
-		  (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags);
+		  DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout",
+		  rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver,
+		  *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags,
+		  cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous");
 
 	crt_req_decref(rpc);
 
diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h
index ec6aa3b817e..cfe3385e959 100644
--- a/src/object/obj_internal.h
+++ b/src/object/obj_internal.h
@@ -292,10 +292,16 @@ struct coll_oper_args {
 	struct shard_auxi_args	 coa_auxi;
 	int			 coa_dct_nr;
 	uint32_t		 coa_dct_cap;
-	uint32_t		 coa_max_dct_sz;
+	union {
+		/* Save obj->cob_min_rank for verification during obj_coll_prep_one(). */
+		uint32_t	 coa_min_rank;
+		/* Only can be used since obj_coll_oper_args_collapse() after object layout scan. */
+		uint32_t	 coa_max_dct_sz;
+	};
 	uint8_t			 coa_max_shard_nr;
 	uint8_t			 coa_max_bitmap_sz;
 	uint8_t			 coa_for_modify:1,
+				 coa_raw_sparse:1,
 				 coa_sparse:1;
 	uint8_t			 coa_target_nr;
 	/*
diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c
index 7bf0ef4aaf9..82d91c966ac 100644
--- a/src/object/obj_utils.c
+++ b/src/object/obj_utils.c
@@ -633,7 +633,7 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t
 		 * use the "cur_pos" as the relay engine.
 		 */
 		pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos;
-		if (pos != ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) {
+		if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) {
 			memcpy(&tmp, &tgts[pos], sizeof(tmp));
 			memcpy(&tgts[pos], dct, sizeof(tmp));
 			memcpy(dct, &tmp, sizeof(tmp));
diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c
index 2a152b47bd6..5b59f954f86 100644
--- a/src/object/srv_coll.c
+++ b/src/object/srv_coll.c
@@ -239,9 +239,15 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct
 	int			 i;
 	int			 j;
 
-	/* dcts[0] is for current engine. */
-	if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 ||
-	    dcts[0].dct_shards == NULL) {
+	/* dcts[0] must be for current engine. */
+	if (unlikely(dcts[0].dct_rank != dss_self_rank())) {
+		D_ERROR("Invalid targets array: rank %u vs %u, nr %u, flags %x\n",
+			dcts[0].dct_rank, dss_self_rank(), dct_nr, ocpi->ocpi_flags);
+		D_GOTO(out, rc = -DER_INVAL);
+	}
+
+	if (unlikely(dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 ||
+		     dcts[0].dct_shards == NULL)) {
 		D_ERROR("Invalid input for current engine: bitmap %s, bitmap_sz %u, shards %s\n",
 			dcts[0].dct_bitmap == NULL ? "empty" : "non-empty", dcts[0].dct_bitmap_sz,
 			dcts[0].dct_shards == NULL ? "empty" : "non-empty");
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index f220149a532..041ea903c4f 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -5615,12 +5615,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj "
 		DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti "
-		DF_DTI", forward width %u, forward depth %u\n",
+		DF_DTI", forward width %u, forward depth %u, flags %x\n",
 		(ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
 		(ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"),
 		rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
 		ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid),
-		ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth);
+		ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
 
 	D_ASSERT(dmi->dmi_xs_id != 0);
 
@@ -5757,13 +5757,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 	DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc,
 		  "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc "
 		  DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, "
-		  "tgt_nr %u, forward width %u, forward depth %u",
+		  "tgt_nr %u, forward width %u, forward depth %u, flags %x",
 		  (ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
 		  (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc,
 		  DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch,
 		  ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz,
 		  ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count,
-		  ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth);
+		  ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
 
 	obj_punch_complete(rpc, rc, max_ver);
 
diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index f1d69b54f4e..0f87bfd4d0e 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -1973,9 +1973,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid,
 			rc = -DER_INPROGRESS;
 	}
 
-	if (rc < 0)
-		D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc));
-
 	return rc;
 }
 

From c4cf4f7b58f9315c2a294c899d807edb6b8917b0 Mon Sep 17 00:00:00 2001
From: Tom Nabarro <tom.nabarro@intel.com>
Date: Mon, 28 Oct 2024 13:49:15 +0000
Subject: [PATCH 24/34] DAOS-16687 control: Handle missing PCIe caps in storage
 query usage (#15296) (#15392)

Missing PCIe capabilities when querying a NVMe SSD's configuration
space is unusual but should be handled gracefully by the control-plane
and shouldn't cause a failure to return usage statistics when calling
dmg storage query usage.

Update so that pciutils lib is only called when attempting to display
health stats via dmg and not when fetching usage info. Improve clarity
of workflow to ease maintenance and add test coverage for updates.
Enable continued functionality when NVMe device doesn't return any
extended capabilities in PCIe configuration space data by adding
sentinel error to library for such a case.

Signed-off-by: Tom Nabarro <tom.nabarro@intel.com>
---
 .../common/proto/ctl/storage_nvme.pb.go       |  46 +++--
 src/control/lib/control/storage.go            |   4 +-
 src/control/lib/hardware/pciutils/bindings.go |   3 +-
 src/control/server/ctl_smd_rpc_test.go        |  49 +++++
 src/control/server/instance_storage_rpc.go    | 134 ++++++++++----
 .../server/instance_storage_rpc_test.go       | 172 ++++++++++++------
 src/proto/ctl/storage_nvme.proto              |   3 +-
 7 files changed, 308 insertions(+), 103 deletions(-)

diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go
index 62fede43ed4..cb2dc5099d4 100644
--- a/src/control/common/proto/ctl/storage_nvme.pb.go
+++ b/src/control/common/proto/ctl/storage_nvme.pb.go
@@ -95,11 +95,12 @@ type ScanNvmeReq struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields
 
-	Health   bool   `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"`     // Retrieve NVMe device health statistics
-	Meta     bool   `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"`         // Retrieve metadata relating to NVMe device
-	Basic    bool   `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"`       // Strip NVMe device details to only basic
-	MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob
-	RdbSize  uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"`   // Size of the RDB blob
+	Health    bool   `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"`       // Retrieve NVMe device health statistics
+	Meta      bool   `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"`           // Retrieve metadata relating to NVMe device
+	Basic     bool   `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"`         // Strip NVMe device details to only basic
+	MetaSize  uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"`   // Size of the metadata blob
+	RdbSize   uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"`     // Size of the RDB blob
+	LinkStats bool   `protobuf:"varint,6,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics
 }
 
 func (x *ScanNvmeReq) Reset() {
@@ -169,6 +170,13 @@ func (x *ScanNvmeReq) GetRdbSize() uint64 {
 	return 0
 }
 
+func (x *ScanNvmeReq) GetLinkStats() bool {
+	if x != nil {
+		return x.LinkStats
+	}
+	return false
+}
+
 type ScanNvmeResp struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -276,7 +284,7 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{
 	0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53,
 	0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72,
 	0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08,
-	0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0x85, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61,
+	0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61,
 	0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c,
 	0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68,
 	0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04,
@@ -285,18 +293,20 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{
 	0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65,
 	0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a,
 	0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65,
-	0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70,
-	0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b,
-	0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72,
-	0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a,
-	0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63,
-	0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65,
-	0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61,
-	0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68,
-	0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63,
-	0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-	0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f,
-	0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x12, 0x1c, 0x0a, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x06, 0x20,
+	0x01, 0x28, 0x08, 0x52, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65,
+	0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b,
+	0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13,
+	0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+	0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73,
+	0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c,
+	0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05,
+	0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e,
+	0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62,
+	0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f,
+	0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
+	0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74,
+	0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
 }
 
 var (
diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go
index 65942a6c12d..9d5fe470de6 100644
--- a/src/control/lib/control/storage.go
+++ b/src/control/lib/control/storage.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2020-2023 Intel Corporation.
+// (C) Copyright 2020-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -258,6 +258,8 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe
 				// Health and meta details required to populate usage statistics.
 				Health: req.NvmeHealth || req.Usage,
 				Meta:   req.Usage,
+				// Only request link stats if health explicitly requested.
+				LinkStats: req.NvmeHealth,
 			},
 		})
 	})
diff --git a/src/control/lib/hardware/pciutils/bindings.go b/src/control/lib/hardware/pciutils/bindings.go
index 86686896c80..f4d43c50fa3 100644
--- a/src/control/lib/hardware/pciutils/bindings.go
+++ b/src/control/lib/hardware/pciutils/bindings.go
@@ -40,6 +40,7 @@ var (
 	ErrMultiDevices     = errors.New("want single device config got multiple")
 	ErrCfgNotTerminated = errors.New("device config content not new-line terminated")
 	ErrCfgMissing       = errors.New("incomplete device config")
+	ErrNoPCIeCaps       = errors.New("no pci-express capabilities found")
 )
 
 // api provides the PCIeLinkStatsProvider interface by exposing a concrete implementation of
@@ -150,7 +151,7 @@ func (ap *api) PCIeCapsFromConfig(cfgBytes []byte, dev *hardware.PCIDevice) erro
 	var cp *C.struct_pci_cap = C.pci_find_cap(pciDev, C.PCI_CAP_ID_EXP, C.PCI_CAP_NORMAL)
 
 	if cp == nil {
-		return errors.New("no pci-express capabilities found")
+		return ErrNoPCIeCaps
 	}
 
 	cpAddr := uint32(cp.addr)
diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go
index 6378e81d23c..06f1276fa25 100644
--- a/src/control/server/ctl_smd_rpc_test.go
+++ b/src/control/server/ctl_smd_rpc_test.go
@@ -17,6 +17,7 @@ import (
 	"github.com/daos-stack/daos/src/control/common/test"
 	"github.com/daos-stack/daos/src/control/drpc"
 	"github.com/daos-stack/daos/src/control/lib/daos"
+	"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
 	"github.com/daos-stack/daos/src/control/logging"
 	"github.com/daos-stack/daos/src/control/server/config"
@@ -88,6 +89,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 		drpcResps      map[int][]*mockDrpcResponse
 		harnessStopped bool
 		ioStopped      bool
+		pciDevErr      error
 		expResp        *ctlpb.SmdQueryResp
 		expErr         error
 	}{
@@ -658,6 +660,46 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 			},
 			expErr: daos.FreeMemError,
 		},
+		"list-devices; with health; update link stats": {
+			req: &ctlpb.SmdQueryReq{
+				OmitPools:        true,
+				Rank:             uint32(ranklist.NilRank),
+				Uuid:             test.MockUUID(1),
+				IncludeBioHealth: true,
+			},
+			drpcResps: map[int][]*mockDrpcResponse{
+				0: {
+					{
+						Message: &ctlpb.SmdDevResp{
+							Devices: []*ctlpb.SmdDevice{pbNormDev(0)},
+						},
+					},
+				},
+				1: {
+					{
+						Message: &ctlpb.SmdDevResp{
+							Devices: []*ctlpb.SmdDevice{
+								func() *ctlpb.SmdDevice {
+									sd := pbFaultDev(1)
+									sd.Ctrlr.PciCfg = "ABCD"
+									return sd
+								}(),
+							},
+						},
+					},
+					{
+						Message: &ctlpb.BioHealthResp{
+							Temperature: 1000000,
+							TempWarn:    true,
+						},
+					},
+				},
+			},
+			// Prove mock link stats provider gets called when IncludeBioHealth
+			// flag is set and Ctrlr.PciCfg string is not empty.
+			pciDevErr: errors.New("link stats provider fail"),
+			expErr:    errors.New("link stats provider fail"),
+		},
 		"ambiguous UUID": {
 			req: &ctlpb.SmdQueryReq{
 				Rank: uint32(ranklist.NilRank),
@@ -680,6 +722,13 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
 			defer test.ShowBufferOnFailure(t, buf)
 
+			linkStatsProv = &mockPCIeLinkStatsProvider{
+				pciDevErr: tc.pciDevErr,
+			}
+			defer func() {
+				linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
+			}()
+
 			engineCount := len(tc.drpcResps)
 			if engineCount == 0 {
 				engineCount = 1
diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go
index fbca1e534a0..3097440f500 100644
--- a/src/control/server/instance_storage_rpc.go
+++ b/src/control/server/instance_storage_rpc.go
@@ -23,14 +23,19 @@ import (
 	"github.com/daos-stack/daos/src/control/events"
 	"github.com/daos-stack/daos/src/control/fault"
 	"github.com/daos-stack/daos/src/control/lib/hardware"
-	"github.com/daos-stack/daos/src/control/lib/hardware/hwprov"
+	"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
 	"github.com/daos-stack/daos/src/control/server/storage"
 )
 
 var (
-	scanSmd                       = listSmdDevices
-	getCtrlrHealth                = getBioHealth
+	// Function pointers to enable mocking.
+	scanSmd       = listSmdDevices
+	scanHealth    = getBioHealth
+	linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
+
+	// Sentinel errors to enable comparison.
 	errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance")
+	errCtrlrHealthSkipped         = errors.New("controller health update was skipped")
 )
 
 // newMntRet creates and populates SCM mount result.
@@ -168,13 +173,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe
 }
 
 func addLinkInfoToHealthStats(prov hardware.PCIeLinkStatsProvider, pciCfg string, health *ctlpb.BioHealthResp) error {
+	if health == nil {
+		return errors.New("nil BioHealthResp")
+	}
+
 	// Convert byte-string to lspci-format.
 	sb := new(strings.Builder)
 	formatBytestring(pciCfg, sb)
 
 	pciDev := &hardware.PCIDevice{}
 	if err := prov.PCIeCapsFromConfig([]byte(sb.String()), pciDev); err != nil {
-		return err
+		return errors.Wrap(err, "pciutils lib")
 	}
 
 	// Copy link details from PCIDevice to health stats.
@@ -243,31 +252,74 @@ func publishLinkStatEvents(engine Engine, pciAddr string, stats *ctlpb.BioHealth
 		lastMaxWidthStr, lastWidthStr, stats.LinkPortId)
 }
 
-func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController, prov hardware.PCIeLinkStatsProvider) (bool, error) {
-	stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)]
-	if !ctrlr.CanSupplyHealthStats() {
-		engine.Debugf("skip fetching health stats on device %q in %q state",
-			ctrlr.PciAddr, stateName)
-		return false, nil
+type ctrlrHealthReq struct {
+	meta          bool
+	engine        Engine
+	bhReq         *ctlpb.BioHealthReq
+	ctrlr         *ctlpb.NvmeController
+	linkStatsProv hardware.PCIeLinkStatsProvider
+}
+
+// Retrieve NVMe controller health statistics for those in an acceptable state. Return nil health
+// resp if in a bad state.
+func getCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (*ctlpb.BioHealthResp, error) {
+	stateName := ctlpb.NvmeDevState_name[int32(req.ctrlr.DevState)]
+	if !req.ctrlr.CanSupplyHealthStats() {
+		req.engine.Debugf("skip fetching health stats on device %q in %q state",
+			req.ctrlr.PciAddr, stateName)
+		return nil, errCtrlrHealthSkipped
 	}
 
-	health, err := getCtrlrHealth(ctx, engine, req)
+	health, err := scanHealth(ctx, req.engine, req.bhReq)
 	if err != nil {
-		return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr,
+		return nil, errors.Wrapf(err, "retrieve health stats for %q (state %q)", req.ctrlr,
 			stateName)
 	}
 
-	if ctrlr.PciCfg != "" {
-		if err := addLinkInfoToHealthStats(prov, ctrlr.PciCfg, health); err != nil {
-			return false, errors.Wrapf(err, "add link stats for %q", ctrlr)
+	return health, nil
+}
+
+// Add link state and capability information to input health statistics for the given controller
+// then if successful publish events based on link statistic changes. Link updated health stats to
+// controller.
+func setCtrlrHealthWithLinkInfo(req ctrlrHealthReq, health *ctlpb.BioHealthResp) error {
+	err := addLinkInfoToHealthStats(req.linkStatsProv, req.ctrlr.PciCfg, health)
+	if err == nil {
+		publishLinkStatEvents(req.engine, req.ctrlr.PciAddr, health)
+	} else {
+		if errors.Cause(err) != pciutils.ErrNoPCIeCaps {
+			return errors.Wrapf(err, "add link stats for %q", req.ctrlr)
+		}
+		req.engine.Debugf("device %q not reporting PCIe capabilities", req.ctrlr.PciAddr)
+	}
+
+	return nil
+}
+
+// Update controller health statistics and include link info if required and available.
+func populateCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (bool, error) {
+	health, err := getCtrlrHealth(ctx, req)
+	if err != nil {
+		if err == errCtrlrHealthSkipped {
+			// Nothing to do.
+			return false, nil
 		}
-		publishLinkStatEvents(engine, ctrlr.PciAddr, health)
+		return false, errors.Wrap(err, "get ctrlr health")
+	}
+
+	if req.linkStatsProv == nil {
+		req.engine.Debugf("device %q skip adding link stats; nil provider",
+			req.ctrlr.PciAddr)
+	} else if req.ctrlr.PciCfg == "" {
+		req.engine.Debugf("device %q skip adding link stats; empty pci cfg",
+			req.ctrlr.PciAddr)
 	} else {
-		engine.Debugf("no pcie config space received for %q, skip add link stats", ctrlr)
+		if err = setCtrlrHealthWithLinkInfo(req, health); err != nil {
+			return false, errors.Wrap(err, "set ctrlr health")
+		}
 	}
 
-	ctrlr.HealthStats = health
-	ctrlr.PciCfg = ""
+	req.ctrlr.HealthStats = health
 	return true, nil
 }
 
@@ -305,12 +357,13 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
 
 		c := seenCtrlrs[addr]
 
-		// Only minimal info provided in standard scan to enable result aggregation across
-		// homogeneous hosts.
 		engineRank, err := engine.GetRank()
 		if err != nil {
 			return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index())
 		}
+
+		// Only provide minimal info in standard scan to enable result aggregation across
+		// homogeneous hosts.
 		nsd := &ctlpb.SmdDevice{
 			RoleBits:         sd.RoleBits,
 			CtrlrNamespaceId: sd.CtrlrNamespaceId,
@@ -326,18 +379,29 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
 		// Populate health if requested.
 		healthUpdated := false
 		if pbReq.Health && c.HealthStats == nil {
-			bhReq := &ctlpb.BioHealthReq{
-				DevUuid:  sd.Uuid,
-				MetaSize: pbReq.MetaSize,
-				RdbSize:  pbReq.RdbSize,
+			bhReq := &ctlpb.BioHealthReq{DevUuid: sd.Uuid}
+			if pbReq.Meta {
+				bhReq.MetaSize = pbReq.MetaSize
+				bhReq.RdbSize = pbReq.RdbSize
 			}
-			upd, err := populateCtrlrHealth(ctx, engine, bhReq, c,
-				hwprov.DefaultPCIeLinkStatsProvider())
+
+			chReq := ctrlrHealthReq{
+				engine: engine,
+				bhReq:  bhReq,
+				ctrlr:  c,
+			}
+			if pbReq.LinkStats {
+				// Add link stats to health if flag set.
+				chReq.linkStatsProv = linkStatsProv
+			}
+
+			healthUpdated, err = populateCtrlrHealth(ctx, chReq)
 			if err != nil {
 				return nil, err
 			}
-			healthUpdated = upd
 		}
+		// Used to update health with link stats, now redundant.
+		c.PciCfg = ""
 
 		// Populate usage data if requested.
 		if pbReq.Meta {
@@ -510,12 +574,20 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq
 			continue // Skip health query if UUID doesn't match requested.
 		}
 		if pbReq.IncludeBioHealth {
-			bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid}
-			if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr,
-				hwprov.DefaultPCIeLinkStatsProvider()); err != nil {
+			chReq := ctrlrHealthReq{
+				engine:        engine,
+				bhReq:         &ctlpb.BioHealthReq{DevUuid: dev.Uuid},
+				ctrlr:         dev.Ctrlr,
+				linkStatsProv: linkStatsProv,
+			}
+
+			if _, err = populateCtrlrHealth(ctx, chReq); err != nil {
 				return nil, err
 			}
 		}
+		// Used to update health with link stats, now redundant.
+		dev.Ctrlr.PciCfg = ""
+
 		if pbReq.Uuid != "" && dev.Uuid == pbReq.Uuid {
 			rResp.Devices = []*ctlpb.SmdDevice{dev}
 			found = true
diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go
index b199adb6b8d..0f72f739970 100644
--- a/src/control/server/instance_storage_rpc_test.go
+++ b/src/control/server/instance_storage_rpc_test.go
@@ -21,6 +21,7 @@ import (
 	"github.com/daos-stack/daos/src/control/common/test"
 	"github.com/daos-stack/daos/src/control/events"
 	"github.com/daos-stack/daos/src/control/lib/hardware"
+	"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
 	"github.com/daos-stack/daos/src/control/lib/ranklist"
 	"github.com/daos-stack/daos/src/control/logging"
 	"github.com/daos-stack/daos/src/control/server/config"
@@ -59,24 +60,27 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 	}
 
 	for name, tc := range map[string]struct {
-		badDevState    bool
-		noPciCfgSpc    bool
-		pciDev         *hardware.PCIDevice
-		pciDevErr      error
-		emptyHealthRes bool
-		healthErr      error
-		lastStats      map[string]*ctlpb.BioHealthResp
-		expCtrlr       *ctlpb.NvmeController
-		expNotUpdated  bool
-		expErr         error
-		expDispatched  []*events.RASEvent
-		expLastStats   map[string]*ctlpb.BioHealthResp
+		badDevState      bool
+		nilLinkStatsProv bool
+		noPciCfgSpc      bool
+		pciDev           *hardware.PCIDevice
+		pciDevErr        error
+		healthReq        *ctlpb.BioHealthReq
+		healthRes        *ctlpb.BioHealthResp
+		nilHealthRes     bool
+		healthErr        error
+		lastStats        map[string]*ctlpb.BioHealthResp
+		expCtrlr         *ctlpb.NvmeController
+		expNotUpdated    bool
+		expErr           error
+		expDispatched    []*events.RASEvent
+		expLastStats     map[string]*ctlpb.BioHealthResp
 	}{
 		"bad state; skip health": {
 			badDevState: true,
-			noPciCfgSpc: true,
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr: pciAddr,
+				PciCfg:  "ABCD",
 			},
 			expNotUpdated: true,
 		},
@@ -88,11 +92,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 				HealthStats: healthWithLinkStats(0, 0, 0, 0),
 			},
 		},
+		"nil bio health response": {
+			nilHealthRes: true,
+			expErr:       errors.New("nil BioHealthResp"),
+		},
 		"empty bio health response; empty link stats": {
-			emptyHealthRes: true,
-			pciDev:         new(hardware.PCIDevice),
+			healthRes: new(ctlpb.BioHealthResp),
+			pciDev:    new(hardware.PCIDevice),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: new(ctlpb.BioHealthResp),
 			},
@@ -106,9 +115,19 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			pciDevErr: errors.New("fail"),
 			expErr:    errors.New("fail"),
 		},
+		"update health; add link stats; pciutils lib error; missing pcie caps": {
+			pciDevErr: pciutils.ErrNoPCIeCaps,
+			expCtrlr: &ctlpb.NvmeController{
+				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
+				DevState:    ctlpb.NvmeDevState_NORMAL,
+				HealthStats: healthWithLinkStats(0, 0, 0, 0),
+			},
+		},
 		"update health; add link stats; normal link state; no event published": {
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -128,6 +147,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			},
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4),
 			},
@@ -152,6 +172,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			},
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4),
 			},
@@ -167,6 +188,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(proto.MockNvmeHealth()),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -176,6 +198,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 0.5e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -191,6 +214,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 4, 1)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     pciAddr,
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: proto.MockNvmeHealth(),
 			},
@@ -213,6 +237,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4),
 			},
@@ -229,6 +254,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 8, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4),
 			},
@@ -245,6 +271,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 8, 8),
 			},
@@ -271,6 +298,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 8, 8)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4),
 			},
@@ -297,6 +325,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(8e+9, 2.5e+9, 4, 4)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(8e+9, 1e+9, 4, 4),
 			},
@@ -319,6 +348,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 16, 8)),
 			expCtrlr: &ctlpb.NvmeController{
 				PciAddr:     test.MockPCIAddr(1),
+				PciCfg:      "ABCD",
 				DevState:    ctlpb.NvmeDevState_NORMAL,
 				HealthStats: healthWithLinkStats(1e+9, 1e+9, 16, 4),
 			},
@@ -335,15 +365,11 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
 			defer test.ShowBufferOnFailure(t, buf)
 
-			healthRes := healthWithLinkStats(0, 0, 0, 0)
-			if tc.emptyHealthRes {
-				healthRes = new(ctlpb.BioHealthResp)
-			}
-			getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
-				return healthRes, tc.healthErr
+			scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
+				return tc.healthRes, tc.healthErr
 			}
 			defer func() {
-				getCtrlrHealth = getBioHealth
+				scanHealth = getBioHealth
 			}()
 
 			var devState ctlpb.NvmeDevState
@@ -363,10 +389,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 					LinkMaxWidth: 4,
 				}
 			}
+			if tc.healthRes == nil && !tc.nilHealthRes {
+				tc.healthRes = healthWithLinkStats(0, 0, 0, 0)
+			}
 
-			mockProv := &mockPCIeLinkStatsProvider{
-				pciDev:    tc.pciDev,
-				pciDevErr: tc.pciDevErr,
+			var mockProv *mockPCIeLinkStatsProvider
+			if !tc.nilLinkStatsProv {
+				mockProv = &mockPCIeLinkStatsProvider{
+					pciDev:    tc.pciDev,
+					pciDevErr: tc.pciDevErr,
+				}
 			}
 
 			ctrlr := &ctlpb.NvmeController{
@@ -387,8 +419,14 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
 			subscriber := newMockSubscriber(2)
 			ps.Subscribe(events.RASTypeInfoOnly, subscriber)
 
-			upd, err := populateCtrlrHealth(test.Context(t), ei,
-				&ctlpb.BioHealthReq{}, ctrlr, mockProv)
+			chReq := ctrlrHealthReq{
+				engine:        ei,
+				bhReq:         tc.healthReq,
+				ctrlr:         ctrlr,
+				linkStatsProv: mockProv,
+			}
+
+			upd, err := populateCtrlrHealth(test.Context(t), chReq)
 			test.CmpErr(t, tc.expErr, err)
 			if err != nil {
 				return
@@ -436,6 +474,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 	}
 	defSmdScanRes := func() *ctlpb.SmdDevResp {
 		sd := proto.MockSmdDevice(c, 2)
+		sd.Rank = 2
 		return &ctlpb.SmdDevResp{Devices: []*ctlpb.SmdDevice{sd}}
 	}
 	healthRespWithUsage := func() *ctlpb.BioHealthResp {
@@ -444,6 +483,19 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 		mh.MetaWalSize, mh.RdbWalSize = 4, 5
 		return mh
 	}
+	ctrlrWithUsageAndMeta := func() *ctlpb.NvmeController {
+		c := proto.MockNvmeController(2)
+		c.HealthStats = healthRespWithUsage()
+		sd := proto.MockSmdDevice(nil, 2)
+		sd.Rank = 1
+		sd.TotalBytes = c.HealthStats.TotalBytes
+		sd.AvailBytes = c.HealthStats.AvailBytes
+		sd.ClusterSize = c.HealthStats.ClusterSize
+		sd.MetaWalSize = c.HealthStats.MetaWalSize
+		sd.RdbWalSize = c.HealthStats.RdbWalSize
+		c.SmdDevices = []*ctlpb.SmdDevice{sd}
+		return c
+	}
 
 	for name, tc := range map[string]struct {
 		req                 ctlpb.ScanNvmeReq
@@ -640,28 +692,14 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				State: new(ctlpb.ResponseState),
 			},
 		},
-		"scan over drpc; with smd and health; usage and wal size reported": {
+		"scan over drpc; with meta and health; usage and wal size reported": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			rank:      1,
 			smdRes:    defSmdScanRes(),
 			healthRes: healthRespWithUsage(),
 			expResp: &ctlpb.ScanNvmeResp{
-				Ctrlrs: proto.NvmeControllers{
-					func() *ctlpb.NvmeController {
-						c := proto.MockNvmeController(2)
-						c.HealthStats = healthRespWithUsage()
-						sd := proto.MockSmdDevice(nil, 2)
-						sd.Rank = 1
-						sd.TotalBytes = c.HealthStats.TotalBytes
-						sd.AvailBytes = c.HealthStats.AvailBytes
-						sd.ClusterSize = c.HealthStats.ClusterSize
-						sd.MetaWalSize = c.HealthStats.MetaWalSize
-						sd.RdbWalSize = c.HealthStats.RdbWalSize
-						c.SmdDevices = []*ctlpb.SmdDevice{sd}
-						return c
-					}(),
-				},
-				State: new(ctlpb.ResponseState),
+				Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()},
+				State:  new(ctlpb.ResponseState),
 			},
 		},
 		"scan over drpc; only ctrlrs with valid states shown": {
@@ -703,7 +741,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				State: new(ctlpb.ResponseState),
 			},
 		},
-		"scan over drpc; with smd and health; missing ctrlr in smd": {
+		"scan over drpc; with meta and health; missing ctrlr in smd": {
 			req: ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdRes: func() *ctlpb.SmdDevResp {
 				ssr := defSmdScanRes()
@@ -713,23 +751,48 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("no ctrlr ref"),
 		},
-		"scan over drpc; with smd and health; health scan fails": {
+		"scan over drpc; with meta and health; health scan fails": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdRes:    defSmdScanRes(),
 			healthErr: errors.New("health scan failed"),
 			expErr:    errors.New("health scan failed"),
 		},
-		"scan over drpc; with smd and health; smd list fails": {
+		"scan over drpc; with meta and health; smd list fails": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			smdErr:    errors.New("smd scan failed"),
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("smd scan failed"),
 		},
-		"scan over drpc; with smd and health; nil smd list returned": {
+		"scan over drpc; with meta and health; nil smd list returned": {
 			req:       ctlpb.ScanNvmeReq{Meta: true, Health: true},
 			healthRes: healthRespWithUsage(),
 			expErr:    errors.New("nil smd scan resp"),
 		},
+		"scan over drpc; with meta and health; link info update skipped": {
+			req:  ctlpb.ScanNvmeReq{Meta: true, Health: true},
+			rank: 1,
+			smdRes: func() *ctlpb.SmdDevResp {
+				ssr := defSmdScanRes()
+				ssr.Devices[0].Ctrlr.PciCfg = "ABCD"
+				return ssr
+			}(),
+			healthRes: healthRespWithUsage(),
+			expResp: &ctlpb.ScanNvmeResp{
+				Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()},
+				State:  new(ctlpb.ResponseState),
+			},
+		},
+		"scan over drpc; with health; link info update run but failed": {
+			req: ctlpb.ScanNvmeReq{Health: true, LinkStats: true},
+			smdRes: func() *ctlpb.SmdDevResp {
+				ssr := defSmdScanRes()
+				ssr.Devices[0].Ctrlr.PciCfg = "ABCD"
+				return ssr
+			}(),
+			healthRes: healthRespWithUsage(),
+			// Prove link stat provider gets called when LinkStats flag set.
+			expErr: errors.New("link stats provider fail"),
+		},
 	} {
 		t.Run(name, func(t *testing.T) {
 			log, buf := logging.NewTestLogger(t.Name())
@@ -741,11 +804,17 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 			defer func() {
 				scanSmd = listSmdDevices
 			}()
-			getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
+			scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) {
 				return tc.healthRes, tc.healthErr
 			}
 			defer func() {
-				getCtrlrHealth = getBioHealth
+				scanHealth = getBioHealth
+			}()
+			linkStatsProv = &mockPCIeLinkStatsProvider{
+				pciDevErr: errors.New("link stats provider fail"),
+			}
+			defer func() {
+				linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
 			}()
 
 			if tc.provRes == nil {
@@ -776,7 +845,8 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
 				ei.setSuperblock(nil)
 			} else {
 				ei.setSuperblock(&Superblock{
-					Rank: ranklist.NewRankPtr(uint32(tc.rank)), ValidRank: true,
+					Rank:      ranklist.NewRankPtr(uint32(tc.rank)),
+					ValidRank: true,
 				})
 			}
 
diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto
index 944d8e943ba..be068e5274d 100644
--- a/src/proto/ctl/storage_nvme.proto
+++ b/src/proto/ctl/storage_nvme.proto
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2019-2023 Intel Corporation.
+// (C) Copyright 2019-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -28,6 +28,7 @@ message ScanNvmeReq {
 	bool Basic = 3;		// Strip NVMe device details to only basic
 	uint64 MetaSize = 4;	// Size of the metadata blob
 	uint64 RdbSize = 5;	// Size of the RDB blob
+	bool   LinkStats = 6;     // Populate PCIe link info in health statistics
 }
 
 message ScanNvmeResp {

From eb95b5555016ccb40c6915ededd17943f652f1e0 Mon Sep 17 00:00:00 2001
From: wiliamhuang <lei.huang@intel.com>
Date: Mon, 28 Oct 2024 09:00:18 -0500
Subject: [PATCH 25/34] DAOS-16722 client: to intercept PMPI_Init() in
 libpil4dfs (#15387)

Intercept PMPI_Init() to avoid calling daos_init() if MPI_Init() is intercepted by other library (like darshan and mpip).

Signed-off-by: Lei Huang <lei.huang@intel.com>
---
 src/client/dfuse/pil4dfs/int_dfs.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c
index 3087cb09c75..b69449f1e3d 100644
--- a/src/client/dfuse/pil4dfs/int_dfs.c
+++ b/src/client/dfuse/pil4dfs/int_dfs.c
@@ -470,6 +470,7 @@ static int (*next_tcgetattr)(int fd, void *termios_p);
 /* end NOT supported by DAOS */
 
 static int (*next_mpi_init)(int *argc, char ***argv);
+static int (*next_pmpi_init)(int *argc, char ***argv);
 
 /* to do!! */
 /**
@@ -1041,6 +1042,22 @@ MPI_Init(int *argc, char ***argv)
 	return rc;
 }
 
+int
+PMPI_Init(int *argc, char ***argv)
+{
+	int rc;
+
+	if (next_pmpi_init == NULL) {
+		next_pmpi_init = dlsym(RTLD_NEXT, "PMPI_Init");
+		D_ASSERT(next_pmpi_init != NULL);
+	}
+
+	atomic_fetch_add_relaxed(&mpi_init_count, 1);
+	rc = next_pmpi_init(argc, argv);
+	atomic_fetch_add_relaxed(&mpi_init_count, -1);
+	return rc;
+}
+
 /** determine whether a path (both relative and absolute) is on DAOS or not. If yes,
  *  returns parent object, item name, full path of parent dir, full absolute path, and
  *  the pointer to struct dfs_mt.

From bde13c310ef66e302d3bfb55af2b252a67386cb7 Mon Sep 17 00:00:00 2001
From: mjean308 <48688872+mjean308@users.noreply.github.com>
Date: Mon, 28 Oct 2024 11:25:47 -0400
Subject: [PATCH 26/34] DAOS-15943 test: Remove server logging from
 pre-teardown (#15282) (#15386)

Signed-off-by: Maureen Jean <maureen.jean@intel.com>
---
 src/tests/ftest/util/soak_test_base.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py
index f32e068cb16..2d1c11e722e 100644
--- a/src/tests/ftest/util/soak_test_base.py
+++ b/src/tests/ftest/util/soak_test_base.py
@@ -25,8 +25,8 @@
 from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse,
                         create_app_cmdline, create_dm_cmdline, create_fio_cmdline,
                         create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline,
-                        create_racer_cmdline, ddhhmmss_format, get_daos_server_logs, get_harassers,
-                        get_journalctl, launch_exclude_reintegrate, launch_extend, launch_reboot,
+                        create_racer_cmdline, ddhhmmss_format, get_harassers,
+                        launch_exclude_reintegrate, launch_extend, launch_reboot,
                         launch_server_stop_start, launch_snapshot, launch_vmd_identify_check,
                         reserved_file_copy, run_event_check, run_metrics_check, run_monitor_check)
 
@@ -164,17 +164,6 @@ def pre_tear_down(self):
 
         # display final metrics
         run_metrics_check(self, prefix="final")
-        # Gather server logs
-        try:
-            get_daos_server_logs(self)
-        except SoakTestError as error:
-            errors.append(f"<<FAILED: Failed to gather server logs {error}>>")
-        # Gather journalctl logs
-        hosts = list(set(self.hostlist_servers))
-        since = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.start_time))
-        until = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.end_time))
-        for journalctl_type in ["kernel", "daos_server"]:
-            get_journalctl(self, hosts, since, until, journalctl_type, logging=True)
 
         if self.all_failed_harassers:
             errors.extend(self.all_failed_harassers)

From 4637c0058cceb07a6918c3c426d2ce45ab0750c8 Mon Sep 17 00:00:00 2001
From: Joseph Moore <26410038+jgmoore-or@users.noreply.github.com>
Date: Thu, 31 Oct 2024 10:11:09 -0600
Subject: [PATCH 27/34] DAOS-16685 dfuse: Change eq poll to use NOWAIT.
 (#15425)

Signed-off-by: Joseph Moore <joseph.moore@intel.com>
---
 src/client/dfuse/dfuse_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c
index 4f654fa3209..6397b283e97 100644
--- a/src/client/dfuse/dfuse_core.c
+++ b/src/client/dfuse/dfuse_core.c
@@ -53,7 +53,7 @@ dfuse_progress_thread(void *arg)
 				return NULL;
 		}
 
-		rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_WAIT, 128, &dev[0]);
+		rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_NOWAIT, 128, &dev[0]);
 		if (rc >= 1) {
 			for (i = 0; i < rc; i++) {
 				struct dfuse_event *ev;

From c10428a132d072c9187fab6dd02bdb0208e6b570 Mon Sep 17 00:00:00 2001
From: Johann Lombardi <johann.lombardi@gmail.com>
Date: Tue, 5 Nov 2024 05:40:16 +0100
Subject: [PATCH 28/34] DAOS-16508 csum: retry a few times on checksum mismatch
 on update (#15069) (#15111)

Unlike fetch, we return DER_CSUM on update (turned into EIO by dfs) without any retry.
We should retry a few times in case it is a transient error.

The patch also prints more information about the actual checksum mismatch.

Signed-off-by: Johann Lombardi <johann.lombardi@gmail.com>
Co-authored-by: Dalton Bohning <dalton.bohning@intel.com>
---
 src/common/checksum.c     |  4 ++++
 src/object/cli_csum.h     |  3 +++
 src/object/cli_obj.c      | 38 ++++++++++++++++++++++++++++++--------
 src/object/obj_internal.h |  3 ++-
 4 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/src/common/checksum.c b/src/common/checksum.c
index 69cf89ab07a..c36f14e3c6d 100644
--- a/src/common/checksum.c
+++ b/src/common/checksum.c
@@ -278,6 +278,10 @@ daos_csummer_compare_csum_info(struct daos_csummer *obj,
 		match = daos_csummer_csum_compare(obj, ci_idx2csum(a, i),
 						  ci_idx2csum(b, i),
 						  a->cs_len);
+		if (unlikely(!match))
+			D_ERROR("Checksum mismatch at index %d/%d "DF_CI_BUF" != "DF_CI_BUF"\n", i,
+				a->cs_nr, DP_CI_BUF(ci_idx2csum(a, i), a->cs_len),
+				DP_CI_BUF(ci_idx2csum(b, i), b->cs_len));
 	}
 
 	return match;
diff --git a/src/object/cli_csum.h b/src/object/cli_csum.h
index 3b1431b8c1f..566c707054d 100644
--- a/src/object/cli_csum.h
+++ b/src/object/cli_csum.h
@@ -11,6 +11,9 @@
 #include <daos/cont_props.h>
 #include "obj_internal.h"
 
+/** How many times to retry UPDATE RPCs on checksum error */
+#define MAX_CSUM_RETRY 10
+
 int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param,
 		       daos_key_t *dkey, daos_iod_t *iods, d_sg_list_t *sgls, const uint32_t iod_nr,
 		       struct dcs_layout *layout, struct dcs_csum_info **dkey_csum,
diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c
index 2c0c0a952ea..318ba0cb51e 100644
--- a/src/object/cli_obj.c
+++ b/src/object/cli_obj.c
@@ -4684,12 +4684,15 @@ obj_comp_cb(tse_task_t *task, void *data)
 	int			rc;
 
 	obj_auxi = tse_task_stack_pop(task, sizeof(*obj_auxi));
-	obj_auxi->io_retry = 0;
-	obj_auxi->result = 0;
-	obj_auxi->csum_retry = 0;
-	obj_auxi->tx_uncertain = 0;
-	obj_auxi->nvme_io_err = 0;
 	obj = obj_auxi->obj;
+
+	/** Clear various bits for a new attempt */
+	obj_auxi->io_retry	= 0;
+	obj_auxi->result	= 0;
+	obj_auxi->csum_retry	= 0;
+	obj_auxi->tx_uncertain	= 0;
+	obj_auxi->nvme_io_err	= 0;
+
 	rc = obj_comp_cb_internal(obj_auxi);
 	if (rc != 0 || obj_auxi->result) {
 		if (task->dt_result == 0)
@@ -4760,9 +4763,28 @@ obj_comp_cb(tse_task_t *task, void *data)
 						obj_auxi->tx_uncertain = 1;
 					else
 						obj_auxi->nvme_io_err = 1;
-				} else if (task->dt_result != -DER_NVME_IO) {
-					/* Don't retry update for CSUM & UNCERTAIN errors */
-					obj_auxi->io_retry = 0;
+				} else {
+					if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE &&
+					    task->dt_result == -DER_CSUM) {
+						struct shard_rw_args *rw_arg = &obj_auxi->rw_args;
+
+						/** Retry a few times on checksum error on update */
+						if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) {
+							obj_auxi->csum_retry = 1;
+							rw_arg->csum_retry_cnt++;
+							D_DEBUG(DB_IO, DF_OID" checksum error on "
+								"update, retrying\n",
+								DP_OID(obj->cob_md.omd_id));
+						} else {
+							D_ERROR(DF_OID" checksum error on update, "
+								"too many retries. Failing I/O\n",
+								DP_OID(obj->cob_md.omd_id));
+							obj_auxi->io_retry = 0;
+						}
+					} else if (task->dt_result != -DER_NVME_IO) {
+						/* Don't retry update for UNCERTAIN errors */
+						obj_auxi->io_retry = 0;
+					}
 				}
 			} else {
 				obj_auxi->io_retry = 0;
diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h
index cfe3385e959..06cfdb5b195 100644
--- a/src/object/obj_internal.h
+++ b/src/object/obj_internal.h
@@ -281,6 +281,7 @@ struct shard_rw_args {
 	struct dcs_csum_info	*dkey_csum;
 	struct dcs_iod_csums	*iod_csums;
 	struct obj_reasb_req	*reasb_req;
+	uint16_t		 csum_retry_cnt;
 };
 
 struct coll_sparse_targets {
@@ -480,8 +481,8 @@ struct obj_auxi_args {
 					 rebuilding:1,
 					 for_migrate:1;
 	/* request flags. currently only: ORF_RESEND */
-	uint32_t			 flags;
 	uint32_t			 specified_shard;
+	uint32_t			 flags;
 	uint16_t			 retry_cnt;
 	uint16_t			 inprogress_cnt;
 	struct obj_req_tgts		 req_tgts;

From d17d3d6f14508cad773a4e72eefacd3bf4fdc3a3 Mon Sep 17 00:00:00 2001
From: Jeff Olivier <jeffolivier@google.com>
Date: Tue, 5 Nov 2024 01:55:46 -0700
Subject: [PATCH 29/34] DAOS-16211 vos: Avoid race condition with discard
 (#15370) (#15432)

* DAOS-16211 vos: Avoid race condition with discard (#15370)

There is a possible race between aggregation deleting
the object tree and discard working on the same
object tree.  Add a check to avoid this race

Signed-off-by: Jeff Olivier <jeffolivier@google.com>
---
 src/vos/vos_obj_index.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c
index ea47cf4454c..58699453f40 100644
--- a/src/vos/vos_obj_index.c
+++ b/src/vos/vos_obj_index.c
@@ -791,13 +791,14 @@ oi_iter_process(struct vos_iterator *iter, vos_iter_proc_op_t op, void *args)
 int
 oi_iter_check_punch(daos_handle_t ih)
 {
-	struct vos_iterator	*iter = vos_hdl2iter(ih);
-	struct vos_oi_iter	*oiter = iter2oiter(iter);
-	struct vos_obj_df	*obj;
-	struct oi_delete_arg	del_arg;
-	daos_unit_oid_t		 oid;
-	d_iov_t			 rec_iov;
-	int			 rc;
+	struct vos_iterator  *iter  = vos_hdl2iter(ih);
+	struct vos_oi_iter   *oiter = iter2oiter(iter);
+	struct vos_container *cont  = oiter->oit_cont;
+	struct vos_obj_df    *obj;
+	struct oi_delete_arg  del_arg;
+	daos_unit_oid_t       oid;
+	d_iov_t               rec_iov;
+	int                   rc;
 
 	D_ASSERT(iter->it_type == VOS_ITER_OBJ);
 
@@ -811,10 +812,22 @@ oi_iter_check_punch(daos_handle_t ih)
 	obj = (struct vos_obj_df *)rec_iov.iov_buf;
 	oid = obj->vo_id;
 
-	if (!vos_ilog_is_punched(vos_cont2hdl(oiter->oit_cont), &obj->vo_ilog, &oiter->oit_epr,
-				 NULL, &oiter->oit_ilog_info))
+	if (!vos_ilog_is_punched(vos_cont2hdl(cont), &obj->vo_ilog, &oiter->oit_epr, NULL,
+				 &oiter->oit_ilog_info))
 		return 0;
 
+	rc = vos_obj_hold(vos_obj_cache_current(cont->vc_pool->vp_sysdb), cont, oid,
+			  &oiter->oit_epr, iter->it_bound, VOS_OBJ_AGGREGATE | VOS_OBJ_NO_HOLD,
+			  DAOS_INTENT_PURGE, NULL, NULL);
+	if (rc != 0) {
+		/** -DER_BUSY means the object is in-use already.  We will after a yield in this
+		 * case.
+		 */
+		D_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, "Hold check failed for " DF_UOID "\n",
+			 DP_UOID(oid));
+		return rc;
+	}
+
 	/** Ok, ilog is fully punched, so we can move it to gc heap */
 	rc = umem_tx_begin(vos_cont2umm(oiter->oit_cont), NULL);
 	if (rc != 0)

From 9ab3200f94dcb980cc812d4c234b1003982bb4b0 Mon Sep 17 00:00:00 2001
From: Jerome Soumagne <jerome.soumagne@intel.com>
Date: Tue, 5 Nov 2024 07:37:34 -0600
Subject: [PATCH 30/34] DAOS-15162 build: update to libfabric 1.22.0 (#15441)

Signed-off-by: Jerome Soumagne <jerome.soumagne@intel.com>
---
 utils/build.config | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/utils/build.config b/utils/build.config
index 52749b3f1ce..b7f86add782 100644
--- a/utils/build.config
+++ b/utils/build.config
@@ -8,7 +8,7 @@ pmdk=2.1.0
 isal=v2.30.0
 isal_crypto=v2.23.0
 spdk=v22.01.2
-ofi=v1.19.1
+ofi=v1.22.0
 mercury=v2.4.0rc5
 protobufc=v1.3.3
 ucx=v1.14.1
@@ -27,7 +27,6 @@ ucx=https://github.com/openucx/ucx.git
 
 [patch_versions]
 spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff
-ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff
 fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff
 mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch
 pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff

From f7d12a48846a6121489ca8026441c88f33971806 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Wed, 6 Nov 2024 06:49:31 +0800
Subject: [PATCH 31/34] DAOS-16721 dtx: handle potential DTX ID reusing trouble
 - b26 (#15409)

The patch contains the following improvements:

1. When VOS level logic returns -DER_TX_RESATRT, the object level RPC
   handler should set 'RESEND' flag then restart the transaction with
   newer epoch. Because dtx_abort() logic cannot guarantee all former
   prepared DTX entries (on all related participants) can be aborted,
   especially if the former one failed for some network trouble, that
   may cause restarted transaction hit -DER_TX_ID_REUSED unexpectedly.

2. Compare the epoch for DTX entries with the same transaction ID for
   distinguishing potential reused TX ID more accurately.

3. Add DTX entry into DTX CoS cache if cannot commit it synchronously.
   Then subsequent batched commit logic can handle it.

4. If server complains suspected TX ID reusing, then reports -EIO to
   related application instead of assertion on client.

5. Control DTX related warning message frequency to avoid log flood.

6. Collect more information when generate some error/warning message.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/dtx/dtx_coll.c               |   8 ++-
 src/dtx/dtx_common.c             |  83 ++++++++++++----------
 src/dtx/dtx_rpc.c                |  15 ++--
 src/dtx/dtx_srv.c                |   5 +-
 src/include/daos_srv/container.h |   3 +
 src/object/cli_coll.c            |   2 +-
 src/object/cli_obj.c             |  23 ++++--
 src/object/cli_shard.c           |  13 ++--
 src/object/obj_internal.h        |   2 +-
 src/object/obj_utils.c           |  17 +++--
 src/object/srv_ec_aggregate.c    |  33 ++++++++-
 src/object/srv_obj.c             | 116 ++++++++++++++++++-------------
 src/object/srv_obj_remote.c      |  12 ++--
 src/vos/vos_dtx.c                |  63 ++++++++++++-----
 src/vos/vos_internal.h           |   2 +
 15 files changed, 254 insertions(+), 143 deletions(-)

diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c
index 7e7c8199155..ba45aaa2616 100644
--- a/src/dtx/dtx_coll.c
+++ b/src/dtx/dtx_coll.c
@@ -81,9 +81,11 @@ dtx_coll_prep_ult(void *arg)
 	}
 
 	if (dcpa->dcpa_result != 0) {
-		if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST)
-			D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n",
-				DP_DTI(&dci->dci_xid), opc, DP_RC(rc));
+		if (dcpa->dcpa_result < 0 &&
+		    dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST)
+			D_ERROR("Failed to load mbs for "DF_DTI" in "DF_UUID"/"DF_UUID", opc %u: "
+				DF_RC"\n", DP_DTI(&dci->dci_xid), DP_UUID(dci->dci_po_uuid),
+				DP_UUID(dci->dci_co_uuid), opc, DP_RC(dcpa->dcpa_result));
 		goto out;
 	}
 
diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c
index ecb156729ed..1ee74ae11a4 100644
--- a/src/dtx/dtx_common.c
+++ b/src/dtx/dtx_common.c
@@ -1271,7 +1271,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	int				 status = -1;
 	int				 rc = 0;
 	bool				 aborted = false;
-	bool				 unpin = false;
 
 	D_ASSERT(cont != NULL);
 
@@ -1339,7 +1338,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	 * it persistently. Otherwise, the subsequent DTX resync may not find it as
 	 * to regard it as failed transaction and abort it.
 	 */
-	if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo &&
+	if (!dth->dth_active && !dth->dth_prepared &&
 	    (dth->dth_dist || dth->dth_modification_cnt > 0)) {
 		result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false);
 		if (unlikely(result < 0)) {
@@ -1349,7 +1348,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		}
 	}
 
-	if (dth->dth_prepared || dtx_batched_ult_max == 0) {
+	if ((dth->dth_prepared && !dlh->dlh_coll) || dtx_batched_ult_max == 0) {
 		dth->dth_sync = 1;
 		goto sync;
 	}
@@ -1363,14 +1362,12 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	if (DAOS_FAIL_CHECK(DAOS_DTX_MISS_COMMIT))
 		dth->dth_sync = 1;
 
-	/* For synchronous DTX, do not add it into CoS cache, otherwise,
-	 * we may have no way to remove it from the cache.
-	 */
 	if (dth->dth_sync)
 		goto sync;
 
 	D_ASSERT(dth->dth_mbs != NULL);
 
+cache:
 	if (dlh->dlh_coll) {
 		rc = dtx_cos_add(cont, dlh->dlh_coll_entry, &dth->dth_leader_oid,
 				 dth->dth_dkey_hash, dth->dth_epoch, DCF_EXP_CMT | DCF_COLL);
@@ -1378,38 +1375,47 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 		size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size;
 		D_ALLOC(dte, size);
 		if (dte == NULL) {
-			dth->dth_sync = 1;
-			goto sync;
-		}
-
-		mbs = (struct dtx_memberships *)(dte + 1);
-		memcpy(mbs, dth->dth_mbs, size - sizeof(*dte));
-
-		dte->dte_xid = dth->dth_xid;
-		dte->dte_ver = dth->dth_ver;
-		dte->dte_refs = 1;
-		dte->dte_mbs = mbs;
+			rc = -DER_NOMEM;
+		} else {
+			mbs = (struct dtx_memberships *)(dte + 1);
+			memcpy(mbs, dth->dth_mbs, size - sizeof(*dte));
+
+			dte->dte_xid = dth->dth_xid;
+			dte->dte_ver = dth->dth_ver;
+			dte->dte_refs = 1;
+			dte->dte_mbs = mbs;
+
+			if (!(mbs->dm_flags & DMF_SRDG_REP))
+				flags = DCF_EXP_CMT;
+			else if (dth->dth_modify_shared)
+				flags = DCF_SHARED;
+			else
+				flags = 0;
 
-		if (!(mbs->dm_flags & DMF_SRDG_REP))
-			flags = DCF_EXP_CMT;
-		else if (dth->dth_modify_shared)
-			flags = DCF_SHARED;
-		else
-			flags = 0;
+			rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash,
+					 dth->dth_epoch, flags);
+			dtx_entry_put(dte);
+		}
+	}
 
-		rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash,
-				 dth->dth_epoch, flags);
-		dtx_entry_put(dte);
+	/*
+	 * NOTE: If we failed to add the committable DTX into CoS cache, then we also have no way
+	 *	 to commit (or abort) the DTX because of out of memory. Such DTX will be finally
+	 *	 committed via next DTX resync (after recovered from OOM).
+	 *
+	 *	 Here, we only warning to notify the trouble, but not failed the transaction.
+	 */
+	if (rc != 0) {
+		D_WARN(DF_UUID": Fail to cache %s DTX "DF_DTI": "DF_RC"\n",
+		       DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular",
+		       DP_DTI(&dth->dth_xid), DP_RC(rc));
+		D_GOTO(out, result = 0);
 	}
 
-	if (rc == 0) {
-		if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) {
-			vos_dtx_mark_committable(dth);
-			if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll)
-				sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req);
-		}
-	} else {
-		dth->dth_sync = 1;
+	if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) {
+		vos_dtx_mark_committable(dth);
+		if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll)
+			sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req);
 	}
 
 sync:
@@ -1428,10 +1434,15 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 			rc = dtx_commit(cont, &dte, NULL, 1, false);
 		}
 
-		if (rc != 0)
+		if (rc != 0) {
 			D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n",
 			       DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular",
 			       DP_DTI(&dth->dth_xid), DP_RC(rc));
+			if (likely(dtx_batched_ult_max != 0)) {
+				dth->dth_sync = 0;
+				goto cache;
+			}
+		}
 
 		/*
 		 * NOTE: The semantics of 'sync' commit does not guarantee that all
@@ -1451,7 +1462,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	 * to locally retry for avoiding related forwarded RPC timeout, instead,
 	 * The leader will trigger retry globally without abort 'prepared' ones.
 	 */
-	if (unpin || (result < 0 && result != -DER_AGAIN && !dth->dth_solo)) {
+	if (result < 0 && result != -DER_AGAIN && !dth->dth_solo) {
 		/* 1. Drop partial modification for distributed transaction.
 		 * 2. Remove the pinned DTX entry.
 		 */
diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c
index 2ccbfec2734..6d34e871269 100644
--- a/src/dtx/dtx_rpc.c
+++ b/src/dtx/dtx_rpc.c
@@ -1657,8 +1657,9 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d
 	}
 
 	D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE,
-		 "Collectively commit DTX "DF_DTI": %d/%d/%d\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1, rc2);
+		 "Collectively commit DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/%d\n",
+		 DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid),
+		 rc, rc1, rc2);
 
 	return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2;
 }
@@ -1717,8 +1718,9 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 		rc2 = 0;
 
 	D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE,
-		 "Collectively abort DTX "DF_DTI": %d/%d/%d\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1, rc2);
+		 "Collectively abort DTX "DF_DTI" with epoch "DF_X64" in "
+		 DF_UUID"/"DF_UUID": %d/%d/%d\n", DP_DTI(&dce->dce_xid), epoch,
+		 DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), rc, rc1, rc2);
 
 	return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2;
 }
@@ -1766,8 +1768,9 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc
 	}
 
 	D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR,
-		 DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n",
-		 DP_DTI(&dce->dce_xid), rc, rc1);
+		 DB_TRACE, "Collectively check DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/\n",
+		 DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid),
+		 rc, rc1);
 
 	return dce->dce_ranks != NULL  ? rc : rc1;
 }
diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c
index 0fd4e7c513f..084f804ec81 100644
--- a/src/dtx/dtx_srv.c
+++ b/src/dtx/dtx_srv.c
@@ -474,8 +474,9 @@ dtx_coll_handler(crt_rpc_t *rpc)
 
 out:
 	D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE,
-		 "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n",
-		 opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc));
+		 "Handled collective DTX PRC %u on rank %u for "DF_DTI" in "
+		 DF_UUID"/"DF_UUID": "DF_RC"\n", opc, myrank, DP_DTI(&dci->dci_xid),
+		 DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), DP_RC(rc));
 
 	dco->dco_status = rc;
 	rc = crt_reply_send(rpc);
diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h
index f99f4df14e3..9fc615c2a8b 100644
--- a/src/include/daos_srv/container.h
+++ b/src/include/daos_srv/container.h
@@ -108,6 +108,9 @@ struct ds_cont_child {
 	uint32_t		 sc_dtx_committable_count;
 	uint32_t		 sc_dtx_committable_coll_count;
 
+	/* Last timestamp when EC aggregation reports -DER_INPROGRESS. */
+	uint64_t		 sc_ec_agg_busy_ts;
+
 	/* The global minimum EC aggregation epoch, which will be upper
 	 * limit for VOS aggregation, i.e. EC object VOS aggregation can
 	 * not cross this limit. For simplification purpose, all objects
diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c
index 3285bec58b3..d517e3269d6 100644
--- a/src/object/cli_coll.c
+++ b/src/object/cli_coll.c
@@ -873,7 +873,7 @@ queue_coll_query_task(tse_task_t *api_task, struct obj_auxi_args *obj_auxi, stru
 			   0, 0, ocdc);
 
 	for (i = 0; i < ocdc->grp_nr; i++) {
-		obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep);
+		obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep, obj->cob_md.omd_id);
 
 		tmp = coa->coa_dcts[ocdc->cur_pos].dct_shards[tgt_ep.ep_tag].dcs_idx;
 		rc = queue_shard_query_key_task(api_task, obj_auxi, epoch, tmp, map_ver,
diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c
index 318ba0cb51e..46161bf7a2d 100644
--- a/src/object/cli_obj.c
+++ b/src/object/cli_obj.c
@@ -1718,15 +1718,20 @@ dc_obj_retry_delay(tse_task_t *task, int err, uint16_t *retry_cnt, uint16_t *inp
 		   uint32_t timeout_sec)
 {
 	uint32_t	delay = 0;
+	uint32_t	limit = 4;
 
 	/*
-	 * Randomly delay 5 - 68 us if it is not the first retry for
+	 * Randomly delay 5 ~ 1028 us if it is not the first retry for
 	 * -DER_INPROGRESS || -DER_UPDATE_AGAIN cases.
 	 */
 	++(*retry_cnt);
 	if (err == -DER_INPROGRESS || err == -DER_UPDATE_AGAIN) {
 		if (++(*inprogress_cnt) > 1) {
-			delay = (d_rand() & ((1 << 6) - 1)) + 5;
+			limit += *inprogress_cnt;
+			if (limit > 10)
+				limit = 10;
+
+			delay = (d_rand() & ((1 << limit) - 1)) + 5;
 			/* Rebuild is being established on the server side, wait a bit longer */
 			if (err == -DER_UPDATE_AGAIN)
 				delay <<= 10;
@@ -4856,11 +4861,14 @@ obj_comp_cb(tse_task_t *task, void *data)
 		D_ASSERT(daos_handle_is_inval(obj_auxi->th));
 		D_ASSERT(obj_is_modification_opc(obj_auxi->opc));
 
-		if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0)
-			/* XXX: it is must because miss to set "RESEND" flag, that is bug. */
-			D_ASSERTF(0,
-				  "Miss 'RESEND' flag (%x) when resend the RPC for task %p: %u\n",
-				  obj_auxi->flags, task, obj_auxi->retry_cnt);
+		if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) {
+			D_ERROR("TX ID maybe reused for unknown reason, "
+				"task %p, opc %u, flags %x, retry_cnt %u\n",
+				task, obj_auxi->opc, obj_auxi->flags, obj_auxi->retry_cnt);
+			task->dt_result = -DER_IO;
+			obj_auxi->io_retry = 0;
+			goto args_fini;
+		}
 
 		if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) {
 			daos_obj_rw_t		*api_args = dc_task_get_args(obj_auxi->obj_task);
@@ -4886,6 +4894,7 @@ obj_comp_cb(tse_task_t *task, void *data)
 		}
 	}
 
+args_fini:
 	if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH)
 		obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa);
 
diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c
index 0c9dfc1418e..9f084140f80 100644
--- a/src/object/cli_shard.c
+++ b/src/object/cli_shard.c
@@ -1451,11 +1451,14 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data)
 			shard_args->pa_auxi.obj_auxi->max_delay = timeout;
 	}
 
-	DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result,
-		  "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX "
-		  DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout",
-		  rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver,
-		  *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags,
+	DL_CDEBUG(task->dt_result < 0 && task->dt_result != -DER_INPROGRESS,
+		  DLOG_ERR, DB_IO, task->dt_result,
+		  "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" in "DF_UUID"/"DF_UUID"/"
+		  DF_UUID" with DTX "DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout",
+		  rpc, DP_UOID(ocpi->ocpi_oid), DP_UUID(ocpi->ocpi_po_uuid),
+		  DP_UUID(ocpi->ocpi_co_hdl), DP_UUID(ocpi->ocpi_co_uuid), DP_DTI(&ocpi->ocpi_xid),
+		  task, ocpi->ocpi_map_ver, *cb_args->cpca_ver,
+		  (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags,
 		  cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous");
 
 	crt_req_decref(rpc);
diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h
index 06cfdb5b195..c0df21dd009 100644
--- a/src/object/obj_internal.h
+++ b/src/object/obj_internal.h
@@ -1100,7 +1100,7 @@ int daos_obj_query_merge(struct obj_query_merge_args *oqma);
 void obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size,
 			uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc);
 void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts,
-			crt_endpoint_t *tgt_ep);
+			crt_endpoint_t *tgt_ep, daos_obj_id_t oid);
 void obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc);
 int obj_utils_init(void);
 void obj_utils_fini(void);
diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c
index 82d91c966ac..c01947a05a1 100644
--- a/src/object/obj_utils.c
+++ b/src/object/obj_utils.c
@@ -616,23 +616,22 @@ obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size,
 
 void
 obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts,
-		   crt_endpoint_t *tgt_ep)
+		   crt_endpoint_t *tgt_ep, daos_obj_id_t oid)
 {
 	struct daos_coll_target		*dct = &tgts[ocdc->cur_pos];
 	struct daos_coll_target		 tmp;
-	unsigned long			 rand = 0;
 	uint32_t			 size;
 	int				 pos;
 	int				 i;
 
 	if (ocdc->cur_step > 2) {
-		rand = d_rand();
 		/*
-		 * Randomly choose an engine as the relay one for load balance.
-		 * If the one corresponding to "pos" is former moved one, then
-		 * use the "cur_pos" as the relay engine.
+		 * Choose an engine (according to the given oid) as the relay one for load balance.
+		 * If the one corresponding to "pos" is former moved one, then use the "cur_pos" as
+		 * the relay engine. Then even if related RPC was resent without changing pool map,
+		 * then the relay one will be the same as the original case.
 		 */
-		pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos;
+		pos = oid.lo % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos;
 		if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) {
 			memcpy(&tmp, &tgts[pos], sizeof(tmp));
 			memcpy(&tgts[pos], dct, sizeof(tmp));
@@ -642,8 +641,8 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t
 
 	size = dct->dct_bitmap_sz << 3;
 
-	/* Randomly choose a XS as the local leader on target engine for load balance. */
-	for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) {
+	/* Choose a target as the local agent on the engine for load balance. */
+	for (i = 0, pos = oid.lo % dct->dct_tgt_nr; i < size; i++) {
 		if (isset(dct->dct_bitmap, i)) {
 			pos -= dct->dct_shards[i].dcs_nr;
 			if (pos < 0)
diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c
index 71c630fa947..50b513d1612 100644
--- a/src/object/srv_ec_aggregate.c
+++ b/src/object/srv_ec_aggregate.c
@@ -2667,8 +2667,13 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
 	struct ec_agg_param	 *ec_agg_param = agg_param->ap_data;
 	vos_iter_param_t	 iter_param = { 0 };
 	struct vos_iter_anchors  anchors = { 0 };
+	struct dtx_handle	*dth = NULL;
+	struct dtx_share_peer	*dsp;
+	struct dtx_id		 dti = { 0 };
+	struct dtx_epoch	 epoch = { 0 };
+	daos_unit_oid_t		 oid = { 0 };
+	int			 blocks = 0;
 	int			 rc = 0;
-	int                       blocks       = 0;
 
 	/*
 	 * Avoid calling into vos_aggregate() when aborting aggregation
@@ -2715,8 +2720,32 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr,
 	agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL);
 
 retry:
+	epoch.oe_value = epr->epr_hi;
+	rc = dtx_begin(cont->sc_hdl, &dti, &epoch, 0, cont->sc_pool->spc_map_version, &oid,
+		       NULL, 0, 0, NULL, &dth);
+	if (rc != 0)
+		goto update_hae;
+
 	rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors, agg_iterate_pre_cb,
-			 agg_iterate_post_cb, ec_agg_param, NULL);
+			 agg_iterate_post_cb, ec_agg_param, dth);
+	if (rc == -DER_INPROGRESS && !d_list_empty(&dth->dth_share_tbd_list)) {
+		uint64_t	now = daos_gettime_coarse();
+
+		/* Report warning per each 10 seconds to avoid log flood. */
+		if (now - cont->sc_ec_agg_busy_ts > 10) {
+			while ((dsp = d_list_pop_entry(&dth->dth_share_tbd_list,
+						       struct dtx_share_peer, dsp_link)) != NULL) {
+				D_WARN(DF_CONT ": EC aggregate hit non-committed DTX " DF_DTI "\n",
+				       DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
+				       DP_DTI(&dsp->dsp_xid));
+				dtx_dsp_free(dsp);
+			}
+
+			cont->sc_ec_agg_busy_ts = now;
+		}
+	}
+
+	dtx_end(dth, cont, rc);
 
 	/* Post_cb may not being executed in some cases */
 	agg_clear_extents(&ec_agg_param->ap_agg_entry);
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index 041ea903c4f..4762e04b898 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -2953,8 +2953,11 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 
 		d_tm_inc_counter(opm->opm_update_resent, 1);
 
-again1:
-		e = 0;
+again:
+		if (flags & ORF_RESEND)
+			e = orw->orw_epoch;
+		else
+			e = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti,
 				       &e, &version);
 		switch (rc) {
@@ -2965,8 +2968,13 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 			orw->orw_epoch = e;
 			/* TODO: Also recover the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &orw->orw_dti, e);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
-			rc = 0;
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -2976,7 +2984,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 		D_GOTO(out, rc);
 	}
 
-again2:
 	/* For leader case, we need to find out the potential conflict
 	 * (or share the same non-committed object/dkey) DTX(s) in the
 	 * CoS (committable) cache, piggyback them via the dispdatched
@@ -3021,7 +3028,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 	exec_arg.start = orw->orw_start_shard;
 
 	/* Execute the operation on all targets */
@@ -3036,28 +3043,25 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
 	case -DER_TX_RESTART:
 		/*
 		 * If this is a standalone operation, we can restart the
-		 * internal transaction right here. Otherwise, we have to defer
-		 * the restart to the RPC client.
+		 * internal transaction right here. Otherwise we have to
+		 * defer the restart to the RPC sponsor.
 		 */
-		if (opc == DAOS_OBJ_RPC_UPDATE) {
-			/*
-			 * Only standalone updates use this RPC. Retry with
-			 * newer epoch.
-			 */
-			orw->orw_epoch = d_hlc_get();
-			orw->orw_flags &= ~ORF_RESEND;
-			flags = 0;
-			d_tm_inc_counter(opm->opm_update_restart, 1);
-			goto again2;
-		}
+		if (opc != DAOS_OBJ_RPC_UPDATE)
+			break;
 
-		break;
+		/* Only standalone updates use this RPC. Retry with newer epoch. */
+		orw->orw_epoch = d_hlc_get();
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		d_tm_inc_counter(opm->opm_update_restart, 1);
+		goto again;
 	case -DER_AGAIN:
-		orw->orw_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		d_tm_inc_counter(opm->opm_update_retry, 1);
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -3875,8 +3879,11 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 	if (opi->opi_flags & ORF_RESEND) {
 		daos_epoch_t	e;
 
-again1:
-		e = 0;
+again:
+		if (flags & ORF_RESEND)
+			e = opi->opi_epoch;
+		else
+			e = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti,
 				       &e, &version);
 		switch (rc) {
@@ -3887,8 +3894,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 			flags |= ORF_RESEND;
 			/* TODO: Also recovery the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
-			rc = 0;
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -3898,7 +3910,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 		goto cleanup;
 	}
 
-again2:
 	/* For leader case, we need to find out the potential conflict
 	 * (or share the same non-committed object/dkey) DTX(s) in the
 	 * CoS (committable) cache, piggyback them via the dispdatched
@@ -3943,7 +3954,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 
 	/* Execute the operation on all shards */
 	if (opi->opi_api_flags & DAOS_COND_PUNCH)
@@ -3959,19 +3970,17 @@ ds_obj_punch_handler(crt_rpc_t *rpc)
 	rc = dtx_leader_end(dlh, ioc.ioc_coh, rc);
 	switch (rc) {
 	case -DER_TX_RESTART:
-		/*
-		 * Only standalone punches use this RPC. Retry with newer
-		 * epoch.
-		 */
+		/* Only standalone punches use this RPC. Retry with newer epoch. */
 		opi->opi_epoch = d_hlc_get();
-		opi->opi_flags &= ~ORF_RESEND;
-		flags = 0;
-		goto again2;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		goto again;
 	case -DER_AGAIN:
-		opi->opi_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -5663,8 +5672,11 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	if (ocpi->ocpi_flags & ORF_RESEND) {
 
-again1:
-		tmp = 0;
+again:
+		if (!(ocpi->ocpi_flags & ORF_LEADER) || (flags & ORF_RESEND))
+			tmp = ocpi->ocpi_epoch;
+		else
+			tmp = 0;
 		rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version);
 		switch (rc) {
 		case -DER_ALREADY:
@@ -5674,7 +5686,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 			flags |= ORF_RESEND;
 			/* TODO: Also recovery the epoch uncertainty. */
 			break;
+		case -DER_MISMATCH:
+			rc = vos_dtx_abort(ioc.ioc_vos_coh, &ocpi->ocpi_xid, tmp);
+			if (rc < 0 && rc != -DER_NONEXIST)
+				D_GOTO(out, rc);
+			/* Fall through */
 		case -DER_NONEXIST:
+			flags = 0;
 			break;
 		default:
 			D_GOTO(out, rc);
@@ -5683,7 +5701,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 		dce->dce_ver = version;
 	}
 
-again2:
 	epoch.oe_value = ocpi->ocpi_epoch;
 	epoch.oe_first = epoch.oe_value;
 	epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags);
@@ -5695,7 +5712,7 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 
 	exec_arg.rpc = rpc;
 	exec_arg.ioc = &ioc;
-	exec_arg.flags = flags;
+	exec_arg.flags |= flags;
 	exec_arg.coll_shards = dcts[0].dct_shards;
 	exec_arg.coll_tgts = dcts;
 	obj_coll_disp_init(dct_nr, ocpi->ocpi_max_tgt_sz,
@@ -5728,14 +5745,15 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 	switch (rc) {
 	case -DER_TX_RESTART:
 		ocpi->ocpi_epoch = d_hlc_get();
-		ocpi->ocpi_flags &= ~ORF_RESEND;
-		flags = 0;
-		goto again2;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
+		goto again;
 	case -DER_AGAIN:
-		ocpi->ocpi_flags |= ORF_RESEND;
 		need_abort = true;
+		exec_arg.flags |= ORF_RESEND;
+		flags = ORF_RESEND;
 		ABT_thread_yield();
-		goto again1;
+		goto again;
 	default:
 		break;
 	}
@@ -5755,12 +5773,14 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
 		max_ver = version;
 
 	DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc,
-		  "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc "
-		  DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, "
-		  "tgt_nr %u, forward width %u, forward depth %u, flags %x",
+		  "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/"
+		  DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, "
+		  "bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x",
 		  (ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
 		  (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc,
-		  DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch,
+		  DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
+		  DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl),
+		  DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch,
 		  ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz,
 		  ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count,
 		  ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c
index ce06723621b..f64d851e5b4 100644
--- a/src/object/srv_obj_remote.c
+++ b/src/object/srv_obj_remote.c
@@ -136,7 +136,7 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx,
 	*orw = *orw_parent;
 
 	orw->orw_oid.id_shard = shard_tgt->st_shard_id;
-	orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags;
+	orw->orw_flags |= (ORF_BULK_BIND | obj_exec_arg->flags) & ~ORF_LEADER;
 	if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond)
 		orw->orw_api_flags &= ~DAOS_COND_MASK;
 	orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count;
@@ -247,7 +247,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx,
 	*opi = *opi_parent;
 
 	opi->opi_oid.id_shard = shard_tgt->st_shard_id;
-	opi->opi_flags |= obj_exec_arg->flags;
+	opi->opi_flags |= obj_exec_arg->flags & ~ORF_LEADER;
 	if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond)
 		opi->opi_api_flags &= ~DAOS_COND_PUNCH;
 	opi->opi_dti_cos.ca_count = dth->dth_dti_cos_count;
@@ -495,7 +495,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	crt_endpoint_t			 tgt_ep = { 0 };
 	crt_rpc_t			*parent_req = exec_arg->rpc;
 	crt_rpc_t			*req;
-	struct obj_coll_punch_in	*ocpi_parent;
+	struct obj_coll_punch_in	*ocpi_parent = crt_req_get(parent_req);
 	struct obj_coll_punch_in	*ocpi;
 	int				 tag;
 	int				 rc = 0;
@@ -509,7 +509,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	if (remote_arg == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep);
+	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocpi_parent->ocpi_oid.id_pub);
 	tag = tgt_ep.ep_tag;
 
 	crt_req_addref(parent_req);
@@ -524,9 +524,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 		D_GOTO(out, rc);
 	}
 
-	ocpi_parent = crt_req_get(parent_req);
 	ocpi = crt_req_get(req);
-
 	ocpi->ocpi_odm = ocpi_parent->ocpi_odm;
 	uuid_copy(ocpi->ocpi_po_uuid, ocpi_parent->ocpi_po_uuid);
 	uuid_copy(ocpi->ocpi_co_hdl, ocpi_parent->ocpi_co_hdl);
@@ -634,7 +632,7 @@ ds_obj_coll_query_remote(struct dtx_leader_handle *dlh, void *data, int idx,
 	if (remote_arg == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep);
+	obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocqi_parent->ocqi_oid.id_pub);
 	tag = tgt_ep.ep_tag;
 
 	remote_arg->dlh = dlh;
diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index 0f87bfd4d0e..47e410fc1e7 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -312,16 +312,38 @@ dtx_act_ent_update(struct btr_instance *tins, struct btr_record *rec,
 
 	if (unlikely(!dae_old->dae_aborted)) {
 		/*
-		 * XXX: There are two possible reasons for that:
-		 *
-		 *	1. Client resent the RPC but without set 'RESEND' flag.
-		 *	2. Client reused the DTX ID for different modifications.
-		 *
-		 *	Currently, the 1st case is more suspected.
+		 * If the new entry and the old entry are for the same transaction, then the RPC
+		 * for the new one will take 'RESEND' flag, that will cause the old one has been
+		 * aborted before arriving at here. So it is quite possible that the new one and
+		 * the old one are for different transactions.
 		 */
-		D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n",
-			DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
-		return -DER_TX_ID_REUSED;
+		if (DAE_EPOCH(dae_old) < DAE_EPOCH(dae_new)) {
+			D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n",
+				DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
+			return -DER_TX_ID_REUSED;
+		}
+
+		/*
+		 * If the old entry has higher epoch, it is quite possible that the resent RPC
+		 * was handled before the original RPC (corresponding to 'dae_new'). Returning
+		 * -DER_INPROGRESS to make the RPC sponsor to retry the RPC with 'RESEND' flag,
+		 *  then related RPC handler logic will handle such case.
+		 */
+		if (DAE_EPOCH(dae_old) > DAE_EPOCH(dae_new)) {
+			D_ERROR("Resent RPC may be handled before original one for DTX "DF_DTI
+				" with epoch "DF_X64" vs "DF_X64"\n",
+				DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new));
+			return -DER_INPROGRESS;
+		}
+
+		/*
+		 * The two entries uses the same epoch, then it may be caused by repeated RPCs
+		 * from different sources, such as multiple relay engines forward the same RPC
+		 * to current target. We need to notify related caller for such buggy case.
+		 */
+		D_ERROR("Receive repeated DTX "DF_DTI" with epoch "DF_X64"\n",
+			DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old));
+		return -DER_MISC;
 	}
 
 	rec->rec_off = umem_ptr2off(&tins->ti_umm, dae_new);
@@ -1172,16 +1194,20 @@ vos_dtx_check_availability(daos_handle_t coh, uint32_t entry,
 	}
 
 	if (intent == DAOS_INTENT_PURGE) {
-		uint32_t	age = d_hlc_age2sec(DAE_XID(dae).dti_hlc);
+		uint64_t	now = daos_gettime_coarse();
 
 		/*
 		 * The DTX entry still references related data record,
 		 * then we cannot (vos) aggregate related data record.
+		 * Report warning per each 10 seconds to avoid log flood.
 		 */
-		if (age >= DAOS_AGG_THRESHOLD)
-			D_WARN("DTX "DF_DTI" (state:%u, age:%u) still references the data, "
-			       "cannot be (VOS) aggregated\n",
-			       DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), age);
+		if (now - cont->vc_agg_busy_ts > 10) {
+			D_WARN("DTX "DF_DTI" (state:%u, flags:%x, age:%u) still references "
+			       "the modification, cannot be (VOS) aggregated\n",
+			       DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), DAE_FLAGS(dae),
+			       (unsigned int)d_hlc_age2sec(DAE_XID(dae).dti_hlc));
+			cont->vc_agg_busy_ts = now;
+		}
 
 		return ALB_AVAILABLE_DIRTY;
 	}
@@ -1909,8 +1935,13 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch,
 			daos_epoch_t	e = *epoch;
 
 			*epoch = DAE_EPOCH(dae);
-			if (e != 0 && e != DAE_EPOCH(dae))
-				return -DER_MISMATCH;
+			if (e != 0) {
+				if (e > DAE_EPOCH(dae))
+					return -DER_MISMATCH;
+
+				if (e < DAE_EPOCH(dae))
+					return -DER_TX_RESTART;
+			}
 		}
 
 		return vos_dae_is_prepare(dae) ? DTX_ST_PREPARED : DTX_ST_INITED;
diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h
index 7d4dd3ac166..5aa0a1fadbb 100644
--- a/src/vos/vos_internal.h
+++ b/src/vos/vos_internal.h
@@ -353,6 +353,8 @@ struct vos_container {
 	daos_epoch_range_t	vc_epr_aggregation;
 	/* Current ongoing discard EPR */
 	daos_epoch_range_t	vc_epr_discard;
+	/* Last timestamp when VOS aggregation reports -DER_TX_BUSY */
+	uint64_t		vc_agg_busy_ts;
 	/* Last timestamp when VOS aggregation reporting ENOSPACE */
 	uint64_t		vc_agg_nospc_ts;
 	/* Last timestamp when IO reporting ENOSPACE */

From cf2a8b9571ea02cccfed6f96112d1582bd44e8c9 Mon Sep 17 00:00:00 2001
From: wiliamhuang <lei.huang@intel.com>
Date: Tue, 5 Nov 2024 20:12:12 -0600
Subject: [PATCH 32/34] DAOS-16365 client: intercept dlsym() and zeInit() to
 avoid nested call (#14932) (#15446)

libfabric loads libze_loader.so which calls zeInit(). We observed deadlock due to nested calls when daos_init() is called inside zeInit(). We intercept dlsym() and zeInit() to avoid calling daos_init() inside zeInit(). dlsym(RTLD_NEXT, ) checks returning address to determine caller's module. To maintain expected behavior of dlsym(RTLD_NEXT, ) with our interception, new_dlsym() is implemented with assembly code to use jmp instruction instead of call. dlsym() has been moved from libdl.so to libc.so since version 2.34.

Signed-off-by: Lei Huang <lei.huang@intel.com>
---
 src/client/dfuse/pil4dfs/hook.c        |  49 +++++--
 src/client/dfuse/pil4dfs/hook.h        |   6 +
 src/client/dfuse/pil4dfs/int_dfs.c     | 189 ++++++++++++++++++++++++-
 src/client/dfuse/pil4dfs/pil4dfs_int.h |   2 +-
 4 files changed, 226 insertions(+), 20 deletions(-)

diff --git a/src/client/dfuse/pil4dfs/hook.c b/src/client/dfuse/pil4dfs/hook.c
index 0ec1a0b5374..c30061387c1 100644
--- a/src/client/dfuse/pil4dfs/hook.c
+++ b/src/client/dfuse/pil4dfs/hook.c
@@ -42,6 +42,7 @@
 #include <linux/limits.h>
 #include <capstone/capstone.h>
 #include <gurt/common.h>
+#include <gnu/libc-version.h>
 
 #include "hook.h"
 #include "hook_int.h"
@@ -89,10 +90,15 @@ static uint64_t lib_base_addr[MAX_NUM_LIB];
 /* List of names of loaded libraries */
 static char   **lib_name_list;
 
+/* libc version number in current process. e.g., 2.28 */
+static float    libc_version;
+static char    *libc_version_str;
+
 /* end   to compile list of memory blocks in /proc/pid/maps */
 
 static char    *path_ld;
 static char    *path_libc;
+static char    *path_libdl;
 static char    *path_libpthread;
 /* This holds the path of libpil4dfs.so. It is needed when we want to
  * force child processes append libpil4dfs.so to env LD_PRELOAD. */
@@ -212,7 +218,7 @@ determine_lib_path(void)
 {
 	int   path_offset   = 0, read_size, i, rc;
 	char *read_buff_map = NULL;
-	char *pos, *start, *end, lib_ver_str[32] = "", *lib_dir_str = NULL;
+	char *pos, *start, *end, *lib_dir_str = NULL;
 
 	read_size = read_map_file(&read_buff_map);
 
@@ -289,19 +295,17 @@ determine_lib_path(void)
 		goto err;
 	path_libc[end - start] = 0;
 
-	pos = strstr(path_libc, "libc-2.");
-	if (pos) {
-		/* containing version in name. example, 2.17 */
-		memcpy(lib_ver_str, pos + 5, 4);
-		lib_ver_str[4] = 0;
+	if (libc_version_str == NULL) {
+		libc_version_str = (char *)gnu_get_libc_version();
+		if (libc_version_str == NULL) {
+			DS_ERROR(errno, "Failed to determine libc version");
+			goto err;
+		}
+		libc_version = atof(libc_version_str);
 	}
 
-	if (lib_ver_str[0]) {
-		/* with version in name */
-		rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, lib_ver_str);
-	} else {
-		rc = asprintf(&path_libpthread, "%s/libpthread.so.0", lib_dir_str);
-	}
+	/* with version in name */
+	rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, libc_version_str);
 	if (rc < 0) {
 		DS_ERROR(ENOMEM, "Failed to allocate memory for path_libpthread");
 		goto err_1;
@@ -311,7 +315,18 @@ determine_lib_path(void)
 		path_libpthread = NULL;
 		DS_ERROR(ENAMETOOLONG, "path_libpthread is too long");
 		goto err_1;
-	}	
+	}
+	rc = asprintf(&path_libdl, "%s/libdl-%s.so", lib_dir_str, libc_version_str);
+	if (rc < 0) {
+		DS_ERROR(ENOMEM, "Failed to allocate memory for path_libdl");
+		goto err_1;
+	}
+	if (rc >= PATH_MAX) {
+		free(path_libdl);
+		path_libdl = NULL;
+		DS_ERROR(ENAMETOOLONG, "path_libdl is too long");
+		goto err_1;
+	}
 	D_FREE(lib_dir_str);
 
 	pos = strstr(read_buff_map, "libpil4dfs.so");
@@ -348,6 +363,11 @@ query_pil4dfs_path(void)
 	return path_libpil4dfs;
 }
 
+float
+query_libc_version(void)
+{
+	return libc_version;
+}
 
 /*
  * query_func_addr - Determine the addresses and code sizes of functions in func_name_list[].
@@ -754,6 +774,7 @@ free_memory_in_hook(void)
 	D_FREE(path_ld);
 	D_FREE(path_libc);
 	D_FREE(module_list);
+	free(path_libdl);
 	free(path_libpthread);
 
 	if (lib_name_list) {
@@ -1034,6 +1055,8 @@ register_a_hook(const char *module_name, const char *func_name, const void *new_
 		module_name_local = path_ld;
 	else if (strncmp(module_name, "libc", 5) == 0)
 		module_name_local = path_libc;
+	else if (strncmp(module_name, "libdl", 6) == 0)
+		module_name_local = path_libdl;
 	else if (strncmp(module_name, "libpthread", 11) == 0)
 		module_name_local = path_libpthread;
 	else
diff --git a/src/client/dfuse/pil4dfs/hook.h b/src/client/dfuse/pil4dfs/hook.h
index 7742faaff53..b686d99ce4e 100644
--- a/src/client/dfuse/pil4dfs/hook.h
+++ b/src/client/dfuse/pil4dfs/hook.h
@@ -60,4 +60,10 @@ free_memory_in_hook(void);
 char *
 query_pil4dfs_path(void);
 
+/**
+ * return glibc version in current process
+ */
+float
+query_libc_version(void);
+
 #endif
diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c
index b69449f1e3d..84567ac5f2d 100644
--- a/src/client/dfuse/pil4dfs/int_dfs.c
+++ b/src/client/dfuse/pil4dfs/int_dfs.c
@@ -161,6 +161,7 @@ static long int         page_size;
 #define DAOS_INIT_RUNNING     1
 
 static _Atomic uint64_t mpi_init_count;
+static _Atomic int64_t  zeInit_count;
 
 static long int         daos_initing;
 _Atomic bool            d_daos_inited;
@@ -471,6 +472,9 @@ static int (*next_tcgetattr)(int fd, void *termios_p);
 
 static int (*next_mpi_init)(int *argc, char ***argv);
 static int (*next_pmpi_init)(int *argc, char ***argv);
+static int (*next_ze_init)(int flags);
+static void *(*next_dlsym)(void *handle, const char *symbol);
+static void *(*new_dlsym)(void *handle, const char *symbol);
 
 /* to do!! */
 /**
@@ -1058,6 +1062,143 @@ PMPI_Init(int *argc, char ***argv)
 	return rc;
 }
 
+int
+zeInit(int flags)
+{
+	int rc;
+
+	if (next_ze_init == NULL) {
+		if (d_hook_enabled)
+			next_ze_init = next_dlsym(RTLD_NEXT, "zeInit");
+		else
+			next_ze_init = dlsym(RTLD_NEXT, "zeInit");
+	}
+	D_ASSERT(next_ze_init != NULL);
+	atomic_fetch_add_relaxed(&zeInit_count, 1);
+	rc = next_ze_init(flags);
+	atomic_fetch_add_relaxed(&zeInit_count, -1);
+	return rc;
+}
+
+#if defined(__x86_64__)
+/* This is used to work around compiling warning and limitations of using asm function. */
+static void *
+query_new_dlsym_addr(void *addr)
+{
+	int i;
+
+	/* assume little endian */
+	for (i = 0; i < 64; i++) {
+		/* 0x56579090 is corresponding to the first four instructions at new_dlsym_asm.
+		 * 0x90 - nop, 0x90 - nop, 0x57 - push %rdi, 0x56 - push %rsi
+		 */
+		if (*((int *)(addr + i)) == 0x56579090) {
+			/* two nop are added for easier positioning. offset +2 here to skip two
+			 * nop and start from the real entry.
+			 */
+			return ((void *)(addr + i + 2));
+		}
+	}
+	return NULL;
+}
+
+_Pragma("GCC diagnostic push")
+_Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+_Pragma("GCC diagnostic ignored \"-Wunused-variable\"")
+
+_Pragma("GCC push_options")
+_Pragma("GCC optimize(\"-O0\")")
+static char str_zeinit[] = "zeInit";
+
+static int
+is_hook_enabled(void)
+{
+	return (d_hook_enabled ? (1) : (0));
+}
+
+/* This wrapper function is introduced to avoid compiling issue with Intel-C on Leap 15.5 */
+static int
+my_strcmp(const char *s1, const char *s2)
+{
+	return strcmp(s1, s2);
+}
+
+static void *
+get_zeinit_addr(void)
+{
+	return (void *)zeInit;
+}
+
+__attribute__((aligned(16))) static void
+new_dlsym_marker(void)
+{
+}
+
+__asm__(
+	"new_dlsym_asm:\n"
+	"nop\n"
+	"nop\n"
+	"push %rdi\n"
+	"push %rsi\n"
+
+	"call is_hook_enabled\n"
+	"test %eax,%eax\n"
+	"je org_dlsym\n"
+
+	"mov %rsi, %rdi\n"
+	"lea str_zeinit(%rip), %rsi\n"
+	"call my_strcmp\n"
+	"test %eax,%eax\n"
+	"jne org_dlsym\n"
+
+	"pop %rsi\n"
+	"pop %rdi\n"
+	"call *next_dlsym(%rip)\n"
+	"mov %rax, next_ze_init(%rip)\n"
+
+	"test %eax,%eax\n"
+	"jne found\n"
+	"ret\n"
+
+	"found:\n"
+	"call get_zeinit_addr\n"
+	"ret\n"
+
+	"org_dlsym:\n"
+	"pop %rsi\n"
+	"pop %rdi\n"
+	"jmp *next_dlsym(%rip)\n"
+);
+_Pragma("GCC pop_options")
+_Pragma("GCC diagnostic pop")
+
+#else
+/* c code for other architecture. caller info could be wrong inside libc dlsym() when handle is set
+ * RTLD_NEXT. Assembly version implementation similar to above is needed to fix the issue by using
+ * jump instead of call instruction. 
+ */
+static void *
+new_dlsym_c(void *handle, const char *symbol)
+{
+	if (!d_hook_enabled)
+		goto org_dlsym;
+	printf("Inside my dlsym().\n");
+	if (strcmp(symbol, "zeInit") != 0)
+		goto org_dlsym;
+
+	next_ze_init = next_dlsym(handle, symbol);
+	if (next_ze_init)
+		/* dlsym() finished successfully, then intercept zeInit() */
+		return zeInit;
+	else
+		return next_ze_init;
+
+org_dlsym:
+	/* Ideally we need to adjust stack and jump to next_dlsym(). */
+	return next_dlsym(handle, symbol);
+}
+#endif
+
 /** determine whether a path (both relative and absolute) is on DAOS or not. If yes,
  *  returns parent object, item name, full path of parent dir, full absolute path, and
  *  the pointer to struct dfs_mt.
@@ -1164,6 +1305,15 @@ query_path(const char *szInput, int *is_target_path, struct dcache_rec **parent,
 				goto out_normal;
 			}
 
+			/* Check whether zeInit() is running. If yes, pass to the original
+			 * libc functions. Avoid possible zeInit reentrancy/nested call.
+			 */
+
+			if (atomic_load_relaxed(&zeInit_count) > 0) {
+				*is_target_path = 0;
+				goto out_normal;
+			}
+
 			/* daos_init() is expensive to call. We call it only when necessary. */
 
 			/* Check whether daos_init() is running. If yes, pass to the original
@@ -2034,6 +2184,7 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 
 	if (!is_target_path)
 		goto org_func;
+	atomic_fetch_add_relaxed(&num_open, 1);
 	if (oflags & O_CREAT && (oflags & O_DIRECTORY || oflags & O_PATH)) {
 		/* Create a dir is not supported. */
 		errno = ENOENT;
@@ -2061,7 +2212,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 		}
 
 		/* Need to create a fake fd and associate with fd_kernel */
-		atomic_fetch_add_relaxed(&num_open, 1);
 		dfs_get_mode(dfs_obj, &mode_query);
 
 		/* regular file */
@@ -2237,7 +2387,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char
 
 		return (idx_dirfd + FD_DIR_BASE);
 	}
-	atomic_fetch_add_relaxed(&num_open, 1);
 
 	rc = find_next_available_fd(NULL, &idx_fd);
 	if (rc)
@@ -6014,7 +6163,7 @@ ioctl(int fd, unsigned long request, ...)
 	va_list                         arg;
 	void                           *param;
 	struct dfuse_user_reply        *reply;
-	int                             fd_directed;
+	int                             fd_directed = fd;
 
 	va_start(arg, request);
 	param = va_arg(arg, void *);
@@ -6040,12 +6189,11 @@ ioctl(int fd, unsigned long request, ...)
 		return next_ioctl(fd, request, param);
 
 	fd_directed = d_get_fd_redirected(fd);
-	if (fd_directed < FD_FILE_BASE)
+	if ((fd_directed < FD_FILE_BASE) || (fd_directed >= (FD_DIR_BASE + MAX_OPENED_DIR)))
 		return next_ioctl(fd, request, param);
 
 	errno = ENOTSUP;
-
-	return -1;
+	return (-1);
 }
 
 int
@@ -6718,6 +6866,18 @@ check_exe_sh_bash(void)
 	return;
 }
 
+#define SMALL_DIFF (0.0001)
+static int
+libc_ver_cmp(float ver_a, float ver_b)
+{
+	if ((ver_a + SMALL_DIFF) < ver_b)
+		return (-1);
+	else if (ver_a > (ver_b + SMALL_DIFF))
+		return (1);
+	else
+		return (0);
+}
+
 static __attribute__((constructor)) void
 init_myhook(void)
 {
@@ -6725,6 +6885,7 @@ init_myhook(void)
 	char    *env_log;
 	int      rc;
 	uint64_t eq_count_loc = 0;
+	float    libc_version;
 
 	umask_old = umask(0);
 	umask(umask_old);
@@ -6873,6 +7034,18 @@ init_myhook(void)
 	register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit));
 	register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3));
 
+#if defined(__x86_64__)
+	new_dlsym = query_new_dlsym_addr(new_dlsym_marker);
+#else
+	new_dlsym = new_dlsym_c;
+#endif
+	D_ASSERT(new_dlsym != NULL);
+	libc_version = query_libc_version();
+	if (libc_ver_cmp(libc_version, 2.34) < 0)
+		register_a_hook("libdl", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym));
+	else
+		register_a_hook("libc", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym));
+
 	init_fd_dup2_list();
 
 	/* Need to check whether current process is bash or not under regular & compatible modes.*/
@@ -6884,6 +7057,10 @@ init_myhook(void)
 		dcache_rec_timeout = 0;
 
 	install_hook();
+
+	/* Check it here to minimize the work in function new_dlsym() written in assembly */
+	D_ASSERT(next_dlsym != NULL);
+
 	d_hook_enabled   = 1;
 	hook_enabled_bak = d_hook_enabled;
 }
diff --git a/src/client/dfuse/pil4dfs/pil4dfs_int.h b/src/client/dfuse/pil4dfs/pil4dfs_int.h
index a9c54b55555..0693123b51f 100644
--- a/src/client/dfuse/pil4dfs/pil4dfs_int.h
+++ b/src/client/dfuse/pil4dfs/pil4dfs_int.h
@@ -30,7 +30,7 @@
 /* FD_FILE_BASE - The base number of the file descriptor for a directory.
  * The fd allocate from this lib is always larger than FD_FILE_BASE.
  */
-#define FD_DIR_BASE     (0x40000000)
+#define FD_DIR_BASE     (FD_FILE_BASE + MAX_OPENED_FILE)
 
 /* structure allocated for a FD for a file */
 struct file_obj {

From d1af13a52492f72474d5249a45cbd123b3d05da9 Mon Sep 17 00:00:00 2001
From: Jerome Soumagne <jerome.soumagne@intel.com>
Date: Tue, 5 Nov 2024 22:18:28 -0600
Subject: [PATCH 33/34] DAOS-16752 build: update mercury to 2.4.0 (#15443)

Signed-off-by: Jerome Soumagne <jerome.soumagne@intel.com>
---
 utils/build.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/build.config b/utils/build.config
index b7f86add782..a14ad039c15 100644
--- a/utils/build.config
+++ b/utils/build.config
@@ -9,7 +9,7 @@ isal=v2.30.0
 isal_crypto=v2.23.0
 spdk=v22.01.2
 ofi=v1.22.0
-mercury=v2.4.0rc5
+mercury=v2.4.0
 protobufc=v1.3.3
 ucx=v1.14.1
 

From 12bd641a222537cc4baa7c4d0c76028a7e9f4942 Mon Sep 17 00:00:00 2001
From: Phil Henderson <phillip.henderson@intel.com>
Date: Wed, 6 Nov 2024 00:54:21 -0500
Subject: [PATCH 34/34] DAOS-16784 build: Tag 2.6.2 tb1 (#15455)

Tag first test build for 2.6.2.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
---
 TAG                  | 2 +-
 VERSION              | 2 +-
 debian/changelog     | 6 ++++++
 utils/rpms/daos.spec | 7 +++++--
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/TAG b/TAG
index 47d92ef13d9..045cac3ed40 100644
--- a/TAG
+++ b/TAG
@@ -1 +1 @@
-2.6.1-rc3
+2.6.2-tb1
diff --git a/VERSION b/VERSION
index 6a6a3d8e35c..097a15a2af3 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.6.1
+2.6.2
diff --git a/debian/changelog b/debian/changelog
index 9790dcbee3e..4a48f35bab2 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+daos (2.6.2-1) unstable; urgency=medium
+  [ Phillip Henderson ]
+  * First test build for 2.6.2
+
+ -- Phillip Henderson <phillip.henderson@intel.com>  Tue, 05 Nov 2024 23:25:00 -0500
+
 daos (2.6.1-4) unstable; urgency=medium
   [ Tomasz Gromadzki ]
   * Add support of the PMDK package 2.1.0 with NDCTL enabled.
diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec
index 29576bc71a1..3d9ffa6b9fc 100644
--- a/utils/rpms/daos.spec
+++ b/utils/rpms/daos.spec
@@ -14,8 +14,8 @@
 %endif
 
 Name:          daos
-Version:       2.6.1
-Release:       4%{?relval}%{?dist}
+Version:       2.6.2
+Release:       1%{?relval}%{?dist}
 Summary:       DAOS Storage Engine
 
 License:       BSD-2-Clause-Patent
@@ -599,6 +599,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 
 %changelog
 
+* Tue Nov 05 2024 Phillip Henderson <phillip.henderson@intel.com> 2.6.2-1
+- First test build for 2.6.2
+
 * Wed Oct 02 2024 Tomasz Gromadzki <tomasz.gromadzki@intel.com> 2.6.1-4
 - Add support of the PMDK package 2.1.0 with NDCTL enabled.
   * Increase the default ULT stack size to 20KiB if the engine uses