From 76cfb41b852f94859a32caaa7b3b379f069d0881 Mon Sep 17 00:00:00 2001 From: wiliamhuang Date: Tue, 15 Oct 2024 10:18:18 -0500 Subject: [PATCH 01/34] DAOS-16556 client: call fstat() before mmap() to update file status in kernel (#15304) Signed-off-by: Lei Huang --- src/client/dfuse/pil4dfs/int_dfs.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index fff49d9f0ae..3087cb09c75 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -6224,6 +6224,14 @@ new_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) atomic_fetch_add_relaxed(&num_mmap, 1); + if ((fd < FD_FILE_BASE) && (fd_directed >= FD_FILE_BASE) && d_compatible_mode) { + /* DAOS-14494: Force the kernel to update the size before mapping. */ + rc = next_fxstat(1, fd, &stat_buf); + if (rc == -1) + return MAP_FAILED; + return next_mmap(addr, length, prot, flags, fd, offset); + } + addr_ret = next_mmap(addr, length, prot, flags | MAP_ANONYMOUS, -1, offset); if (addr_ret == MAP_FAILED) return MAP_FAILED; @@ -6842,10 +6850,8 @@ init_myhook(void) register_a_hook("libc", "fcntl", (void *)new_fcntl, (long int *)(&libc_fcntl)); - if (d_compatible_mode == false) { - register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap)); - register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap)); - } + register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap)); + register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap)); register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit)); register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3)); From b4eb68912b13345905efc98bdf2ed9b1c948113d Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Wed, 16 Oct 2024 09:40:29 +0900 Subject: [PATCH 02/34] =?UTF-8?q?DAOS-16446=20test:=20HDF5-VOL=20test=20-?= =?UTF-8?q?=20Set=20object=20class=20and=20container=20prope=E2=80=A6=20(#?= =?UTF-8?q?15004)=20(#15098)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In HDF5, DFS, MPIIO, or POSIX, object class and container properties are defined during the container create. If it’s DFS, object class is also set to the IOR parameter. However, in HDF5-VOL, object class and container properties are defined with the following environment variables of mpirun. HDF5_DAOS_OBJ_CLASS (Object class) HDF5_DAOS_FILE_PROP (Container properties) The infrastructure to set these variables are already there in run_ior_with_pool(). In file_count_test_base.py, pass in the env vars to run_ior_with_pool(env=env) as a dictionary. Object class is the oclass variable. Container properties can be obtained from self.container.properties.value. This fix is discussed in PR #14964. Signed-off-by: Makito Kano --- src/tests/ftest/util/file_count_test_base.py | 31 ++++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py index 12c66d76b8c..f95e22bd4bd 100644 --- a/src/tests/ftest/util/file_count_test_base.py +++ b/src/tests/ftest/util/file_count_test_base.py @@ -17,15 +17,15 @@ class FileCountTestBase(IorTestBase, MdtestBase): :avocado: recursive """ - def add_containers(self, file_oclass=None, dir_oclass=None): - """Create a list of containers that the various jobs use for storage. + def get_file_write_container(self, file_oclass=None, dir_oclass=None): + """Create a container, set oclass, dir_oclass, and add rd_fac property based on oclass. Args: - file_oclass (str, optional): file object class of container. - Defaults to None. - dir_oclass (str, optional): dir object class of container. - Defaults to None. + file_oclass (str, optional): file object class of container. Defaults to None. + dir_oclass (str, optional): dir object class of container. Defaults to None. + Returns: + TestContainer: Created container with oclass, dir_oclass, and rd_fac set. """ # Create a container and add it to the overall list of containers @@ -92,7 +92,7 @@ def run_file_count(self): rd_fac = extract_redundancy_factor(oclass) dir_oclass = self.get_diroclass(rd_fac) self.mdtest_cmd.dfs_dir_oclass.update(dir_oclass) - self.container = self.add_containers(oclass, dir_oclass) + self.container = self.get_file_write_container(oclass, dir_oclass) try: self.processes = mdtest_np self.ppn = mdtest_ppn @@ -111,14 +111,27 @@ def run_file_count(self): # run ior self.log.info("=======>>>Starting IOR with %s and %s", api, oclass) self.ior_cmd.dfs_oclass.update(oclass) - self.container = self.add_containers(oclass) + self.container = self.get_file_write_container(oclass) self.update_ior_cmd_with_pool(False) try: self.processes = ior_np self.ppn = ior_ppn if api == 'HDF5-VOL': + # Format the container properties so that it works with HDF5-VOL env var. + # Each entry:value pair needs to be separated by a semicolon. Since we're + # using this in the mpirun command, semicolon would indicate the end of the + # command, so quote the whole thing. + cont_props = self.container.properties.value + cont_props_hdf5_vol = '"' + cont_props.replace(",", ";") + '"' + self.log.info("cont_props_hdf5_vol = %s", cont_props_hdf5_vol) + env = self.ior_cmd.env.copy() + env.update({ + "HDF5_DAOS_OBJ_CLASS": oclass, + "HDF5_DAOS_FILE_PROP": cont_props_hdf5_vol + }) self.ior_cmd.api.update('HDF5') - self.run_ior_with_pool(create_pool=False, plugin_path=hdf5_plugin_path) + self.run_ior_with_pool( + create_pool=False, plugin_path=hdf5_plugin_path, env=env) elif self.ior_cmd.api.value == 'POSIX': self.run_ior_with_pool(create_pool=False, intercept=intercept) else: From 6e16c8e21bcc1cb5b29c62aca45e309201c0d9ca Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Wed, 16 Oct 2024 02:43:26 +0200 Subject: [PATCH 03/34] DAOS-16673 common: ignore Hadoop 3.4.0 related CVE (#15320) Hadoope 3.4.0 has resolved a few CVE issues but introduces new + enable Trivy scans on release branch + enable on demand scan and scan on final PR merge. Signed-off-by: Tomasz Gromadzki --- .github/workflows/trivy.yml | 70 ++++++++++ utils/trivy/.trivyignore | 27 ++++ utils/trivy/csv.tpl | 29 +++++ utils/trivy/trivy.yaml | 249 ++++++++++++++++++++++++++++++++++++ 4 files changed, 375 insertions(+) create mode 100644 .github/workflows/trivy.yml create mode 100644 utils/trivy/.trivyignore create mode 100644 utils/trivy/csv.tpl create mode 100644 utils/trivy/trivy.yaml diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml new file mode 100644 index 00000000000..8f5524d4513 --- /dev/null +++ b/.github/workflows/trivy.yml @@ -0,0 +1,70 @@ +name: Trivy scan + +on: + workflow_dispatch: + push: + branches: ["master", "release/**"] + pull_request: + branches: ["master", "release/**"] + +# Declare default permissions as nothing. +permissions: {} + +jobs: + build: + name: Build + runs-on: ubuntu-20.04 + steps: + - name: Checkout code + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' + + - name: Prepare the report to be uploaded to the GitHub artifact store + run: | + mkdir report + cp trivy-report-daos.txt report + cp utils/trivy/.trivyignore report/trivyignore.txt + + - name: Upload the report to the GitHub artifact store + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + with: + path: report/* + name: trivy-report-daos + + - name: Adjust config file to use sarif format + run: | + sed -i 's/output: "trivy-report-daos.txt"/output: "trivy-results.sarif"/g' \ + utils/trivy/trivy.yaml + sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' + + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a + # 3.25.15 (v3) + with: + sarif_file: 'trivy-results.sarif' + + - name: Adjust config file to show and validate scan results + run: | + sed -i 's/output: "trivy-results.sarif"//g' utils/trivy/trivy.yaml + sed -i 's/format: sarif/format: table/g' utils/trivy/trivy.yaml + sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' diff --git a/utils/trivy/.trivyignore b/utils/trivy/.trivyignore new file mode 100644 index 00000000000..3a3b4cff1ce --- /dev/null +++ b/utils/trivy/.trivyignore @@ -0,0 +1,27 @@ +## Ignored hadoop 3.3.6 related CVE +## CVE-2023-52428,MEDIUM,,"Denial of Service in Connect2id Nimbus JOSE+JWT","com.nimbusds:nimbus-jose-jwt","9.8.1","9.37.2",https://avd.aquasec.com/nvd/cve-2023-52428 +CVE-2023-52428 +## CVE-2023-39410,HIGH,7.5,"apache-avro: Apache Avro Java SDK: Memory when deserializing untrusted data in Avro Java SDK","org.apache.avro:avro","1.7.7","1.11.3",https://avd.aquasec.com/nvd/cve-2023-39410 +CVE-2023-39410 +## CVE-2024-25710,HIGH,5.5,"commons-compress: Denial of service caused by an infinite loop for a corrupted DUMP file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-25710 +CVE-2024-25710 +## CVE-2024-26308,HIGH,5.5,"commons-compress: OutOfMemoryError unpacking broken Pack200 file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-26308 +CVE-2024-26308 +## CVE-2024-29131,MEDIUM,,"commons-configuration: StackOverflowError adding property in AbstractListDelimiterHandler.flattenIterator()","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29131 +CVE-2024-29131 +## CVE-2024-29133,MEDIUM,,"commons-configuration: StackOverflowError calling ListDelimiterHandler.flatten(Object, int) with a cyclical object tree","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29133 +CVE-2024-29133 +## CVE-2024-25638,HIGH,,"dnsjava: Improper response validation allowing DNSSEC bypass","dnsjava:dnsjava","2.1.7","3.6.0",https://avd.aquasec.com/nvd/cve-2024-25638 +CVE-2024-25638 + +## Ignored hadoop 3.4.0 related CVE +## CVE-2024-47561,CRITICAL,,"apache-avro: Schema parsing may trigger Remote Code Execution (RCE)","org.apache.avro:avro","1.9.2","1.11.4",https://avd.aquasec.com/nvd/cve-2024-47561 +CVE-2024-47561 +## CVE-2023-33201,MEDIUM,5.3,"bouncycastle: potential blind LDAP injection attack using a self-signed certificate","org.bouncycastle:bcprov-jdk15on","1.70","",https://avd.aquasec.com/nvd/cve-2023-33201 +CVE-2023-33201 +## CVE-2024-29857,MEDIUM,,"org.bouncycastle: Importing an EC certificate with crafted F2m parameters may lead to Denial of Service","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-29857 +CVE-2024-29857 +## CVE-2024-30171,MEDIUM,,"bc-java: BouncyCastle vulnerable to a timing variant of Bleichenbacher (Marvin Attack)","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30171 +CVE-2024-30171 +## CVE-2024-30172,MEDIUM,,"org.bouncycastle:bcprov-jdk18on: Infinite loop in ED25519 verification in the ScalarUtil class","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30172 +CVE-2024-30172 diff --git a/utils/trivy/csv.tpl b/utils/trivy/csv.tpl new file mode 100644 index 00000000000..d3bcafcef5e --- /dev/null +++ b/utils/trivy/csv.tpl @@ -0,0 +1,29 @@ +{{ range . }} +Trivy Vulnerability Scan Results ({{- .Target -}}) +VulnerabilityID,Severity,CVSS Score,Title,Library,Vulnerable Version,Fixed Version,Information URL,Triage Information +{{ range .Vulnerabilities }} + {{- .VulnerabilityID }}, + {{- .Severity }}, + {{- range $key, $value := .CVSS }} + {{- if (eq $key "nvd") }} + {{- .V3Score -}} + {{- end }} + {{- end }}, + {{- quote .Title }}, + {{- quote .PkgName }}, + {{- quote .InstalledVersion }}, + {{- quote .FixedVersion }}, + {{- .PrimaryURL }} +{{ else -}} + No vulnerabilities found at this time. +{{ end }} +Trivy Dependency Scan Results ({{ .Target }}) +ID,Name,Version,Notes +{{ range .Packages -}} + {{- quote .ID }}, + {{- quote .Name }}, + {{- quote .Version }} +{{ else -}} + No dependencies found at this time. +{{ end }} +{{ end }} \ No newline at end of file diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml new file mode 100644 index 00000000000..cfb13b5c40f --- /dev/null +++ b/utils/trivy/trivy.yaml @@ -0,0 +1,249 @@ +cache: + backend: fs + dir: + redis: + ca: "" + cert: "" + key: "" + tls: false + ttl: 0s +config: trivy.yaml +db: + download-java-only: false + download-only: false + java-repository: ghcr.io/aquasecurity/trivy-java-db + java-skip-update: false + no-progress: false + repository: ghcr.io/aquasecurity/trivy-db + skip-update: false +debug: false +dependency-tree: true +exit-code: 0 +generate-default-config: false +ignore-policy: "" +ignorefile: ./utils/trivy/.trivyignore +include-dev-deps: false +insecure: false +license: + confidencelevel: "0.9" + forbidden: + - AGPL-1.0 + - AGPL-3.0 + - CC-BY-NC-1.0 + - CC-BY-NC-2.0 + - CC-BY-NC-2.5 + - CC-BY-NC-3.0 + - CC-BY-NC-4.0 + - CC-BY-NC-ND-1.0 + - CC-BY-NC-ND-2.0 + - CC-BY-NC-ND-2.5 + - CC-BY-NC-ND-3.0 + - CC-BY-NC-ND-4.0 + - CC-BY-NC-SA-1.0 + - CC-BY-NC-SA-2.0 + - CC-BY-NC-SA-2.5 + - CC-BY-NC-SA-3.0 + - CC-BY-NC-SA-4.0 + - Commons-Clause + - Facebook-2-Clause + - Facebook-3-Clause + - Facebook-Examples + - WTFPL + full: false + ignored: [] + notice: + - AFL-1.1 + - AFL-1.2 + - AFL-2.0 + - AFL-2.1 + - AFL-3.0 + - Apache-1.0 + - Apache-1.1 + - Apache-2.0 + - Artistic-1.0-cl8 + - Artistic-1.0-Perl + - Artistic-1.0 + - Artistic-2.0 + - BSL-1.0 + - BSD-2-Clause-FreeBSD + - BSD-2-Clause-NetBSD + - BSD-2-Clause + - BSD-3-Clause-Attribution + - BSD-3-Clause-Clear + - BSD-3-Clause-LBNL + - BSD-3-Clause + - BSD-4-Clause + - BSD-4-Clause-UC + - BSD-Protection + - CC-BY-1.0 + - CC-BY-2.0 + - CC-BY-2.5 + - CC-BY-3.0 + - CC-BY-4.0 + - FTL + - ISC + - ImageMagick + - Libpng + - Lil-1.0 + - Linux-OpenIB + - LPL-1.02 + - LPL-1.0 + - MS-PL + - MIT + - NCSA + - OpenSSL + - PHP-3.01 + - PHP-3.0 + - PIL + - Python-2.0 + - Python-2.0-complete + - PostgreSQL + - SGI-B-1.0 + - SGI-B-1.1 + - SGI-B-2.0 + - Unicode-DFS-2015 + - Unicode-DFS-2016 + - Unicode-TOU + - UPL-1.0 + - W3C-19980720 + - W3C-20150513 + - W3C + - X11 + - Xnet + - Zend-2.0 + - zlib-acknowledgement + - Zlib + - ZPL-1.1 + - ZPL-2.0 + - ZPL-2.1 + permissive: [] + reciprocal: + - APSL-1.0 + - APSL-1.1 + - APSL-1.2 + - APSL-2.0 + - CDDL-1.0 + - CDDL-1.1 + - CPL-1.0 + - EPL-1.0 + - EPL-2.0 + - FreeImage + - IPL-1.0 + - MPL-1.0 + - MPL-1.1 + - MPL-2.0 + - Ruby + restricted: + - BCL + - CC-BY-ND-1.0 + - CC-BY-ND-2.0 + - CC-BY-ND-2.5 + - CC-BY-ND-3.0 + - CC-BY-ND-4.0 + - CC-BY-SA-1.0 + - CC-BY-SA-2.0 + - CC-BY-SA-2.5 + - CC-BY-SA-3.0 + - CC-BY-SA-4.0 + - GPL-1.0 + - GPL-2.0 + - GPL-2.0-with-autoconf-exception + - GPL-2.0-with-bison-exception + - GPL-2.0-with-classpath-exception + - GPL-2.0-with-font-exception + - GPL-2.0-with-GCC-exception + - GPL-3.0 + - GPL-3.0-with-autoconf-exception + - GPL-3.0-with-GCC-exception + - LGPL-2.0 + - LGPL-2.1 + - LGPL-3.0 + - NPL-1.0 + - NPL-1.1 + - OSL-1.0 + - OSL-1.1 + - OSL-2.0 + - OSL-2.1 + - OSL-3.0 + - QPL-1.0 + - Sleepycat + unencumbered: + - CC0-1.0 + - Unlicense + - 0BSD +list-all-pkgs: false +misconfiguration: + cloudformation: + params: [] + helm: + set: [] + set-file: [] + set-string: [] + values: [] + include-non-failures: false + check-bundle-repository: ghcr.io/aquasecurity/trivy-policies:0 + # scanners: + # - azure-arm + # - cloudformation + # - dockerfile + # - helm + # - kubernetes + # - terraform + # - terraformplan + terraform: + exclude-downloaded-modules: false + vars: [] +module: + dir: + enable-modules: [] +output: "trivy-report-daos.txt" +format: template +template: '@./utils/trivy/csv.tpl' +output-plugin-arg: "" +quiet: false +registry: + password: [] + token: "" + username: [] +rego: + data: [] + namespaces: [] + policy: [] + skip-policy-update: false + trace: false +report: all +scan: + compliance: "" + file-patterns: [] + offline: false + parallel: 1 + rekor-url: https://rekor.sigstore.dev + sbom-sources: [] + scanners: + - vuln + - secret + # ignore all hadoop dependencies + skip-dirs: + ./src/client/java/hadoop-daos + skip-files: [] + show-suppressed: true +secret: + config: trivy-secret.yaml +server: + addr: "" + custom-headers: [] + token: "" + token-header: Trivy-Token +severity: + - UNKNOWN + - MEDIUM + - HIGH + - CRITICAL +timeout: 5m0s +version: false +vulnerability: + ignore-status: [] + ignore-unfixed: false + type: + - os + - library From d9f16a1d6f7326ace19b1f7d4a6f182b4bde0020 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Wed, 16 Oct 2024 02:48:45 +0200 Subject: [PATCH 04/34] DAOS-14408 common: ensure NDCTL not used for storage class `ram` (#15203) * DAOS-14408 common: enable NDCTL for DCPM This PR prepares DAOS to be used with NDCTL enabled in PMDK, which means: - NDCTL must not be used when non-DCPM (simulate PMem) - `storage class: "ram"` is used: `PMEMOBJ_CONF=sds.at_create=0` env variable disables NDCTL features in the PMDK This change affects all tests run on simulated PMem (e.g. inside VMs). Some DOAS utility applications may also require `PMEMOBJ_CONF=sds.at_create=0` to be set. - The default ULT stack size must be at least 20KiB to avoid stack overuse by PMDK with NDCTL enabled and be aligned with Linux page size. `ABT_THREAD_STACKSIZE=20480` env variable is used to increase the default ULT stack size. This env variable is set by control/server module just before engine is started. Much bigger stack is used for pmempool open/create-related tasks e.g. `tgt_vos_create_one` to avoid stack overusage. This modification shall not affect md-on-ssd mode as long as `storage class: "ram"` is used for the first tier in the `storage` configuration. This change does not require any configuration changes to existing systems. The new PMDK package with NDCTL enabled (https://github.com/daos-stack/pmdk/pull/38) will land as soon as this PR is merged. Signed-off-by: Jan Michalski --- .github/workflows/landing-builds.yml | 1 + debian/changelog | 16 ++ debian/control | 6 +- site_scons/components/__init__.py | 1 - src/control/server/engine/config.go | 86 ++++++++++ src/control/server/engine/config_test.go | 207 +++++++++++++++++++++++ src/control/server/server.go | 6 + src/mgmt/srv_target.c | 2 +- utils/build.config | 3 +- utils/rpms/daos.rpmlintrc | 10 +- utils/rpms/daos.spec | 29 +++- utils/run_utest.py | 1 + utils/scripts/install-el8.sh | 2 + utils/scripts/install-el9.sh | 2 + utils/scripts/install-leap15.sh | 1 + utils/scripts/install-ubuntu.sh | 2 + 16 files changed, 361 insertions(+), 14 deletions(-) diff --git a/.github/workflows/landing-builds.yml b/.github/workflows/landing-builds.yml index 103f150915a..814c4f0e71f 100644 --- a/.github/workflows/landing-builds.yml +++ b/.github/workflows/landing-builds.yml @@ -16,6 +16,7 @@ on: - ci/** - requirements-build.txt - requirements-utest.txt + - utils/build.config permissions: {} diff --git a/debian/changelog b/debian/changelog index 6891cea4737..9790dcbee3e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,19 @@ +daos (2.6.1-4) unstable; urgency=medium + [ Tomasz Gromadzki ] + * Add support of the PMDK package 2.1.0 with NDCTL enabled. + * Increase the default ULT stack size to 20KiB if the engine uses + the DCPM storage class. + * Prevent using the RAM storage class (simulated PMem) when + the shutdown state (SDS) is active. + * Automatically disable SDS for the RAM storage class on engine startup. + * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0' + environment variable to deactivate SDS for the DAOS tools + (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM. + Otherwise, a user is supposed to be stopped by an error + like: "Unsafe shutdown count is not supported for this source". + + -- Tomasz Gromadzki Wed, 02 Oct 2024 12:00:00 +0200 + daos (2.6.1-3) unstable; urgency=medium [ Phillip Henderson ] * Third release candidate for 2.6.1 diff --git a/debian/control b/debian/control index bdc377e6d94..32b97fd67d1 100644 --- a/debian/control +++ b/debian/control @@ -18,7 +18,7 @@ Build-Depends: debhelper (>= 10), python3-distro, libabt-dev, libucx-dev, - libpmemobj-dev (>= 2.0.0), + libpmemobj-dev (>= 2.1.0), libfuse3-dev, libprotobuf-c-dev, libjson-c-dev, @@ -118,7 +118,9 @@ Depends: python (>=3.8), python3, python-yaml, python3-yaml, daos-client (= ${binary:Version}), daos-admin (= ${binary:Version}), golang-go (>=1.18), - libcapstone-dev + libcapstone-dev, + libndctl-dev, + libdaxctl-dev Description: The Distributed Asynchronous Object Storage (DAOS) is an open-source software-defined object store designed from the ground up for massively distributed Non Volatile Memory (NVM). DAOS takes advantage diff --git a/site_scons/components/__init__.py b/site_scons/components/__init__.py index ed50b80461b..1ae7bb7a2aa 100644 --- a/site_scons/components/__init__.py +++ b/site_scons/components/__init__.py @@ -266,7 +266,6 @@ def define_components(reqs): retriever=GitRepoRetriever(), commands=[['make', 'all', - 'NDCTL_ENABLE=n', 'BUILD_EXAMPLES=n', 'BUILD_BENCHMARKS=n', 'DOC=n', diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go index 3e6ab0b64d7..38595fd8091 100644 --- a/src/control/server/engine/config.go +++ b/src/control/server/engine/config.go @@ -9,6 +9,7 @@ package engine import ( "fmt" "os" + "strconv" "strings" "github.com/pkg/errors" @@ -28,6 +29,8 @@ const ( envLogMasks = "D_LOG_MASK" envLogDbgStreams = "DD_MASK" envLogSubsystems = "DD_SUBSYS" + + minABTThreadStackSizeDCPM = 20480 ) // FabricConfig encapsulates networking fabric configuration. @@ -342,7 +345,80 @@ func (c *Config) Validate() error { if err := ValidateLogSubsystems(subsystems); err != nil { return errors.Wrap(err, "validate engine log subsystems") } + return nil +} + +// Ensure at least 20KiB ABT stack size for an engine with DCPM storage class. +func (c *Config) UpdatePMDKEnvarsStackSizeDCPM() error { + stackSizeStr, err := c.GetEnvVar("ABT_THREAD_STACKSIZE") + if err != nil { + c.EnvVars = append(c.EnvVars, fmt.Sprintf("ABT_THREAD_STACKSIZE=%d", + minABTThreadStackSizeDCPM)) + return nil + } + // Ensure at least 20KiB ABT stack size for an engine with DCPM storage class. + stackSizeValue, err := strconv.Atoi(stackSizeStr) + if err != nil { + return errors.Errorf("env_var ABT_THREAD_STACKSIZE has invalid value: %s", + stackSizeStr) + } + if stackSizeValue < minABTThreadStackSizeDCPM { + return errors.Errorf("env_var ABT_THREAD_STACKSIZE should be >= %d "+ + "for DCPM storage class, found %d", minABTThreadStackSizeDCPM, + stackSizeValue) + } + return nil +} + +// Ensure proper configuration of shutdown (SDS) state +func (c *Config) UpdatePMDKEnvarsPMemobjConf(isDCPM bool) error { + pmemobjConfStr, pmemobjConfErr := c.GetEnvVar("PMEMOBJ_CONF") + //also work for empty string + hasSdsAtCreate := strings.Contains(pmemobjConfStr, "sds.at_create") + if isDCPM { + if !hasSdsAtCreate { + return nil + } + // Confirm default handling of shutdown state (SDS) for DCPM storage class. + return errors.New("env_var PMEMOBJ_CONF should NOT contain 'sds.at_create=?' " + + "for DCPM storage class, found '" + pmemobjConfStr + "'") + } + + // Disable shutdown state (SDS) (part of RAS) for RAM-based simulated SCM. + if pmemobjConfErr != nil { + c.EnvVars = append(c.EnvVars, "PMEMOBJ_CONF=sds.at_create=0") + return nil + } + if !hasSdsAtCreate { + envVars, _ := common.DeleteKeyValue(c.EnvVars, "PMEMOBJ_CONF") + c.EnvVars = append(envVars, "PMEMOBJ_CONF="+pmemobjConfStr+ + ";sds.at_create=0") + return nil + } + if strings.Contains(pmemobjConfStr, "sds.at_create=1") { + return errors.New("env_var PMEMOBJ_CONF should contain 'sds.at_create=0' " + + "for non-DCPM storage class, found '" + pmemobjConfStr + "'") + } + return nil +} + +// Ensure proper environment variables for PMDK w/ NDCTL enabled based on +// the actual configuration of the storage class. +func (c *Config) UpdatePMDKEnvars() error { + + if len(c.Storage.Tiers) == 0 { + return errors.New("Invalid config - no tier 0 defined") + } + + isDCPM := c.Storage.Tiers[0].Class == storage.ClassDcpm + if err := c.UpdatePMDKEnvarsPMemobjConf(isDCPM); err != nil { + return err + } + + if isDCPM { + return c.UpdatePMDKEnvarsStackSizeDCPM() + } return nil } @@ -690,3 +766,13 @@ func (c *Config) WithStorageIndex(i uint32) *Config { c.Storage.EngineIdx = uint(i) return c } + +// WithEnvVarAbtThreadStackSize sets environment variable ABT_THREAD_STACKSIZE. +func (c *Config) WithEnvVarAbtThreadStackSize(stack_size uint16) *Config { + return c.WithEnvVars(fmt.Sprintf("ABT_THREAD_STACKSIZE=%d", stack_size)) +} + +// WithEnvVarPMemObjSdsAtCreate sets PMEMOBJ_CONF env. var. to sds.at_create=0/1 value +func (c *Config) WithEnvVarPMemObjSdsAtCreate(value uint8) *Config { + return c.WithEnvVars(fmt.Sprintf("PMEMOBJ_CONF=sds.at_create=%d", value)) +} diff --git a/src/control/server/engine/config_test.go b/src/control/server/engine/config_test.go index 463e86567dc..d9ca2bfebbd 100644 --- a/src/control/server/engine/config_test.go +++ b/src/control/server/engine/config_test.go @@ -1104,3 +1104,210 @@ func TestFabricConfig_Update(t *testing.T) { }) } } + +func TestConfig_UpdatePMDKEnvarsStackSizeDCPM(t *testing.T) { + validConfig := func() *Config { + return MockConfig().WithStorage( + storage.NewTierConfig(). + WithStorageClass("dcpm")) + } + + for name, tc := range map[string]struct { + cfg *Config + expErr error + expABTthreadStackSize int + }{ + "empty config should not fail": { + cfg: MockConfig(), + expABTthreadStackSize: minABTThreadStackSizeDCPM, + }, + "valid config for DCPM should not fail": { + cfg: validConfig().WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM), + expABTthreadStackSize: minABTThreadStackSizeDCPM, + }, + "config for DCPM without thread size should not fail": { + cfg: validConfig(), + expABTthreadStackSize: minABTThreadStackSizeDCPM, + }, + "config for DCPM with stack size big enough should not fail": { + cfg: validConfig(). + WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM + 1), + expABTthreadStackSize: minABTThreadStackSizeDCPM + 1, + }, + "config for DCPM with stack size too small should fail": { + cfg: validConfig(). + WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1), + expErr: errors.New(fmt.Sprintf("env_var ABT_THREAD_STACKSIZE "+ + "should be >= %d for DCPM storage class, found %d", + minABTThreadStackSizeDCPM, minABTThreadStackSizeDCPM-1)), + }, + "config for DCPM with invalid ABT_THREAD_STACKSIZE value should fail": { + cfg: validConfig().WithEnvVars("ABT_THREAD_STACKSIZE=foo_bar"), + expErr: errors.New("env_var ABT_THREAD_STACKSIZE has invalid value: foo_bar"), + }, + } { + t.Run(name, func(t *testing.T) { + err := tc.cfg.UpdatePMDKEnvarsStackSizeDCPM() + test.CmpErr(t, tc.expErr, err) + if err == nil { + stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE") + test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE") + stackSizeVal, err := strconv.Atoi(stackSizeStr) + test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE") + test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal, + "Invalid ABT_THREAD_STACKSIZE value") + } + }) + } +} + +func TestConfig_UpdatePMDKEnvarsPMemobjConfDCPM(t *testing.T) { + validConfig := func() *Config { + return MockConfig().WithStorage( + storage.NewTierConfig().WithStorageClass("dcpm")) + } + + for name, tc := range map[string]struct { + cfg *Config + expErr error + }{ + "empty config should not fail": { + cfg: MockConfig(), + }, + "valid config for DCPM should not fail": { + cfg: validConfig(), + }, + "config for DCPM with forced sds.at_create (1) should fail": { + cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1), + expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " + + "'sds.at_create=?' for DCPM storage class, found 'sds.at_create=1'"), + }, + "config for DCPM with forced sds.at_create (0) should fail": { + cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(0), + expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " + + "'sds.at_create=?' for DCPM storage class, found 'sds.at_create=0'"), + }, + } { + t.Run(name, func(t *testing.T) { + test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(true)) + }) + } +} + +func TestConfig_UpdatePMDKEnvarsPMemobjConfNRam(t *testing.T) { + validConfig := func() *Config { + return MockConfig().WithStorage( + storage.NewTierConfig(). + WithStorageClass("dcpm")) + } + + for name, tc := range map[string]struct { + cfg *Config + expErr error + expPMEMOBJ_CONF string + }{ + "empty config should not fail": { + cfg: validConfig(), + expPMEMOBJ_CONF: "sds.at_create=0", + }, + "config for ram without PMEMOBJ_CONF should not fail": { + cfg: MockConfig(), + expPMEMOBJ_CONF: "sds.at_create=0", + }, + "valid config for should not fail": { + cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(0), + expPMEMOBJ_CONF: "sds.at_create=0", + }, + "config for ram w/ PMEMOBJ_CONF w/o sds.at_create should should be updated": { + cfg: validConfig().WithEnvVars("PMEMOBJ_CONF=foo_bar"), + expPMEMOBJ_CONF: "foo_bar;sds.at_create=0", + }, + "config for ram with sds.at_create set to 1 should fail": { + cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1), + expErr: errors.New("env_var PMEMOBJ_CONF should contain " + + "'sds.at_create=0' for non-DCPM storage class" + + ", found 'sds.at_create=1'"), + }, + "config for ram w/ PMEMOBJ_CONF w/ sds.at_create=1 should fail": { + cfg: validConfig(). + WithEnvVars("PMEMOBJ_CONF=sds.at_create=1;foo-bar"), + expErr: errors.New("env_var PMEMOBJ_CONF should contain " + + "'sds.at_create=0' for non-DCPM storage class" + + ", found 'sds.at_create=1;foo-bar'"), + }, + } { + t.Run(name, func(t *testing.T) { + test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(false)) + if len(tc.expPMEMOBJ_CONF) > 0 { + sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF") + test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF") + test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create, + "Invalid PMEMOBJ_CONF") + } + + }) + } +} + +func TestConfig_UpdatePMDKEnvars(t *testing.T) { + validConfig := func(storageclas string) *Config { + return MockConfig().WithStorage( + storage.NewTierConfig(). + WithStorageClass(storageclas)) + } + for name, tc := range map[string]struct { + cfg *Config + expErr error + expPMEMOBJ_CONF string + expABTthreadStackSize int + }{ + "empty config should fail": { + cfg: MockConfig(), + expErr: errors.New("Invalid config - no tier 0 defined"), + expABTthreadStackSize: -1, + }, + "valid config for RAM should not fail": { + cfg: validConfig("ram"). + WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1), + expPMEMOBJ_CONF: "sds.at_create=0", + expABTthreadStackSize: minABTThreadStackSizeDCPM - 1, + }, + "invalid config for RAM should fail": { + cfg: validConfig("ram").WithEnvVarPMemObjSdsAtCreate(1), + expErr: errors.New("env_var PMEMOBJ_CONF should contain " + + "'sds.at_create=0' for non-DCPM storage class, " + + "found 'sds.at_create=1'"), + expABTthreadStackSize: -1, + }, + "valid config for DCPM should not fail": { + cfg: validConfig("dcpm"), + expABTthreadStackSize: minABTThreadStackSizeDCPM, + }, + "invalid config for DCPM should not fail": { + cfg: validConfig("dcpm"). + WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1), + expErr: errors.New("env_var ABT_THREAD_STACKSIZE should be >= 20480 " + + "for DCPM storage class, found 20479"), + expABTthreadStackSize: minABTThreadStackSizeDCPM - 1, + }, + } { + t.Run(name, func(t *testing.T) { + errTc := tc.cfg.UpdatePMDKEnvars() + test.CmpErr(t, tc.expErr, errTc) + if len(tc.expPMEMOBJ_CONF) > 0 { + sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF") + test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF") + test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create, + "Invalid PMEMOBJ_CONF") + } + if tc.expABTthreadStackSize >= 0 { + stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE") + test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE") + stackSizeVal, err := strconv.Atoi(stackSizeStr) + test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE") + test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal, + "Invalid ABT_THREAD_STACKSIZE value") + } + }) + } +} diff --git a/src/control/server/server.go b/src/control/server/server.go index 4ac8ce5e04b..c9232dd1725 100644 --- a/src/control/server/server.go +++ b/src/control/server/server.go @@ -103,6 +103,12 @@ func processConfig(log logging.Logger, cfg *config.Server, fis *hardware.FabricI return err } + for _, ec := range cfg.Engines { + if err := ec.UpdatePMDKEnvars(); err != nil { + return err + } + } + return nil } diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index b03215d08aa..e1d6ee79bce 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1180,7 +1180,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) /* A zero size accommodates the existing file */ vpa.vpa_scm_size = 0; vpa.vpa_nvme_size = tc_in->tc_nvme_size / dss_tgt_nr; - rc = dss_thread_collective(tgt_vos_create_one, &vpa, 0); + rc = dss_thread_collective(tgt_vos_create_one, &vpa, DSS_ULT_DEEP_STACK); if (rc) { D_ERROR(DF_UUID": thread collective tgt_vos_create_one failed, "DF_RC"\n", DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc)); diff --git a/utils/build.config b/utils/build.config index 080ebae1283..52749b3f1ce 100644 --- a/utils/build.config +++ b/utils/build.config @@ -4,7 +4,7 @@ component=daos [commit_versions] argobots=v1.1 fuse=fuse-3.16.2 -pmdk=2.0.0 +pmdk=2.1.0 isal=v2.30.0 isal_crypto=v2.23.0 spdk=v22.01.2 @@ -30,3 +30,4 @@ spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc7566497 ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch +pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff diff --git a/utils/rpms/daos.rpmlintrc b/utils/rpms/daos.rpmlintrc index 09022a74e59..bbf0f9e2b92 100644 --- a/utils/rpms/daos.rpmlintrc +++ b/utils/rpms/daos.rpmlintrc @@ -47,10 +47,10 @@ addFilter("E: static-library-without-debuginfo \/usr\/lib64\/lib(dfuse|ioil)\.a" addFilter("W: no-soname \/usr\/lib64\/lib(ds3|daos_(common|cmd_hdlrs|tests|serialize|common_pmem)|dfs|dfuse|duns|ioil|pil4dfs|dpar(|_mpi)).so") # Tests rpm needs to be able to build daos from source so pulls in build deps and is expected. -addFilter("daos-client-tests.x86_64: E: devel-dependency protobuf-c-devel") +addFilter("daos-client-tests\.x86_64: E: devel-dependency protobuf-c-devel") # a functional test builds daos from source, so it needs the various *-devel packages for daos' build dependencies. -addFilter("daos-client-tests.x86_64: E: devel-dependency capstone-devel") -addFilter("daos-client-tests.x86_64: E: explicit-lib-dependency libcapstone-devel") -addFilter("daos-client-tests.x86_64: E: devel-dependency libcapstone-devel") -addFilter("daos-client-tests.x86_64: E: devel-dependency fuse3-devel") +addFilter("daos-client-tests\.x86_64: E: devel-dependency capstone-devel") +addFilter("daos-client-tests\.x86_64: E: explicit-lib-dependency lib(capstone|ndctl)-devel") +addFilter("daos-client-tests\.x86_64: E: devel-dependency libcapstone-devel") +addFilter("daos-client-tests\.x86_64: E: devel-dependency fuse3-devel") diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 14fa74776a2..29576bc71a1 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -15,7 +15,7 @@ Name: daos Version: 2.6.1 -Release: 3%{?relval}%{?dist} +Release: 4%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -49,7 +49,7 @@ BuildRequires: libabt-devel >= 1.0rc1 BuildRequires: libjson-c-devel BuildRequires: boost-devel %endif -BuildRequires: libpmemobj-devel >= 2.0.0 +BuildRequires: libpmemobj-devel >= 2.1.0 %if (0%{?rhel} >= 8) BuildRequires: fuse3-devel >= 3 %else @@ -147,11 +147,11 @@ Requires: ndctl # needed to set PMem configuration goals in BIOS through control-plane %if (0%{?suse_version} >= 1500) Requires: ipmctl >= 03.00.00.0423 -Requires: libpmemobj1 >= 2.0.0-1.suse1500 +Requires: libpmemobj1 >= 2.1.0-1.suse1500 Requires: libfabric1 >= %{libfabric_version} %else Requires: ipmctl >= 03.00.00.0468 -Requires: libpmemobj >= 2.0.0-1%{?dist} +Requires: libpmemobj >= 2.1.0-1%{?dist} %endif Requires: libfabric >= %{libfabric_version} Requires: mercury >= %{mercury_version} @@ -232,6 +232,13 @@ Requires: fuse3-devel >= 3 Requires: fuse3-devel >= 3.4.2 %endif Requires: pciutils-devel +%if (0%{?suse_version} > 0) +Requires: libndctl-devel +%endif +%if (0%{?rhel} >= 8) +Requires: ndctl-devel +Requires: daxctl-devel +%endif %description client-tests This is the package needed to run the DAOS test suite (client tests) @@ -591,6 +598,20 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog + +* Wed Oct 02 2024 Tomasz Gromadzki 2.6.1-4 +- Add support of the PMDK package 2.1.0 with NDCTL enabled. + * Increase the default ULT stack size to 20KiB if the engine uses + the DCPM storage class. + * Prevent using the RAM storage class (simulated PMem) when + the shutdown state (SDS) is active. + * Automatically disable SDS for the RAM storage class on engine startup. + * Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0' + environment variable to deactivate SDS for the DAOS tools + (ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM. + Otherwise, a user is supposed to be stopped by an error + like: "Unsafe shutdown count is not supported for this source". + * Tue Oct 01 2024 Phillip Henderson 2.6.1-3 - Third release candidate for 2.6.1 diff --git a/utils/run_utest.py b/utils/run_utest.py index c2f4ffd002d..1835f230e36 100755 --- a/utils/run_utest.py +++ b/utils/run_utest.py @@ -440,6 +440,7 @@ def run(self, base, memcheck, sudo): cmd = new_cmd self.last = cmd + self.env.update({"PMEMOBJ_CONF": "sds.at_create=0"}) if self.suite.gha: retval = run_cmd(cmd, env=self.env) else: diff --git a/utils/scripts/install-el8.sh b/utils/scripts/install-el8.sh index 81a044bfddb..472f88c9925 100755 --- a/utils/scripts/install-el8.sh +++ b/utils/scripts/install-el8.sh @@ -20,6 +20,7 @@ dnf --nodocs install \ clang-tools-extra \ cmake \ CUnit-devel \ + daxctl-devel \ diffutils \ e2fsprogs \ file \ @@ -48,6 +49,7 @@ dnf --nodocs install \ lz4-devel \ make \ ndctl \ + ndctl-devel \ numactl \ numactl-devel \ openmpi-devel \ diff --git a/utils/scripts/install-el9.sh b/utils/scripts/install-el9.sh index 9ddd8c257d6..092de8eba0f 100755 --- a/utils/scripts/install-el9.sh +++ b/utils/scripts/install-el9.sh @@ -18,6 +18,7 @@ dnf --nodocs install \ clang-tools-extra \ cmake \ CUnit-devel \ + daxctl-devel \ diffutils \ e2fsprogs \ file \ @@ -47,6 +48,7 @@ dnf --nodocs install \ lz4-devel \ make \ ndctl \ + ndctl-devel \ numactl \ numactl-devel \ openmpi-devel \ diff --git a/utils/scripts/install-leap15.sh b/utils/scripts/install-leap15.sh index fc9826ce508..0eb8ef44fee 100755 --- a/utils/scripts/install-leap15.sh +++ b/utils/scripts/install-leap15.sh @@ -38,6 +38,7 @@ dnf --nodocs install \ libjson-c-devel \ libltdl7 \ liblz4-devel \ + libndctl-devel \ libnuma-devel \ libopenssl-devel \ libprotobuf-c-devel \ diff --git a/utils/scripts/install-ubuntu.sh b/utils/scripts/install-ubuntu.sh index ec21c92f474..a36641d5f10 100755 --- a/utils/scripts/install-ubuntu.sh +++ b/utils/scripts/install-ubuntu.sh @@ -29,11 +29,13 @@ apt-get install \ libcapstone-dev \ libcmocka-dev \ libcunit1-dev \ + libdaxctl-dev \ libfuse3-dev \ libhwloc-dev \ libibverbs-dev \ libjson-c-dev \ liblz4-dev \ + libndctl-dev \ libnuma-dev \ libopenmpi-dev \ libpci-dev \ From 60d4b5dc682463eb61f829b415cc31ea0b993ebc Mon Sep 17 00:00:00 2001 From: Li Wei Date: Fri, 18 Oct 2024 17:39:03 +0900 Subject: [PATCH 05/34] DAOS-16653 pool: Batch crt events (#15230) (#15302) * DAOS-16653 pool: Batch crt events When multiple engines become unavailable around the same time, if a pool cannot tolerate the unavailability of those engines, it is sometimes desired that the pool would not exclude any of the engines. Hence, this patch introduces a CaRT event delay, tunable via the server-side environment variable, CRT_EVENT_DELAY, so that the events signaling the unavailability of those engines will be handled in hopefully one batch, giving pool_svc_update_map_internal a chance to reject the pool map update based on the RF check. When the RF check rejects a pool map change, we should revisit the corresponding events later, rather than simply throwing them away. This patch improves this case by returning the events back to the event queue, and pause the queue handling until next new event or pool map update. - Introduce event sets: pool_svc_event_set. Now the event queue can be simplified to just one event set. - Add the ability to pause and resume the event handling: pse_paused. - Track the time when the latest event was queued: pse_time. Signed-off-by: Li Wei --- docs/admin/env_variables.md | 1 + src/pool/srv_pool.c | 454 +++++++++++++++----- src/tests/ftest/util/server_utils_params.py | 1 + 3 files changed, 346 insertions(+), 110 deletions(-) diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md index 060c3790d57..31e0f3e8767 100644 --- a/docs/admin/env_variables.md +++ b/docs/admin/env_variables.md @@ -44,6 +44,7 @@ Environment variables in this section only apply to the server side. |DAOS\_MD\_CAP |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.| |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.| |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.| +|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.| |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.| |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"| |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.| diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 667e4bc6ed6..2d80088d4b6 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -62,7 +62,6 @@ ds_pool_get_vos_df_version(uint32_t pool_global_version) /* Pool service crt event */ struct pool_svc_event { - d_list_t psv_link; d_rank_t psv_rank; uint64_t psv_incarnation; enum crt_event_source psv_src; @@ -72,15 +71,40 @@ struct pool_svc_event { #define DF_PS_EVENT "rank=%u inc="DF_U64" src=%d type=%d" #define DP_PS_EVENT(e) e->psv_rank, e->psv_incarnation, e->psv_src, e->psv_type -#define RECHOOSE_SLEEP_MS 250 +/* + * Pool service crt event set + * + * This stores an unordered array of pool_svc_event objects. For all different + * i and j, we have pss_buf[i].psv_rank != pss_buf[j].psv_rank. + * + * An event set facilitates the merging of a sequence of events. For instance, + * sequence (in the format ) + * <3, D>, <5, D>, <1, D>, <5, A>, <1, A>, <1, D> + * will merge into set + * <3, D>, <5, A>, <1, D> + * (that is, during the merge, an event overrides a previuos event of the same + * rank in the set). + */ +struct pool_svc_event_set { + struct pool_svc_event *pss_buf; + uint32_t pss_len; + uint32_t pss_cap; +}; + +#define DF_PS_EVENT_SET "len=%u" +#define DP_PS_EVENT_SET(s) s->pss_len /* Pool service crt-event-handling state */ struct pool_svc_events { - ABT_mutex pse_mutex; - ABT_cond pse_cv; - d_list_t pse_queue; - ABT_thread pse_handler; - bool pse_stop; + ABT_mutex pse_mutex; + ABT_cond pse_cv; + struct pool_svc_event_set *pse_pending; + uint64_t pse_timeout; /* s */ + uint64_t pse_time; /* s */ + struct sched_request *pse_timer; + ABT_thread pse_handler; + bool pse_stop; + bool pse_paused; }; /* Pool service schedule state */ @@ -1162,6 +1186,15 @@ pool_svc_locate_cb(d_iov_t *id, char **path) return 0; } +static unsigned int +get_crt_event_delay(void) +{ + unsigned int t = 10 /* s */; + + d_getenv_uint("CRT_EVENT_DELAY", &t); + return t; +} + static int pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) { @@ -1182,7 +1215,7 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) d_iov_set(&svc->ps_rsvc.s_id, svc->ps_uuid, sizeof(uuid_t)); uuid_copy(svc->ps_uuid, id->iov_buf); - D_INIT_LIST_HEAD(&svc->ps_events.pse_queue); + svc->ps_events.pse_timeout = get_crt_event_delay(); svc->ps_events.pse_handler = ABT_THREAD_NULL; svc->ps_svc_rf = -1; svc->ps_force_notify = false; @@ -1300,98 +1333,221 @@ ds_pool_enable_exclude(void) pool_disable_exclude = false; } +static int +alloc_event_set(struct pool_svc_event_set **event_set) +{ + D_ALLOC_PTR(*event_set); + if (*event_set == NULL) + return -DER_NOMEM; + return 0; +} + +static void +free_event_set(struct pool_svc_event_set **event_set) +{ + D_FREE((*event_set)->pss_buf); + D_FREE(*event_set); +} + +static int +add_to_event_set(struct pool_svc_event_set *event_set, d_rank_t rank, uint64_t incarnation, + enum crt_event_source src, enum crt_event_type type) +{ + int i; + + /* Find rank in event_set. */ + for (i = 0; i < event_set->pss_len; i++) + if (event_set->pss_buf[i].psv_rank == rank) + break; + + /* If not found, prepare to add a new event. */ + if (i == event_set->pss_len) { + if (event_set->pss_len == event_set->pss_cap) { + uint32_t cap; + struct pool_svc_event *buf; + + if (event_set->pss_cap == 0) + cap = 1; + else + cap = 2 * event_set->pss_cap; + D_REALLOC_ARRAY(buf, event_set->pss_buf, event_set->pss_cap, cap); + if (buf == NULL) + return -DER_NOMEM; + event_set->pss_buf = buf; + event_set->pss_cap = cap; + } + event_set->pss_len++; + } + + event_set->pss_buf[i].psv_rank = rank; + event_set->pss_buf[i].psv_incarnation = incarnation; + event_set->pss_buf[i].psv_src = src; + event_set->pss_buf[i].psv_type = type; + return 0; +} + +/* Merge next into prev. */ +static int +merge_event_sets(struct pool_svc_event_set *prev, struct pool_svc_event_set *next) +{ + int i; + + for (i = 0; i < next->pss_len; i++) { + struct pool_svc_event *event = &next->pss_buf[i]; + int rc; + + rc = add_to_event_set(prev, event->psv_rank, event->psv_incarnation, event->psv_src, + event->psv_type); + if (rc != 0) + return rc; + } + return 0; +} + static int queue_event(struct pool_svc *svc, d_rank_t rank, uint64_t incarnation, enum crt_event_source src, enum crt_event_type type) { struct pool_svc_events *events = &svc->ps_events; - struct pool_svc_event *event; + int rc; + bool allocated = false; - D_ALLOC_PTR(event); - if (event == NULL) - return -DER_NOMEM; + D_DEBUG(DB_MD, DF_UUID ": queuing event: " DF_PS_EVENT "\n", DP_UUID(svc->ps_uuid), rank, + incarnation, src, type); - event->psv_rank = rank; - event->psv_incarnation = incarnation; - event->psv_src = src; - event->psv_type = type; + ABT_mutex_lock(events->pse_mutex); - D_DEBUG(DB_MD, DF_UUID": queuing event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), - DP_PS_EVENT(event)); + if (events->pse_pending == NULL) { + rc = alloc_event_set(&events->pse_pending); + if (rc != 0) + goto out; + allocated = true; + } + + rc = add_to_event_set(events->pse_pending, rank, incarnation, src, type); + if (rc != 0) + goto out; + + events->pse_time = daos_gettime_coarse(); + + if (events->pse_paused) { + D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid)); + events->pse_paused = false; + } - ABT_mutex_lock(events->pse_mutex); - d_list_add_tail(&event->psv_link, &events->pse_queue); ABT_cond_broadcast(events->pse_cv); + +out: + if (rc != 0 && allocated) + free_event_set(&events->pse_pending); ABT_mutex_unlock(events->pse_mutex); - return 0; + return rc; } static void -discard_events(d_list_t *queue) +resume_event_handling(struct pool_svc *svc) { - struct pool_svc_event *event; - struct pool_svc_event *tmp; + struct pool_svc_events *events = &svc->ps_events; - d_list_for_each_entry_safe(event, tmp, queue, psv_link) { - D_DEBUG(DB_MD, "discard event: "DF_PS_EVENT"\n", DP_PS_EVENT(event)); - d_list_del_init(&event->psv_link); - D_FREE(event); + ABT_mutex_lock(events->pse_mutex); + if (events->pse_paused) { + D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid)); + events->pse_paused = false; + ABT_cond_broadcast(events->pse_cv); } + ABT_mutex_unlock(events->pse_mutex); } -static int pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank); +static int pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set); -static void -handle_event(struct pool_svc *svc, struct pool_svc_event *event) +static int +handle_event(struct pool_svc *svc, struct pool_svc_event_set *event_set) { + int i; int rc; - if ((event->psv_src != CRT_EVS_GRPMOD && event->psv_src != CRT_EVS_SWIM) || - (event->psv_type == CRT_EVT_DEAD && pool_disable_exclude)) { - D_DEBUG(DB_MD, "ignore event: "DF_PS_EVENT" exclude=%d\n", DP_PS_EVENT(event), - pool_disable_exclude); - goto out; + D_INFO(DF_UUID ": handling event set: " DF_PS_EVENT_SET "\n", DP_UUID(svc->ps_uuid), + DP_PS_EVENT_SET(event_set)); + + if (!pool_disable_exclude) { + rc = pool_svc_exclude_ranks(svc, event_set); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to exclude ranks", DP_UUID(svc->ps_uuid)); + return rc; + } } - if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD && - event->psv_type == CRT_EVT_DEAD) { - D_DEBUG(DB_MD, "ignore exclusion of self\n"); - goto out; + /* + * Check if the alive ranks are up in the pool map. If in the future we + * add automatic reintegration below, for instance, we may need + * to not only take svc->ps_lock, but also employ an RDB TX by + * the book. + */ + ABT_rwlock_rdlock(svc->ps_pool->sp_lock); + for (i = 0; i < event_set->pss_len; i++) { + struct pool_svc_event *event = &event_set->pss_buf[i]; + + if (event->psv_src != CRT_EVS_SWIM || event->psv_type != CRT_EVT_ALIVE) + continue; + if (ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank)) { + /* + * The rank is up in the pool map. Request a pool map + * distribution just in case the rank has recently + * restarted and does not have a copy of the pool map. + */ + ds_rsvc_request_map_dist(&svc->ps_rsvc); + D_DEBUG(DB_MD, DF_UUID ": requested map dist for rank %u\n", + DP_UUID(svc->ps_uuid), event->psv_rank); + break; + } } + ABT_rwlock_unlock(svc->ps_pool->sp_lock); - D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), - DP_PS_EVENT(event)); + return 0; +} - if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) { - /* - * Check if the rank is up in the pool map. If in the future we - * add automatic reintegration below, for instance, we may need - * to not only take svc->ps_lock, but also employ an RDB TX by - * the book. - */ - ABT_rwlock_rdlock(svc->ps_pool->sp_lock); - rc = ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank); - ABT_rwlock_unlock(svc->ps_pool->sp_lock); - if (!rc) - goto out; +struct event_timer_arg { + struct pool_svc_events *eta_events; + uint64_t eta_deadline; +}; - /* - * The rank is up in the pool map. Request a pool map - * distribution just in case the rank has recently restarted - * and does not have a copy of the pool map. - */ - ds_rsvc_request_map_dist(&svc->ps_rsvc); - D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n", - DP_UUID(svc->ps_uuid), event->psv_rank); - } else if (event->psv_type == CRT_EVT_DEAD) { - rc = pool_svc_exclude_rank(svc, event->psv_rank); - if (rc != 0) - D_ERROR(DF_UUID": failed to exclude rank %u: "DF_RC"\n", - DP_UUID(svc->ps_uuid), event->psv_rank, DP_RC(rc)); - } +static void +event_timer(void *varg) +{ + struct event_timer_arg *arg = varg; + struct pool_svc_events *events = arg->eta_events; + int64_t time_left = arg->eta_deadline - daos_gettime_coarse(); -out: - return; + if (time_left > 0) + sched_req_sleep(events->pse_timer, time_left * 1000); + ABT_cond_broadcast(events->pse_cv); +} + +static int +start_event_timer(struct event_timer_arg *arg) +{ + struct pool_svc_events *events = arg->eta_events; + uuid_t uuid; + struct sched_req_attr attr; + + D_ASSERT(events->pse_timer == NULL); + uuid_clear(uuid); + sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &uuid); + events->pse_timer = sched_create_ult(&attr, event_timer, arg, 0); + if (events->pse_timer == NULL) + return -DER_NOMEM; + return 0; +} + +static void +stop_event_timer(struct event_timer_arg *arg) +{ + struct pool_svc_events *events = arg->eta_events; + + D_ASSERT(events->pse_timer != NULL); + sched_req_wait(events->pse_timer, true /* abort */); + sched_req_put(events->pse_timer); + events->pse_timer = NULL; } static void @@ -1403,31 +1559,83 @@ events_handler(void *arg) D_DEBUG(DB_MD, DF_UUID": starting\n", DP_UUID(svc->ps_uuid)); for (;;) { - struct pool_svc_event *event = NULL; - bool stop; + struct pool_svc_event_set *event_set = NULL; + bool stop; + int rc; ABT_mutex_lock(events->pse_mutex); for (;;) { + struct event_timer_arg timer_arg; + int64_t time_left; + stop = events->pse_stop; if (stop) { - discard_events(&events->pse_queue); + events->pse_paused = false; + if (events->pse_pending != NULL) + free_event_set(&events->pse_pending); break; } - if (!d_list_empty(&events->pse_queue)) { - event = d_list_entry(events->pse_queue.next, struct pool_svc_event, - psv_link); - d_list_del_init(&event->psv_link); + + timer_arg.eta_events = events; + timer_arg.eta_deadline = events->pse_time + events->pse_timeout; + + time_left = timer_arg.eta_deadline - daos_gettime_coarse(); + if (events->pse_pending != NULL && !events->pse_paused && time_left <= 0) { + event_set = events->pse_pending; + events->pse_pending = NULL; break; } + + /* A simple timed cond_wait without polling. */ + if (time_left > 0) { + rc = start_event_timer(&timer_arg); + if (rc != 0) { + /* No delay then. */ + DL_ERROR(rc, DF_UUID ": failed to start event timer", + DP_UUID(svc->ps_uuid)); + events->pse_time = 0; + continue; + } + } sched_cond_wait(events->pse_cv, events->pse_mutex); + if (time_left > 0) + stop_event_timer(&timer_arg); } ABT_mutex_unlock(events->pse_mutex); if (stop) break; - handle_event(svc, event); + rc = handle_event(svc, event_set); + if (rc != 0) { + /* Put event_set back to events->pse_pending. */ + D_DEBUG(DB_MD, DF_UUID ": returning event set\n", DP_UUID(svc->ps_uuid)); + ABT_mutex_lock(events->pse_mutex); + if (events->pse_pending == NULL) { + /* + * No pending events; pause the handling until + * next event or pool map change. + */ + D_DEBUG(DB_MD, DF_UUID ": pausing event handling\n", + DP_UUID(svc->ps_uuid)); + events->pse_paused = true; + } else { + /* + * There are pending events; do not pause the + * handling. + */ + rc = merge_event_sets(event_set, events->pse_pending); + if (rc != 0) + DL_ERROR(rc, DF_UUID ": failed to merge events", + DP_UUID(svc->ps_uuid)); + free_event_set(&events->pse_pending); + } + events->pse_pending = event_set; + event_set = NULL; + ABT_mutex_unlock(events->pse_mutex); + } - D_FREE(event); + if (event_set != NULL) + free_event_set(&event_set); ABT_thread_yield(); } @@ -1437,7 +1645,7 @@ events_handler(void *arg) static bool events_pending(struct pool_svc *svc) { - return !d_list_empty(&svc->ps_events.pse_queue); + return svc->ps_events.pse_pending != NULL; } static void @@ -1461,9 +1669,11 @@ init_events(struct pool_svc *svc) struct pool_svc_events *events = &svc->ps_events; int rc; - D_ASSERT(d_list_empty(&events->pse_queue)); + D_ASSERT(events->pse_pending == NULL); + D_ASSERT(events->pse_timer == NULL); D_ASSERT(events->pse_handler == ABT_THREAD_NULL); - D_ASSERT(events->pse_stop == false); + D_ASSERT(!events->pse_stop); + D_ASSERT(!events->pse_paused); if (!ds_pool_skip_for_check(svc->ps_pool)) { rc = crt_register_event_cb(ds_pool_crt_event_cb, svc); @@ -1498,7 +1708,8 @@ init_events(struct pool_svc *svc) err_cb: if (!ds_pool_skip_for_check(svc->ps_pool)) crt_unregister_event_cb(ds_pool_crt_event_cb, svc); - discard_events(&events->pse_queue); + if (events->pse_pending != NULL) + free_event_set(&events->pse_pending); err: return rc; } @@ -1507,7 +1718,6 @@ static void fini_events(struct pool_svc *svc) { struct pool_svc_events *events = &svc->ps_events; - int rc; D_ASSERT(events->pse_handler != ABT_THREAD_NULL); @@ -1519,8 +1729,6 @@ fini_events(struct pool_svc *svc) ABT_cond_broadcast(events->pse_cv); ABT_mutex_unlock(events->pse_mutex); - rc = ABT_thread_join(events->pse_handler); - D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc)); ABT_thread_free(&events->pse_handler); events->pse_handler = ABT_THREAD_NULL; events->pse_stop = false; @@ -4332,7 +4540,7 @@ ds_pool_svc_list_cont(uuid_t uuid, d_rank_list_t *ranks, list_cont_bulk_destroy(bulk); D_FREE(resp_cont); crt_req_decref(rpc); - dss_sleep(RECHOOSE_SLEEP_MS); + dss_sleep(250); D_GOTO(rechoose, rc); } @@ -6501,7 +6709,7 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma } static int -pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank) +pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map) { crt_group_t *primary_grp; struct pool_domain *doms; @@ -6517,13 +6725,10 @@ pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank) D_CRIT("!!! Please try to recover these engines in top priority -\n"); D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n"); - D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n", - DP_UUID(svc->ps_uuid), rank); for (i = 0; i < doms_cnt; i++) { struct swim_member_state state; - if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) || - (doms[i].do_comp.co_rank == rank)) + if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN)) continue; rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state); @@ -6701,8 +6906,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, * with CRIT log message to ask administrator to bring back the engine. */ if (src == MUS_SWIM && opc == MAP_EXCLUDE) { - d_rank_t rank; - int failed_cnt; + int failed_cnt; rc = pool_map_update_failed_cnt(map); if (rc != 0) { @@ -6711,15 +6915,19 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, goto out_map; } - D_ASSERT(tgt_addrs->pta_number == 1); - rank = tgt_addrs->pta_addrs->pta_rank; failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE); - D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n", - DP_UUID(svc->ps_uuid), rank, failed_cnt); + D_INFO(DF_UUID": SWIM exclude %d ranks, failed NODE %d\n", + DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, failed_cnt); if (failed_cnt > pw_rf) { - D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n", - DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt); - rc = pool_map_crit_prompt(svc, map, rank); + D_CRIT(DF_UUID": exclude %d ranks will break pool RF %d, failed_cnt %d\n", + DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, pw_rf, failed_cnt); + ABT_rwlock_rdlock(svc->ps_pool->sp_lock); + rc = pool_map_crit_prompt(svc, svc->ps_pool->sp_map); + ABT_rwlock_unlock(svc->ps_pool->sp_lock); + if (rc != 0) + DL_ERROR(rc, DF_UUID ": failed to log prompt", + DP_UUID(svc->ps_uuid)); + rc = -DER_RF; goto out_map; } } @@ -6768,6 +6976,9 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, ds_rsvc_request_map_dist(&svc->ps_rsvc); + /* See events_handler. */ + resume_event_handling(svc); + rc = pool_svc_schedule_reconf(svc, NULL /* map */, map_version, false /* sync_remove */); if (rc != 0) { DL_INFO(rc, DF_UUID": failed to schedule pool service reconfiguration", @@ -7167,28 +7378,51 @@ ds_pool_update_handler_v5(crt_rpc_t *rpc) } static int -pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank) +pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set) { struct pool_target_addr_list list; struct pool_target_addr_list inval_list_out = { 0 }; - struct pool_target_addr tgt_rank; + struct pool_target_addr *addrs; + d_rank_t self_rank = dss_self_rank(); uint32_t map_version = 0; + int n = 0; + int i; int rc; - tgt_rank.pta_rank = rank; - tgt_rank.pta_target = -1; - list.pta_number = 1; - list.pta_addrs = &tgt_rank; + D_ALLOC_ARRAY(addrs, event_set->pss_len); + if (addrs == NULL) + return -DER_NOMEM; + for (i = 0; i < event_set->pss_len; i++) { + struct pool_svc_event *event = &event_set->pss_buf[i]; + + if (event->psv_type != CRT_EVT_DEAD) + continue; + if (event->psv_src == CRT_EVS_GRPMOD && event->psv_rank == self_rank) { + D_DEBUG(DB_MD, DF_UUID ": ignore exclusion of self\n", + DP_UUID(svc->ps_uuid)); + continue; + } + addrs[n].pta_rank = event->psv_rank; + addrs[n].pta_target = -1; + n++; + } + if (n == 0) { + rc = 0; + goto out; + } + list.pta_number = n; + list.pta_addrs = addrs; rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */, NULL, NULL, 0, &list, &inval_list_out, &map_version, NULL /* hint */, MUS_SWIM); - D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n", - DP_UUID(svc->ps_uuid), map_version, rank, rc); + D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u ranks %u: rc %d\n", + DP_UUID(svc->ps_uuid), map_version, n, rc); pool_target_addr_list_free(&inval_list_out); - +out: + D_FREE(addrs); return rc; } diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 440ffe68f82..7cda958d242 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -435,6 +435,7 @@ class EngineYamlParameters(YamlParameters): "common": [ "D_LOG_FILE_APPEND_PID=1", "DAOS_POOL_RF=4", + "CRT_EVENT_DELAY=1", "COVFILE=/tmp/test.cov"], "ofi+tcp": [], "ofi+tcp;ofi_rxm": [], From e0f58831ef4b7de01bc0abb4be85d0635a85f3cd Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Fri, 18 Oct 2024 06:48:22 -0700 Subject: [PATCH 06/34] DAOS-16720 cq: pin isort to v1.1.0 (#15338) (#15339) Pin isort to v1.1.0 to avoid suprprise changes and because v1.1.1 is not working for us. Signed-off-by: Dalton Bohning --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 5fce435168a..67c57bd3154 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3' - - uses: isort/isort-action@master + - uses: isort/isort-action@f14e57e1d457956c45a19c05a89cccdf087846e5 # v1.1.0 with: requirementsFiles: "requirements.txt" - name: Run on SConstruct file. From cb9d278be5b6730f294d65b74db17e4a948a9a23 Mon Sep 17 00:00:00 2001 From: Ken Cain Date: Sat, 19 Oct 2024 02:53:37 -0400 Subject: [PATCH 07/34] DAOS-15852 test: more timing samples for co_op_dup_timing() (#14497) (#15324) Rarely, this test will produce timings that exceed the failure threshold. Local and PR/CI experiments have shown that increasing the test's NUM_OPS to more than 200 iterations greatly reduces or may eliminate such intermittent timing failures, by "spreading out" the magnitude of the time spent in the 3 main loops of the test (0% loops perform fault injections, 33%, and 50%). Signed-off-by: Kenneth Cain --- src/tests/suite/daos_container.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/suite/daos_container.c b/src/tests/suite/daos_container.c index 958f9fd1d16..243a829c317 100644 --- a/src/tests/suite/daos_container.c +++ b/src/tests/suite/daos_container.c @@ -3835,7 +3835,7 @@ co_op_dup_timing(void **state) size_t const in_sizes[] = {strlen(in_values[0]), strlen(in_values[1])}; int n = (int)ARRAY_SIZE(names); const unsigned int NUM_FP = 3; - const uint32_t NUM_OPS = 200; + const uint32_t NUM_OPS = 500; uint32_t num_failures = 0; const uint32_t SVC_OPS_ENABLED = 1; const uint32_t SVC_OPS_ENTRY_AGE = 60; From f8682fb56bc0d7928eb807f45599a5d9e9ada5de Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Sun, 20 Oct 2024 21:55:19 +0800 Subject: [PATCH 08/34] DAOS-16572 rebuild: properly assign global_dtx_resync_version in IV - b26 (#15186) In rebuild_iv_ent_refresh() for refreshing DTX resync version, needs to assign rt_global_dtx_resync_version firstly before wakeup related rebuild_scan_leader. Signed-off-by: Fan Yong --- src/rebuild/rebuild_iv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/rebuild/rebuild_iv.c b/src/rebuild/rebuild_iv.c index cc585e037e1..f4b9f75f407 100644 --- a/src/rebuild/rebuild_iv.c +++ b/src/rebuild/rebuild_iv.c @@ -167,6 +167,7 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, struct rebuild_tgt_pool_tracker *rpt; struct rebuild_iv *dst_iv = entry->iv_value.sg_iovs[0].iov_buf; struct rebuild_iv *src_iv = src->sg_iovs[0].iov_buf; + uint32_t old_ver; int rc = 0; rpt = rpt_lookup(src_iv->riv_pool_uuid, -1, src_iv->riv_ver, @@ -200,16 +201,18 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, dst_iv->riv_stable_epoch); rpt->rt_global_done = dst_iv->riv_global_done; rpt->rt_global_scan_done = dst_iv->riv_global_scan_done; - if (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver && + old_ver = rpt->rt_global_dtx_resync_version; + if (rpt->rt_global_dtx_resync_version < dst_iv->riv_global_dtx_resyc_version) + rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version; + if (old_ver < rpt->rt_rebuild_ver && dst_iv->riv_global_dtx_resyc_version >= rpt->rt_rebuild_ver) { D_INFO(DF_UUID " global/iv/rebuild_ver %u/%u/%u signal wait cond\n", - DP_UUID(src_iv->riv_pool_uuid), rpt->rt_global_dtx_resync_version, + DP_UUID(src_iv->riv_pool_uuid), old_ver, dst_iv->riv_global_dtx_resyc_version, rpt->rt_rebuild_ver); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond); ABT_mutex_unlock(rpt->rt_lock); } - rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version; } out: From 81e57d0ec818c1f117afa97418d1f126bc528952 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Mon, 21 Oct 2024 10:10:03 -0600 Subject: [PATCH 09/34] DAOS-16716 ci: Set reference build for PRs (#15337) Release branch PRs should use the release branch build instead of master branch build for NLT reference Signed-off-by: Jeff Olivier --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index 63c2dc28ae1..13ccc512895 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -801,6 +801,7 @@ pipeline { unitTestPost artifacts: ['nlt_logs/'], testResults: 'nlt-junit.xml', always_script: 'ci/unit/test_nlt_post.sh', + referenceJobName: 'daos-stack/daos/release%252F2.6', valgrind_stash: 'el8-gcc-nlt-memcheck' recordIssues enabledForFailure: true, failOnError: false, From c8213799c4c14ad23ab6ef87c386b5e1d16ec089 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Tue, 22 Oct 2024 02:14:44 +0800 Subject: [PATCH 10/34] DAOS-16329 chk: maintenance mode after checking pool with dryrun - b26 (#14985) Sometimes, after system shutdown unexpectedly, the users may expect to check their critical data under some kind of maintenance mode. Under such mode, no user data can be modified or moved or aggregated. That will guarantee no further potential (DAOS logic caused) damage can happen during the check. For such purpose, we will enhance current DAOS CR logic with --dryrun option to allow the pool (after check) to be opened as immutable with disabling some mechanism that may potentially cause data modification or movement (such as rebuild or aggregation). Under such mode, if client wants to connect to the pool, the read-only option must be specified. Similarly for opening container in such pool. Signed-off-by: Fan Yong --- src/chk/chk_common.c | 2 +- src/chk/chk_engine.c | 13 +-- src/chk/chk_internal.h | 1 + src/client/dfs/cont.c | 4 +- src/container/srv_container.c | 32 +++++-- src/container/srv_target.c | 10 ++- src/include/daos/container.h | 4 +- src/include/daos_cont.h | 3 + src/include/daos_srv/pool.h | 7 +- src/include/daos_srv/security.h | 15 +++- src/mgmt/srv_target.c | 2 +- src/pool/srv_internal.h | 6 ++ src/pool/srv_pool.c | 82 +++++++++++++----- src/pool/srv_target.c | 16 ++-- src/rebuild/srv.c | 2 +- src/security/srv_acl.c | 8 +- src/tests/suite/daos_cr.c | 144 ++++++++++++++++++++++++++++++++ 17 files changed, 297 insertions(+), 54 deletions(-) diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index ace1c736791..39821ec6328 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -403,7 +403,7 @@ chk_pool_restart_svc(struct chk_pool_rec *cpr) if (cpr->cpr_started) chk_pool_shutdown(cpr, true); - rc = ds_pool_start_after_check(cpr->cpr_uuid); + rc = ds_pool_start_after_check(cpr->cpr_uuid, cpr->cpr_immutable); if (rc != 0) { D_WARN("Cannot start full PS for "DF_UUIDF" after CR check: "DF_RC"\n", DP_UUID(cpr->cpr_uuid), DP_RC(rc)); diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index 56e6da3ad9b..ad61af851ce 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -1797,10 +1797,8 @@ chk_engine_pool_ult(void *args) } rc = chk_engine_cont_cleanup(cpr, svc, &aggregator); - if (rc != 0) - goto out; - - rc = ds_pool_svc_schedule_reconf(svc); + if (rc == 0 && !cpr->cpr_immutable) + rc = ds_pool_svc_schedule_reconf(svc); out: chk_engine_cont_list_fini(&aggregator); @@ -2113,6 +2111,11 @@ chk_engine_start_post(struct chk_instance *ins) if (pool_cbk->cb_phase == CHK__CHECK_SCAN_PHASE__CSP_DONE) continue; + if (ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_DRYRUN) + cpr->cpr_immutable = 1; + else + cpr->cpr_immutable = 0; + if (phase > pool_cbk->cb_phase) phase = pool_cbk->cb_phase; @@ -2950,7 +2953,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags) cbk = &cpr->cpr_bk; chk_pool_get(cpr); - rc = ds_pool_start(uuid, false); + rc = ds_pool_start(uuid, false, cpr->cpr_immutable); if (rc != 0) D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc)); diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 86868e305ee..9ab16b060b3 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -596,6 +596,7 @@ struct chk_pool_rec { cpr_stop:1, cpr_done:1, cpr_skip:1, + cpr_immutable:1, cpr_dangling:1, cpr_for_orphan:1, cpr_notified_exit:1, diff --git a/src/client/dfs/cont.c b/src/client/dfs/cont.c index 910e1819aa3..27822914056 100644 --- a/src/client/dfs/cont.c +++ b/src/client/dfs/cont.c @@ -970,7 +970,9 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * out_snap: D_FREE(oit_args); epr.epr_hi = epr.epr_lo = snap_epoch; - rc2 = daos_cont_destroy_snap(coh, epr, NULL); + rc2 = daos_cont_destroy_snap(coh, epr, NULL); + if (rc2 != 0) + D_ERROR("Failed to destroy OID table: " DF_RC "\n", DP_RC(rc2)); if (rc == 0) rc = daos_der2errno(rc2); out_dfs: diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 372da43afe4..91d87d9b978 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1555,9 +1555,9 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, * - Users who can delete any container in the pool * - Users who have been given access to delete the specific container */ - if (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) && - !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, - &owner, acl)) { + if (pool_hdl->sph_pool->sp_immutable || + (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) && + !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, &owner, acl))) { D_ERROR(DF_CONT": permission denied to delete cont\n", DP_CONT(pool_hdl->sph_pool->sp_uuid, cont->c_uuid)); D_GOTO(out_prop, rc = -DER_NO_PERM); @@ -2253,6 +2253,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr goto out; } + if (pool_hdl->sph_pool->sp_immutable && (flags & DAOS_COO_IO_BASE_MASK) != DAOS_COO_RO) { + rc = -DER_NO_PERM; + D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the immutable " + "container with flags " DF_X64 ", sec_capas " DF_X64 ": " DF_RC "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, DP_RC(rc)); + goto out; + } + /* * Need props to check for pool redundancy requirements and access * control. @@ -2274,6 +2283,11 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr D_GOTO(out, rc); } + D_DEBUG(DB_MD, DF_UUID "/" DF_UUID "/" DF_UUID ": opening container with flags " + DF_X64", sec_capas " DF_X64 "/" DF_X64 "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas); + if ((flags & DAOS_COO_EVICT_ALL) && !ds_sec_cont_can_evict_all(sec_capas)) { D_ERROR(DF_CONT": permission denied evicting all handles\n", DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid)); @@ -2282,11 +2296,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr goto out; } - if ((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) { - D_ERROR(DF_CONT": permission denied opening exclusively\n", - DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid)); - daos_prop_free(prop); + if (((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) || + ((flags & DAOS_COO_RW) && !ds_sec_cont_can_modify(sec_capas))) { rc = -DER_NO_PERM; + D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the container " + "with flags " DF_X64 ", capas " DF_X64 "/" DF_X64 ": " DF_RC "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas, + DP_RC(rc)); + daos_prop_free(prop); goto out; } diff --git a/src/container/srv_target.c b/src/container/srv_target.c index b0f3b693580..a6d3d3c5286 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -935,7 +935,7 @@ cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid, cont_child->sc_stopping, cont_child->sc_destroying); rc = -DER_SHUTDOWN; } else if (!cont_child_started(cont_child)) { - if (!ds_pool_skip_for_check(pool_child->spc_pool)) { + if (!ds_pool_restricted(pool_child->spc_pool, false)) { rc = cont_start_agg(cont_child); if (rc != 0) goto out; @@ -1591,11 +1591,15 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, * but for creating rebuild global container handle. */ D_ASSERT(hdl->sch_cont != NULL); + D_ASSERT(hdl->sch_cont->sc_pool != NULL); hdl->sch_cont->sc_open++; if (hdl->sch_cont->sc_open > 1) goto opened; + if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false)) + goto csum_init; + rc = dtx_cont_open(hdl->sch_cont); if (rc != 0) { D_ASSERTF(hdl->sch_cont->sc_open == 1, "Unexpected open count for cont " @@ -1623,10 +1627,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, D_GOTO(err_dtx, rc); } - D_ASSERT(hdl->sch_cont != NULL); - D_ASSERT(hdl->sch_cont->sc_pool != NULL); +csum_init: rc = ds_cont_csummer_init(hdl->sch_cont); - if (rc != 0) D_GOTO(err_dtx, rc); } diff --git a/src/include/daos/container.h b/src/include/daos/container.h index 282dfb8ec49..57ef06f63a2 100644 --- a/src/include/daos/container.h +++ b/src/include/daos/container.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -130,7 +130,7 @@ dc_cont_open_flags_valid(uint64_t flags) f = flags; /* One and only one of DAOS_COO_RO, DAOS_COO_RW, and DAOS_COO_EX. */ - m = f & (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX); + m = f & DAOS_COO_IO_BASE_MASK; if (m != DAOS_COO_RO && m != DAOS_COO_RW && m != DAOS_COO_EX) return false; diff --git a/src/include/daos_cont.h b/src/include/daos_cont.h index c3bda9837d8..632eafe8130 100644 --- a/src/include/daos_cont.h +++ b/src/include/daos_cont.h @@ -79,6 +79,9 @@ extern "C" { /** Mask for all of the bits in the container open mode flag, DAOS_COO_ bits */ #define DAOS_COO_MASK ((1U << DAOS_COO_NBITS) - 1) +/** The basic IO mode: read-only, read-write or exclusively read-write. */ +#define DAOS_COO_IO_BASE_MASK (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX) + /** Maximum length for container hints */ #define DAOS_CONT_HINT_MAX_LEN 128 diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 6cbe3873f0a..de22d55ed5d 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -84,6 +84,7 @@ struct ds_pool { uuid_t sp_srv_pool_hdl; uint32_t sp_stopping:1, sp_cr_checked:1, + sp_immutable:1, sp_fetch_hdls:1, sp_need_discard:1, sp_disable_rebuild:1; @@ -277,9 +278,9 @@ int ds_pool_tgt_finish_rebuild(uuid_t pool_uuid, struct pool_target_id_list *lis int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, unsigned int map_version); -bool ds_pool_skip_for_check(struct ds_pool *pool); -int ds_pool_start_after_check(uuid_t uuid); -int ds_pool_start(uuid_t uuid, bool aft_chk); +bool ds_pool_restricted(struct ds_pool *pool, bool immutable); +int ds_pool_start_after_check(uuid_t uuid, bool immutable); +int ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable); int ds_pool_stop(uuid_t uuid); int dsc_pool_svc_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uint64_t deadline, int ntargets, const d_rank_list_t *rank_list, int ndomains, const uint32_t *domains); diff --git a/src/include/daos_srv/security.h b/src/include/daos_srv/security.h index 203b9b548ee..28fe83852b3 100644 --- a/src/include/daos_srv/security.h +++ b/src/include/daos_srv/security.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -304,6 +304,19 @@ ds_sec_cont_can_open_ex(uint64_t cont_capas); bool ds_sec_cont_can_evict_all(uint64_t cont_capas); +/** + * Determine if the container can be modified based on the container security + * capabilities. + * + * \param[in] cont_capas Capability bits acquired via + * ds_sec_cont_get_capabilities + * + * \return True Access allowed + * False Access denied + */ +bool +ds_sec_cont_can_modify(uint64_t cont_capas); + /** * Get the security capabilities for a rebuild container handle created by the * DAOS server. diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index e1d6ee79bce..7975a2115d4 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1212,7 +1212,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) tc_out->tc_ranks.ca_arrays = rank; tc_out->tc_ranks.ca_count = 1; - rc = ds_pool_start(tc_in->tc_pool_uuid, false); + rc = ds_pool_start(tc_in->tc_pool_uuid, false, false); if (rc) { D_ERROR(DF_UUID": failed to start pool: "DF_RC"\n", DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc)); diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index 8f864c8c11a..612f7760fd1 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -57,6 +57,12 @@ pool_tls_get() return tls; } +static inline bool +ds_pool_skip_for_check(struct ds_pool *pool) +{ + return engine_in_check() && !pool->sp_cr_checked; +} + struct pool_iv_map { d_rank_t piv_master_rank; uint32_t piv_pool_map_ver; diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 2d80088d4b6..6e3a01379fa 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1675,7 +1675,7 @@ init_events(struct pool_svc *svc) D_ASSERT(!events->pse_stop); D_ASSERT(!events->pse_paused); - if (!ds_pool_skip_for_check(svc->ps_pool)) { + if (!ds_pool_restricted(svc->ps_pool, false)) { rc = crt_register_event_cb(ds_pool_crt_event_cb, svc); if (rc != 0) { D_ERROR(DF_UUID": failed to register event callback: "DF_RC"\n", @@ -1706,7 +1706,7 @@ init_events(struct pool_svc *svc) return 0; err_cb: - if (!ds_pool_skip_for_check(svc->ps_pool)) + if (!ds_pool_restricted(svc->ps_pool, false)) crt_unregister_event_cb(ds_pool_crt_event_cb, svc); if (events->pse_pending != NULL) free_event_set(&events->pse_pending); @@ -1721,7 +1721,7 @@ fini_events(struct pool_svc *svc) D_ASSERT(events->pse_handler != ABT_THREAD_NULL); - if (!ds_pool_skip_for_check(svc->ps_pool)) + if (!ds_pool_restricted(svc->ps_pool, false)) crt_unregister_event_cb(ds_pool_crt_event_cb, svc); ABT_mutex_lock(events->pse_mutex); @@ -2565,6 +2565,11 @@ int ds_pool_failed_lookup(uuid_t uuid) return 0; } +struct pool_start_args { + bool psa_aft_chk; + bool psa_immutable; +}; + /* * Try to start the pool. Continue the iteration upon errors as other pools may * still be able to work. @@ -2572,13 +2577,26 @@ int ds_pool_failed_lookup(uuid_t uuid) static int start_one(uuid_t uuid, void *varg) { - int rc; + struct pool_start_args *psa = varg; + bool aft_chk; + bool immutable; + int rc; + + if (psa != NULL) { + aft_chk = psa->psa_aft_chk; + immutable = psa->psa_immutable; + } else { + aft_chk = false; + immutable = false; + } - D_DEBUG(DB_MD, DF_UUID ": starting pool\n", DP_UUID(uuid)); + D_DEBUG(DB_MD, DF_UUID ": starting pool, aft_chk %s, immutable %s\n", + DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no"); - rc = ds_pool_start(uuid, varg != NULL ? true : false); + rc = ds_pool_start(uuid, aft_chk, immutable); if (rc != 0) { - DL_ERROR(rc, DF_UUID ": failed to start pool", DP_UUID(uuid)); + DL_ERROR(rc, DF_UUID ": failed to start pool, aft_chk %s, immutable %s", + DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no"); ds_pool_failed_add(uuid, rc); } @@ -2597,12 +2615,27 @@ pool_start_all(void *arg) DP_RC(rc)); } +bool +ds_pool_restricted(struct ds_pool *pool, bool immutable) +{ + if (ds_pool_skip_for_check(pool)) + return true; + + if (pool->sp_immutable && !immutable) + return true; + + return false; +} + int -ds_pool_start_after_check(uuid_t uuid) +ds_pool_start_after_check(uuid_t uuid, bool immutable) { - bool aft_chk = true; + struct pool_start_args psa; - return start_one(uuid, &aft_chk); + psa.psa_aft_chk = true; + psa.psa_immutable = immutable; + + return start_one(uuid, &psa); } /* Note that this function is currently called from the main xstream. */ @@ -3820,7 +3853,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) &ds_pool_prop_connectable, &value); if (rc != 0) goto out_lock; - D_DEBUG(DB_MD, DF_UUID ": connectable=%u\n", DP_UUID(in->pci_op.pi_uuid), connectable); if (!connectable) { D_ERROR(DF_UUID": being destroyed, not accepting connections\n", DP_UUID(in->pci_op.pi_uuid)); @@ -3829,12 +3861,21 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) /* * NOTE: Under check mode, there is a small race window between ds_pool_mark_connectable() - * the PS restart for full service. If some client tries to connect the pool during + * and PS restart with full service. If some client tries to connect the pool during * such internal, it will get -DER_BUSY temporarily. */ if (unlikely(ds_pool_skip_for_check(svc->ps_pool))) { - D_ERROR(DF_UUID" is not ready for full pool service\n", DP_UUID(in->pci_op.pi_uuid)); - D_GOTO(out_lock, rc = -DER_BUSY); + rc = -DER_BUSY; + D_ERROR(DF_UUID " is not ready for full pool service: " DF_RC "\n", + DP_UUID(in->pci_op.pi_uuid), DP_RC(rc)); + goto out_lock; + } + + if (svc->ps_pool->sp_immutable && flags != DAOS_PC_RO) { + rc = -DER_NO_PERM; + D_ERROR(DF_UUID " failed to connect immutable pool, flags " DF_X64 ": " DF_RC "\n", + DP_UUID(in->pci_op.pi_uuid), flags, DP_RC(rc)); + goto out_lock; } /* Check existing pool handles. */ @@ -3997,6 +4038,11 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) } } + D_DEBUG(DB_MD, DF_UUID "/" DF_UUID ": connecting to %s pool with flags " + DF_X64", sec_capas " DF_X64 "\n", + DP_UUID(in->pci_op.pi_uuid), DP_UUID(in->pci_op.pi_hdl), + svc->ps_pool->sp_immutable ? "immutable" : "regular", flags, sec_capas); + rc = pool_connect_iv_dist(svc, in->pci_op.pi_hdl, flags, sec_capas, credp, global_ver, obj_layout_ver); if (rc == 0 && DAOS_FAIL_CHECK(DAOS_POOL_CONNECT_FAIL_CORPC)) { @@ -6522,7 +6568,7 @@ pool_svc_schedule(struct pool_svc *svc, struct pool_svc_sched *sched, void (*fun D_DEBUG(DB_MD, DF_UUID": begin\n", DP_UUID(svc->ps_uuid)); - if (ds_pool_skip_for_check(svc->ps_pool)) { + if (ds_pool_restricted(svc->ps_pool, false)) { D_DEBUG(DB_MD, DF_UUID": end: skip in check mode\n", DP_UUID(svc->ps_uuid)); return -DER_OP_CANCELED; } @@ -8761,9 +8807,3 @@ ds_pool_svc_upgrade_vos_pool(struct ds_pool *pool) ds_rsvc_put(rsvc); return rc; } - -bool -ds_pool_skip_for_check(struct ds_pool *pool) -{ - return engine_in_check() && !pool->sp_cr_checked; -} diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index cfa837e8b2a..113fb757fd9 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -505,7 +505,7 @@ pool_child_start(struct ds_pool_child *child, bool recreate) goto done; } - if (!ds_pool_skip_for_check(child->spc_pool)) { + if (!ds_pool_restricted(child->spc_pool, false)) { rc = start_gc_ult(child); if (rc != 0) goto out_close; @@ -1128,7 +1128,7 @@ pool_fetch_hdls_ult_abort(struct ds_pool *pool) * till ds_pool_stop. Only for mgmt and pool modules. */ int -ds_pool_start(uuid_t uuid, bool aft_chk) +ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable) { struct ds_pool *pool; struct daos_llink *llink; @@ -1185,6 +1185,11 @@ ds_pool_start(uuid_t uuid, bool aft_chk) else pool->sp_cr_checked = 0; + if (immutable) + pool->sp_immutable = 1; + else + pool->sp_immutable = 0; + rc = pool_child_add_all(pool); if (rc != 0) goto failure_pool; @@ -1199,7 +1204,9 @@ ds_pool_start(uuid_t uuid, bool aft_chk) } pool->sp_fetch_hdls = 1; + } + if (!ds_pool_restricted(pool, false)) { rc = ds_pool_start_ec_eph_query_ult(pool); if (rc != 0) { D_ERROR(DF_UUID": failed to start ec eph query ult: "DF_RC"\n", @@ -1882,13 +1889,10 @@ ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, DP_UUID(pool->sp_uuid), map_version_before, map_version); } - if (map_updated) { + if (map_updated && !ds_pool_restricted(pool, false)) { struct dtx_scan_args *arg; int ret; - if (ds_pool_skip_for_check(pool)) - D_GOTO(out, rc = 0); - D_ALLOC_PTR(arg); if (arg == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index b1f722b8254..5c463af9e90 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1941,7 +1941,7 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, return 0; } - if (ds_pool_skip_for_check(pool)) { + if (ds_pool_restricted(pool, false)) { D_DEBUG(DB_REBUILD, DF_UUID" skip rebuild under check mode\n", DP_UUID(pool->sp_uuid)); return 0; diff --git a/src/security/srv_acl.c b/src/security/srv_acl.c index ad3c0107e3b..8f05216e971 100644 --- a/src/security/srv_acl.c +++ b/src/security/srv_acl.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -754,6 +754,12 @@ ds_sec_cont_can_evict_all(uint64_t cont_capas) return (cont_capas & CONT_CAPA_EVICT_ALL) != 0; } +bool +ds_sec_cont_can_modify(uint64_t cont_capas) +{ + return (cont_capas & CONT_CAPAS_W_MASK) != 0; +} + uint64_t ds_sec_get_rebuild_cont_capabilities(void) { diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 3827a0eda6a..4c981f29645 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -18,6 +18,8 @@ #include +#include "daos_iotest.h" + /* * Will enable accurate query result verification after DAOS-13520 resolved. * #define CR_ACCURATE_QUERY_RESULT 1 @@ -3573,6 +3575,146 @@ cr_handle_fail_pool2(void **state) cr_cleanup(arg, &pool, 1); } +#define CR_IO_SIZE 16 + +/* + * 1. Create pool and container without inconsistency. + * 2. Write something to the container. + * 3. Start checker with --dry-run option. + * 4. Query checker, it should be completed without any inconsistency reported. + * 5. Verify the pool only can be connected with read-only permission. + * 6. Verify the container only can be opened with read-only permission. + * 7. Verify the object only can be opened with read-only permission. + * 8. Verify the object cannot be written. + * 9. Read former wrote data and verify its correctness. + * 10. Switch to normal mode and cleanup. + */ +static void +cr_maintenance_mode(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = { 0 }; + struct test_cont cont = { 0 }; + daos_handle_t coh; + daos_obj_id_t oid; + struct ioreq req; + const char *dkey = "cr_dkey"; + const char *akey = "cr_akey"; + char update_buf[CR_IO_SIZE]; + char fetch_buf[CR_IO_SIZE]; + struct daos_check_info dci = { 0 }; + int rc; + + print_message("CR28: maintenance mode after dry-run check\n"); + + rc = cr_pool_create(state, &pool, true, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_cont_create(state, &pool, &cont, 0); + assert_rc_equal(rc, 0); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL); + assert_rc_equal(rc, 0); + + oid = daos_test_oid_gen(coh, OC_SX, 0, 0, arg->myrank); + arg->fail_loc = DAOS_DTX_COMMIT_SYNC | DAOS_FAIL_ALWAYS; + arg->async = 0; + ioreq_init(&req, coh, oid, DAOS_IOD_SINGLE, arg); + dts_buf_render(update_buf, CR_IO_SIZE); + + print_message("Generate some data.\n"); + + insert_single(dkey, akey, 0, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + daos_fail_loc_set(0); + + rc = daos_obj_close(req.oh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_pool_disconnect(pool.poh, NULL); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_DRYRUN, 0, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the pool only can be connected as RDONLY mode.\n"); + + rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RW, &pool.poh, NULL, NULL); + assert_rc_equal(rc, -DER_NO_PERM); + + rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RO, &pool.poh, NULL, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the container only can be opened as RDONLY mode.\n"); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL); + assert_rc_equal(rc, -DER_NO_PERM); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RO, &coh, NULL, NULL); + assert_rc_equal(rc, 0); + +#if 0 + /* + * NOTE: Currently, DAOS security check is server-side logic, but object open may not + * talk with server. So related permission check will not be done until read or + * write really happened. If not consider exclusively open and cache case, then + * it seems fine; otherwise, need some enhancement in future. + */ + + print_message("Verify the object only can be opened as RDONLY mode.\n"); + + rc = daos_obj_open(coh, oid, DAOS_OO_RW, &req.oh, NULL); + assert_rc_equal(rc, -DER_NO_PERM); +#endif + + rc = daos_obj_open(coh, oid, DAOS_OO_RO, &req.oh, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the object cannot be modified.\n"); + + req.arg->expect_result = -DER_NO_PERM; + insert_single(dkey, akey, 100, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + daos_fail_loc_set(0); + + print_message("Verify former wrote data.\n"); + + req.arg->expect_result = 0; + lookup_single(dkey, akey, 0, fetch_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + assert_int_equal(req.iod[0].iod_size, CR_IO_SIZE); + assert_memory_equal(update_buf, fetch_buf, CR_IO_SIZE); + + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_pool_disconnect(pool.poh, NULL); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", cr_start_specified, async_disable, test_case_teardown}, @@ -3628,6 +3770,8 @@ static const struct CMUnitTest cr_tests[] = { cr_handle_fail_pool1, async_disable, test_case_teardown}, { "CR27: handle the pool if some engine failed to report some pool service", cr_handle_fail_pool2, async_disable, test_case_teardown}, + { "CR28: maintenance mode after dry-run check", + cr_maintenance_mode, async_disable, test_case_teardown}, }; static int From b913d3e9685fe8b31b046d197f66477460053e68 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Mon, 21 Oct 2024 14:19:00 -0400 Subject: [PATCH 11/34] DAOS-16265 test: Fix erasurecode/rebuild_fio.py out of space (#15020) (#15340) Prevent accumulating large server log files caused by temporarily enabling the DEBUG log mask while creating or destroying pools. Signed-off-by: Phil Henderson --- .../ftest/erasurecode/multiple_failure.yaml | 1 + src/tests/ftest/erasurecode/rebuild_fio.yaml | 1 + src/tests/ftest/util/apricot/apricot/test.py | 22 +++++++++++++++---- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml index 78f132474b5..95aab541329 100644 --- a/src/tests/ftest/erasurecode/multiple_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_failure.yaml @@ -25,6 +25,7 @@ server_config: storage: auto pool: size: 93% + set_logmasks: False container: type: POSIX control_method: daos diff --git a/src/tests/ftest/erasurecode/rebuild_fio.yaml b/src/tests/ftest/erasurecode/rebuild_fio.yaml index a895c356707..a3539d86579 100644 --- a/src/tests/ftest/erasurecode/rebuild_fio.yaml +++ b/src/tests/ftest/erasurecode/rebuild_fio.yaml @@ -39,6 +39,7 @@ pool: aggregation: threshold: 50000000 aggr_timeout: 180 + set_logmasks: False container: type: POSIX control_method: daos diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 42e05937f37..563ff7adece 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -643,6 +643,7 @@ def __init__(self, *args, **kwargs): self.setup_start_agents = True self.slurm_exclude_servers = False self.slurm_exclude_nodes = NodeSet() + self.max_test_dir_usage_check = 90 self.host_info = HostInfo() self.hostlist_servers = NodeSet() self.hostlist_clients = NodeSet() @@ -693,6 +694,11 @@ def setUp(self): self.slurm_exclude_servers = self.params.get( "slurm_exclude_servers", "/run/setup/*", self.slurm_exclude_servers) + # Max test directory usage percentage - when exceeded will display sizes of files in the + # test directory + self.max_test_dir_usage_check = self.params.get( + "max_test_dir_usage_check", "/run/setup/*", self.max_test_dir_usage_check) + # The server config name should be obtained from each ServerManager # object, but some tests still use this TestWithServers attribute. self.server_group = self.params.get("name", "/run/server_config/*", "daos_server") @@ -765,12 +771,20 @@ def setUp(self): # List common test directory contents before running the test self.log.info("-" * 100) - self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir)) + self.log.debug( + "Common test directory (%s) contents (check > %s%%):", + os.path.dirname(self.test_dir), self.max_test_dir_usage_check) all_hosts = include_local_host(self.host_info.all_hosts) test_dir_parent = os.path.dirname(self.test_dir) - result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") - if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90: - run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*") + _result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") + _details = NodeSet() + for _host, _stdout in _result.all_stdout.items(): + _test_dir_usage = re.findall(r"\s+([\d]+)%\s+", _stdout) + _test_dir_usage_int = int(max(_test_dir_usage + ["0"])) + if _test_dir_usage_int > self.max_test_dir_usage_check: + _details.add(_host) + if _details: + run_remote(self.log, _details, f"du -sh {test_dir_parent}/*") self.log.info("-" * 100) if not self.start_servers_once or self.name.uid == 1: From ffa1c9d952fb62f72524abec119e7d829bce5633 Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Tue, 22 Oct 2024 11:38:08 -0400 Subject: [PATCH 12/34] DAOS-16693 telemetry: Avoid race between init/read (#15306) (#15322) In rare cases, a reader may attempt to access a telemetry node after it has been added to the tree, but before it has been fully initialized. Use an atomic to prevent reads before the initialization has completed. Unlucky readers will get a -DER_AGAIN instead of crashing. Signed-off-by: Michael MacDonald --- src/control/lib/telemetry/counter.go | 20 ++++++++----- src/control/lib/telemetry/duration.go | 20 ++++++++----- src/control/lib/telemetry/gauge.go | 38 ++++++++++++++---------- src/control/lib/telemetry/snapshot.go | 20 ++++++++----- src/control/lib/telemetry/telemetry.go | 12 ++++++++ src/control/lib/telemetry/timestamp.go | 22 +++++++++----- src/gurt/telemetry.c | 41 ++++++++++++++++++++++++++ src/include/gurt/telemetry_common.h | 3 ++ 8 files changed, 129 insertions(+), 47 deletions(-) diff --git a/src/control/lib/telemetry/counter.go b/src/control/lib/telemetry/counter.go index 81549a32daf..e6e59a8d0ea 100644 --- a/src/control/lib/telemetry/counter.go +++ b/src/control/lib/telemetry/counter.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -37,18 +37,22 @@ func (c *Counter) FloatValue() float64 { } func (c *Counter) Value() uint64 { + ctrVal := BadUintVal if c.handle == nil || c.node == nil { - return BadUintVal + return ctrVal } - var val C.uint64_t - - res := C.d_tm_get_counter(c.handle.ctx, &val, c.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_counter(c.handle.ctx, &val, c.node) + if res == C.DER_SUCCESS { + ctrVal = uint64(val) + } + return res } + c.fetchValWithRetry(fetch) - return BadUintVal + return ctrVal } func newCounter(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Counter { diff --git a/src/control/lib/telemetry/duration.go b/src/control/lib/telemetry/duration.go index 1f32125bc90..3cfd240bde7 100644 --- a/src/control/lib/telemetry/duration.go +++ b/src/control/lib/telemetry/duration.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,18 +34,22 @@ func (d *Duration) Type() MetricType { } func (d *Duration) Value() time.Duration { + durValue := BadDuration if d.handle == nil || d.node == nil { - return BadDuration + return durValue } - var tms C.struct_timespec - - res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node) - if res == C.DER_SUCCESS { - return time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond + fetch := func() C.int { + var tms C.struct_timespec + res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node) + if res == C.DER_SUCCESS { + durValue = time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond + } + return res } + d.fetchValWithRetry(fetch) - return BadDuration + return durValue } func (d *Duration) FloatValue() float64 { diff --git a/src/control/lib/telemetry/gauge.go b/src/control/lib/telemetry/gauge.go index ea84ff90504..93db24ab9fc 100644 --- a/src/control/lib/telemetry/gauge.go +++ b/src/control/lib/telemetry/gauge.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -41,18 +41,22 @@ func (g *Gauge) FloatValue() float64 { // Value returns the value as an unsigned integer. func (g *Gauge) Value() uint64 { + gaugeVal := BadUintVal if g.handle == nil || g.node == nil { - return BadUintVal + return gaugeVal } - var val C.uint64_t - - res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node) + if res == C.DER_SUCCESS { + gaugeVal = uint64(val) + } + return res } + g.fetchValWithRetry(fetch) - return BadUintVal + return gaugeVal } func newGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Gauge { @@ -103,18 +107,22 @@ func (g *StatsGauge) FloatValue() float64 { // Value returns the gauge value as an unsigned integer. func (g *StatsGauge) Value() uint64 { + gaugeVal := BadUintVal if g.handle == nil || g.node == nil { - return BadUintVal + return gaugeVal } - var val C.uint64_t - - res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node) + if res == C.DER_SUCCESS { + gaugeVal = uint64(val) + } + return res } + g.fetchValWithRetry(fetch) - return BadUintVal + return gaugeVal } func newStatsGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *StatsGauge { diff --git a/src/control/lib/telemetry/snapshot.go b/src/control/lib/telemetry/snapshot.go index 2ffa23296c3..5b2af9f0747 100644 --- a/src/control/lib/telemetry/snapshot.go +++ b/src/control/lib/telemetry/snapshot.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,18 +34,22 @@ func (s *Snapshot) Type() MetricType { } func (s *Snapshot) Value() time.Time { + timeVal := time.Time{} // zero val if s.handle == nil || s.node == nil { - return time.Time{} + return timeVal } - var tms C.struct_timespec - - res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node) - if res == C.DER_SUCCESS { - return time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec)) + fetch := func() C.int { + var tms C.struct_timespec + res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node) + if res == C.DER_SUCCESS { + timeVal = time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec)) + } + return res } + s.fetchValWithRetry(fetch) - return time.Time{} + return timeVal } func (s *Snapshot) FloatValue() float64 { diff --git a/src/control/lib/telemetry/telemetry.go b/src/control/lib/telemetry/telemetry.go index bb0593240b6..479c41e2aab 100644 --- a/src/control/lib/telemetry/telemetry.go +++ b/src/control/lib/telemetry/telemetry.go @@ -84,6 +84,8 @@ const ( BadDuration = time.Duration(BadIntVal) PathSep = filepath.Separator + + maxFetchRetries = 1 ) type ( @@ -304,6 +306,16 @@ func (mb *metricBase) String() string { return strings.TrimSpace(string(buf[:bytes.Index(buf, []byte{0})])) } +func (mb *metricBase) fetchValWithRetry(fetchFn func() C.int) C.int { + var rc C.int + for i := 0; i < maxFetchRetries; i++ { + if rc = fetchFn(); rc == C.DER_SUCCESS { + return rc + } + } + return rc +} + func (sm *statsMetric) Min() uint64 { return uint64(sm.stats.dtm_min) } diff --git a/src/control/lib/telemetry/timestamp.go b/src/control/lib/telemetry/timestamp.go index 97ef5bb1ed9..c787aed488d 100644 --- a/src/control/lib/telemetry/timestamp.go +++ b/src/control/lib/telemetry/timestamp.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,16 +34,22 @@ func (t *Timestamp) Type() MetricType { } func (t *Timestamp) Value() time.Time { - zero := time.Time{} + timeVal := time.Time{} // zero val if t.handle == nil || t.node == nil { - return zero + return timeVal } - var clk C.time_t - res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node) - if res == C.DER_SUCCESS { - return time.Unix(int64(clk), 0) + + fetch := func() C.int { + var clk C.time_t + res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node) + if res == C.DER_SUCCESS { + timeVal = time.Unix(int64(clk), 0) + } + return res } - return zero + t.fetchValWithRetry(fetch) + + return timeVal } // FloatValue converts the timestamp to time in seconds since the UNIX epoch. diff --git a/src/gurt/telemetry.c b/src/gurt/telemetry.c index 8f08192d072..be7762184a9 100644 --- a/src/gurt/telemetry.c +++ b/src/gurt/telemetry.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -624,6 +625,7 @@ alloc_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t **newnode, goto out; tmp->dtn_metric = NULL; tmp->dtn_sibling = NULL; + atomic_store_relaxed(&tmp->dtn_readable, false); *newnode = node; out: @@ -2409,6 +2411,9 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type, pthread_mutexattr_destroy(&mattr); temp->dtn_protect = true; } + + atomic_store_relaxed(&temp->dtn_readable, true); + if (node != NULL) *node = temp; @@ -3090,6 +3095,15 @@ d_tm_try_del_ephemeral_dir(const char *fmt, ...) return rc; } +static bool +node_is_readable(struct d_tm_node_t *node) +{ + if (node == NULL) + return false; + + return atomic_load_relaxed(&node->dtn_readable); +} + /** * Creates histogram counters for the given node. It calculates the * extents of each bucket and creates counters at the path specified that @@ -3278,6 +3292,9 @@ d_tm_get_num_buckets(struct d_tm_context *ctx, if (ctx == NULL || histogram == NULL || node == NULL) return -DER_INVAL; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + rc = validate_node_ptr(ctx, node, &shmem); if (rc != 0) return rc; @@ -3341,6 +3358,9 @@ d_tm_get_bucket_range(struct d_tm_context *ctx, struct d_tm_bucket_t *bucket, if (rc != 0) return rc; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + if (!has_stats(node)) return -DER_OP_NOT_PERMITTED; @@ -3392,6 +3412,9 @@ d_tm_get_counter(struct d_tm_context *ctx, uint64_t *val, if (node->dtn_type != D_TM_COUNTER) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + /* "ctx == NULL" is server side fast version to read the counter. */ if (ctx == NULL) { metric_data = node->dtn_metric; @@ -3441,6 +3464,9 @@ d_tm_get_timestamp(struct d_tm_context *ctx, time_t *val, if (node->dtn_type != D_TM_TIMESTAMP) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3470,6 +3496,9 @@ d_tm_get_meminfo(struct d_tm_context *ctx, struct d_tm_meminfo_t *meminfo, if (node->dtn_type != D_TM_MEMINFO) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3513,6 +3542,9 @@ d_tm_get_timer_snapshot(struct d_tm_context *ctx, struct timespec *tms, if (!(node->dtn_type & D_TM_TIMER_SNAPSHOT)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3563,6 +3595,9 @@ d_tm_get_duration(struct d_tm_context *ctx, struct timespec *tms, if (!(node->dtn_type & D_TM_DURATION)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data == NULL) return -DER_METRIC_NOT_FOUND; @@ -3628,6 +3663,9 @@ d_tm_get_gauge(struct d_tm_context *ctx, uint64_t *val, if (!is_gauge(node)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { dtm_stats = conv_ptr(shmem, metric_data->dtm_stats); @@ -3700,6 +3738,9 @@ int d_tm_get_metadata(struct d_tm_context *ctx, char **desc, char **units, if (node->dtn_type == D_TM_DIRECTORY) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); diff --git a/src/include/gurt/telemetry_common.h b/src/include/gurt/telemetry_common.h index 4e7d3b3a02d..2068c1c0cce 100644 --- a/src/include/gurt/telemetry_common.h +++ b/src/include/gurt/telemetry_common.h @@ -8,6 +8,8 @@ #include +#include + #define D_TM_VERSION 1 #define D_TM_MAX_NAME_LEN 256 #define D_TM_MAX_DESC_LEN 128 @@ -236,6 +238,7 @@ struct d_tm_node_t { pthread_mutex_t dtn_lock; /** individual mutex */ struct d_tm_metric_t *dtn_metric; /** values */ bool dtn_protect; /** synchronized access */ + _Atomic bool dtn_readable; /** fully initialized and ready for reads */ }; struct d_tm_nodeList_t { From 42a0d35d815505ba0435e192933160b52e55c7c2 Mon Sep 17 00:00:00 2001 From: Alexander Oganezov Date: Tue, 22 Oct 2024 09:10:51 -0700 Subject: [PATCH 13/34] DAOS-16696 cart: Fix rc in error path (#15313) (#15357) - Fix rc in error path during ivo_on_update failure Required-githooks: true Signed-off-by: Alexander A Oganezov --- src/cart/crt_iv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c index 603e565ac1f..bf8124a8a6a 100644 --- a/src/cart/crt_iv.c +++ b/src/cart/crt_iv.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2911,8 +2911,12 @@ bulk_update_transfer_done_aux(const struct crt_bulk_cb_info *info) return rc; send_error: - rc = crt_bulk_free(cb_info->buc_bulk_hdl); + /* send back whatever error got us here */ output->rc = rc; + rc = crt_bulk_free(cb_info->buc_bulk_hdl); + if (rc != 0) + DL_ERROR(rc, "crt_bulk_free() failed"); + iv_ops->ivo_on_put(ivns_internal, &cb_info->buc_iv_value, cb_info->buc_user_priv); From 1ae3f29dcbebe9a4ae123302cc11b9dfac870c8f Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Wed, 23 Oct 2024 00:57:32 +0800 Subject: [PATCH 14/34] DAOS-16574 vos: shrink DTX table blob size - b26 (#15220) (#15221) Use 4KB blob for committed DTX table and 16KB for active DTX table. It is more efficient for lower allocator and reduce the possibility of space allocation failure when space pressure. Simplify vos_dtx_commit logic and code cleanup. Signed-off-by: Fan Yong --- src/vos/vos_dtx.c | 101 +++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 1c60f781507..f1d69b54f4e 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -15,11 +15,10 @@ #include "vos_layout.h" #include "vos_internal.h" -/* 128 KB per SCM blob */ -#define DTX_BLOB_SIZE (1 << 17) -/** Ensure 16-bit signed int is sufficient to store record index */ -D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_act_ent_df)) < (1 << 15)); -D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_cmt_ent_df)) < (1 << 15)); +/* 16 KB blob for each active DTX blob */ +#define DTX_ACT_BLOB_SIZE (1 << 14) +/* 4 KB for committed DTX blob */ +#define DTX_CMT_BLOB_SIZE (1 << 12) #define DTX_ACT_BLOB_MAGIC 0x14130a2b #define DTX_CMT_BLOB_MAGIC 0x2502191c @@ -767,7 +766,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, static int vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t epoch, daos_epoch_t cmt_time, struct vos_dtx_cmt_ent **dce_p, - struct vos_dtx_act_ent **dae_p, bool *rm_cos, bool *fatal) + struct vos_dtx_act_ent **dae_p, bool *rm_cos) { struct vos_dtx_act_ent *dae = NULL; struct vos_dtx_cmt_ent *dce = NULL; @@ -866,10 +865,8 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t goto out; rc = dtx_rec_release(cont, dae, false); - if (rc != 0) { - *fatal = true; + if (rc != 0) goto out; - } D_ASSERT(dae_p != NULL); *dae_p = dae; @@ -915,7 +912,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) umem_off_t dbd_off; int rc; - dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE); + dbd_off = umem_zalloc(umm, DTX_ACT_BLOB_SIZE); if (UMOFF_IS_NULL(dbd_off)) { D_ERROR("No space when create active DTX table.\n"); return -DER_NOSPACE; @@ -923,7 +920,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) dbd = umem_off2ptr(umm, dbd_off); dbd->dbd_magic = DTX_ACT_BLOB_MAGIC; - dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / + dbd->dbd_cap = (DTX_ACT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / sizeof(struct vos_dtx_act_ent_df); dbd->dbd_count = 0; dbd->dbd_index = 0; @@ -1994,12 +1991,11 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], umem_off_t dbd_off; uint64_t cmt_time = daos_gettime_coarse(); int committed = 0; - int cur = 0; int rc = 0; - int rc1 = 0; + int p = 0; int i = 0; int j; - bool fatal = false; + int k; bool allocated = false; dbd = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); @@ -2017,67 +2013,64 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto new_blob; again: - for (j = dbd->dbd_count; i < count && j < dbd->dbd_cap && rc1 == 0; - i++, cur++) { - struct vos_dtx_cmt_ent *dce = NULL; - - rc = vos_dtx_commit_one(cont, &dtis[cur], epoch, cmt_time, &dce, - daes != NULL ? &daes[cur] : NULL, - rm_cos != NULL ? &rm_cos[cur] : NULL, &fatal); - if (dces != NULL) - dces[cur] = dce; - - if (fatal) - goto out; - - if (rc == 0 && (daes == NULL || daes[cur] != NULL)) + for (j = dbd->dbd_count; j < dbd->dbd_cap && i < count; i++) { + rc = vos_dtx_commit_one(cont, &dtis[i], epoch, cmt_time, &dces[i], + daes != NULL ? &daes[i] : NULL, + rm_cos != NULL ? &rm_cos[i] : NULL); + if (rc == 0 && (daes == NULL || daes[i] != NULL)) committed++; if (rc == -DER_ALREADY || rc == -DER_NONEXIST) rc = 0; - if (rc1 == 0) - rc1 = rc; + if (rc != 0) + goto out; + + if (dces[i] != NULL) + j++; + } - if (dce != NULL) { - rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[j], - sizeof(struct vos_dtx_cmt_ent_df), - UMEM_XADD_NO_SNAPSHOT); + if (j > dbd->dbd_count) { + if (!allocated) { + rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[dbd->dbd_count], + sizeof(struct vos_dtx_cmt_ent_df) * + (j - dbd->dbd_count), UMEM_XADD_NO_SNAPSHOT); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; - memcpy(&dbd->dbd_committed_data[j++], &dce->dce_base, + /* Only need to add range for the first partial blob. */ + rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count)); + if (rc != 0) + goto out; + } + + for (k = dbd->dbd_count; k < j; k++, p++) { + while (dces[p] == NULL) + p++; + + memcpy(&dbd->dbd_committed_data[k], &dces[p]->dce_base, sizeof(struct vos_dtx_cmt_ent_df)); } - } - if (!allocated) { - /* Only need to add range for the first partial blob. */ - rc = umem_tx_add_ptr(umm, &dbd->dbd_count, - sizeof(dbd->dbd_count)); - if (rc != 0) - D_GOTO(out, fatal = true); + dbd->dbd_count = j; } - dbd->dbd_count = j; - - if (i == count || rc1 != 0) + if (i == count) goto out; new_blob: dbd_prev = dbd; /* Need new @dbd */ - dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE); + dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE); if (UMOFF_IS_NULL(dbd_off)) { D_ERROR("No space to store committed DTX %d "DF_DTI"\n", - count, DP_DTI(&dtis[cur])); - fatal = true; + count, DP_DTI(&dtis[i])); D_GOTO(out, rc = -DER_NOSPACE); } dbd = umem_off2ptr(umm, dbd_off); dbd->dbd_magic = DTX_CMT_BLOB_MAGIC; - dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / + dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / sizeof(struct vos_dtx_cmt_ent_df); dbd->dbd_prev = umem_ptr2off(umm, dbd_prev); @@ -2090,21 +2083,21 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], sizeof(cont_df->cd_dtx_committed_head) + sizeof(cont_df->cd_dtx_committed_tail)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; cont_df->cd_dtx_committed_head = dbd_off; } else { rc = umem_tx_add_ptr(umm, &dbd_prev->dbd_next, sizeof(dbd_prev->dbd_next)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; dbd_prev->dbd_next = dbd_off; rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_tail, sizeof(cont_df->cd_dtx_committed_tail)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; } D_DEBUG(DB_IO, "Allocated DTX committed blob %p ("UMOFF_PF") for cont "DF_UUID"\n", @@ -2115,7 +2108,7 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto again; out: - return fatal ? rc : (committed > 0 ? committed : rc1); + return rc < 0 ? rc : committed; } void From dcf84195f80f2d0367fd03264b76d7a9674fdd19 Mon Sep 17 00:00:00 2001 From: Li Wei Date: Wed, 23 Oct 2024 23:20:15 +0900 Subject: [PATCH 15/34] DAOS-16653 doc: Fix CRT_EVENT_DELAY description (#15351) (#15371) Fix the description of the CRT_EVENT_DELAY environment variable in docs/admin/env_variables.md. Signed-off-by: Li Wei --- docs/admin/env_variables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md index 31e0f3e8767..3de0a079203 100644 --- a/docs/admin/env_variables.md +++ b/docs/admin/env_variables.md @@ -44,7 +44,7 @@ Environment variables in this section only apply to the server side. |DAOS\_MD\_CAP |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.| |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.| |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.| -|CRT\_EVENT\_DELAY|Delay in seconds before handling each CaRT event. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.| +|CRT\_EVENT\_DELAY|Delay in seconds before handling a set of CaRT events. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.| |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.| |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"| |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.| From 4c49f3626726e561dfab41c21a9345573828abc3 Mon Sep 17 00:00:00 2001 From: Ken Cain Date: Wed, 23 Oct 2024 16:54:02 -0400 Subject: [PATCH 16/34] DAOS-16650 control: dmg system exclude, update group version (#15288) (#15349) With this change, when a daos administrator runs dmg system exclude for a given set of engines, the system map version / cart primary group version will be updated. In turn, daos_engines will more immediately detect the "loss" of the administratively excluded engines, update pool maps and perform rebuild. This change supports a use case of a proactive exclusion of ranks that are expected to be impacted by planned maintenance that would cut off connectivity to certain engines. Signed-off-by: Kenneth Cain --- src/control/server/mgmt_system.go | 2 ++ src/control/server/mgmt_system_test.go | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index e85091b5529..a35753d95ba 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -1002,6 +1002,8 @@ func (svc *mgmtSvc) SystemExclude(ctx context.Context, req *mgmtpb.SystemExclude }) } + svc.reqGroupUpdate(ctx, false) + return resp, nil } diff --git a/src/control/server/mgmt_system_test.go b/src/control/server/mgmt_system_test.go index f482b1943a1..0cffe7f63d8 100644 --- a/src/control/server/mgmt_system_test.go +++ b/src/control/server/mgmt_system_test.go @@ -1765,6 +1765,7 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) { mockMember(t, 3, 2, "joined"), }, }, + "unexclude hosts": { req: &mgmtpb.SystemExcludeReq{Hosts: test.MockHostAddr(1).String(), Clear: true}, members: system.Members{ @@ -1795,12 +1796,32 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) { if tc.req != nil && tc.req.Sys == "" { tc.req.Sys = build.DefaultSystemName } + + startMapVer, err := svc.sysdb.CurMapVersion() + if err != nil { + t.Fatalf("startMapVer CurMapVersion() failed\n") + return + } gotResp, gotAPIErr := svc.SystemExclude(ctx, tc.req) test.CmpErr(t, tc.expAPIErr, gotAPIErr) if tc.expAPIErr != nil { return } + // Check for any system map version increase by the (asynchronous) update. + // Test will time out if it never happens, thus choice of an infinite loop here. + for { + curMapVer, err := svc.sysdb.CurMapVersion() + if err != nil { + t.Fatalf("CurMapVersion() failed\n") + return + } + + if curMapVer > startMapVer { + break + } + } + checkRankResults(t, tc.expResults, gotResp.Results) checkMembers(t, tc.expMembers, svc.membership) }) From 2819d4532e00b2395396ba3294e0bdc2c6a0449c Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Thu, 24 Oct 2024 15:04:10 +0800 Subject: [PATCH 17/34] DAOS-16488 chk: take sd_lock before accessing VOS sys_db - b26 (#15269) The VOS sys_db may have multuiple users, such as SMD and CHK. It is caller's duty to take lock against the VOS sys_db before accessing it to handle concurrent operations from multiple XS. Signed-off-by: Fan Yong --- src/chk/chk_vos.c | 67 ++++++++++++++++++++++++++--------------------- src/common/lru.c | 4 ++- src/vos/sys_db.c | 27 +++++++++++++++---- 3 files changed, 62 insertions(+), 36 deletions(-) diff --git a/src/chk/chk_vos.c b/src/chk/chk_vos.c index fdefc2995f7..af2904affd1 100644 --- a/src/chk/chk_vos.c +++ b/src/chk/chk_vos.c @@ -19,11 +19,20 @@ chk_db_fetch(char *key, int key_size, void *val, int val_size) { d_iov_t key_iov; d_iov_t val_iov; + int rc; d_iov_set(&key_iov, key, key_size); d_iov_set(&val_iov, val, val_size); - return chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + + rc = chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); + + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + + return rc; } static int @@ -33,21 +42,17 @@ chk_db_update(char *key, int key_size, void *val, int val_size) d_iov_t val_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } - d_iov_set(&key_iov, key, key_size); d_iov_set(&val_iov, val, val_size); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); -out: return rc; } @@ -57,27 +62,33 @@ chk_db_delete(char *key, int key_size) d_iov_t key_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } - d_iov_set(&key_iov, key, key_size); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + rc = chk_db->sd_delete(chk_db, CHK_DB_TABLE, &key_iov); - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); -out: return rc; } static int chk_db_traverse(sys_db_trav_cb_t cb, void *args) { - return chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args); + int rc; + + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + + rc = chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args); + + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + + return rc; } int @@ -243,11 +254,8 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) d_iov_t val_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); if (cpp->cp_rank_nr != 0 && rank_list != NULL) { D_ASSERTF(cpp->cp_rank_nr == rank_list->rl_nr, "Invalid rank nr %u/%u\n", @@ -259,7 +267,7 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); if (rc != 0) - goto end; + goto out; } d_iov_set(&key_iov, CHK_PROPERTY, strlen(CHK_PROPERTY)); @@ -267,11 +275,10 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); -end: - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); - out: + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + if (rc != 0) D_ERROR("Failed to update check property on rank %u: "DF_RC"\n", dss_self_rank(), DP_RC(rc)); diff --git a/src/common/lru.c b/src/common/lru.c index de86d367e0e..87dbdaddaa9 100644 --- a/src/common/lru.c +++ b/src/common/lru.c @@ -255,7 +255,9 @@ void daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink) { D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1); - D_ASSERT(d_list_empty(&llink->ll_qlink)); + D_ASSERTF(d_list_empty(&llink->ll_qlink), + "May hit corrupted item in LRU cache %p: llink %p, refs %d, prev %p, next %p\n", + lcache, llink, llink->ll_ref, llink->ll_qlink.prev, llink->ll_qlink.next); lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link); diff --git a/src/vos/sys_db.c b/src/vos/sys_db.c index e7b4a2baa20..d1f6d4bce98 100644 --- a/src/vos/sys_db.c +++ b/src/vos/sys_db.c @@ -349,8 +349,9 @@ vos_db_init(const char *db_path) int vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool destroy_db_on_fini) { - int create; - int rc; + ABT_mutex_attr attr = ABT_MUTEX_ATTR_NULL; + int create; + int rc; D_ASSERT(db_path != NULL); @@ -373,12 +374,26 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool goto failed; } - rc = ABT_mutex_create(&vos_db.db_lock); + rc = ABT_mutex_attr_create(&attr); if (rc != ABT_SUCCESS) { - rc = -DER_NOMEM; - goto failed; + D_ERROR("Failed to create mutex attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); + } + + rc = ABT_mutex_attr_set_recursive(attr, ABT_TRUE); + if (rc != ABT_SUCCESS) { + D_ERROR("Failed to set mutex attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); } + rc = ABT_mutex_create_with_attr(attr, &vos_db.db_lock); + if (rc != ABT_SUCCESS) { + D_ERROR("Failed to create mutex with attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); + } + + ABT_mutex_attr_free(&attr); + vos_db.db_poh = DAOS_HDL_INVAL; vos_db.db_coh = DAOS_HDL_INVAL; @@ -416,6 +431,8 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool } return 0; failed: + if (attr != ABT_MUTEX_ATTR_NULL) + ABT_mutex_attr_free(&attr); vos_db_fini(); return rc; } From ec3aa1c64d02d959289ec9417df96a8e945b5a0b Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Thu, 24 Oct 2024 20:34:28 +0800 Subject: [PATCH 18/34] DAOS-16469 dtx: optimize DTX CoS cache - b26 (#15085) If there are a lot of committable DTX entries in DTX CoS cache, then it may be inefficient to locate the DTX entry in CoS cache with given oid + dkey_hash, that may happen under the case of that DTX batched commit is blocked (such as because of network trouble) as to trigger DTX refresh (for DTX cleanup) on other related engines. If that happened, it will increase the system load on such engine and slow down DTX commit further more. The patch reduces unnecessary search operation inside CoS cache. Other changes: 1. Metrics (io/dtx/async_cmt_lat/tgt_id) for DTX asynchronously commit latency (with unit ms). 2. Fix a bug in sched_ult2xs() with multiple numa sockets for DSS_XS_OFFLOAD case. 3. Delay commit (or abort) collective DTX on the leader target to handle resent race. 4. Avoid blocking dtx_req_wait() if chore failed to send out some DTX RPC. 5. Some cleanup for error handling. Signed-off-by: Fan Yong --- src/container/srv_target.c | 1 + src/dtx/dtx_common.c | 43 ++-- src/dtx/dtx_cos.c | 308 ++++++++++++++++-------- src/dtx/dtx_internal.h | 4 + src/dtx/dtx_resync.c | 4 +- src/dtx/dtx_rpc.c | 104 +++++--- src/dtx/dtx_srv.c | 13 +- src/engine/ult.c | 2 +- src/include/daos_srv/container.h | 2 + src/include/daos_srv/dtx_srv.h | 9 +- src/tests/ftest/util/telemetry_utils.py | 6 +- 11 files changed, 322 insertions(+), 174 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index a6d3d3c5286..35ac3d4285c 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -669,6 +669,7 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, cont->sc_dtx_committable_coll_count = 0; D_INIT_LIST_HEAD(&cont->sc_dtx_cos_list); D_INIT_LIST_HEAD(&cont->sc_dtx_coll_list); + D_INIT_LIST_HEAD(&cont->sc_dtx_batched_list); *link = &cont->sc_list; return 0; diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index ff4f2dfe4ef..ecb156729ed 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -392,11 +392,11 @@ dtx_cleanup(void *arg) if (rc == 0) { D_ASSERT(dce != NULL); - rc = dtx_coll_commit(cont, dce, NULL); + rc = dtx_coll_commit(cont, dce, NULL, false); dtx_coll_entry_put(dce); } } else { - rc = dtx_commit(cont, &dte, NULL, 1); + rc = dtx_commit(cont, &dte, NULL, 1, false); } } @@ -620,17 +620,16 @@ dtx_batched_commit_one(void *arg) tls->dt_batched_ult_cnt++; /* dbca->dbca_reg_gen != cont->sc_dtx_batched_gen means someone reopen the container. */ - while (!dss_ult_exiting(dbca->dbca_commit_req) && + while (!dss_ult_exiting(dbca->dbca_commit_req) && dtx_cont_opened(cont) && dbca->dbca_reg_gen == cont->sc_dtx_batched_gen) { struct dtx_entry **dtes = NULL; - struct dtx_cos_key *dcks = NULL; struct dtx_coll_entry *dce = NULL; struct dtx_stat stat = { 0 }; int cnt; int rc; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, NULL, - DAOS_EPOCH_MAX, false, &dtes, &dcks, &dce); + DAOS_EPOCH_MAX, false, &dtes, NULL, &dce); if (cnt == 0) break; @@ -644,11 +643,11 @@ dtx_batched_commit_one(void *arg) /* Currently, commit collective DTX one by one. */ D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, NULL, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, NULL, cnt, true); } - dtx_free_committable(dtes, dcks, dce, cnt); + dtx_free_committable(dtes, NULL, dce, cnt); if (rc != 0) { D_WARN("Fail to batched commit %d entries for "DF_UUID": "DF_RC"\n", cnt, DP_UUID(cont->sc_uuid), DP_RC(rc)); @@ -1271,7 +1270,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul uint32_t flags; int status = -1; int rc = 0; - int i; bool aborted = false; bool unpin = false; @@ -1424,10 +1422,10 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul vos_dtx_mark_committable(dth); if (dlh->dlh_coll) { - rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL); + rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL, false); } else { dte = &dth->dth_dte; - rc = dtx_commit(cont, &dte, NULL, 1); + rc = dtx_commit(cont, &dte, NULL, 1, false); } if (rc != 0) @@ -1487,15 +1485,9 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul /* If piggyback DTX has been done everywhere, then need to handle CoS cache. * It is harmless to keep some partially committed DTX entries in CoS cache. */ - if (result == 0 && dth->dth_cos_done) { - for (i = 0; i < dth->dth_dti_cos_count; i++) - dtx_cos_del(cont, &dth->dth_dti_cos[i], - &dth->dth_leader_oid, dth->dth_dkey_hash); - } else { - for (i = 0; i < dth->dth_dti_cos_count; i++) - dtx_cos_put_piggyback(cont, &dth->dth_dti_cos[i], - &dth->dth_leader_oid, dth->dth_dkey_hash); - } + dtx_cos_put_piggyback(cont, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_dti_cos, + dth->dth_dti_cos_count, + (result == 0 && dth->dth_cos_done) ? true : false); D_DEBUG(DB_IO, "Stop the DTX "DF_DTI" ver %u, dkey %lu, %s, cos %d/%d: result "DF_RC"\n", DP_DTI(&dth->dth_xid), dth->dth_ver, (unsigned long)dth->dth_dkey_hash, @@ -1654,7 +1646,8 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db struct dtx_coll_entry *dce = NULL; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, - NULL, DAOS_EPOCH_MAX, true, &dtes, &dcks, &dce); + NULL, DAOS_EPOCH_MAX, true, &dtes, + dbca->dbca_commit_req != NULL ? &dcks : NULL, &dce); if (cnt <= 0) D_GOTO(out, rc = cnt); @@ -1675,9 +1668,9 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db if (dce != NULL) { D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, dcks, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, dcks, cnt, true); } dtx_free_committable(dtes, dcks, dce, cnt); } @@ -2365,9 +2358,9 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, if (dce != NULL) { D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, dcks, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, dcks, cnt, true); } dtx_free_committable(dtes, dcks, dce, cnt); if (rc < 0) { diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c index 6e1d042b82b..4c165f94d0c 100644 --- a/src/dtx/dtx_cos.c +++ b/src/dtx/dtx_cos.c @@ -54,6 +54,8 @@ struct dtx_cos_rec_child { d_list_t dcrc_gl_committable; /* Link into related dcr_{reg,prio}_list. */ d_list_t dcrc_lo_link; + /* Link into container::sc_dtx_batched_list. */ + d_list_t dcrc_batched_link; union { struct dtx_entry *dcrc_dte; struct dtx_coll_entry *dcrc_dce; @@ -61,8 +63,12 @@ struct dtx_cos_rec_child { /* The DTX epoch. */ daos_epoch_t dcrc_epoch; struct dtx_cos_rec *dcrc_ptr; + uint64_t dcrc_ready_time; uint32_t dcrc_piggyback_refs; - uint32_t dcrc_coll:1; /* For collective DTX. */ + uint32_t dcrc_expcmt:1, + dcrc_prio:1, + dcrc_reg:1, + dcrc_coll:1; /* For collective DTX. */ }; struct dtx_cos_rec_bundle { @@ -129,6 +135,8 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, return -DER_NOMEM; } + D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link); + dcrc->dcrc_ready_time = daos_getmtime_coarse(); dcrc->dcrc_epoch = rbund->epoch; dcrc->dcrc_ptr = dcr; if (rbund->flags & DCF_COLL) { @@ -144,12 +152,15 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_tm_inc_gauge(tls->dt_committable, 1); if (rbund->flags & DCF_EXP_CMT) { + dcrc->dcrc_expcmt = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list); dcr->dcr_expcmt_count = 1; } else if (rbund->flags & DCF_SHARED) { + dcrc->dcrc_prio = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_prio_count = 1; } else { + dcrc->dcrc_reg = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list); dcr->dcr_reg_count = 1; } @@ -177,6 +188,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -190,6 +202,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -203,6 +216,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -256,6 +270,8 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec, if (dcrc == NULL) return -DER_NOMEM; + D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link); + dcrc->dcrc_ready_time = daos_getmtime_coarse(); dcrc->dcrc_epoch = rbund->epoch; dcrc->dcrc_ptr = dcr; if (rbund->flags & DCF_COLL) { @@ -271,12 +287,15 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec, d_tm_inc_gauge(tls->dt_committable, 1); if (rbund->flags & DCF_EXP_CMT) { + dcrc->dcrc_expcmt = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list); dcr->dcr_expcmt_count++; } else if (rbund->flags & DCF_SHARED) { + dcrc->dcrc_prio = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_prio_count++; } else { + dcrc->dcrc_reg = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list); dcr->dcr_reg_count++; } @@ -294,6 +313,53 @@ btr_ops_t dtx_btr_cos_ops = { .to_rec_update = dtx_cos_rec_update, }; +static int +dtx_cos_del_one(struct ds_cont_child *cont, struct dtx_cos_rec_child *dcrc) +{ + struct dtx_cos_key key; + d_iov_t kiov; + struct dtx_cos_rec *dcr = dcrc->dcrc_ptr; + uint64_t time = daos_getmtime_coarse() - dcrc->dcrc_ready_time; + int rc = 0; + + d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_lo_link); + if (!d_list_empty(&dcrc->dcrc_batched_link)) + d_list_del_init(&dcrc->dcrc_batched_link); + + if (dcrc->dcrc_expcmt) + dcr->dcr_expcmt_count--; + else if (dcrc->dcrc_prio) + dcr->dcr_prio_count--; + else + dcr->dcr_reg_count--; + + if (dcrc->dcrc_coll) + cont->sc_dtx_committable_coll_count--; + cont->sc_dtx_committable_count--; + + d_tm_set_gauge(dtx_tls_get()->dt_async_cmt_lat, time); + + if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && dcr->dcr_expcmt_count == 0) { + key.oid = dcr->dcr_oid; + key.dkey_hash = dcr->dcr_dkey_hash; + d_iov_set(&kiov, &key, sizeof(key)); + rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL); + } + + DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, + "Remove DTX "DF_DTI" from CoS cache", DP_DTI(&dcrc->dcrc_dte->dte_xid)); + + if (dcrc->dcrc_coll) + dtx_coll_entry_put(dcrc->dcrc_dce); + else + dtx_entry_put(dcrc->dcrc_dte); + + D_FREE(dcrc); + + return rc; +} + int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, daos_unit_oid_t *oid, daos_epoch_t epoch, bool force, @@ -306,18 +372,45 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, uint32_t count; uint32_t i = 0; + /* Last batched commit failed, let's re-commit them. */ + if (dcks == NULL && !d_list_empty(&cont->sc_dtx_batched_list)) { + dcrc = d_list_entry(cont->sc_dtx_batched_list.next, struct dtx_cos_rec_child, + dcrc_batched_link); + if (unlikely(dcrc->dcrc_coll)) { + *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); + return 1; + } + + D_ALLOC_ARRAY(dte_buf, max_cnt); + if (dte_buf == NULL) + return -DER_NOMEM; + + d_list_for_each_entry(dcrc, &cont->sc_dtx_batched_list, dcrc_batched_link) { + D_ASSERT(i < max_cnt); + dte_buf[i++] = dtx_entry_get(dcrc->dcrc_dte); + } + + *dtes = dte_buf; + return i; + } + /* Process collective DXT with higher priority. */ if (!d_list_empty(&cont->sc_dtx_coll_list) && oid == NULL) { d_list_for_each_entry(dcrc, &cont->sc_dtx_coll_list, dcrc_gl_committable) { if (epoch >= dcrc->dcrc_epoch && (dcrc->dcrc_piggyback_refs == 0 || force)) { - D_ALLOC_PTR(dck_buf); - if (dck_buf == NULL) - return -DER_NOMEM; - - dck_buf->oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; - *dcks = dck_buf; + if (dcks != NULL) { + D_ALLOC_PTR(dck_buf); + if (dck_buf == NULL) + return -DER_NOMEM; + + dck_buf->oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + *dcks = dck_buf; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, + &cont->sc_dtx_batched_list); + } *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); return 1; @@ -326,19 +419,19 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, } count = min(cont->sc_dtx_committable_count, max_cnt); - if (count == 0) { - *dtes = NULL; + if (count == 0) return 0; - } D_ALLOC_ARRAY(dte_buf, count); if (dte_buf == NULL) return -DER_NOMEM; - D_ALLOC_ARRAY(dck_buf, count); - if (dck_buf == NULL) { - D_FREE(dte_buf); - return -DER_NOMEM; + if (dcks != NULL) { + D_ALLOC_ARRAY(dck_buf, count); + if (dck_buf == NULL) { + D_FREE(dte_buf); + return -DER_NOMEM; + } } d_list_for_each_entry(dcrc, &cont->sc_dtx_cos_list, dcrc_gl_committable) { @@ -353,17 +446,26 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, continue; D_FREE(dte_buf); - dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; - *dcks = dck_buf; + if (dcks != NULL) { + dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + *dcks = dck_buf; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, + &cont->sc_dtx_batched_list); + } *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); return 1; } dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte); - dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + if (dcks != NULL) { + dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, &cont->sc_dtx_batched_list); + } if (++i >= count) break; @@ -372,10 +474,10 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, if (i == 0) { D_FREE(dte_buf); D_FREE(dck_buf); - *dtes = NULL; } else { *dtes = dte_buf; - *dcks = dck_buf; + if (dcks != NULL) + *dcks = dck_buf; } return i; @@ -436,32 +538,44 @@ dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, } void -dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid, - daos_unit_oid_t *oid, uint64_t dkey_hash) +dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, + struct dtx_id xid[], uint32_t count, bool rm) { struct dtx_cos_key key; d_iov_t kiov; d_iov_t riov; struct dtx_cos_rec *dcr; struct dtx_cos_rec_child *dcrc; + int del = 0; int rc; + int i; key.oid = *oid; key.dkey_hash = dkey_hash; d_iov_set(&kiov, &key, sizeof(key)); d_iov_set(&riov, NULL, 0); - /* It is normal that the DTX entry (to be put) in CoS has already been removed by race. */ - rc = dbtree_lookup(cont->sc_dtx_cos_hdl, &kiov, &riov); if (rc == 0) { dcr = (struct dtx_cos_rec *)riov.iov_buf; - d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { - dcrc->dcrc_piggyback_refs--; - return; + for (i = 0; i < count; i++) { + d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { + if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], + sizeof(struct dtx_id)) == 0) { + if (rm) { + rc = dtx_cos_del_one(cont, dcrc); + if (rc == 0) + del++; + } else { + dcrc->dcrc_piggyback_refs--; + } + break; + } } } + + if (del > 0) + d_tm_dec_gauge(dtx_tls_get()->dt_committable, del); } } @@ -493,12 +607,12 @@ dtx_cos_add(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid, DAOS_INTENT_UPDATE, &kiov, &riov, NULL); if (flags & DCF_COLL) - D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert coll DTX "DF_DTI" to CoS cache, " + D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert coll DTX "DF_DTI" to CoS cache, " DF_UOID", key %lu, flags %x: "DF_RC"\n", DP_DTI(&((struct dtx_coll_entry *)entry)->dce_xid), DP_UOID(*oid), (unsigned long)dkey_hash, flags, DP_RC(rc)); else - D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert reg DTX "DF_DTI" to CoS cache, " + D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert reg DTX "DF_DTI" to CoS cache, " DF_UOID", key %lu, flags %x: "DF_RC"\n", DP_DTI(&((struct dtx_entry *)entry)->dte_xid), DP_UOID(*oid), (unsigned long)dkey_hash, flags, DP_RC(rc)); @@ -530,82 +644,36 @@ dtx_cos_del(struct ds_cont_child *cont, struct dtx_id *xid, dcr = (struct dtx_cos_rec *)riov.iov_buf; d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 1); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_prio_count--; - - D_GOTO(out, found = 1); } d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 2); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_reg_count--; - - D_GOTO(out, found = 2); } d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 3); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_expcmt_count--; - - D_GOTO(out, found = 3); } out: - if (found > 0) { + if (found > 0) d_tm_dec_gauge(dtx_tls_get()->dt_committable, 1); - if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && - dcr->dcr_expcmt_count == 0) - rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL); - } - if (rc == 0 && found == 0) rc = -DER_NONEXIST; - D_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_IO, - "Remove DTX "DF_DTI" from CoS " - "cache, "DF_UOID", key %lu, %s shared entry: rc = "DF_RC"\n", - DP_DTI(xid), DP_UOID(*oid), (unsigned long)dkey_hash, - found == 1 ? "has" : "has not", DP_RC(rc)); + DL_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc, + "Remove DTX from CoS cache "DF_UOID", key %lu", + DP_UOID(*oid), (unsigned long)dkey_hash); return rc == -DER_NONEXIST ? 0 : rc; } @@ -624,6 +692,12 @@ dtx_cos_oldest(struct ds_cont_child *cont) return dcrc->dcrc_epoch; } +/* + * It is inefficient to search some item on a very long list. So let's skip + * the search if the length exceeds DTX_COS_SEARCH_MAX. That is not fatal. + */ +#define DTX_COS_SEARCH_MAX 32 + void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash) @@ -647,8 +721,13 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, dcr = (struct dtx_cos_rec *)riov.iov_buf; + if (dcr->dcr_reg_count > DTX_COS_SEARCH_MAX) + goto expcmt; + d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) { if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + dcrc->dcrc_reg = 0; + dcrc->dcrc_prio = 1; d_list_del(&dcrc->dcrc_lo_link); d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_reg_count--; @@ -658,14 +737,9 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, } } - d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { - d_list_del(&dcrc->dcrc_lo_link); - d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); - - D_GOTO(out, found = true); - } - } +expcmt: + if (dcr->dcr_expcmt_count > DTX_COS_SEARCH_MAX) + goto out; d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) { if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) @@ -683,3 +757,39 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, /* It is normal that the DTX entry (for priority) in CoS has been committed by race. */ } + +void +dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], uint32_t count) +{ + struct dtx_cos_rec_child *dcrc; + int del = 0; + int rc; + int i = 0; + bool found; + + while ((dcrc = d_list_pop_entry(&cont->sc_dtx_batched_list, struct dtx_cos_rec_child, + dcrc_batched_link)) != NULL) { + for (found = false; i < count && !found; i++) { + /* + * Some entries in the sc_dtx_batched_list may have been committed by + * others by race. Since the entries order in the sc_dtx_batched_list + * will not be changed, let's compare with xid[i] via one cycle scan. + */ + if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], sizeof(struct dtx_id)) == 0) { + found = true; + + if (rm[i]) { + rc = dtx_cos_del_one(cont, dcrc); + if (rc == 0) + del++; + } + } + } + + /* There must be one in xid array that matches current dcrc. */ + D_ASSERT(found); + } + + if (del > 0) + d_tm_dec_gauge(dtx_tls_get()->dt_committable, del); +} diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index a42bcc1d7d6..06d0333dd77 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -213,6 +213,7 @@ struct dtx_pool_metrics { struct dtx_tls { struct d_tm_node_t *dt_committable; struct d_tm_node_t *dt_dtx_leader_total; + struct d_tm_node_t *dt_async_cmt_lat; uint64_t dt_agg_gen; uint32_t dt_batched_ult_cnt; }; @@ -263,6 +264,9 @@ uint64_t dtx_cos_oldest(struct ds_cont_child *cont); void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash); +void dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], + uint32_t count); + /* dtx_rpc.c */ int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index b98a6469954..756dfe0ca44 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -115,7 +115,7 @@ dtx_resync_commit(struct ds_cont_child *cont, } if (j > 0) { - rc = dtx_commit(cont, dtes, dcks, j); + rc = dtx_commit(cont, dtes, dcks, j, true); if (rc < 0) D_ERROR("Failed to commit the DTXs: rc = "DF_RC"\n", DP_RC(rc)); @@ -359,7 +359,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_un dck.oid = oid; dck.dkey_hash = dkey_hash; - rc = dtx_coll_commit(cont, dce, &dck); + rc = dtx_coll_commit(cont, dce, &dck, true); } dtx_coll_entry_put(dce); diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index e654047a621..5423f6c0cc8 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -225,9 +225,10 @@ dtx_req_cb(const struct crt_cb_info *cb_info) } out: - D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: " - "epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future, - drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc); + DL_CDEBUG(rc < 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc, + "DTX req for opc %x (req %p future %p) got reply from %d/%d: " + "epoch :"DF_X64, dra->dra_opc, req, dra->dra_future, + drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0); drr->drr_comp = 1; drr->drr_result = rc; @@ -397,19 +398,14 @@ dtx_req_list_send(struct dtx_common_args *dca, bool is_reentrance) if (unlikely(dra->dra_opc == DTX_COMMIT && dca->dca_i == 0 && DAOS_FAIL_CHECK(DAOS_DTX_FAIL_COMMIT))) - rc = dtx_req_send(dca->dca_drr, 1); + dtx_req_send(dca->dca_drr, 1); else - rc = dtx_req_send(dca->dca_drr, dca->dca_epoch); - if (rc != 0) { - /* If the first sub-RPC failed, then break, otherwise - * other remote replicas may have already received the - * RPC and executed it, so have to go ahead. - */ - if (dca->dca_i == 0) { - ABT_future_free(&dra->dra_future); - return rc; - } - } + dtx_req_send(dca->dca_drr, dca->dca_epoch); + /* + * Do not care dtx_req_send result, itself or its cb func will set dra->dra_future. + * Each RPC is independent from the others, let's go head to handle the other RPCs + * and set dra->dra_future that will avoid blocking the RPC sponsor - dtx_req_wait. + */ /* dca->dca_drr maybe not points to a real entry if all RPCs have been sent. */ dca->dca_drr = d_list_entry(dca->dca_drr->drr_link.next, @@ -616,12 +612,8 @@ dtx_rpc_helper(struct dss_chore *chore, bool is_reentrance) rc = dtx_req_list_send(dca, is_reentrance); if (rc == DSS_CHORE_YIELD) return DSS_CHORE_YIELD; - if (rc == DSS_CHORE_DONE) - rc = 0; - if (rc != 0) - dca->dca_dra.dra_result = rc; - D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, "%p: DTX RPC chore for %u done: %d\n", chore, - dca->dca_dra.dra_opc, rc); + D_ASSERTF(rc == DSS_CHORE_DONE, "Unexpected helper return value for RPC %u: %d\n", + dca->dca_dra.dra_opc, rc); if (dca->dca_chore_eventual != ABT_EVENTUAL_NULL) { rc = ABT_eventual_set(dca->dca_chore_eventual, NULL, 0); D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_set: %d\n", rc); @@ -737,8 +729,10 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry **dtes, } rc = dss_chore_delegate(&dca->dca_chore, dtx_rpc_helper); - if (rc != 0) + if (rc != 0) { + ABT_eventual_free(&dca->dca_chore_eventual); goto out; + } rc = ABT_eventual_wait(dca->dca_chore_eventual, NULL); D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_wait: %d\n", rc); @@ -809,7 +803,7 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry **dtes, */ int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count) + struct dtx_cos_key *dcks, int count, bool has_cos) { struct dtx_common_args dca; struct dtx_req_args *dra = &dca.dca_dra; @@ -842,7 +836,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, rc1 = vos_dtx_set_flags(cont->sc_hdl, dca.dca_dtis, count, DTE_PARTIAL_COMMITTED); } else { - if (dcks != NULL) { + if (has_cos) { if (count > 1) { D_ALLOC_ARRAY(rm_cos, count); if (rm_cos == NULL) @@ -862,12 +856,16 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, } if (rc1 == 0 && rm_cos != NULL) { - for (i = 0; i < count; i++) { - if (rm_cos[i]) { - D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub)); - dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid, - dcks[i].dkey_hash); + if (dcks != NULL) { + for (i = 0; i < count; i++) { + if (rm_cos[i]) { + D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub)); + dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid, + dcks[i].dkey_hash); + } } + } else { + dtx_cos_batched_del(cont, dca.dca_dtis, rm_cos, count); } } @@ -1237,7 +1235,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che case DTX_ST_COMMITTABLE: dck.oid = dsp->dsp_oid; dck.dkey_hash = dsp->dsp_dkey_hash; - rc = dtx_commit(cont, &pdte, &dck, 1); + rc = dtx_commit(cont, &pdte, &dck, 1, true); if (rc < 0 && rc != -DER_NONEXIST && for_io) d_list_add_tail(&dsp->dsp_link, cmt_list); else @@ -1258,7 +1256,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che case DSHR_NEED_COMMIT: { dck.oid = dsp->dsp_oid; dck.dkey_hash = dsp->dsp_dkey_hash; - rc = dtx_commit(cont, &pdte, &dck, 1); + rc = dtx_commit(cont, &pdte, &dck, 1, true); if (rc < 0 && rc != -DER_NONEXIST && for_io) d_list_add_tail(&dsp->dsp_link, cmt_list); else @@ -1571,17 +1569,20 @@ dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret) { int rc; - rc = ABT_future_wait(dcra->dcra_future); - D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, - "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", - dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); - ABT_future_free(&dcra->dcra_future); + if (dcra->dcra_future != ABT_FUTURE_NULL) { + rc = ABT_future_wait(dcra->dcra_future); + D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, + "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", + dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); + ABT_future_free(&dcra->dcra_future); + } return ret != 0 ? ret : dcra->dcra_result; } int -dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck) +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck, + bool has_cos) { struct dtx_coll_rpc_args dcra = { 0 }; int *results = NULL; @@ -1591,11 +1592,22 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d int rc1 = 0; int rc2 = 0; int i; + bool cos = true; if (dce->dce_ranks != NULL) rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_COMMIT, 0, &dcra); + /* + * NOTE: Before committing the DTX on remote participants, we cannot remove the active + * DTX locally; otherwise, the local committed DTX entry may be removed via DTX + * aggregation before remote participants commit done. Under such case, if some + * remote DTX participant triggere DTX_REFRESH for such DTX during the interval, + * then it will get -DER_TX_UNCERTAIN, that may cause related application to be + * failed. So here, we let remote participants to commit firstly, if failed, we + * will ask the leader to retry the commit until all participants got committed. + */ if (dce->dce_bitmap != NULL) { + clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, &results); @@ -1634,8 +1646,12 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d * to remove the collective DTX entry from the CoS even if the commit failed remotely. * Otherwise, the batched commit ULT may be blocked by such "bad" entry. */ - if (rc2 == 0 && dck != NULL) - dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash); + if (rc2 == 0 && has_cos) { + if (dck != NULL) + dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash); + else + dtx_cos_batched_del(cont, &dce->dce_xid, &cos, 1); + } D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, "Collectively commit DTX "DF_DTI": %d/%d/%d\n", @@ -1658,7 +1674,17 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_ranks != NULL) rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_ABORT, epoch, &dcra); + /* + * NOTE: The DTX abort maybe triggered by dtx_leader_end() for timeout on some DTX + * participant(s). Under such case, the client side RPC sponsor may also hit + * the RPC timeout and resends related RPC to the leader. Here, to avoid DTX + * abort and resend RPC forwarding being executed in parallel, we will abort + * local DTX after remote done, before that the logic of handling resent RPC + * on server will find the local pinned DTX entry then notify related client + * to resend sometime later. + */ if (dce->dce_bitmap != NULL) { + clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, &results); diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 93cc6744a2d..9e78100c451 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -34,8 +34,8 @@ dtx_tls_init(int tags, int xs_id, int tgt_id) tls->dt_agg_gen = 1; rc = d_tm_add_metric(&tls->dt_committable, D_TM_STATS_GAUGE, - "total number of committable DTX entries", - "entries", "io/dtx/committable/tgt_%u", tgt_id); + "total number of committable DTX entries", "entry", + "io/dtx/committable/tgt_%u", tgt_id); if (rc != DER_SUCCESS) D_WARN("Failed to create DTX committable metric: " DF_RC"\n", DP_RC(rc)); @@ -48,6 +48,13 @@ dtx_tls_init(int tags, int xs_id, int tgt_id) D_WARN("Failed to create DTX leader metric: " DF_RC"\n", DP_RC(rc)); + rc = d_tm_add_metric(&tls->dt_async_cmt_lat, D_TM_STATS_GAUGE, + "DTX async commit latency", "ms", + "io/dtx/async_cmt_lat/tgt_%u", tgt_id); + if (rc != DER_SUCCESS) + D_WARN("Failed to create DTX async commit latency metric: " DF_RC"\n", + DP_RC(rc)); + return tls; } @@ -117,7 +124,7 @@ dtx_metrics_alloc(const char *path, int tgt_id) rc = d_tm_add_metric(&metrics->dpm_total[DTX_PROTO_SRV_RPC_COUNT], D_TM_COUNTER, "total number of processed sync DTX_COMMIT", "ops", - "%s/ops/sync_dtx_commit/tgt_%u", path, tgt_id); + "%s/ops/dtx_sync_commit/tgt_%u", path, tgt_id); if (rc != DER_SUCCESS) D_WARN("Failed to create sync DTX_COMMIT RPC cnt metric: "DF_RC"\n", DP_RC(rc)); diff --git a/src/engine/ult.c b/src/engine/ult.c index 94a2a0f9390..fbeb3f538fa 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -458,7 +458,7 @@ sched_ult2xs(int xs_type, int tgt_id) break; case DSS_XS_OFFLOAD: if (dss_numa_nr > 1) - xs_id = sched_ult2xs_multisocket(xs_type, tgt_id); + return sched_ult2xs_multisocket(xs_type, tgt_id); if (!dss_helper_pool) { if (dss_tgt_offload_xs_nr > 0) xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr; diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 793d585c07a..f99f4df14e3 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -134,6 +134,8 @@ struct ds_cont_child { d_list_t sc_dtx_cos_list; /* The global list for committable collective DTXs. */ d_list_t sc_dtx_coll_list; + /* The list for current DTX batched commit. */ + d_list_t sc_dtx_batched_list; /* the pool map version of updating DAOS_PROP_CO_STATUS prop */ uint32_t sc_status_pm_ver; }; diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 7b3f7e9ee57..7c60d2deaa0 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -318,8 +318,8 @@ int dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, int max, struct dtx_id **dtis); void -dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid, - daos_unit_oid_t *oid, uint64_t dkey_hash); +dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, + struct dtx_id xid[], uint32_t count, bool rm); int dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func, dtx_agg_cb_t agg_cb, int allow_failure, void *func_arg); @@ -338,14 +338,15 @@ int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, daos_epoch_t epoch); int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count); + struct dtx_cos_key *dcks, int count, bool has_cos); int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont); int -dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck); +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck, + bool has_cos); int dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch); diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index db424b6de68..17f1753b100 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -90,6 +90,7 @@ class TelemetryUtils(): "engine_pool_ops_dtx_coll_commit", "engine_pool_ops_dtx_commit", "engine_pool_ops_dtx_refresh", + "engine_pool_ops_dtx_sync_commit", "engine_pool_ops_ec_agg", "engine_pool_ops_ec_rep", "engine_pool_ops_fetch", @@ -200,6 +201,8 @@ class TelemetryUtils(): "engine_dmabuff_queued_reqs", "engine_dmabuff_grab_errs", *_gen_stats_metrics("engine_dmabuff_grab_retries")] + ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS = \ + _gen_stats_metrics("engine_io_dtx_async_cmt_lat") ENGINE_IO_DTX_COMMITTABLE_METRICS = \ _gen_stats_metrics("engine_io_dtx_committable") ENGINE_IO_DTX_COMMITTED_METRICS = \ @@ -304,7 +307,8 @@ class TelemetryUtils(): _gen_stats_metrics("engine_io_ops_tgt_update_active") ENGINE_IO_OPS_UPDATE_ACTIVE_METRICS = \ _gen_stats_metrics("engine_io_ops_update_active") - ENGINE_IO_METRICS = ENGINE_IO_DTX_COMMITTABLE_METRICS +\ + ENGINE_IO_METRICS = ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS +\ + ENGINE_IO_DTX_COMMITTABLE_METRICS +\ ENGINE_IO_DTX_COMMITTED_METRICS +\ ENGINE_IO_LATENCY_FETCH_METRICS +\ ENGINE_IO_LATENCY_BULK_FETCH_METRICS +\ From 2b5620b230df6a7c3700f10128998cfb5360b2fc Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Thu, 24 Oct 2024 10:47:16 -0500 Subject: [PATCH 19/34] DAOS-14262 cart: add ability to select traffic class for SWIM context (#14893) (#14917) Add SWIM_TRAFFIC_CLASS env var (default is unspec) Signed-off-by: Jerome Soumagne --- src/cart/README.env | 12 +++++++----- src/cart/crt_hg.c | 3 +++ src/cart/crt_init.c | 36 ++++++++++++++++++++++++++--------- src/cart/crt_internal_types.h | 15 +++++++++++++++ 4 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/cart/README.env b/src/cart/README.env index 00f270d7a41..b90939c8c72 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -1,13 +1,10 @@ This file lists the environment variables used in CaRT. . D_PROVIDER (Deprecated: CRT_PHY_ADDR_STR) - It determines which mercury NA plugin to be used: + It determines which mercury NA plugin and transport to be used: - set it as "ofi+verbs;ofi_rxm" to use OFI verbs;ofi_rxm provider - - set it as "ofi+gni" to use OFI gni provider - set it as "sm" to use SM plugin which only works within single node - - set it as "ofi+tcp;ofi_rxm" to use OFI tcp;ofi_rxm provider. - - set it as "ofi+sockets" to use OFI sockets provider - NOTE: This provider is deprecated in favor of "ofi+tcp;ofi_rxm" + - set it as "ofi+tcp" to use OFI tcp provider. - by default (not set or set as any other value) it will use ofi tcp provider. @@ -205,3 +202,8 @@ This file lists the environment variables used in CaRT. start copying data in an effort to release multi-recv buffers. Copy will occur when at most D_MRECV_BUF_COPY buffers remain. + SWIM_TRAFFIC_CLASS + (server only) Select a traffic class for the SWIM protocol to use and prevent potential + traffic congestion. Available options are: "unspec" (default), "best_effort", + "low_latency", "bulk_data". + diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 789d75ceb31..244373c9228 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -863,6 +863,9 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.request_post_incr = crt_gdata.cg_post_incr; init_info.multi_recv_op_max = crt_gdata.cg_mrecv_buf; init_info.multi_recv_copy_threshold = crt_gdata.cg_mrecv_buf_copy; + /* Separate SWIM traffic in an effort to prevent potential congestion. */ + if (crt_is_service() && ctx_idx == crt_gdata.cg_swim_crt_idx) + init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc; hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info); if (hg_class == NULL) { diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 21fc184d446..48d2090a5b3 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -18,6 +18,10 @@ static volatile int gdata_init_flag; struct crt_plugin_gdata crt_plugin_gdata; static bool g_prov_settings_applied[CRT_PROV_COUNT]; +#define X(a, b) b, +static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES}; +#undef X + static void crt_lib_init(void) __attribute__((__constructor__)); @@ -237,18 +241,30 @@ crt_gdata_dump(void) DUMP_GDATA_FIELD("%d", cg_rpc_quota); } +static enum crt_traffic_class +crt_str_to_tc(const char *str) +{ + enum crt_traffic_class i = 0; + + while (str != NULL && strcmp(crt_tc_name[i], str) != 0 && i < CRT_TC_UNKNOWN) + i++; + + return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i; +} + /* first step init - for initializing crt_gdata */ static int data_init(int server, crt_init_options_t *opt) { - uint32_t timeout = 0; - uint32_t credits; - uint32_t fi_univ_size = 0; - uint32_t mem_pin_enable = 0; - uint32_t is_secondary; - uint32_t post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR; - unsigned int mrecv_buf = CRT_HG_MRECV_BUF; - unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */ - int rc = 0; + uint32_t timeout = 0; + uint32_t credits; + uint32_t fi_univ_size = 0; + uint32_t mem_pin_enable = 0; + uint32_t is_secondary; + uint32_t post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR; + unsigned int mrecv_buf = CRT_HG_MRECV_BUF; + unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */ + char *swim_traffic_class = NULL; + int rc = 0; crt_env_dump(); @@ -261,6 +277,8 @@ static int data_init(int server, crt_init_options_t *opt) crt_gdata.cg_mrecv_buf = mrecv_buf; crt_env_get(D_MRECV_BUF_COPY, &mrecv_buf_copy); crt_gdata.cg_mrecv_buf_copy = mrecv_buf_copy; + crt_env_get(SWIM_TRAFFIC_CLASS, &swim_traffic_class); + crt_gdata.cg_swim_tc = crt_str_to_tc(swim_traffic_class); is_secondary = 0; /* Apply CART-890 workaround for server side only */ diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index 38133e0cf17..c3eb9cae12c 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -42,6 +42,17 @@ struct crt_na_config { char **noc_domain_str; /* Array of domains */ }; +#define CRT_TRAFFIC_CLASSES \ + X(CRT_TC_UNSPEC, "unspec") /* Leave it upon plugin to choose */ \ + X(CRT_TC_BEST_EFFORT, "best_effort") /* Best effort */ \ + X(CRT_TC_LOW_LATENCY, "low_latency") /* Low latency */ \ + X(CRT_TC_BULK_DATA, "bulk_data") /* Bulk data */ \ + X(CRT_TC_UNKNOWN, "unknown") /* Unknown */ + +#define X(a, b) a, +enum crt_traffic_class { CRT_TRAFFIC_CLASSES }; +#undef X + struct crt_prov_gdata { /** NA plugin type */ int cpg_provider; @@ -105,6 +116,9 @@ struct crt_gdata { /** global swim index for all servers */ int32_t cg_swim_crt_idx; + /** traffic class used by SWIM */ + enum crt_traffic_class cg_swim_tc; + /** credits limitation for #in-flight RPCs per target EP CTX */ uint32_t cg_credit_ep_ctx; @@ -220,6 +234,7 @@ struct crt_event_cb_priv { ENV(SWIM_PING_TIMEOUT) \ ENV(SWIM_PROTOCOL_PERIOD_LEN) \ ENV(SWIM_SUSPECT_TIMEOUT) \ + ENV_STR(SWIM_TRAFFIC_CLASS) \ ENV_STR(UCX_IB_FORK_INIT) /* uint env */ From 70b12e3331fea2c44beeba6792b316f1157eec51 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Fri, 25 Oct 2024 00:35:58 +0800 Subject: [PATCH 20/34] DAOS-16469 container: Lower log level for cont_aggregate_interval (#15283) To reduce the side-effect caused by frequent log with -DER_INPROGRESS. Signed-off-by: Fan Yong --- src/container/srv_target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 35ac3d4285c..caf2a4b2ee7 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -475,7 +475,7 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb, if (rc == -DER_SHUTDOWN) { break; /* pool destroyed */ } else if (rc < 0) { - DL_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, rc, + DL_CDEBUG(rc == -DER_BUSY || rc == -DER_INPROGRESS, DB_EPC, DLOG_ERR, rc, DF_CONT ": %s aggregate failed", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), param->ap_vos_agg ? "VOS" : "EC"); From 2a1892f02c08135e9d7d90d23c589df30068b330 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Thu, 24 Oct 2024 12:33:26 -0600 Subject: [PATCH 21/34] DAOS-16716 ci: Set reference build for PRs (#15379) Release branch PRs should use the release branch build instead of master branch build for Fault Injection reference Signed-off-by: Jeff Olivier --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 13ccc512895..8b2c027b15f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1038,7 +1038,7 @@ pipeline { } post { always { - discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', + discoverGitReferenceBuild referenceJob: 'daos-stack/daos/release%252F2.6', scm: 'daos-stack/daos', requiredResult: hudson.model.Result.UNSTABLE recordIssues enabledForFailure: true, From 23f0787ab188704aeb0c0d000e2e181777831690 Mon Sep 17 00:00:00 2001 From: Alexander Oganezov Date: Fri, 25 Oct 2024 08:51:07 -0700 Subject: [PATCH 22/34] DAOS-15914: crt_reply_send_input_free() (#14817) - New crt_reply_send_input_free() API added which releases input buffer right after HG_Respond() instead of waiting until the handle is destroyed. - srv_obj.c calls changed to use new crt_reply_send_input_free() - I/O context takes refcount on RPC - only release input buffer for target update Signed-off-by: Alexander A Oganezov Signed-off-by: Liang Zhen Co-authored-by: Liang Zhen --- src/cart/crt_hg.c | 10 +++++++++ src/cart/crt_rpc.c | 20 ++++++++++++++++++ src/cart/crt_rpc.h | 47 +++++++++++++++++++++--------------------- src/dtx/dtx_srv.c | 2 +- src/include/cart/api.h | 15 ++++++++++++++ src/object/srv_obj.c | 22 ++++++++++++++------ 6 files changed, 86 insertions(+), 30 deletions(-) diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 244373c9228..26b54b52ec3 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1482,6 +1482,16 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv) rc = crt_hgret_2_der(hg_ret); } + /* Release input buffer */ + if (rpc_priv->crp_release_input_early && !rpc_priv->crp_forward) { + hg_ret = HG_Release_input_buf(rpc_priv->crp_hg_hdl); + if (hg_ret != HG_SUCCESS) { + RPC_ERROR(rpc_priv, "HG_Release_input_buf failed, hg_ret: " DF_HG_RC "\n", + DP_HG_RC(hg_ret)); + /* Fall through */ + } + } + return rc; } diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index 72d8214aa09..ead03d1bf29 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -1550,6 +1550,26 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg) return rc; } +int +crt_reply_send_input_free(crt_rpc_t *req) +{ + struct crt_rpc_priv *rpc_priv = NULL; + int rc = 0; + + if (req == NULL) { + D_ERROR("invalid parameter (NULL req).\n"); + D_GOTO(out, rc = -DER_INVAL); + } + + rpc_priv = container_of(req, struct crt_rpc_priv, crp_pub); + rpc_priv->crp_release_input_early = 1; + + return crt_reply_send(req); + +out: + return rc; +} + int crt_reply_send(crt_rpc_t *req) { diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h index a5e6afee780..3a590933103 100644 --- a/src/cart/crt_rpc.h +++ b/src/cart/crt_rpc.h @@ -166,29 +166,30 @@ struct crt_rpc_priv { * match with crp_req_hdr.cch_flags. */ uint32_t crp_flags; - uint32_t crp_srv:1, /* flag of server received request */ - crp_output_got:1, - crp_input_got:1, - /* flag of collective RPC request */ - crp_coll:1, - /* flag of crp_tgt_uri need to be freed */ - crp_uri_free:1, - /* flag of forwarded rpc for corpc */ - crp_forward:1, - /* flag of in timeout binheap */ - crp_in_binheap:1, - /* set if a call to crt_req_reply pending */ - crp_reply_pending:1, - /* set to 1 if target ep is set */ - crp_have_ep:1, - /* RPC is tracked by the context */ - crp_ctx_tracked:1, - /* 1 if RPC fails HLC epsilon check */ - crp_fail_hlc:1, - /* RPC completed flag */ - crp_completed:1, - /* RPC originated from a primary provider */ - crp_src_is_primary:1; + uint32_t crp_srv : 1, /* flag of server received request */ + crp_output_got : 1, crp_input_got : 1, + /* flag of collective RPC request */ + crp_coll : 1, + /* flag of crp_tgt_uri need to be freed */ + crp_uri_free : 1, + /* flag of forwarded rpc for corpc */ + crp_forward : 1, + /* flag of in timeout binheap */ + crp_in_binheap : 1, + /* set if a call to crt_req_reply pending */ + crp_reply_pending : 1, + /* set to 1 if target ep is set */ + crp_have_ep : 1, + /* RPC is tracked by the context */ + crp_ctx_tracked : 1, + /* 1 if RPC fails HLC epsilon check */ + crp_fail_hlc : 1, + /* RPC completed flag */ + crp_completed : 1, + /* RPC originated from a primary provider */ + crp_src_is_primary : 1, + /* release input buffer early */ + crp_release_input_early : 1; struct crt_opc_info *crp_opc_info; /* corpc info, only valid when (crp_coll == 1) */ diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 9e78100c451..0fd4e7c513f 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -337,7 +337,7 @@ dtx_handler(crt_rpc_t *rpc) dout->do_status = rc; /* For DTX_COMMIT, it is the count of real committed DTX entries. */ dout->do_misc = committed; - rc = crt_reply_send(rpc); + rc = crt_reply_send_input_free(rpc); if (rc != 0) D_ERROR("send reply failed for DTX rpc %u: rc = "DF_RC"\n", opc, DP_RC(rc)); diff --git a/src/include/cart/api.h b/src/include/cart/api.h index 21941d7cb98..16fc2091d4b 100644 --- a/src/include/cart/api.h +++ b/src/include/cart/api.h @@ -474,6 +474,21 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg); int crt_reply_send(crt_rpc_t *req); +/** + * Send an RPC reply and free the input buffer immediately. + * Only to be called on the server side. + * + * \param[in] req pointer to RPC request + * + * \return DER_SUCCESS on success, negative value if error + * + * \note the crt_rpc_t is exported to user, caller should fill the + * crt_rpc_t::cr_output before sending the RPC reply. + * See \ref crt_req_create. + */ +int +crt_reply_send_input_free(crt_rpc_t *req); + /** * Return request buffer * diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index a51682b4785..f220149a532 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -162,7 +162,7 @@ obj_rw_complete(crt_rpc_t *rpc, struct obj_io_context *ioc, } static void -obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, +obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, bool release_input, struct obj_io_context *ioc) { struct obj_rw_out *orwo = crt_reply_get(rpc); @@ -187,7 +187,11 @@ obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, ioc->ioc_map_ver, orwo->orw_epoch, status); if (!ioc->ioc_lost_reply) { - rc = crt_reply_send(rpc); + if (release_input) + rc = crt_reply_send_input_free(rpc); + else + rc = crt_reply_send(rpc); + if (rc != 0) D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc)); } else { @@ -2079,6 +2083,8 @@ obj_ioc_init(uuid_t pool_uuid, uuid_t coh_uuid, uuid_t cont_uuid, crt_rpc_t *rpc D_ASSERT(ioc != NULL); memset(ioc, 0, sizeof(*ioc)); + + crt_req_addref(rpc); ioc->ioc_rpc = rpc; ioc->ioc_opc = opc_get(rpc->cr_opc); rc = ds_cont_find_hdl(pool_uuid, coh_uuid, &coh); @@ -2154,6 +2160,10 @@ obj_ioc_fini(struct obj_io_context *ioc, int err) ds_cont_child_put(ioc->ioc_coc); ioc->ioc_coc = NULL; } + if (ioc->ioc_rpc) { + crt_req_decref(ioc->ioc_rpc); + ioc->ioc_rpc = NULL; + } } /* Setup lite IO context, it is only for compound RPC so far: @@ -2508,7 +2518,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc) rc = vos_obj_array_remove(ioc.ioc_coc->sc_hdl, oer->er_oid, &oer->er_epoch_range, dkey, &iod->iod_name, &recx); out: - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, false, &ioc); obj_ioc_end(&ioc, rc); } @@ -2604,7 +2614,7 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc) D_ERROR(DF_UOID ": array_remove failed: " DF_RC "\n", DP_UOID(oea->ea_oid), DP_RC(rc1)); out: - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, false, &ioc); obj_ioc_end(&ioc, rc); } @@ -2733,7 +2743,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) out: if (dth != NULL) rc = dtx_end(dth, ioc.ioc_coc, rc); - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, true, &ioc); D_FREE(mbs); obj_ioc_end(&ioc, rc); } @@ -3073,7 +3083,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (ioc.ioc_map_ver < max_ver) ioc.ioc_map_ver = max_ver; - obj_rw_reply(rpc, rc, epoch.oe_value, &ioc); + obj_rw_reply(rpc, rc, epoch.oe_value, false, &ioc); D_FREE(mbs); D_FREE(dti_cos); obj_ioc_end(&ioc, rc); From 67da3b9b924c16b7b240fcac94e6cdd3044d6415 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Fri, 25 Oct 2024 23:52:33 +0800 Subject: [PATCH 23/34] DAOS-16721 object: fix coll RPC for obj with sparse layout - b26 (#15376) The old implementation did not correctly calculate some collective object RPC size, and may cause trouble when need bulk data transfer for large collective object RPC. It also potentially affects how to dispatch collective RPCs from leader to other engines. The patch also addes more sanity check for coll-punch RPC to detect potential DRAM corruption. Signed-off-by: Fan Yong --- src/dtx/dtx_coll.c | 6 ++- src/dtx/dtx_rpc.c | 5 +- src/object/cli_coll.c | 111 +++++++++++++++++++++++++------------- src/object/cli_shard.c | 7 +-- src/object/obj_internal.h | 8 ++- src/object/obj_utils.c | 2 +- src/object/srv_coll.c | 12 +++-- src/object/srv_obj.c | 8 +-- src/vos/vos_dtx.c | 3 -- 9 files changed, 107 insertions(+), 55 deletions(-) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 863307e9a7f..7e7c8199155 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -80,8 +80,12 @@ dtx_coll_prep_ult(void *arg) DP_UUID(cont->sc_uuid), DP_RC(rc)); } - if (dcpa->dcpa_result != 0) + if (dcpa->dcpa_result != 0) { + if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST) + D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n", + DP_DTI(&dci->dci_xid), opc, DP_RC(rc)); goto out; + } dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1, dci->dci_version, cont->sc_pool->spc_map_version, diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 5423f6c0cc8..2ccbfec2734 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -974,8 +974,11 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (dsp->dsp_mbs == NULL) { rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs); if (rc != 0) { - if (rc < 0 && rc != -DER_NONEXIST && for_io) + if (rc < 0 && rc != -DER_NONEXIST && for_io) { + D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", + DP_DTI(&dsp->dsp_xid), DP_RC(rc)); goto out; + } drop = true; goto next; diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index e05abadf3cf..3285bec58b3 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -178,6 +178,10 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool if (coa->coa_sparse == 0) coa->coa_dct_cap = obj_ranks; } + + /* Save obj->cob_min_rank for verification during subsequent obj_coll_prep_one(). */ + coa->coa_min_rank = obj->cob_min_rank; + D_RWLOCK_UNLOCK(&obj->cob_lock); if (coa->coa_sparse) { @@ -208,7 +212,6 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool coa->coa_dct_nr = -1; } - coa->coa_max_dct_sz = 0; coa->coa_max_shard_nr = 0; coa->coa_max_bitmap_sz = 0; coa->coa_target_nr = 0; @@ -236,17 +239,55 @@ obj_coll_oper_args_fini(struct coll_oper_args *coa) coa->coa_dct_nr = 0; } +static void +obj_coll_collapse_one(struct coll_oper_args *coa, struct daos_coll_target *dct, + uint32_t *size, bool copy) +{ + struct daos_coll_shard *dcs; + uint32_t dct_size; + int i; + + /* The size may be over estimated, no matter. */ + dct_size = sizeof(*dct) + dct->dct_bitmap_sz + + sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); + + for (i = 0; i <= dct->dct_max_shard; i++) { + dcs = &dct->dct_shards[i]; + if (dcs->dcs_nr > 1) + dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; + } + + if (coa->coa_for_modify) + dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; + + if (coa->coa_max_dct_sz < dct_size) + coa->coa_max_dct_sz = dct_size; + + if (copy) + memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); + + coa->coa_dct_nr++; + *size += dct_size; +} + +struct obj_coll_tree_args { + struct coll_oper_args *coa; + uint32_t *size; +}; + static int obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg) { - struct coll_oper_args *coa = arg; - struct daos_coll_target *dct = val->iov_buf; + struct obj_coll_tree_args *octa = arg; + struct coll_oper_args *coa = octa->coa; + struct daos_coll_target *dct = val->iov_buf; D_ASSERTF(coa->coa_dct_nr < coa->coa_dct_cap, "Too short pre-allcoated dct_array: %u vs %u\n", coa->coa_dct_nr, coa->coa_dct_cap); + D_ASSERT(dct->dct_bitmap != NULL); - memcpy(&coa->coa_dcts[coa->coa_dct_nr++], dct, sizeof(*dct)); + obj_coll_collapse_one(coa, dct, octa->size, true); /* The following members have been migrated into coa->coa_dcts. */ dct->dct_bitmap = NULL; @@ -259,6 +300,7 @@ obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg) static int obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size) { + struct obj_coll_tree_args octa; struct coll_sparse_targets *tree = coa->coa_tree; int rc = 0; @@ -270,7 +312,14 @@ obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size) D_GOTO(out, rc = -DER_NOMEM); coa->coa_sparse = 0; - rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, obj_coll_tree_cb, coa); + coa->coa_raw_sparse = 1; + coa->coa_dct_nr = 0; + coa->coa_max_dct_sz = 0; + + octa.coa = coa; + octa.size = size; + rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, + obj_coll_tree_cb, &octa); if (rc == 0) D_ASSERTF(coa->coa_dct_nr == coa->coa_dct_cap, "Something is wrong when prepare coll target array: %u vs %u\n", @@ -287,36 +336,13 @@ static int obj_coll_collapse_array(struct coll_oper_args *coa, uint32_t *size) { struct daos_coll_target *dct; - struct daos_coll_shard *dcs; - uint32_t dct_size; int i; - int j; - for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) { + for (i = 0, *size = 0, coa->coa_dct_nr = 0, coa->coa_max_dct_sz = 0; + i < coa->coa_dct_cap; i++) { dct = &coa->coa_dcts[i]; - if (dct->dct_bitmap != NULL) { - /* The size may be over estimated, no matter. */ - dct_size = sizeof(*dct) + dct->dct_bitmap_sz + - sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); - - for (j = 0; j <= dct->dct_max_shard; j++) { - dcs = &dct->dct_shards[j]; - if (dcs->dcs_nr > 1) - dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; - } - - if (coa->coa_for_modify) - dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; - - if (coa->coa_max_dct_sz < dct_size) - coa->coa_max_dct_sz = dct_size; - - if (coa->coa_dct_nr < i) - memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); - - coa->coa_dct_nr++; - *size += dct_size; - } + if (dct->dct_bitmap != NULL) + obj_coll_collapse_one(coa, dct, size, coa->coa_dct_nr < i); } /* Reset the other dct slots to avoid double free during cleanup. */ @@ -373,8 +399,9 @@ obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj, D_RWLOCK_RDLOCK(&obj->cob_lock); - D_ASSERTF(shard->do_target_rank <= obj->cob_max_rank, - "Unexpected shard with rank %u > %u\n", shard->do_target_rank, obj->cob_max_rank); + D_ASSERTF(coa->coa_min_rank == obj->cob_min_rank, + "Object "DF_OID" layout has been changed unexpectedly %u => %u, idx %u, ver %u\n", + DP_OID(obj->cob_md.omd_id), coa->coa_min_rank, obj->cob_min_rank, idx, map_ver); D_ASSERTF(shard->do_target_rank >= obj->cob_min_rank, "Unexpected shard with rank %u < %u\n", shard->do_target_rank, obj->cob_min_rank); @@ -669,7 +696,6 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo uint32_t tgt_size = 0; uint32_t mbs_max_size; uint32_t inline_size; - uint32_t flags = ORF_LEADER; uint32_t leader = -1; uint32_t len; int rc; @@ -746,6 +772,12 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo memcpy(dct, &tmp_tgt, sizeof(tmp_tgt)); } + /* 'shard' is on the leader target that is must be the coa->coa_dcts[0]. */ + D_ASSERTF(shard->do_target_rank == coa->coa_dcts[0].dct_rank, + "Object "DF_OID" target array corrupted: rank %u vs %ur, nr %u\n", + DP_OID(obj->cob_md.omd_id), shard->do_target_rank, + coa->coa_dcts[0].dct_rank, coa->coa_dct_nr); + rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs); if (rc < 0) goto out; @@ -767,12 +799,14 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo if (rc != 0) goto out; + auxi->flags = ORF_LEADER; if (auxi->io_retry) { - flags |= ORF_RESEND; + auxi->flags |= ORF_RESEND; /* Reset @enqueue_id if resend to new leader. */ if (spa->pa_auxi.target != shard->do_target_id) spa->pa_auxi.enqueue_id = 0; } else { + auxi->flags &= ~ORF_RESEND; spa->pa_auxi.obj_auxi = auxi; daos_dti_gen(&spa->pa_dti, false); } @@ -781,14 +815,15 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo spa->pa_auxi.shard = shard->do_shard_idx; if (obj_is_ec(obj)) - flags |= ORF_EC; + auxi->flags |= ORF_EC; mbs_max_size = sizeof(*mbs) + mbs->dm_data_size + sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz; return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size, coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch, - args->flags, flags, map_ver, &auxi->map_ver_reply, task); + args->flags, auxi->flags, map_ver, + &auxi->map_ver_reply, task); out: if (rc > 0) diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 36c5c5f1e0c..0c9dfc1418e 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1453,9 +1453,10 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data) DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " - DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid), - DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, - (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags); + DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout", + rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, + *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags, + cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous"); crt_req_decref(rpc); diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index ec6aa3b817e..cfe3385e959 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -292,10 +292,16 @@ struct coll_oper_args { struct shard_auxi_args coa_auxi; int coa_dct_nr; uint32_t coa_dct_cap; - uint32_t coa_max_dct_sz; + union { + /* Save obj->cob_min_rank for verification during obj_coll_prep_one(). */ + uint32_t coa_min_rank; + /* Only can be used since obj_coll_oper_args_collapse() after object layout scan. */ + uint32_t coa_max_dct_sz; + }; uint8_t coa_max_shard_nr; uint8_t coa_max_bitmap_sz; uint8_t coa_for_modify:1, + coa_raw_sparse:1, coa_sparse:1; uint8_t coa_target_nr; /* diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 7bf0ef4aaf9..82d91c966ac 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -633,7 +633,7 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t * use the "cur_pos" as the relay engine. */ pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; - if (pos != ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { + if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { memcpy(&tmp, &tgts[pos], sizeof(tmp)); memcpy(&tgts[pos], dct, sizeof(tmp)); memcpy(dct, &tmp, sizeof(tmp)); diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index 2a152b47bd6..5b59f954f86 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -239,9 +239,15 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct int i; int j; - /* dcts[0] is for current engine. */ - if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || - dcts[0].dct_shards == NULL) { + /* dcts[0] must be for current engine. */ + if (unlikely(dcts[0].dct_rank != dss_self_rank())) { + D_ERROR("Invalid targets array: rank %u vs %u, nr %u, flags %x\n", + dcts[0].dct_rank, dss_self_rank(), dct_nr, ocpi->ocpi_flags); + D_GOTO(out, rc = -DER_INVAL); + } + + if (unlikely(dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || + dcts[0].dct_shards == NULL)) { D_ERROR("Invalid input for current engine: bitmap %s, bitmap_sz %u, shards %s\n", dcts[0].dct_bitmap == NULL ? "empty" : "non-empty", dcts[0].dct_bitmap_sz, dcts[0].dct_shards == NULL ? "empty" : "non-empty"); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index f220149a532..041ea903c4f 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -5615,12 +5615,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj " DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti " - DF_DTI", forward width %u, forward depth %u\n", + DF_DTI", forward width %u, forward depth %u, flags %x\n", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid), - ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); D_ASSERT(dmi->dmi_xs_id != 0); @@ -5757,13 +5757,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc " DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, " - "tgt_nr %u, forward width %u, forward depth %u", + "tgt_nr %u, forward width %u, forward depth %u, flags %x", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count, - ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); obj_punch_complete(rpc, rc, max_ver); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index f1d69b54f4e..0f87bfd4d0e 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1973,9 +1973,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, rc = -DER_INPROGRESS; } - if (rc < 0) - D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc)); - return rc; } From c4cf4f7b58f9315c2a294c899d807edb6b8917b0 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Mon, 28 Oct 2024 13:49:15 +0000 Subject: [PATCH 24/34] DAOS-16687 control: Handle missing PCIe caps in storage query usage (#15296) (#15392) Missing PCIe capabilities when querying a NVMe SSD's configuration space is unusual but should be handled gracefully by the control-plane and shouldn't cause a failure to return usage statistics when calling dmg storage query usage. Update so that pciutils lib is only called when attempting to display health stats via dmg and not when fetching usage info. Improve clarity of workflow to ease maintenance and add test coverage for updates. Enable continued functionality when NVMe device doesn't return any extended capabilities in PCIe configuration space data by adding sentinel error to library for such a case. Signed-off-by: Tom Nabarro --- .../common/proto/ctl/storage_nvme.pb.go | 46 +++-- src/control/lib/control/storage.go | 4 +- src/control/lib/hardware/pciutils/bindings.go | 3 +- src/control/server/ctl_smd_rpc_test.go | 49 +++++ src/control/server/instance_storage_rpc.go | 134 ++++++++++---- .../server/instance_storage_rpc_test.go | 172 ++++++++++++------ src/proto/ctl/storage_nvme.proto | 3 +- 7 files changed, 308 insertions(+), 103 deletions(-) diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go index 62fede43ed4..cb2dc5099d4 100644 --- a/src/control/common/proto/ctl/storage_nvme.pb.go +++ b/src/control/common/proto/ctl/storage_nvme.pb.go @@ -95,11 +95,12 @@ type ScanNvmeReq struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - Health bool `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"` // Retrieve NVMe device health statistics - Meta bool `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"` // Retrieve metadata relating to NVMe device - Basic bool `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"` // Strip NVMe device details to only basic - MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob - RdbSize uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"` // Size of the RDB blob + Health bool `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"` // Retrieve NVMe device health statistics + Meta bool `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"` // Retrieve metadata relating to NVMe device + Basic bool `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"` // Strip NVMe device details to only basic + MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob + RdbSize uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"` // Size of the RDB blob + LinkStats bool `protobuf:"varint,6,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics } func (x *ScanNvmeReq) Reset() { @@ -169,6 +170,13 @@ func (x *ScanNvmeReq) GetRdbSize() uint64 { return 0 } +func (x *ScanNvmeReq) GetLinkStats() bool { + if x != nil { + return x.LinkStats + } + return false +} + type ScanNvmeResp struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -276,7 +284,7 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72, 0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08, - 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0x85, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, + 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, @@ -285,18 +293,20 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65, 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, - 0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, - 0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, - 0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, - 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, - 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, - 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, - 0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, - 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, - 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, - 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x12, 0x1c, 0x0a, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x06, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65, + 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b, + 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, + 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73, + 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, + 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, + 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e, + 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, + 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, + 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74, + 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go index 65942a6c12d..9d5fe470de6 100644 --- a/src/control/lib/control/storage.go +++ b/src/control/lib/control/storage.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -258,6 +258,8 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe // Health and meta details required to populate usage statistics. Health: req.NvmeHealth || req.Usage, Meta: req.Usage, + // Only request link stats if health explicitly requested. + LinkStats: req.NvmeHealth, }, }) }) diff --git a/src/control/lib/hardware/pciutils/bindings.go b/src/control/lib/hardware/pciutils/bindings.go index 86686896c80..f4d43c50fa3 100644 --- a/src/control/lib/hardware/pciutils/bindings.go +++ b/src/control/lib/hardware/pciutils/bindings.go @@ -40,6 +40,7 @@ var ( ErrMultiDevices = errors.New("want single device config got multiple") ErrCfgNotTerminated = errors.New("device config content not new-line terminated") ErrCfgMissing = errors.New("incomplete device config") + ErrNoPCIeCaps = errors.New("no pci-express capabilities found") ) // api provides the PCIeLinkStatsProvider interface by exposing a concrete implementation of @@ -150,7 +151,7 @@ func (ap *api) PCIeCapsFromConfig(cfgBytes []byte, dev *hardware.PCIDevice) erro var cp *C.struct_pci_cap = C.pci_find_cap(pciDev, C.PCI_CAP_ID_EXP, C.PCI_CAP_NORMAL) if cp == nil { - return errors.New("no pci-express capabilities found") + return ErrNoPCIeCaps } cpAddr := uint32(cp.addr) diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go index 6378e81d23c..06f1276fa25 100644 --- a/src/control/server/ctl_smd_rpc_test.go +++ b/src/control/server/ctl_smd_rpc_test.go @@ -17,6 +17,7 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/drpc" "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/hardware/pciutils" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/config" @@ -88,6 +89,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { drpcResps map[int][]*mockDrpcResponse harnessStopped bool ioStopped bool + pciDevErr error expResp *ctlpb.SmdQueryResp expErr error }{ @@ -658,6 +660,46 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, expErr: daos.FreeMemError, }, + "list-devices; with health; update link stats": { + req: &ctlpb.SmdQueryReq{ + OmitPools: true, + Rank: uint32(ranklist.NilRank), + Uuid: test.MockUUID(1), + IncludeBioHealth: true, + }, + drpcResps: map[int][]*mockDrpcResponse{ + 0: { + { + Message: &ctlpb.SmdDevResp{ + Devices: []*ctlpb.SmdDevice{pbNormDev(0)}, + }, + }, + }, + 1: { + { + Message: &ctlpb.SmdDevResp{ + Devices: []*ctlpb.SmdDevice{ + func() *ctlpb.SmdDevice { + sd := pbFaultDev(1) + sd.Ctrlr.PciCfg = "ABCD" + return sd + }(), + }, + }, + }, + { + Message: &ctlpb.BioHealthResp{ + Temperature: 1000000, + TempWarn: true, + }, + }, + }, + }, + // Prove mock link stats provider gets called when IncludeBioHealth + // flag is set and Ctrlr.PciCfg string is not empty. + pciDevErr: errors.New("link stats provider fail"), + expErr: errors.New("link stats provider fail"), + }, "ambiguous UUID": { req: &ctlpb.SmdQueryReq{ Rank: uint32(ranklist.NilRank), @@ -680,6 +722,13 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) + linkStatsProv = &mockPCIeLinkStatsProvider{ + pciDevErr: tc.pciDevErr, + } + defer func() { + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() + }() + engineCount := len(tc.drpcResps) if engineCount == 0 { engineCount = 1 diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go index fbca1e534a0..3097440f500 100644 --- a/src/control/server/instance_storage_rpc.go +++ b/src/control/server/instance_storage_rpc.go @@ -23,14 +23,19 @@ import ( "github.com/daos-stack/daos/src/control/events" "github.com/daos-stack/daos/src/control/fault" "github.com/daos-stack/daos/src/control/lib/hardware" - "github.com/daos-stack/daos/src/control/lib/hardware/hwprov" + "github.com/daos-stack/daos/src/control/lib/hardware/pciutils" "github.com/daos-stack/daos/src/control/server/storage" ) var ( - scanSmd = listSmdDevices - getCtrlrHealth = getBioHealth + // Function pointers to enable mocking. + scanSmd = listSmdDevices + scanHealth = getBioHealth + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() + + // Sentinel errors to enable comparison. errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance") + errCtrlrHealthSkipped = errors.New("controller health update was skipped") ) // newMntRet creates and populates SCM mount result. @@ -168,13 +173,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe } func addLinkInfoToHealthStats(prov hardware.PCIeLinkStatsProvider, pciCfg string, health *ctlpb.BioHealthResp) error { + if health == nil { + return errors.New("nil BioHealthResp") + } + // Convert byte-string to lspci-format. sb := new(strings.Builder) formatBytestring(pciCfg, sb) pciDev := &hardware.PCIDevice{} if err := prov.PCIeCapsFromConfig([]byte(sb.String()), pciDev); err != nil { - return err + return errors.Wrap(err, "pciutils lib") } // Copy link details from PCIDevice to health stats. @@ -243,31 +252,74 @@ func publishLinkStatEvents(engine Engine, pciAddr string, stats *ctlpb.BioHealth lastMaxWidthStr, lastWidthStr, stats.LinkPortId) } -func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController, prov hardware.PCIeLinkStatsProvider) (bool, error) { - stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)] - if !ctrlr.CanSupplyHealthStats() { - engine.Debugf("skip fetching health stats on device %q in %q state", - ctrlr.PciAddr, stateName) - return false, nil +type ctrlrHealthReq struct { + meta bool + engine Engine + bhReq *ctlpb.BioHealthReq + ctrlr *ctlpb.NvmeController + linkStatsProv hardware.PCIeLinkStatsProvider +} + +// Retrieve NVMe controller health statistics for those in an acceptable state. Return nil health +// resp if in a bad state. +func getCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (*ctlpb.BioHealthResp, error) { + stateName := ctlpb.NvmeDevState_name[int32(req.ctrlr.DevState)] + if !req.ctrlr.CanSupplyHealthStats() { + req.engine.Debugf("skip fetching health stats on device %q in %q state", + req.ctrlr.PciAddr, stateName) + return nil, errCtrlrHealthSkipped } - health, err := getCtrlrHealth(ctx, engine, req) + health, err := scanHealth(ctx, req.engine, req.bhReq) if err != nil { - return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr, + return nil, errors.Wrapf(err, "retrieve health stats for %q (state %q)", req.ctrlr, stateName) } - if ctrlr.PciCfg != "" { - if err := addLinkInfoToHealthStats(prov, ctrlr.PciCfg, health); err != nil { - return false, errors.Wrapf(err, "add link stats for %q", ctrlr) + return health, nil +} + +// Add link state and capability information to input health statistics for the given controller +// then if successful publish events based on link statistic changes. Link updated health stats to +// controller. +func setCtrlrHealthWithLinkInfo(req ctrlrHealthReq, health *ctlpb.BioHealthResp) error { + err := addLinkInfoToHealthStats(req.linkStatsProv, req.ctrlr.PciCfg, health) + if err == nil { + publishLinkStatEvents(req.engine, req.ctrlr.PciAddr, health) + } else { + if errors.Cause(err) != pciutils.ErrNoPCIeCaps { + return errors.Wrapf(err, "add link stats for %q", req.ctrlr) + } + req.engine.Debugf("device %q not reporting PCIe capabilities", req.ctrlr.PciAddr) + } + + return nil +} + +// Update controller health statistics and include link info if required and available. +func populateCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (bool, error) { + health, err := getCtrlrHealth(ctx, req) + if err != nil { + if err == errCtrlrHealthSkipped { + // Nothing to do. + return false, nil } - publishLinkStatEvents(engine, ctrlr.PciAddr, health) + return false, errors.Wrap(err, "get ctrlr health") + } + + if req.linkStatsProv == nil { + req.engine.Debugf("device %q skip adding link stats; nil provider", + req.ctrlr.PciAddr) + } else if req.ctrlr.PciCfg == "" { + req.engine.Debugf("device %q skip adding link stats; empty pci cfg", + req.ctrlr.PciAddr) } else { - engine.Debugf("no pcie config space received for %q, skip add link stats", ctrlr) + if err = setCtrlrHealthWithLinkInfo(req, health); err != nil { + return false, errors.Wrap(err, "set ctrlr health") + } } - ctrlr.HealthStats = health - ctrlr.PciCfg = "" + req.ctrlr.HealthStats = health return true, nil } @@ -305,12 +357,13 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc c := seenCtrlrs[addr] - // Only minimal info provided in standard scan to enable result aggregation across - // homogeneous hosts. engineRank, err := engine.GetRank() if err != nil { return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index()) } + + // Only provide minimal info in standard scan to enable result aggregation across + // homogeneous hosts. nsd := &ctlpb.SmdDevice{ RoleBits: sd.RoleBits, CtrlrNamespaceId: sd.CtrlrNamespaceId, @@ -326,18 +379,29 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc // Populate health if requested. healthUpdated := false if pbReq.Health && c.HealthStats == nil { - bhReq := &ctlpb.BioHealthReq{ - DevUuid: sd.Uuid, - MetaSize: pbReq.MetaSize, - RdbSize: pbReq.RdbSize, + bhReq := &ctlpb.BioHealthReq{DevUuid: sd.Uuid} + if pbReq.Meta { + bhReq.MetaSize = pbReq.MetaSize + bhReq.RdbSize = pbReq.RdbSize } - upd, err := populateCtrlrHealth(ctx, engine, bhReq, c, - hwprov.DefaultPCIeLinkStatsProvider()) + + chReq := ctrlrHealthReq{ + engine: engine, + bhReq: bhReq, + ctrlr: c, + } + if pbReq.LinkStats { + // Add link stats to health if flag set. + chReq.linkStatsProv = linkStatsProv + } + + healthUpdated, err = populateCtrlrHealth(ctx, chReq) if err != nil { return nil, err } - healthUpdated = upd } + // Used to update health with link stats, now redundant. + c.PciCfg = "" // Populate usage data if requested. if pbReq.Meta { @@ -510,12 +574,20 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq continue // Skip health query if UUID doesn't match requested. } if pbReq.IncludeBioHealth { - bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid} - if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr, - hwprov.DefaultPCIeLinkStatsProvider()); err != nil { + chReq := ctrlrHealthReq{ + engine: engine, + bhReq: &ctlpb.BioHealthReq{DevUuid: dev.Uuid}, + ctrlr: dev.Ctrlr, + linkStatsProv: linkStatsProv, + } + + if _, err = populateCtrlrHealth(ctx, chReq); err != nil { return nil, err } } + // Used to update health with link stats, now redundant. + dev.Ctrlr.PciCfg = "" + if pbReq.Uuid != "" && dev.Uuid == pbReq.Uuid { rResp.Devices = []*ctlpb.SmdDevice{dev} found = true diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go index b199adb6b8d..0f72f739970 100644 --- a/src/control/server/instance_storage_rpc_test.go +++ b/src/control/server/instance_storage_rpc_test.go @@ -21,6 +21,7 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/events" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/hardware/pciutils" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/config" @@ -59,24 +60,27 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { } for name, tc := range map[string]struct { - badDevState bool - noPciCfgSpc bool - pciDev *hardware.PCIDevice - pciDevErr error - emptyHealthRes bool - healthErr error - lastStats map[string]*ctlpb.BioHealthResp - expCtrlr *ctlpb.NvmeController - expNotUpdated bool - expErr error - expDispatched []*events.RASEvent - expLastStats map[string]*ctlpb.BioHealthResp + badDevState bool + nilLinkStatsProv bool + noPciCfgSpc bool + pciDev *hardware.PCIDevice + pciDevErr error + healthReq *ctlpb.BioHealthReq + healthRes *ctlpb.BioHealthResp + nilHealthRes bool + healthErr error + lastStats map[string]*ctlpb.BioHealthResp + expCtrlr *ctlpb.NvmeController + expNotUpdated bool + expErr error + expDispatched []*events.RASEvent + expLastStats map[string]*ctlpb.BioHealthResp }{ "bad state; skip health": { badDevState: true, - noPciCfgSpc: true, expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", }, expNotUpdated: true, }, @@ -88,11 +92,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { HealthStats: healthWithLinkStats(0, 0, 0, 0), }, }, + "nil bio health response": { + nilHealthRes: true, + expErr: errors.New("nil BioHealthResp"), + }, "empty bio health response; empty link stats": { - emptyHealthRes: true, - pciDev: new(hardware.PCIDevice), + healthRes: new(ctlpb.BioHealthResp), + pciDev: new(hardware.PCIDevice), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: new(ctlpb.BioHealthResp), }, @@ -106,9 +115,19 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { pciDevErr: errors.New("fail"), expErr: errors.New("fail"), }, + "update health; add link stats; pciutils lib error; missing pcie caps": { + pciDevErr: pciutils.ErrNoPCIeCaps, + expCtrlr: &ctlpb.NvmeController{ + PciAddr: pciAddr, + PciCfg: "ABCD", + DevState: ctlpb.NvmeDevState_NORMAL, + HealthStats: healthWithLinkStats(0, 0, 0, 0), + }, + }, "update health; add link stats; normal link state; no event published": { expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -128,6 +147,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { }, expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4), }, @@ -152,6 +172,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { }, expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4), }, @@ -167,6 +188,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(proto.MockNvmeHealth()), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -176,6 +198,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 0.5e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -191,6 +214,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 4, 1)), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -213,6 +237,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4), }, @@ -229,6 +254,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 8, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4), }, @@ -245,6 +271,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 8, 8), }, @@ -271,6 +298,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 8, 8)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4), }, @@ -297,6 +325,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(8e+9, 2.5e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(8e+9, 1e+9, 4, 4), }, @@ -319,6 +348,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 16, 8)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 16, 4), }, @@ -335,15 +365,11 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - healthRes := healthWithLinkStats(0, 0, 0, 0) - if tc.emptyHealthRes { - healthRes = new(ctlpb.BioHealthResp) - } - getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { - return healthRes, tc.healthErr + scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { + return tc.healthRes, tc.healthErr } defer func() { - getCtrlrHealth = getBioHealth + scanHealth = getBioHealth }() var devState ctlpb.NvmeDevState @@ -363,10 +389,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { LinkMaxWidth: 4, } } + if tc.healthRes == nil && !tc.nilHealthRes { + tc.healthRes = healthWithLinkStats(0, 0, 0, 0) + } - mockProv := &mockPCIeLinkStatsProvider{ - pciDev: tc.pciDev, - pciDevErr: tc.pciDevErr, + var mockProv *mockPCIeLinkStatsProvider + if !tc.nilLinkStatsProv { + mockProv = &mockPCIeLinkStatsProvider{ + pciDev: tc.pciDev, + pciDevErr: tc.pciDevErr, + } } ctrlr := &ctlpb.NvmeController{ @@ -387,8 +419,14 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { subscriber := newMockSubscriber(2) ps.Subscribe(events.RASTypeInfoOnly, subscriber) - upd, err := populateCtrlrHealth(test.Context(t), ei, - &ctlpb.BioHealthReq{}, ctrlr, mockProv) + chReq := ctrlrHealthReq{ + engine: ei, + bhReq: tc.healthReq, + ctrlr: ctrlr, + linkStatsProv: mockProv, + } + + upd, err := populateCtrlrHealth(test.Context(t), chReq) test.CmpErr(t, tc.expErr, err) if err != nil { return @@ -436,6 +474,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { } defSmdScanRes := func() *ctlpb.SmdDevResp { sd := proto.MockSmdDevice(c, 2) + sd.Rank = 2 return &ctlpb.SmdDevResp{Devices: []*ctlpb.SmdDevice{sd}} } healthRespWithUsage := func() *ctlpb.BioHealthResp { @@ -444,6 +483,19 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { mh.MetaWalSize, mh.RdbWalSize = 4, 5 return mh } + ctrlrWithUsageAndMeta := func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.HealthStats = healthRespWithUsage() + sd := proto.MockSmdDevice(nil, 2) + sd.Rank = 1 + sd.TotalBytes = c.HealthStats.TotalBytes + sd.AvailBytes = c.HealthStats.AvailBytes + sd.ClusterSize = c.HealthStats.ClusterSize + sd.MetaWalSize = c.HealthStats.MetaWalSize + sd.RdbWalSize = c.HealthStats.RdbWalSize + c.SmdDevices = []*ctlpb.SmdDevice{sd} + return c + } for name, tc := range map[string]struct { req ctlpb.ScanNvmeReq @@ -640,28 +692,14 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { State: new(ctlpb.ResponseState), }, }, - "scan over drpc; with smd and health; usage and wal size reported": { + "scan over drpc; with meta and health; usage and wal size reported": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, rank: 1, smdRes: defSmdScanRes(), healthRes: healthRespWithUsage(), expResp: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{ - func() *ctlpb.NvmeController { - c := proto.MockNvmeController(2) - c.HealthStats = healthRespWithUsage() - sd := proto.MockSmdDevice(nil, 2) - sd.Rank = 1 - sd.TotalBytes = c.HealthStats.TotalBytes - sd.AvailBytes = c.HealthStats.AvailBytes - sd.ClusterSize = c.HealthStats.ClusterSize - sd.MetaWalSize = c.HealthStats.MetaWalSize - sd.RdbWalSize = c.HealthStats.RdbWalSize - c.SmdDevices = []*ctlpb.SmdDevice{sd} - return c - }(), - }, - State: new(ctlpb.ResponseState), + Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()}, + State: new(ctlpb.ResponseState), }, }, "scan over drpc; only ctrlrs with valid states shown": { @@ -703,7 +741,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { State: new(ctlpb.ResponseState), }, }, - "scan over drpc; with smd and health; missing ctrlr in smd": { + "scan over drpc; with meta and health; missing ctrlr in smd": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdRes: func() *ctlpb.SmdDevResp { ssr := defSmdScanRes() @@ -713,23 +751,48 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { healthRes: healthRespWithUsage(), expErr: errors.New("no ctrlr ref"), }, - "scan over drpc; with smd and health; health scan fails": { + "scan over drpc; with meta and health; health scan fails": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdRes: defSmdScanRes(), healthErr: errors.New("health scan failed"), expErr: errors.New("health scan failed"), }, - "scan over drpc; with smd and health; smd list fails": { + "scan over drpc; with meta and health; smd list fails": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdErr: errors.New("smd scan failed"), healthRes: healthRespWithUsage(), expErr: errors.New("smd scan failed"), }, - "scan over drpc; with smd and health; nil smd list returned": { + "scan over drpc; with meta and health; nil smd list returned": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, healthRes: healthRespWithUsage(), expErr: errors.New("nil smd scan resp"), }, + "scan over drpc; with meta and health; link info update skipped": { + req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + rank: 1, + smdRes: func() *ctlpb.SmdDevResp { + ssr := defSmdScanRes() + ssr.Devices[0].Ctrlr.PciCfg = "ABCD" + return ssr + }(), + healthRes: healthRespWithUsage(), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()}, + State: new(ctlpb.ResponseState), + }, + }, + "scan over drpc; with health; link info update run but failed": { + req: ctlpb.ScanNvmeReq{Health: true, LinkStats: true}, + smdRes: func() *ctlpb.SmdDevResp { + ssr := defSmdScanRes() + ssr.Devices[0].Ctrlr.PciCfg = "ABCD" + return ssr + }(), + healthRes: healthRespWithUsage(), + // Prove link stat provider gets called when LinkStats flag set. + expErr: errors.New("link stats provider fail"), + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) @@ -741,11 +804,17 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { defer func() { scanSmd = listSmdDevices }() - getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { + scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { return tc.healthRes, tc.healthErr } defer func() { - getCtrlrHealth = getBioHealth + scanHealth = getBioHealth + }() + linkStatsProv = &mockPCIeLinkStatsProvider{ + pciDevErr: errors.New("link stats provider fail"), + } + defer func() { + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() }() if tc.provRes == nil { @@ -776,7 +845,8 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { ei.setSuperblock(nil) } else { ei.setSuperblock(&Superblock{ - Rank: ranklist.NewRankPtr(uint32(tc.rank)), ValidRank: true, + Rank: ranklist.NewRankPtr(uint32(tc.rank)), + ValidRank: true, }) } diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto index 944d8e943ba..be068e5274d 100644 --- a/src/proto/ctl/storage_nvme.proto +++ b/src/proto/ctl/storage_nvme.proto @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -28,6 +28,7 @@ message ScanNvmeReq { bool Basic = 3; // Strip NVMe device details to only basic uint64 MetaSize = 4; // Size of the metadata blob uint64 RdbSize = 5; // Size of the RDB blob + bool LinkStats = 6; // Populate PCIe link info in health statistics } message ScanNvmeResp { From eb95b5555016ccb40c6915ededd17943f652f1e0 Mon Sep 17 00:00:00 2001 From: wiliamhuang Date: Mon, 28 Oct 2024 09:00:18 -0500 Subject: [PATCH 25/34] DAOS-16722 client: to intercept PMPI_Init() in libpil4dfs (#15387) Intercept PMPI_Init() to avoid calling daos_init() if MPI_Init() is intercepted by other library (like darshan and mpip). Signed-off-by: Lei Huang --- src/client/dfuse/pil4dfs/int_dfs.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index 3087cb09c75..b69449f1e3d 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -470,6 +470,7 @@ static int (*next_tcgetattr)(int fd, void *termios_p); /* end NOT supported by DAOS */ static int (*next_mpi_init)(int *argc, char ***argv); +static int (*next_pmpi_init)(int *argc, char ***argv); /* to do!! */ /** @@ -1041,6 +1042,22 @@ MPI_Init(int *argc, char ***argv) return rc; } +int +PMPI_Init(int *argc, char ***argv) +{ + int rc; + + if (next_pmpi_init == NULL) { + next_pmpi_init = dlsym(RTLD_NEXT, "PMPI_Init"); + D_ASSERT(next_pmpi_init != NULL); + } + + atomic_fetch_add_relaxed(&mpi_init_count, 1); + rc = next_pmpi_init(argc, argv); + atomic_fetch_add_relaxed(&mpi_init_count, -1); + return rc; +} + /** determine whether a path (both relative and absolute) is on DAOS or not. If yes, * returns parent object, item name, full path of parent dir, full absolute path, and * the pointer to struct dfs_mt. From bde13c310ef66e302d3bfb55af2b252a67386cb7 Mon Sep 17 00:00:00 2001 From: mjean308 <48688872+mjean308@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:25:47 -0400 Subject: [PATCH 26/34] DAOS-15943 test: Remove server logging from pre-teardown (#15282) (#15386) Signed-off-by: Maureen Jean --- src/tests/ftest/util/soak_test_base.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index f32e068cb16..2d1c11e722e 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -25,8 +25,8 @@ from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse, create_app_cmdline, create_dm_cmdline, create_fio_cmdline, create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline, - create_racer_cmdline, ddhhmmss_format, get_daos_server_logs, get_harassers, - get_journalctl, launch_exclude_reintegrate, launch_extend, launch_reboot, + create_racer_cmdline, ddhhmmss_format, get_harassers, + launch_exclude_reintegrate, launch_extend, launch_reboot, launch_server_stop_start, launch_snapshot, launch_vmd_identify_check, reserved_file_copy, run_event_check, run_metrics_check, run_monitor_check) @@ -164,17 +164,6 @@ def pre_tear_down(self): # display final metrics run_metrics_check(self, prefix="final") - # Gather server logs - try: - get_daos_server_logs(self) - except SoakTestError as error: - errors.append(f"<>") - # Gather journalctl logs - hosts = list(set(self.hostlist_servers)) - since = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.start_time)) - until = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.end_time)) - for journalctl_type in ["kernel", "daos_server"]: - get_journalctl(self, hosts, since, until, journalctl_type, logging=True) if self.all_failed_harassers: errors.extend(self.all_failed_harassers) From 4637c0058cceb07a6918c3c426d2ce45ab0750c8 Mon Sep 17 00:00:00 2001 From: Joseph Moore <26410038+jgmoore-or@users.noreply.github.com> Date: Thu, 31 Oct 2024 10:11:09 -0600 Subject: [PATCH 27/34] DAOS-16685 dfuse: Change eq poll to use NOWAIT. (#15425) Signed-off-by: Joseph Moore --- src/client/dfuse/dfuse_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index 4f654fa3209..6397b283e97 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -53,7 +53,7 @@ dfuse_progress_thread(void *arg) return NULL; } - rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_WAIT, 128, &dev[0]); + rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_NOWAIT, 128, &dev[0]); if (rc >= 1) { for (i = 0; i < rc; i++) { struct dfuse_event *ev; From c10428a132d072c9187fab6dd02bdb0208e6b570 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Tue, 5 Nov 2024 05:40:16 +0100 Subject: [PATCH 28/34] DAOS-16508 csum: retry a few times on checksum mismatch on update (#15069) (#15111) Unlike fetch, we return DER_CSUM on update (turned into EIO by dfs) without any retry. We should retry a few times in case it is a transient error. The patch also prints more information about the actual checksum mismatch. Signed-off-by: Johann Lombardi Co-authored-by: Dalton Bohning --- src/common/checksum.c | 4 ++++ src/object/cli_csum.h | 3 +++ src/object/cli_obj.c | 38 ++++++++++++++++++++++++++++++-------- src/object/obj_internal.h | 3 ++- 4 files changed, 39 insertions(+), 9 deletions(-) diff --git a/src/common/checksum.c b/src/common/checksum.c index 69cf89ab07a..c36f14e3c6d 100644 --- a/src/common/checksum.c +++ b/src/common/checksum.c @@ -278,6 +278,10 @@ daos_csummer_compare_csum_info(struct daos_csummer *obj, match = daos_csummer_csum_compare(obj, ci_idx2csum(a, i), ci_idx2csum(b, i), a->cs_len); + if (unlikely(!match)) + D_ERROR("Checksum mismatch at index %d/%d "DF_CI_BUF" != "DF_CI_BUF"\n", i, + a->cs_nr, DP_CI_BUF(ci_idx2csum(a, i), a->cs_len), + DP_CI_BUF(ci_idx2csum(b, i), b->cs_len)); } return match; diff --git a/src/object/cli_csum.h b/src/object/cli_csum.h index 3b1431b8c1f..566c707054d 100644 --- a/src/object/cli_csum.h +++ b/src/object/cli_csum.h @@ -11,6 +11,9 @@ #include #include "obj_internal.h" +/** How many times to retry UPDATE RPCs on checksum error */ +#define MAX_CSUM_RETRY 10 + int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param, daos_key_t *dkey, daos_iod_t *iods, d_sg_list_t *sgls, const uint32_t iod_nr, struct dcs_layout *layout, struct dcs_csum_info **dkey_csum, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 2c0c0a952ea..318ba0cb51e 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -4684,12 +4684,15 @@ obj_comp_cb(tse_task_t *task, void *data) int rc; obj_auxi = tse_task_stack_pop(task, sizeof(*obj_auxi)); - obj_auxi->io_retry = 0; - obj_auxi->result = 0; - obj_auxi->csum_retry = 0; - obj_auxi->tx_uncertain = 0; - obj_auxi->nvme_io_err = 0; obj = obj_auxi->obj; + + /** Clear various bits for a new attempt */ + obj_auxi->io_retry = 0; + obj_auxi->result = 0; + obj_auxi->csum_retry = 0; + obj_auxi->tx_uncertain = 0; + obj_auxi->nvme_io_err = 0; + rc = obj_comp_cb_internal(obj_auxi); if (rc != 0 || obj_auxi->result) { if (task->dt_result == 0) @@ -4760,9 +4763,28 @@ obj_comp_cb(tse_task_t *task, void *data) obj_auxi->tx_uncertain = 1; else obj_auxi->nvme_io_err = 1; - } else if (task->dt_result != -DER_NVME_IO) { - /* Don't retry update for CSUM & UNCERTAIN errors */ - obj_auxi->io_retry = 0; + } else { + if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE && + task->dt_result == -DER_CSUM) { + struct shard_rw_args *rw_arg = &obj_auxi->rw_args; + + /** Retry a few times on checksum error on update */ + if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) { + obj_auxi->csum_retry = 1; + rw_arg->csum_retry_cnt++; + D_DEBUG(DB_IO, DF_OID" checksum error on " + "update, retrying\n", + DP_OID(obj->cob_md.omd_id)); + } else { + D_ERROR(DF_OID" checksum error on update, " + "too many retries. Failing I/O\n", + DP_OID(obj->cob_md.omd_id)); + obj_auxi->io_retry = 0; + } + } else if (task->dt_result != -DER_NVME_IO) { + /* Don't retry update for UNCERTAIN errors */ + obj_auxi->io_retry = 0; + } } } else { obj_auxi->io_retry = 0; diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index cfe3385e959..06cfdb5b195 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -281,6 +281,7 @@ struct shard_rw_args { struct dcs_csum_info *dkey_csum; struct dcs_iod_csums *iod_csums; struct obj_reasb_req *reasb_req; + uint16_t csum_retry_cnt; }; struct coll_sparse_targets { @@ -480,8 +481,8 @@ struct obj_auxi_args { rebuilding:1, for_migrate:1; /* request flags. currently only: ORF_RESEND */ - uint32_t flags; uint32_t specified_shard; + uint32_t flags; uint16_t retry_cnt; uint16_t inprogress_cnt; struct obj_req_tgts req_tgts; From d17d3d6f14508cad773a4e72eefacd3bf4fdc3a3 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Tue, 5 Nov 2024 01:55:46 -0700 Subject: [PATCH 29/34] DAOS-16211 vos: Avoid race condition with discard (#15370) (#15432) * DAOS-16211 vos: Avoid race condition with discard (#15370) There is a possible race between aggregation deleting the object tree and discard working on the same object tree. Add a check to avoid this race Signed-off-by: Jeff Olivier --- src/vos/vos_obj_index.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index ea47cf4454c..58699453f40 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -791,13 +791,14 @@ oi_iter_process(struct vos_iterator *iter, vos_iter_proc_op_t op, void *args) int oi_iter_check_punch(daos_handle_t ih) { - struct vos_iterator *iter = vos_hdl2iter(ih); - struct vos_oi_iter *oiter = iter2oiter(iter); - struct vos_obj_df *obj; - struct oi_delete_arg del_arg; - daos_unit_oid_t oid; - d_iov_t rec_iov; - int rc; + struct vos_iterator *iter = vos_hdl2iter(ih); + struct vos_oi_iter *oiter = iter2oiter(iter); + struct vos_container *cont = oiter->oit_cont; + struct vos_obj_df *obj; + struct oi_delete_arg del_arg; + daos_unit_oid_t oid; + d_iov_t rec_iov; + int rc; D_ASSERT(iter->it_type == VOS_ITER_OBJ); @@ -811,10 +812,22 @@ oi_iter_check_punch(daos_handle_t ih) obj = (struct vos_obj_df *)rec_iov.iov_buf; oid = obj->vo_id; - if (!vos_ilog_is_punched(vos_cont2hdl(oiter->oit_cont), &obj->vo_ilog, &oiter->oit_epr, - NULL, &oiter->oit_ilog_info)) + if (!vos_ilog_is_punched(vos_cont2hdl(cont), &obj->vo_ilog, &oiter->oit_epr, NULL, + &oiter->oit_ilog_info)) return 0; + rc = vos_obj_hold(vos_obj_cache_current(cont->vc_pool->vp_sysdb), cont, oid, + &oiter->oit_epr, iter->it_bound, VOS_OBJ_AGGREGATE | VOS_OBJ_NO_HOLD, + DAOS_INTENT_PURGE, NULL, NULL); + if (rc != 0) { + /** -DER_BUSY means the object is in-use already. We will after a yield in this + * case. + */ + D_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, "Hold check failed for " DF_UOID "\n", + DP_UOID(oid)); + return rc; + } + /** Ok, ilog is fully punched, so we can move it to gc heap */ rc = umem_tx_begin(vos_cont2umm(oiter->oit_cont), NULL); if (rc != 0) From 9ab3200f94dcb980cc812d4c234b1003982bb4b0 Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Tue, 5 Nov 2024 07:37:34 -0600 Subject: [PATCH 30/34] DAOS-15162 build: update to libfabric 1.22.0 (#15441) Signed-off-by: Jerome Soumagne --- utils/build.config | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/build.config b/utils/build.config index 52749b3f1ce..b7f86add782 100644 --- a/utils/build.config +++ b/utils/build.config @@ -8,7 +8,7 @@ pmdk=2.1.0 isal=v2.30.0 isal_crypto=v2.23.0 spdk=v22.01.2 -ofi=v1.19.1 +ofi=v1.22.0 mercury=v2.4.0rc5 protobufc=v1.3.3 ucx=v1.14.1 @@ -27,7 +27,6 @@ ucx=https://github.com/openucx/ucx.git [patch_versions] spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff -ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff From f7d12a48846a6121489ca8026441c88f33971806 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Wed, 6 Nov 2024 06:49:31 +0800 Subject: [PATCH 31/34] DAOS-16721 dtx: handle potential DTX ID reusing trouble - b26 (#15409) The patch contains the following improvements: 1. When VOS level logic returns -DER_TX_RESATRT, the object level RPC handler should set 'RESEND' flag then restart the transaction with newer epoch. Because dtx_abort() logic cannot guarantee all former prepared DTX entries (on all related participants) can be aborted, especially if the former one failed for some network trouble, that may cause restarted transaction hit -DER_TX_ID_REUSED unexpectedly. 2. Compare the epoch for DTX entries with the same transaction ID for distinguishing potential reused TX ID more accurately. 3. Add DTX entry into DTX CoS cache if cannot commit it synchronously. Then subsequent batched commit logic can handle it. 4. If server complains suspected TX ID reusing, then reports -EIO to related application instead of assertion on client. 5. Control DTX related warning message frequency to avoid log flood. 6. Collect more information when generate some error/warning message. Signed-off-by: Fan Yong --- src/dtx/dtx_coll.c | 8 ++- src/dtx/dtx_common.c | 83 ++++++++++++---------- src/dtx/dtx_rpc.c | 15 ++-- src/dtx/dtx_srv.c | 5 +- src/include/daos_srv/container.h | 3 + src/object/cli_coll.c | 2 +- src/object/cli_obj.c | 23 ++++-- src/object/cli_shard.c | 13 ++-- src/object/obj_internal.h | 2 +- src/object/obj_utils.c | 17 +++-- src/object/srv_ec_aggregate.c | 33 ++++++++- src/object/srv_obj.c | 116 ++++++++++++++++++------------- src/object/srv_obj_remote.c | 12 ++-- src/vos/vos_dtx.c | 63 ++++++++++++----- src/vos/vos_internal.h | 2 + 15 files changed, 254 insertions(+), 143 deletions(-) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 7e7c8199155..ba45aaa2616 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -81,9 +81,11 @@ dtx_coll_prep_ult(void *arg) } if (dcpa->dcpa_result != 0) { - if (dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST) - D_ERROR("Failed to load mbs for "DF_DTI", opc %u: "DF_RC"\n", - DP_DTI(&dci->dci_xid), opc, DP_RC(rc)); + if (dcpa->dcpa_result < 0 && + dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST) + D_ERROR("Failed to load mbs for "DF_DTI" in "DF_UUID"/"DF_UUID", opc %u: " + DF_RC"\n", DP_DTI(&dci->dci_xid), DP_UUID(dci->dci_po_uuid), + DP_UUID(dci->dci_co_uuid), opc, DP_RC(dcpa->dcpa_result)); goto out; } diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index ecb156729ed..1ee74ae11a4 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1271,7 +1271,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul int status = -1; int rc = 0; bool aborted = false; - bool unpin = false; D_ASSERT(cont != NULL); @@ -1339,7 +1338,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * it persistently. Otherwise, the subsequent DTX resync may not find it as * to regard it as failed transaction and abort it. */ - if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo && + if (!dth->dth_active && !dth->dth_prepared && (dth->dth_dist || dth->dth_modification_cnt > 0)) { result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false); if (unlikely(result < 0)) { @@ -1349,7 +1348,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul } } - if (dth->dth_prepared || dtx_batched_ult_max == 0) { + if ((dth->dth_prepared && !dlh->dlh_coll) || dtx_batched_ult_max == 0) { dth->dth_sync = 1; goto sync; } @@ -1363,14 +1362,12 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul if (DAOS_FAIL_CHECK(DAOS_DTX_MISS_COMMIT)) dth->dth_sync = 1; - /* For synchronous DTX, do not add it into CoS cache, otherwise, - * we may have no way to remove it from the cache. - */ if (dth->dth_sync) goto sync; D_ASSERT(dth->dth_mbs != NULL); +cache: if (dlh->dlh_coll) { rc = dtx_cos_add(cont, dlh->dlh_coll_entry, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_epoch, DCF_EXP_CMT | DCF_COLL); @@ -1378,38 +1375,47 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size; D_ALLOC(dte, size); if (dte == NULL) { - dth->dth_sync = 1; - goto sync; - } - - mbs = (struct dtx_memberships *)(dte + 1); - memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); - - dte->dte_xid = dth->dth_xid; - dte->dte_ver = dth->dth_ver; - dte->dte_refs = 1; - dte->dte_mbs = mbs; + rc = -DER_NOMEM; + } else { + mbs = (struct dtx_memberships *)(dte + 1); + memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); + + dte->dte_xid = dth->dth_xid; + dte->dte_ver = dth->dth_ver; + dte->dte_refs = 1; + dte->dte_mbs = mbs; + + if (!(mbs->dm_flags & DMF_SRDG_REP)) + flags = DCF_EXP_CMT; + else if (dth->dth_modify_shared) + flags = DCF_SHARED; + else + flags = 0; - if (!(mbs->dm_flags & DMF_SRDG_REP)) - flags = DCF_EXP_CMT; - else if (dth->dth_modify_shared) - flags = DCF_SHARED; - else - flags = 0; + rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash, + dth->dth_epoch, flags); + dtx_entry_put(dte); + } + } - rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash, - dth->dth_epoch, flags); - dtx_entry_put(dte); + /* + * NOTE: If we failed to add the committable DTX into CoS cache, then we also have no way + * to commit (or abort) the DTX because of out of memory. Such DTX will be finally + * committed via next DTX resync (after recovered from OOM). + * + * Here, we only warning to notify the trouble, but not failed the transaction. + */ + if (rc != 0) { + D_WARN(DF_UUID": Fail to cache %s DTX "DF_DTI": "DF_RC"\n", + DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", + DP_DTI(&dth->dth_xid), DP_RC(rc)); + D_GOTO(out, result = 0); } - if (rc == 0) { - if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) { - vos_dtx_mark_committable(dth); - if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll) - sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req); - } - } else { - dth->dth_sync = 1; + if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) { + vos_dtx_mark_committable(dth); + if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll) + sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req); } sync: @@ -1428,10 +1434,15 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul rc = dtx_commit(cont, &dte, NULL, 1, false); } - if (rc != 0) + if (rc != 0) { D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n", DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", DP_DTI(&dth->dth_xid), DP_RC(rc)); + if (likely(dtx_batched_ult_max != 0)) { + dth->dth_sync = 0; + goto cache; + } + } /* * NOTE: The semantics of 'sync' commit does not guarantee that all @@ -1451,7 +1462,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * to locally retry for avoiding related forwarded RPC timeout, instead, * The leader will trigger retry globally without abort 'prepared' ones. */ - if (unpin || (result < 0 && result != -DER_AGAIN && !dth->dth_solo)) { + if (result < 0 && result != -DER_AGAIN && !dth->dth_solo) { /* 1. Drop partial modification for distributed transaction. * 2. Remove the pinned DTX entry. */ diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 2ccbfec2734..6d34e871269 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -1657,8 +1657,9 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d } D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, - "Collectively commit DTX "DF_DTI": %d/%d/%d\n", - DP_DTI(&dce->dce_xid), rc, rc1, rc2); + "Collectively commit DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/%d\n", + DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), + rc, rc1, rc2); return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; } @@ -1717,8 +1718,9 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc rc2 = 0; D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, - "Collectively abort DTX "DF_DTI": %d/%d/%d\n", - DP_DTI(&dce->dce_xid), rc, rc1, rc2); + "Collectively abort DTX "DF_DTI" with epoch "DF_X64" in " + DF_UUID"/"DF_UUID": %d/%d/%d\n", DP_DTI(&dce->dce_xid), epoch, + DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), rc, rc1, rc2); return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; } @@ -1766,8 +1768,9 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc } D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR, - DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n", - DP_DTI(&dce->dce_xid), rc, rc1); + DB_TRACE, "Collectively check DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/\n", + DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), + rc, rc1); return dce->dce_ranks != NULL ? rc : rc1; } diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 0fd4e7c513f..084f804ec81 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -474,8 +474,9 @@ dtx_coll_handler(crt_rpc_t *rpc) out: D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, - "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n", - opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc)); + "Handled collective DTX PRC %u on rank %u for "DF_DTI" in " + DF_UUID"/"DF_UUID": "DF_RC"\n", opc, myrank, DP_DTI(&dci->dci_xid), + DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), DP_RC(rc)); dco->dco_status = rc; rc = crt_reply_send(rpc); diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index f99f4df14e3..9fc615c2a8b 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -108,6 +108,9 @@ struct ds_cont_child { uint32_t sc_dtx_committable_count; uint32_t sc_dtx_committable_coll_count; + /* Last timestamp when EC aggregation reports -DER_INPROGRESS. */ + uint64_t sc_ec_agg_busy_ts; + /* The global minimum EC aggregation epoch, which will be upper * limit for VOS aggregation, i.e. EC object VOS aggregation can * not cross this limit. For simplification purpose, all objects diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index 3285bec58b3..d517e3269d6 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -873,7 +873,7 @@ queue_coll_query_task(tse_task_t *api_task, struct obj_auxi_args *obj_auxi, stru 0, 0, ocdc); for (i = 0; i < ocdc->grp_nr; i++) { - obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep); + obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep, obj->cob_md.omd_id); tmp = coa->coa_dcts[ocdc->cur_pos].dct_shards[tgt_ep.ep_tag].dcs_idx; rc = queue_shard_query_key_task(api_task, obj_auxi, epoch, tmp, map_ver, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 318ba0cb51e..46161bf7a2d 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -1718,15 +1718,20 @@ dc_obj_retry_delay(tse_task_t *task, int err, uint16_t *retry_cnt, uint16_t *inp uint32_t timeout_sec) { uint32_t delay = 0; + uint32_t limit = 4; /* - * Randomly delay 5 - 68 us if it is not the first retry for + * Randomly delay 5 ~ 1028 us if it is not the first retry for * -DER_INPROGRESS || -DER_UPDATE_AGAIN cases. */ ++(*retry_cnt); if (err == -DER_INPROGRESS || err == -DER_UPDATE_AGAIN) { if (++(*inprogress_cnt) > 1) { - delay = (d_rand() & ((1 << 6) - 1)) + 5; + limit += *inprogress_cnt; + if (limit > 10) + limit = 10; + + delay = (d_rand() & ((1 << limit) - 1)) + 5; /* Rebuild is being established on the server side, wait a bit longer */ if (err == -DER_UPDATE_AGAIN) delay <<= 10; @@ -4856,11 +4861,14 @@ obj_comp_cb(tse_task_t *task, void *data) D_ASSERT(daos_handle_is_inval(obj_auxi->th)); D_ASSERT(obj_is_modification_opc(obj_auxi->opc)); - if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) - /* XXX: it is must because miss to set "RESEND" flag, that is bug. */ - D_ASSERTF(0, - "Miss 'RESEND' flag (%x) when resend the RPC for task %p: %u\n", - obj_auxi->flags, task, obj_auxi->retry_cnt); + if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) { + D_ERROR("TX ID maybe reused for unknown reason, " + "task %p, opc %u, flags %x, retry_cnt %u\n", + task, obj_auxi->opc, obj_auxi->flags, obj_auxi->retry_cnt); + task->dt_result = -DER_IO; + obj_auxi->io_retry = 0; + goto args_fini; + } if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) { daos_obj_rw_t *api_args = dc_task_get_args(obj_auxi->obj_task); @@ -4886,6 +4894,7 @@ obj_comp_cb(tse_task_t *task, void *data) } } +args_fini: if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH) obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa); diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 0c9dfc1418e..9f084140f80 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1451,11 +1451,14 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data) shard_args->pa_auxi.obj_auxi->max_delay = timeout; } - DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, - "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " - DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout", - rpc, DP_UOID(ocpi->ocpi_oid), DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, - *cb_args->cpca_ver, (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags, + DL_CDEBUG(task->dt_result < 0 && task->dt_result != -DER_INPROGRESS, + DLOG_ERR, DB_IO, task->dt_result, + "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" in "DF_UUID"/"DF_UUID"/" + DF_UUID" with DTX "DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout", + rpc, DP_UOID(ocpi->ocpi_oid), DP_UUID(ocpi->ocpi_po_uuid), + DP_UUID(ocpi->ocpi_co_hdl), DP_UUID(ocpi->ocpi_co_uuid), DP_DTI(&ocpi->ocpi_xid), + task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, + (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags, cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous"); crt_req_decref(rpc); diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 06cfdb5b195..c0df21dd009 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -1100,7 +1100,7 @@ int daos_obj_query_merge(struct obj_query_merge_args *oqma); void obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc); void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, - crt_endpoint_t *tgt_ep); + crt_endpoint_t *tgt_ep, daos_obj_id_t oid); void obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc); int obj_utils_init(void); void obj_utils_fini(void); diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 82d91c966ac..c01947a05a1 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -616,23 +616,22 @@ obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, - crt_endpoint_t *tgt_ep) + crt_endpoint_t *tgt_ep, daos_obj_id_t oid) { struct daos_coll_target *dct = &tgts[ocdc->cur_pos]; struct daos_coll_target tmp; - unsigned long rand = 0; uint32_t size; int pos; int i; if (ocdc->cur_step > 2) { - rand = d_rand(); /* - * Randomly choose an engine as the relay one for load balance. - * If the one corresponding to "pos" is former moved one, then - * use the "cur_pos" as the relay engine. + * Choose an engine (according to the given oid) as the relay one for load balance. + * If the one corresponding to "pos" is former moved one, then use the "cur_pos" as + * the relay engine. Then even if related RPC was resent without changing pool map, + * then the relay one will be the same as the original case. */ - pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; + pos = oid.lo % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { memcpy(&tmp, &tgts[pos], sizeof(tmp)); memcpy(&tgts[pos], dct, sizeof(tmp)); @@ -642,8 +641,8 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t size = dct->dct_bitmap_sz << 3; - /* Randomly choose a XS as the local leader on target engine for load balance. */ - for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) { + /* Choose a target as the local agent on the engine for load balance. */ + for (i = 0, pos = oid.lo % dct->dct_tgt_nr; i < size; i++) { if (isset(dct->dct_bitmap, i)) { pos -= dct->dct_shards[i].dcs_nr; if (pos < 0) diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 71c630fa947..50b513d1612 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -2667,8 +2667,13 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, struct ec_agg_param *ec_agg_param = agg_param->ap_data; vos_iter_param_t iter_param = { 0 }; struct vos_iter_anchors anchors = { 0 }; + struct dtx_handle *dth = NULL; + struct dtx_share_peer *dsp; + struct dtx_id dti = { 0 }; + struct dtx_epoch epoch = { 0 }; + daos_unit_oid_t oid = { 0 }; + int blocks = 0; int rc = 0; - int blocks = 0; /* * Avoid calling into vos_aggregate() when aborting aggregation @@ -2715,8 +2720,32 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL); retry: + epoch.oe_value = epr->epr_hi; + rc = dtx_begin(cont->sc_hdl, &dti, &epoch, 0, cont->sc_pool->spc_map_version, &oid, + NULL, 0, 0, NULL, &dth); + if (rc != 0) + goto update_hae; + rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors, agg_iterate_pre_cb, - agg_iterate_post_cb, ec_agg_param, NULL); + agg_iterate_post_cb, ec_agg_param, dth); + if (rc == -DER_INPROGRESS && !d_list_empty(&dth->dth_share_tbd_list)) { + uint64_t now = daos_gettime_coarse(); + + /* Report warning per each 10 seconds to avoid log flood. */ + if (now - cont->sc_ec_agg_busy_ts > 10) { + while ((dsp = d_list_pop_entry(&dth->dth_share_tbd_list, + struct dtx_share_peer, dsp_link)) != NULL) { + D_WARN(DF_CONT ": EC aggregate hit non-committed DTX " DF_DTI "\n", + DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), + DP_DTI(&dsp->dsp_xid)); + dtx_dsp_free(dsp); + } + + cont->sc_ec_agg_busy_ts = now; + } + } + + dtx_end(dth, cont, rc); /* Post_cb may not being executed in some cases */ agg_clear_extents(&ec_agg_param->ap_agg_entry); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 041ea903c4f..4762e04b898 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2953,8 +2953,11 @@ ds_obj_rw_handler(crt_rpc_t *rpc) d_tm_inc_counter(opm->opm_update_resent, 1); -again1: - e = 0; +again: + if (flags & ORF_RESEND) + e = orw->orw_epoch; + else + e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &e, &version); switch (rc) { @@ -2965,8 +2968,13 @@ ds_obj_rw_handler(crt_rpc_t *rpc) orw->orw_epoch = e; /* TODO: Also recover the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &orw->orw_dti, e); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: - rc = 0; + flags = 0; break; default: D_GOTO(out, rc); @@ -2976,7 +2984,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) D_GOTO(out, rc); } -again2: /* For leader case, we need to find out the potential conflict * (or share the same non-committed object/dkey) DTX(s) in the * CoS (committable) cache, piggyback them via the dispdatched @@ -3021,7 +3028,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; exec_arg.start = orw->orw_start_shard; /* Execute the operation on all targets */ @@ -3036,28 +3043,25 @@ ds_obj_rw_handler(crt_rpc_t *rpc) case -DER_TX_RESTART: /* * If this is a standalone operation, we can restart the - * internal transaction right here. Otherwise, we have to defer - * the restart to the RPC client. + * internal transaction right here. Otherwise we have to + * defer the restart to the RPC sponsor. */ - if (opc == DAOS_OBJ_RPC_UPDATE) { - /* - * Only standalone updates use this RPC. Retry with - * newer epoch. - */ - orw->orw_epoch = d_hlc_get(); - orw->orw_flags &= ~ORF_RESEND; - flags = 0; - d_tm_inc_counter(opm->opm_update_restart, 1); - goto again2; - } + if (opc != DAOS_OBJ_RPC_UPDATE) + break; - break; + /* Only standalone updates use this RPC. Retry with newer epoch. */ + orw->orw_epoch = d_hlc_get(); + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + d_tm_inc_counter(opm->opm_update_restart, 1); + goto again; case -DER_AGAIN: - orw->orw_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; d_tm_inc_counter(opm->opm_update_retry, 1); ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -3875,8 +3879,11 @@ ds_obj_punch_handler(crt_rpc_t *rpc) if (opi->opi_flags & ORF_RESEND) { daos_epoch_t e; -again1: - e = 0; +again: + if (flags & ORF_RESEND) + e = opi->opi_epoch; + else + e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &e, &version); switch (rc) { @@ -3887,8 +3894,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc) flags |= ORF_RESEND; /* TODO: Also recovery the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: - rc = 0; + flags = 0; break; default: D_GOTO(out, rc); @@ -3898,7 +3910,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) goto cleanup; } -again2: /* For leader case, we need to find out the potential conflict * (or share the same non-committed object/dkey) DTX(s) in the * CoS (committable) cache, piggyback them via the dispdatched @@ -3943,7 +3954,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; /* Execute the operation on all shards */ if (opi->opi_api_flags & DAOS_COND_PUNCH) @@ -3959,19 +3970,17 @@ ds_obj_punch_handler(crt_rpc_t *rpc) rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { case -DER_TX_RESTART: - /* - * Only standalone punches use this RPC. Retry with newer - * epoch. - */ + /* Only standalone punches use this RPC. Retry with newer epoch. */ opi->opi_epoch = d_hlc_get(); - opi->opi_flags &= ~ORF_RESEND; - flags = 0; - goto again2; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + goto again; case -DER_AGAIN: - opi->opi_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -5663,8 +5672,11 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (ocpi->ocpi_flags & ORF_RESEND) { -again1: - tmp = 0; +again: + if (!(ocpi->ocpi_flags & ORF_LEADER) || (flags & ORF_RESEND)) + tmp = ocpi->ocpi_epoch; + else + tmp = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version); switch (rc) { case -DER_ALREADY: @@ -5674,7 +5686,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) flags |= ORF_RESEND; /* TODO: Also recovery the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &ocpi->ocpi_xid, tmp); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: + flags = 0; break; default: D_GOTO(out, rc); @@ -5683,7 +5701,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) dce->dce_ver = version; } -again2: epoch.oe_value = ocpi->ocpi_epoch; epoch.oe_first = epoch.oe_value; epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags); @@ -5695,7 +5712,7 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; exec_arg.coll_shards = dcts[0].dct_shards; exec_arg.coll_tgts = dcts; obj_coll_disp_init(dct_nr, ocpi->ocpi_max_tgt_sz, @@ -5728,14 +5745,15 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) switch (rc) { case -DER_TX_RESTART: ocpi->ocpi_epoch = d_hlc_get(); - ocpi->ocpi_flags &= ~ORF_RESEND; - flags = 0; - goto again2; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + goto again; case -DER_AGAIN: - ocpi->ocpi_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -5755,12 +5773,14 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) max_ver = version; DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, - "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc " - DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, " - "tgt_nr %u, forward width %u, forward depth %u, flags %x", + "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/" + DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, " + "bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, - DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, + DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, + DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl), + DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch, ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count, ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c index ce06723621b..f64d851e5b4 100644 --- a/src/object/srv_obj_remote.c +++ b/src/object/srv_obj_remote.c @@ -136,7 +136,7 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, *orw = *orw_parent; orw->orw_oid.id_shard = shard_tgt->st_shard_id; - orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags; + orw->orw_flags |= (ORF_BULK_BIND | obj_exec_arg->flags) & ~ORF_LEADER; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) orw->orw_api_flags &= ~DAOS_COND_MASK; orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; @@ -247,7 +247,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, *opi = *opi_parent; opi->opi_oid.id_shard = shard_tgt->st_shard_id; - opi->opi_flags |= obj_exec_arg->flags; + opi->opi_flags |= obj_exec_arg->flags & ~ORF_LEADER; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) opi->opi_api_flags &= ~DAOS_COND_PUNCH; opi->opi_dti_cos.ca_count = dth->dth_dti_cos_count; @@ -495,7 +495,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, crt_endpoint_t tgt_ep = { 0 }; crt_rpc_t *parent_req = exec_arg->rpc; crt_rpc_t *req; - struct obj_coll_punch_in *ocpi_parent; + struct obj_coll_punch_in *ocpi_parent = crt_req_get(parent_req); struct obj_coll_punch_in *ocpi; int tag; int rc = 0; @@ -509,7 +509,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, if (remote_arg == NULL) D_GOTO(out, rc = -DER_NOMEM); - obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep); + obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocpi_parent->ocpi_oid.id_pub); tag = tgt_ep.ep_tag; crt_req_addref(parent_req); @@ -524,9 +524,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, D_GOTO(out, rc); } - ocpi_parent = crt_req_get(parent_req); ocpi = crt_req_get(req); - ocpi->ocpi_odm = ocpi_parent->ocpi_odm; uuid_copy(ocpi->ocpi_po_uuid, ocpi_parent->ocpi_po_uuid); uuid_copy(ocpi->ocpi_co_hdl, ocpi_parent->ocpi_co_hdl); @@ -634,7 +632,7 @@ ds_obj_coll_query_remote(struct dtx_leader_handle *dlh, void *data, int idx, if (remote_arg == NULL) D_GOTO(out, rc = -DER_NOMEM); - obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep); + obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocqi_parent->ocqi_oid.id_pub); tag = tgt_ep.ep_tag; remote_arg->dlh = dlh; diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 0f87bfd4d0e..47e410fc1e7 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -312,16 +312,38 @@ dtx_act_ent_update(struct btr_instance *tins, struct btr_record *rec, if (unlikely(!dae_old->dae_aborted)) { /* - * XXX: There are two possible reasons for that: - * - * 1. Client resent the RPC but without set 'RESEND' flag. - * 2. Client reused the DTX ID for different modifications. - * - * Currently, the 1st case is more suspected. + * If the new entry and the old entry are for the same transaction, then the RPC + * for the new one will take 'RESEND' flag, that will cause the old one has been + * aborted before arriving at here. So it is quite possible that the new one and + * the old one are for different transactions. */ - D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n", - DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); - return -DER_TX_ID_REUSED; + if (DAE_EPOCH(dae_old) < DAE_EPOCH(dae_new)) { + D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); + return -DER_TX_ID_REUSED; + } + + /* + * If the old entry has higher epoch, it is quite possible that the resent RPC + * was handled before the original RPC (corresponding to 'dae_new'). Returning + * -DER_INPROGRESS to make the RPC sponsor to retry the RPC with 'RESEND' flag, + * then related RPC handler logic will handle such case. + */ + if (DAE_EPOCH(dae_old) > DAE_EPOCH(dae_new)) { + D_ERROR("Resent RPC may be handled before original one for DTX "DF_DTI + " with epoch "DF_X64" vs "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); + return -DER_INPROGRESS; + } + + /* + * The two entries uses the same epoch, then it may be caused by repeated RPCs + * from different sources, such as multiple relay engines forward the same RPC + * to current target. We need to notify related caller for such buggy case. + */ + D_ERROR("Receive repeated DTX "DF_DTI" with epoch "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old)); + return -DER_MISC; } rec->rec_off = umem_ptr2off(&tins->ti_umm, dae_new); @@ -1172,16 +1194,20 @@ vos_dtx_check_availability(daos_handle_t coh, uint32_t entry, } if (intent == DAOS_INTENT_PURGE) { - uint32_t age = d_hlc_age2sec(DAE_XID(dae).dti_hlc); + uint64_t now = daos_gettime_coarse(); /* * The DTX entry still references related data record, * then we cannot (vos) aggregate related data record. + * Report warning per each 10 seconds to avoid log flood. */ - if (age >= DAOS_AGG_THRESHOLD) - D_WARN("DTX "DF_DTI" (state:%u, age:%u) still references the data, " - "cannot be (VOS) aggregated\n", - DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), age); + if (now - cont->vc_agg_busy_ts > 10) { + D_WARN("DTX "DF_DTI" (state:%u, flags:%x, age:%u) still references " + "the modification, cannot be (VOS) aggregated\n", + DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), DAE_FLAGS(dae), + (unsigned int)d_hlc_age2sec(DAE_XID(dae).dti_hlc)); + cont->vc_agg_busy_ts = now; + } return ALB_AVAILABLE_DIRTY; } @@ -1909,8 +1935,13 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, daos_epoch_t e = *epoch; *epoch = DAE_EPOCH(dae); - if (e != 0 && e != DAE_EPOCH(dae)) - return -DER_MISMATCH; + if (e != 0) { + if (e > DAE_EPOCH(dae)) + return -DER_MISMATCH; + + if (e < DAE_EPOCH(dae)) + return -DER_TX_RESTART; + } } return vos_dae_is_prepare(dae) ? DTX_ST_PREPARED : DTX_ST_INITED; diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 7d4dd3ac166..5aa0a1fadbb 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -353,6 +353,8 @@ struct vos_container { daos_epoch_range_t vc_epr_aggregation; /* Current ongoing discard EPR */ daos_epoch_range_t vc_epr_discard; + /* Last timestamp when VOS aggregation reports -DER_TX_BUSY */ + uint64_t vc_agg_busy_ts; /* Last timestamp when VOS aggregation reporting ENOSPACE */ uint64_t vc_agg_nospc_ts; /* Last timestamp when IO reporting ENOSPACE */ From cf2a8b9571ea02cccfed6f96112d1582bd44e8c9 Mon Sep 17 00:00:00 2001 From: wiliamhuang Date: Tue, 5 Nov 2024 20:12:12 -0600 Subject: [PATCH 32/34] DAOS-16365 client: intercept dlsym() and zeInit() to avoid nested call (#14932) (#15446) libfabric loads libze_loader.so which calls zeInit(). We observed deadlock due to nested calls when daos_init() is called inside zeInit(). We intercept dlsym() and zeInit() to avoid calling daos_init() inside zeInit(). dlsym(RTLD_NEXT, ) checks returning address to determine caller's module. To maintain expected behavior of dlsym(RTLD_NEXT, ) with our interception, new_dlsym() is implemented with assembly code to use jmp instruction instead of call. dlsym() has been moved from libdl.so to libc.so since version 2.34. Signed-off-by: Lei Huang --- src/client/dfuse/pil4dfs/hook.c | 49 +++++-- src/client/dfuse/pil4dfs/hook.h | 6 + src/client/dfuse/pil4dfs/int_dfs.c | 189 ++++++++++++++++++++++++- src/client/dfuse/pil4dfs/pil4dfs_int.h | 2 +- 4 files changed, 226 insertions(+), 20 deletions(-) diff --git a/src/client/dfuse/pil4dfs/hook.c b/src/client/dfuse/pil4dfs/hook.c index 0ec1a0b5374..c30061387c1 100644 --- a/src/client/dfuse/pil4dfs/hook.c +++ b/src/client/dfuse/pil4dfs/hook.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "hook.h" #include "hook_int.h" @@ -89,10 +90,15 @@ static uint64_t lib_base_addr[MAX_NUM_LIB]; /* List of names of loaded libraries */ static char **lib_name_list; +/* libc version number in current process. e.g., 2.28 */ +static float libc_version; +static char *libc_version_str; + /* end to compile list of memory blocks in /proc/pid/maps */ static char *path_ld; static char *path_libc; +static char *path_libdl; static char *path_libpthread; /* This holds the path of libpil4dfs.so. It is needed when we want to * force child processes append libpil4dfs.so to env LD_PRELOAD. */ @@ -212,7 +218,7 @@ determine_lib_path(void) { int path_offset = 0, read_size, i, rc; char *read_buff_map = NULL; - char *pos, *start, *end, lib_ver_str[32] = "", *lib_dir_str = NULL; + char *pos, *start, *end, *lib_dir_str = NULL; read_size = read_map_file(&read_buff_map); @@ -289,19 +295,17 @@ determine_lib_path(void) goto err; path_libc[end - start] = 0; - pos = strstr(path_libc, "libc-2."); - if (pos) { - /* containing version in name. example, 2.17 */ - memcpy(lib_ver_str, pos + 5, 4); - lib_ver_str[4] = 0; + if (libc_version_str == NULL) { + libc_version_str = (char *)gnu_get_libc_version(); + if (libc_version_str == NULL) { + DS_ERROR(errno, "Failed to determine libc version"); + goto err; + } + libc_version = atof(libc_version_str); } - if (lib_ver_str[0]) { - /* with version in name */ - rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, lib_ver_str); - } else { - rc = asprintf(&path_libpthread, "%s/libpthread.so.0", lib_dir_str); - } + /* with version in name */ + rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, libc_version_str); if (rc < 0) { DS_ERROR(ENOMEM, "Failed to allocate memory for path_libpthread"); goto err_1; @@ -311,7 +315,18 @@ determine_lib_path(void) path_libpthread = NULL; DS_ERROR(ENAMETOOLONG, "path_libpthread is too long"); goto err_1; - } + } + rc = asprintf(&path_libdl, "%s/libdl-%s.so", lib_dir_str, libc_version_str); + if (rc < 0) { + DS_ERROR(ENOMEM, "Failed to allocate memory for path_libdl"); + goto err_1; + } + if (rc >= PATH_MAX) { + free(path_libdl); + path_libdl = NULL; + DS_ERROR(ENAMETOOLONG, "path_libdl is too long"); + goto err_1; + } D_FREE(lib_dir_str); pos = strstr(read_buff_map, "libpil4dfs.so"); @@ -348,6 +363,11 @@ query_pil4dfs_path(void) return path_libpil4dfs; } +float +query_libc_version(void) +{ + return libc_version; +} /* * query_func_addr - Determine the addresses and code sizes of functions in func_name_list[]. @@ -754,6 +774,7 @@ free_memory_in_hook(void) D_FREE(path_ld); D_FREE(path_libc); D_FREE(module_list); + free(path_libdl); free(path_libpthread); if (lib_name_list) { @@ -1034,6 +1055,8 @@ register_a_hook(const char *module_name, const char *func_name, const void *new_ module_name_local = path_ld; else if (strncmp(module_name, "libc", 5) == 0) module_name_local = path_libc; + else if (strncmp(module_name, "libdl", 6) == 0) + module_name_local = path_libdl; else if (strncmp(module_name, "libpthread", 11) == 0) module_name_local = path_libpthread; else diff --git a/src/client/dfuse/pil4dfs/hook.h b/src/client/dfuse/pil4dfs/hook.h index 7742faaff53..b686d99ce4e 100644 --- a/src/client/dfuse/pil4dfs/hook.h +++ b/src/client/dfuse/pil4dfs/hook.h @@ -60,4 +60,10 @@ free_memory_in_hook(void); char * query_pil4dfs_path(void); +/** + * return glibc version in current process + */ +float +query_libc_version(void); + #endif diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index b69449f1e3d..84567ac5f2d 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -161,6 +161,7 @@ static long int page_size; #define DAOS_INIT_RUNNING 1 static _Atomic uint64_t mpi_init_count; +static _Atomic int64_t zeInit_count; static long int daos_initing; _Atomic bool d_daos_inited; @@ -471,6 +472,9 @@ static int (*next_tcgetattr)(int fd, void *termios_p); static int (*next_mpi_init)(int *argc, char ***argv); static int (*next_pmpi_init)(int *argc, char ***argv); +static int (*next_ze_init)(int flags); +static void *(*next_dlsym)(void *handle, const char *symbol); +static void *(*new_dlsym)(void *handle, const char *symbol); /* to do!! */ /** @@ -1058,6 +1062,143 @@ PMPI_Init(int *argc, char ***argv) return rc; } +int +zeInit(int flags) +{ + int rc; + + if (next_ze_init == NULL) { + if (d_hook_enabled) + next_ze_init = next_dlsym(RTLD_NEXT, "zeInit"); + else + next_ze_init = dlsym(RTLD_NEXT, "zeInit"); + } + D_ASSERT(next_ze_init != NULL); + atomic_fetch_add_relaxed(&zeInit_count, 1); + rc = next_ze_init(flags); + atomic_fetch_add_relaxed(&zeInit_count, -1); + return rc; +} + +#if defined(__x86_64__) +/* This is used to work around compiling warning and limitations of using asm function. */ +static void * +query_new_dlsym_addr(void *addr) +{ + int i; + + /* assume little endian */ + for (i = 0; i < 64; i++) { + /* 0x56579090 is corresponding to the first four instructions at new_dlsym_asm. + * 0x90 - nop, 0x90 - nop, 0x57 - push %rdi, 0x56 - push %rsi + */ + if (*((int *)(addr + i)) == 0x56579090) { + /* two nop are added for easier positioning. offset +2 here to skip two + * nop and start from the real entry. + */ + return ((void *)(addr + i + 2)); + } + } + return NULL; +} + +_Pragma("GCC diagnostic push") +_Pragma("GCC diagnostic ignored \"-Wunused-function\"") +_Pragma("GCC diagnostic ignored \"-Wunused-variable\"") + +_Pragma("GCC push_options") +_Pragma("GCC optimize(\"-O0\")") +static char str_zeinit[] = "zeInit"; + +static int +is_hook_enabled(void) +{ + return (d_hook_enabled ? (1) : (0)); +} + +/* This wrapper function is introduced to avoid compiling issue with Intel-C on Leap 15.5 */ +static int +my_strcmp(const char *s1, const char *s2) +{ + return strcmp(s1, s2); +} + +static void * +get_zeinit_addr(void) +{ + return (void *)zeInit; +} + +__attribute__((aligned(16))) static void +new_dlsym_marker(void) +{ +} + +__asm__( + "new_dlsym_asm:\n" + "nop\n" + "nop\n" + "push %rdi\n" + "push %rsi\n" + + "call is_hook_enabled\n" + "test %eax,%eax\n" + "je org_dlsym\n" + + "mov %rsi, %rdi\n" + "lea str_zeinit(%rip), %rsi\n" + "call my_strcmp\n" + "test %eax,%eax\n" + "jne org_dlsym\n" + + "pop %rsi\n" + "pop %rdi\n" + "call *next_dlsym(%rip)\n" + "mov %rax, next_ze_init(%rip)\n" + + "test %eax,%eax\n" + "jne found\n" + "ret\n" + + "found:\n" + "call get_zeinit_addr\n" + "ret\n" + + "org_dlsym:\n" + "pop %rsi\n" + "pop %rdi\n" + "jmp *next_dlsym(%rip)\n" +); +_Pragma("GCC pop_options") +_Pragma("GCC diagnostic pop") + +#else +/* c code for other architecture. caller info could be wrong inside libc dlsym() when handle is set + * RTLD_NEXT. Assembly version implementation similar to above is needed to fix the issue by using + * jump instead of call instruction. + */ +static void * +new_dlsym_c(void *handle, const char *symbol) +{ + if (!d_hook_enabled) + goto org_dlsym; + printf("Inside my dlsym().\n"); + if (strcmp(symbol, "zeInit") != 0) + goto org_dlsym; + + next_ze_init = next_dlsym(handle, symbol); + if (next_ze_init) + /* dlsym() finished successfully, then intercept zeInit() */ + return zeInit; + else + return next_ze_init; + +org_dlsym: + /* Ideally we need to adjust stack and jump to next_dlsym(). */ + return next_dlsym(handle, symbol); +} +#endif + /** determine whether a path (both relative and absolute) is on DAOS or not. If yes, * returns parent object, item name, full path of parent dir, full absolute path, and * the pointer to struct dfs_mt. @@ -1164,6 +1305,15 @@ query_path(const char *szInput, int *is_target_path, struct dcache_rec **parent, goto out_normal; } + /* Check whether zeInit() is running. If yes, pass to the original + * libc functions. Avoid possible zeInit reentrancy/nested call. + */ + + if (atomic_load_relaxed(&zeInit_count) > 0) { + *is_target_path = 0; + goto out_normal; + } + /* daos_init() is expensive to call. We call it only when necessary. */ /* Check whether daos_init() is running. If yes, pass to the original @@ -2034,6 +2184,7 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char if (!is_target_path) goto org_func; + atomic_fetch_add_relaxed(&num_open, 1); if (oflags & O_CREAT && (oflags & O_DIRECTORY || oflags & O_PATH)) { /* Create a dir is not supported. */ errno = ENOENT; @@ -2061,7 +2212,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char } /* Need to create a fake fd and associate with fd_kernel */ - atomic_fetch_add_relaxed(&num_open, 1); dfs_get_mode(dfs_obj, &mode_query); /* regular file */ @@ -2237,7 +2387,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char return (idx_dirfd + FD_DIR_BASE); } - atomic_fetch_add_relaxed(&num_open, 1); rc = find_next_available_fd(NULL, &idx_fd); if (rc) @@ -6014,7 +6163,7 @@ ioctl(int fd, unsigned long request, ...) va_list arg; void *param; struct dfuse_user_reply *reply; - int fd_directed; + int fd_directed = fd; va_start(arg, request); param = va_arg(arg, void *); @@ -6040,12 +6189,11 @@ ioctl(int fd, unsigned long request, ...) return next_ioctl(fd, request, param); fd_directed = d_get_fd_redirected(fd); - if (fd_directed < FD_FILE_BASE) + if ((fd_directed < FD_FILE_BASE) || (fd_directed >= (FD_DIR_BASE + MAX_OPENED_DIR))) return next_ioctl(fd, request, param); errno = ENOTSUP; - - return -1; + return (-1); } int @@ -6718,6 +6866,18 @@ check_exe_sh_bash(void) return; } +#define SMALL_DIFF (0.0001) +static int +libc_ver_cmp(float ver_a, float ver_b) +{ + if ((ver_a + SMALL_DIFF) < ver_b) + return (-1); + else if (ver_a > (ver_b + SMALL_DIFF)) + return (1); + else + return (0); +} + static __attribute__((constructor)) void init_myhook(void) { @@ -6725,6 +6885,7 @@ init_myhook(void) char *env_log; int rc; uint64_t eq_count_loc = 0; + float libc_version; umask_old = umask(0); umask(umask_old); @@ -6873,6 +7034,18 @@ init_myhook(void) register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit)); register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3)); +#if defined(__x86_64__) + new_dlsym = query_new_dlsym_addr(new_dlsym_marker); +#else + new_dlsym = new_dlsym_c; +#endif + D_ASSERT(new_dlsym != NULL); + libc_version = query_libc_version(); + if (libc_ver_cmp(libc_version, 2.34) < 0) + register_a_hook("libdl", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym)); + else + register_a_hook("libc", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym)); + init_fd_dup2_list(); /* Need to check whether current process is bash or not under regular & compatible modes.*/ @@ -6884,6 +7057,10 @@ init_myhook(void) dcache_rec_timeout = 0; install_hook(); + + /* Check it here to minimize the work in function new_dlsym() written in assembly */ + D_ASSERT(next_dlsym != NULL); + d_hook_enabled = 1; hook_enabled_bak = d_hook_enabled; } diff --git a/src/client/dfuse/pil4dfs/pil4dfs_int.h b/src/client/dfuse/pil4dfs/pil4dfs_int.h index a9c54b55555..0693123b51f 100644 --- a/src/client/dfuse/pil4dfs/pil4dfs_int.h +++ b/src/client/dfuse/pil4dfs/pil4dfs_int.h @@ -30,7 +30,7 @@ /* FD_FILE_BASE - The base number of the file descriptor for a directory. * The fd allocate from this lib is always larger than FD_FILE_BASE. */ -#define FD_DIR_BASE (0x40000000) +#define FD_DIR_BASE (FD_FILE_BASE + MAX_OPENED_FILE) /* structure allocated for a FD for a file */ struct file_obj { From d1af13a52492f72474d5249a45cbd123b3d05da9 Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Tue, 5 Nov 2024 22:18:28 -0600 Subject: [PATCH 33/34] DAOS-16752 build: update mercury to 2.4.0 (#15443) Signed-off-by: Jerome Soumagne --- utils/build.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/build.config b/utils/build.config index b7f86add782..a14ad039c15 100644 --- a/utils/build.config +++ b/utils/build.config @@ -9,7 +9,7 @@ isal=v2.30.0 isal_crypto=v2.23.0 spdk=v22.01.2 ofi=v1.22.0 -mercury=v2.4.0rc5 +mercury=v2.4.0 protobufc=v1.3.3 ucx=v1.14.1 From 12bd641a222537cc4baa7c4d0c76028a7e9f4942 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 6 Nov 2024 00:54:21 -0500 Subject: [PATCH 34/34] DAOS-16784 build: Tag 2.6.2 tb1 (#15455) Tag first test build for 2.6.2. Signed-off-by: Phil Henderson --- TAG | 2 +- VERSION | 2 +- debian/changelog | 6 ++++++ utils/rpms/daos.spec | 7 +++++-- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/TAG b/TAG index 47d92ef13d9..045cac3ed40 100644 --- a/TAG +++ b/TAG @@ -1 +1 @@ -2.6.1-rc3 +2.6.2-tb1 diff --git a/VERSION b/VERSION index 6a6a3d8e35c..097a15a2af3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.1 +2.6.2 diff --git a/debian/changelog b/debian/changelog index 9790dcbee3e..4a48f35bab2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +daos (2.6.2-1) unstable; urgency=medium + [ Phillip Henderson ] + * First test build for 2.6.2 + + -- Phillip Henderson Tue, 05 Nov 2024 23:25:00 -0500 + daos (2.6.1-4) unstable; urgency=medium [ Tomasz Gromadzki ] * Add support of the PMDK package 2.1.0 with NDCTL enabled. diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 29576bc71a1..3d9ffa6b9fc 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -14,8 +14,8 @@ %endif Name: daos -Version: 2.6.1 -Release: 4%{?relval}%{?dist} +Version: 2.6.2 +Release: 1%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -599,6 +599,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent %changelog +* Tue Nov 05 2024 Phillip Henderson 2.6.2-1 +- First test build for 2.6.2 + * Wed Oct 02 2024 Tomasz Gromadzki 2.6.1-4 - Add support of the PMDK package 2.1.0 with NDCTL enabled. * Increase the default ULT stack size to 20KiB if the engine uses