diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 5fce435168a..67c57bd3154 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3' - - uses: isort/isort-action@master + - uses: isort/isort-action@f14e57e1d457956c45a19c05a89cccdf087846e5 # v1.1.0 with: requirementsFiles: "requirements.txt" - name: Run on SConstruct file. diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml new file mode 100644 index 00000000000..8f5524d4513 --- /dev/null +++ b/.github/workflows/trivy.yml @@ -0,0 +1,70 @@ +name: Trivy scan + +on: + workflow_dispatch: + push: + branches: ["master", "release/**"] + pull_request: + branches: ["master", "release/**"] + +# Declare default permissions as nothing. +permissions: {} + +jobs: + build: + name: Build + runs-on: ubuntu-20.04 + steps: + - name: Checkout code + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' + + - name: Prepare the report to be uploaded to the GitHub artifact store + run: | + mkdir report + cp trivy-report-daos.txt report + cp utils/trivy/.trivyignore report/trivyignore.txt + + - name: Upload the report to the GitHub artifact store + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + with: + path: report/* + name: trivy-report-daos + + - name: Adjust config file to use sarif format + run: | + sed -i 's/output: "trivy-report-daos.txt"/output: "trivy-results.sarif"/g' \ + utils/trivy/trivy.yaml + sed -i 's/format: template/format: sarif/g' utils/trivy/trivy.yaml + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' + + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a + # 3.25.15 (v3) + with: + sarif_file: 'trivy-results.sarif' + + - name: Adjust config file to show and validate scan results + run: | + sed -i 's/output: "trivy-results.sarif"//g' utils/trivy/trivy.yaml + sed -i 's/format: sarif/format: table/g' utils/trivy/trivy.yaml + sed -i 's/exit-code: 0/exit-code: 1/g' utils/trivy/trivy.yaml + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@6e7b7d1fd3e4fef0c5fa8cce1229c54b2c9bd0d8 # 0.24.0 + with: + scan-type: 'fs' + scan-ref: '.' + trivy-config: 'utils/trivy/trivy.yaml' diff --git a/TAG b/TAG index 47d92ef13d9..045cac3ed40 100644 --- a/TAG +++ b/TAG @@ -1 +1 @@ -2.6.1-rc3 +2.6.2-tb1 diff --git a/VERSION b/VERSION index 6a6a3d8e35c..097a15a2af3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.1 +2.6.2 diff --git a/debian/changelog b/debian/changelog index 9790dcbee3e..4a48f35bab2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +daos (2.6.2-1) unstable; urgency=medium + [ Phillip Henderson ] + * First test build for 2.6.2 + + -- Phillip Henderson Tue, 05 Nov 2024 23:25:00 -0500 + daos (2.6.1-4) unstable; urgency=medium [ Tomasz Gromadzki ] * Add support of the PMDK package 2.1.0 with NDCTL enabled. diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md index 060c3790d57..3de0a079203 100644 --- a/docs/admin/env_variables.md +++ b/docs/admin/env_variables.md @@ -44,6 +44,7 @@ Environment variables in this section only apply to the server side. |DAOS\_MD\_CAP |Size of a metadata pmem pool/file in MBs. INTEGER. Default to 128 MB.| |DAOS\_START\_POOL\_SVC|Determines whether to start existing pool services when starting a daos\_server. BOOL. Default to true.| |CRT\_DISABLE\_MEM\_PIN|Disable memory pinning workaround on a server side. BOOL. Default to 0.| +|CRT\_EVENT\_DELAY|Delay in seconds before handling a set of CaRT events. INTEGER. Default to 10 s. A longer delay enables batching of successive CaRT events, leading to fewer pool map changes when multiple engines become unavailable at around the same time.| |DAOS\_SCHED\_PRIO\_DISABLED|Disable server ULT prioritizing. BOOL. Default to 0.| |DAOS\_SCHED\_RELAX\_MODE|The mode of CPU relaxing on idle. "disabled":disable relaxing; "net":wait on network request for INTVL; "sleep":sleep for INTVL. STRING. Default to "net"| |DAOS\_SCHED\_RELAX\_INTVL|CPU relax interval in milliseconds. INTEGER. Default to 1 ms.| diff --git a/src/cart/README.env b/src/cart/README.env index 00f270d7a41..b90939c8c72 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -1,13 +1,10 @@ This file lists the environment variables used in CaRT. . D_PROVIDER (Deprecated: CRT_PHY_ADDR_STR) - It determines which mercury NA plugin to be used: + It determines which mercury NA plugin and transport to be used: - set it as "ofi+verbs;ofi_rxm" to use OFI verbs;ofi_rxm provider - - set it as "ofi+gni" to use OFI gni provider - set it as "sm" to use SM plugin which only works within single node - - set it as "ofi+tcp;ofi_rxm" to use OFI tcp;ofi_rxm provider. - - set it as "ofi+sockets" to use OFI sockets provider - NOTE: This provider is deprecated in favor of "ofi+tcp;ofi_rxm" + - set it as "ofi+tcp" to use OFI tcp provider. - by default (not set or set as any other value) it will use ofi tcp provider. @@ -205,3 +202,8 @@ This file lists the environment variables used in CaRT. start copying data in an effort to release multi-recv buffers. Copy will occur when at most D_MRECV_BUF_COPY buffers remain. + SWIM_TRAFFIC_CLASS + (server only) Select a traffic class for the SWIM protocol to use and prevent potential + traffic congestion. Available options are: "unspec" (default), "best_effort", + "low_latency", "bulk_data". + diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 789d75ceb31..26b54b52ec3 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -863,6 +863,9 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.request_post_incr = crt_gdata.cg_post_incr; init_info.multi_recv_op_max = crt_gdata.cg_mrecv_buf; init_info.multi_recv_copy_threshold = crt_gdata.cg_mrecv_buf_copy; + /* Separate SWIM traffic in an effort to prevent potential congestion. */ + if (crt_is_service() && ctx_idx == crt_gdata.cg_swim_crt_idx) + init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc; hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info); if (hg_class == NULL) { @@ -1479,6 +1482,16 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv) rc = crt_hgret_2_der(hg_ret); } + /* Release input buffer */ + if (rpc_priv->crp_release_input_early && !rpc_priv->crp_forward) { + hg_ret = HG_Release_input_buf(rpc_priv->crp_hg_hdl); + if (hg_ret != HG_SUCCESS) { + RPC_ERROR(rpc_priv, "HG_Release_input_buf failed, hg_ret: " DF_HG_RC "\n", + DP_HG_RC(hg_ret)); + /* Fall through */ + } + } + return rc; } diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 21fc184d446..48d2090a5b3 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -18,6 +18,10 @@ static volatile int gdata_init_flag; struct crt_plugin_gdata crt_plugin_gdata; static bool g_prov_settings_applied[CRT_PROV_COUNT]; +#define X(a, b) b, +static const char *const crt_tc_name[] = {CRT_TRAFFIC_CLASSES}; +#undef X + static void crt_lib_init(void) __attribute__((__constructor__)); @@ -237,18 +241,30 @@ crt_gdata_dump(void) DUMP_GDATA_FIELD("%d", cg_rpc_quota); } +static enum crt_traffic_class +crt_str_to_tc(const char *str) +{ + enum crt_traffic_class i = 0; + + while (str != NULL && strcmp(crt_tc_name[i], str) != 0 && i < CRT_TC_UNKNOWN) + i++; + + return i == CRT_TC_UNKNOWN ? CRT_TC_UNSPEC : i; +} + /* first step init - for initializing crt_gdata */ static int data_init(int server, crt_init_options_t *opt) { - uint32_t timeout = 0; - uint32_t credits; - uint32_t fi_univ_size = 0; - uint32_t mem_pin_enable = 0; - uint32_t is_secondary; - uint32_t post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR; - unsigned int mrecv_buf = CRT_HG_MRECV_BUF; - unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */ - int rc = 0; + uint32_t timeout = 0; + uint32_t credits; + uint32_t fi_univ_size = 0; + uint32_t mem_pin_enable = 0; + uint32_t is_secondary; + uint32_t post_init = CRT_HG_POST_INIT, post_incr = CRT_HG_POST_INCR; + unsigned int mrecv_buf = CRT_HG_MRECV_BUF; + unsigned int mrecv_buf_copy = 0; /* buf copy disabled by default */ + char *swim_traffic_class = NULL; + int rc = 0; crt_env_dump(); @@ -261,6 +277,8 @@ static int data_init(int server, crt_init_options_t *opt) crt_gdata.cg_mrecv_buf = mrecv_buf; crt_env_get(D_MRECV_BUF_COPY, &mrecv_buf_copy); crt_gdata.cg_mrecv_buf_copy = mrecv_buf_copy; + crt_env_get(SWIM_TRAFFIC_CLASS, &swim_traffic_class); + crt_gdata.cg_swim_tc = crt_str_to_tc(swim_traffic_class); is_secondary = 0; /* Apply CART-890 workaround for server side only */ diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index 513d6c3db96..0c480b06c2c 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -42,6 +42,17 @@ struct crt_na_config { char **noc_domain_str; /* Array of domains */ }; +#define CRT_TRAFFIC_CLASSES \ + X(CRT_TC_UNSPEC, "unspec") /* Leave it upon plugin to choose */ \ + X(CRT_TC_BEST_EFFORT, "best_effort") /* Best effort */ \ + X(CRT_TC_LOW_LATENCY, "low_latency") /* Low latency */ \ + X(CRT_TC_BULK_DATA, "bulk_data") /* Bulk data */ \ + X(CRT_TC_UNKNOWN, "unknown") /* Unknown */ + +#define X(a, b) a, +enum crt_traffic_class { CRT_TRAFFIC_CLASSES }; +#undef X + struct crt_prov_gdata { /** NA plugin type */ int cpg_provider; @@ -105,6 +116,9 @@ struct crt_gdata { /** global swim index for all servers */ int32_t cg_swim_crt_idx; + /** traffic class used by SWIM */ + enum crt_traffic_class cg_swim_tc; + /** credits limitation for #in-flight RPCs per target EP CTX */ uint32_t cg_credit_ep_ctx; @@ -220,6 +234,7 @@ struct crt_event_cb_priv { ENV(SWIM_PING_TIMEOUT) \ ENV(SWIM_PROTOCOL_PERIOD_LEN) \ ENV(SWIM_SUSPECT_TIMEOUT) \ + ENV_STR(SWIM_TRAFFIC_CLASS) \ ENV_STR(UCX_IB_FORK_INIT) /* uint env */ diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c index 603e565ac1f..bf8124a8a6a 100644 --- a/src/cart/crt_iv.c +++ b/src/cart/crt_iv.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2911,8 +2911,12 @@ bulk_update_transfer_done_aux(const struct crt_bulk_cb_info *info) return rc; send_error: - rc = crt_bulk_free(cb_info->buc_bulk_hdl); + /* send back whatever error got us here */ output->rc = rc; + rc = crt_bulk_free(cb_info->buc_bulk_hdl); + if (rc != 0) + DL_ERROR(rc, "crt_bulk_free() failed"); + iv_ops->ivo_on_put(ivns_internal, &cb_info->buc_iv_value, cb_info->buc_user_priv); diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index 72d8214aa09..ead03d1bf29 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -1550,6 +1550,26 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg) return rc; } +int +crt_reply_send_input_free(crt_rpc_t *req) +{ + struct crt_rpc_priv *rpc_priv = NULL; + int rc = 0; + + if (req == NULL) { + D_ERROR("invalid parameter (NULL req).\n"); + D_GOTO(out, rc = -DER_INVAL); + } + + rpc_priv = container_of(req, struct crt_rpc_priv, crp_pub); + rpc_priv->crp_release_input_early = 1; + + return crt_reply_send(req); + +out: + return rc; +} + int crt_reply_send(crt_rpc_t *req) { diff --git a/src/cart/crt_rpc.h b/src/cart/crt_rpc.h index a5e6afee780..3a590933103 100644 --- a/src/cart/crt_rpc.h +++ b/src/cart/crt_rpc.h @@ -166,29 +166,30 @@ struct crt_rpc_priv { * match with crp_req_hdr.cch_flags. */ uint32_t crp_flags; - uint32_t crp_srv:1, /* flag of server received request */ - crp_output_got:1, - crp_input_got:1, - /* flag of collective RPC request */ - crp_coll:1, - /* flag of crp_tgt_uri need to be freed */ - crp_uri_free:1, - /* flag of forwarded rpc for corpc */ - crp_forward:1, - /* flag of in timeout binheap */ - crp_in_binheap:1, - /* set if a call to crt_req_reply pending */ - crp_reply_pending:1, - /* set to 1 if target ep is set */ - crp_have_ep:1, - /* RPC is tracked by the context */ - crp_ctx_tracked:1, - /* 1 if RPC fails HLC epsilon check */ - crp_fail_hlc:1, - /* RPC completed flag */ - crp_completed:1, - /* RPC originated from a primary provider */ - crp_src_is_primary:1; + uint32_t crp_srv : 1, /* flag of server received request */ + crp_output_got : 1, crp_input_got : 1, + /* flag of collective RPC request */ + crp_coll : 1, + /* flag of crp_tgt_uri need to be freed */ + crp_uri_free : 1, + /* flag of forwarded rpc for corpc */ + crp_forward : 1, + /* flag of in timeout binheap */ + crp_in_binheap : 1, + /* set if a call to crt_req_reply pending */ + crp_reply_pending : 1, + /* set to 1 if target ep is set */ + crp_have_ep : 1, + /* RPC is tracked by the context */ + crp_ctx_tracked : 1, + /* 1 if RPC fails HLC epsilon check */ + crp_fail_hlc : 1, + /* RPC completed flag */ + crp_completed : 1, + /* RPC originated from a primary provider */ + crp_src_is_primary : 1, + /* release input buffer early */ + crp_release_input_early : 1; struct crt_opc_info *crp_opc_info; /* corpc info, only valid when (crp_coll == 1) */ diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index ace1c736791..39821ec6328 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -403,7 +403,7 @@ chk_pool_restart_svc(struct chk_pool_rec *cpr) if (cpr->cpr_started) chk_pool_shutdown(cpr, true); - rc = ds_pool_start_after_check(cpr->cpr_uuid); + rc = ds_pool_start_after_check(cpr->cpr_uuid, cpr->cpr_immutable); if (rc != 0) { D_WARN("Cannot start full PS for "DF_UUIDF" after CR check: "DF_RC"\n", DP_UUID(cpr->cpr_uuid), DP_RC(rc)); diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index 56e6da3ad9b..ad61af851ce 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -1797,10 +1797,8 @@ chk_engine_pool_ult(void *args) } rc = chk_engine_cont_cleanup(cpr, svc, &aggregator); - if (rc != 0) - goto out; - - rc = ds_pool_svc_schedule_reconf(svc); + if (rc == 0 && !cpr->cpr_immutable) + rc = ds_pool_svc_schedule_reconf(svc); out: chk_engine_cont_list_fini(&aggregator); @@ -2113,6 +2111,11 @@ chk_engine_start_post(struct chk_instance *ins) if (pool_cbk->cb_phase == CHK__CHECK_SCAN_PHASE__CSP_DONE) continue; + if (ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_DRYRUN) + cpr->cpr_immutable = 1; + else + cpr->cpr_immutable = 0; + if (phase > pool_cbk->cb_phase) phase = pool_cbk->cb_phase; @@ -2950,7 +2953,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags) cbk = &cpr->cpr_bk; chk_pool_get(cpr); - rc = ds_pool_start(uuid, false); + rc = ds_pool_start(uuid, false, cpr->cpr_immutable); if (rc != 0) D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc)); diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 86868e305ee..9ab16b060b3 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -596,6 +596,7 @@ struct chk_pool_rec { cpr_stop:1, cpr_done:1, cpr_skip:1, + cpr_immutable:1, cpr_dangling:1, cpr_for_orphan:1, cpr_notified_exit:1, diff --git a/src/chk/chk_vos.c b/src/chk/chk_vos.c index fdefc2995f7..af2904affd1 100644 --- a/src/chk/chk_vos.c +++ b/src/chk/chk_vos.c @@ -19,11 +19,20 @@ chk_db_fetch(char *key, int key_size, void *val, int val_size) { d_iov_t key_iov; d_iov_t val_iov; + int rc; d_iov_set(&key_iov, key, key_size); d_iov_set(&val_iov, val, val_size); - return chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + + rc = chk_db->sd_fetch(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); + + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + + return rc; } static int @@ -33,21 +42,17 @@ chk_db_update(char *key, int key_size, void *val, int val_size) d_iov_t val_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } - d_iov_set(&key_iov, key, key_size); d_iov_set(&val_iov, val, val_size); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); -out: return rc; } @@ -57,27 +62,33 @@ chk_db_delete(char *key, int key_size) d_iov_t key_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } - d_iov_set(&key_iov, key, key_size); + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + rc = chk_db->sd_delete(chk_db, CHK_DB_TABLE, &key_iov); - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); -out: return rc; } static int chk_db_traverse(sys_db_trav_cb_t cb, void *args) { - return chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args); + int rc; + + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); + + rc = chk_db->sd_traverse(chk_db, CHK_DB_TABLE, cb, args); + + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + + return rc; } int @@ -243,11 +254,8 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) d_iov_t val_iov; int rc; - if (chk_db->sd_tx_begin) { - rc = chk_db->sd_tx_begin(chk_db); - if (rc != 0) - goto out; - } + if (chk_db->sd_lock) + chk_db->sd_lock(chk_db); if (cpp->cp_rank_nr != 0 && rank_list != NULL) { D_ASSERTF(cpp->cp_rank_nr == rank_list->rl_nr, "Invalid rank nr %u/%u\n", @@ -259,7 +267,7 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); if (rc != 0) - goto end; + goto out; } d_iov_set(&key_iov, CHK_PROPERTY, strlen(CHK_PROPERTY)); @@ -267,11 +275,10 @@ chk_prop_update(struct chk_property *cpp, d_rank_list_t *rank_list) rc = chk_db->sd_upsert(chk_db, CHK_DB_TABLE, &key_iov, &val_iov); -end: - if (chk_db->sd_tx_end) - rc = chk_db->sd_tx_end(chk_db, rc); - out: + if (chk_db->sd_unlock) + chk_db->sd_unlock(chk_db); + if (rc != 0) D_ERROR("Failed to update check property on rank %u: "DF_RC"\n", dss_self_rank(), DP_RC(rc)); diff --git a/src/client/dfs/cont.c b/src/client/dfs/cont.c index 910e1819aa3..27822914056 100644 --- a/src/client/dfs/cont.c +++ b/src/client/dfs/cont.c @@ -970,7 +970,9 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * out_snap: D_FREE(oit_args); epr.epr_hi = epr.epr_lo = snap_epoch; - rc2 = daos_cont_destroy_snap(coh, epr, NULL); + rc2 = daos_cont_destroy_snap(coh, epr, NULL); + if (rc2 != 0) + D_ERROR("Failed to destroy OID table: " DF_RC "\n", DP_RC(rc2)); if (rc == 0) rc = daos_der2errno(rc2); out_dfs: diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index 4f654fa3209..6397b283e97 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -53,7 +53,7 @@ dfuse_progress_thread(void *arg) return NULL; } - rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_WAIT, 128, &dev[0]); + rc = daos_eq_poll(eqt->de_eq, 1, DAOS_EQ_NOWAIT, 128, &dev[0]); if (rc >= 1) { for (i = 0; i < rc; i++) { struct dfuse_event *ev; diff --git a/src/client/dfuse/pil4dfs/hook.c b/src/client/dfuse/pil4dfs/hook.c index 0ec1a0b5374..c30061387c1 100644 --- a/src/client/dfuse/pil4dfs/hook.c +++ b/src/client/dfuse/pil4dfs/hook.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "hook.h" #include "hook_int.h" @@ -89,10 +90,15 @@ static uint64_t lib_base_addr[MAX_NUM_LIB]; /* List of names of loaded libraries */ static char **lib_name_list; +/* libc version number in current process. e.g., 2.28 */ +static float libc_version; +static char *libc_version_str; + /* end to compile list of memory blocks in /proc/pid/maps */ static char *path_ld; static char *path_libc; +static char *path_libdl; static char *path_libpthread; /* This holds the path of libpil4dfs.so. It is needed when we want to * force child processes append libpil4dfs.so to env LD_PRELOAD. */ @@ -212,7 +218,7 @@ determine_lib_path(void) { int path_offset = 0, read_size, i, rc; char *read_buff_map = NULL; - char *pos, *start, *end, lib_ver_str[32] = "", *lib_dir_str = NULL; + char *pos, *start, *end, *lib_dir_str = NULL; read_size = read_map_file(&read_buff_map); @@ -289,19 +295,17 @@ determine_lib_path(void) goto err; path_libc[end - start] = 0; - pos = strstr(path_libc, "libc-2."); - if (pos) { - /* containing version in name. example, 2.17 */ - memcpy(lib_ver_str, pos + 5, 4); - lib_ver_str[4] = 0; + if (libc_version_str == NULL) { + libc_version_str = (char *)gnu_get_libc_version(); + if (libc_version_str == NULL) { + DS_ERROR(errno, "Failed to determine libc version"); + goto err; + } + libc_version = atof(libc_version_str); } - if (lib_ver_str[0]) { - /* with version in name */ - rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, lib_ver_str); - } else { - rc = asprintf(&path_libpthread, "%s/libpthread.so.0", lib_dir_str); - } + /* with version in name */ + rc = asprintf(&path_libpthread, "%s/libpthread-%s.so", lib_dir_str, libc_version_str); if (rc < 0) { DS_ERROR(ENOMEM, "Failed to allocate memory for path_libpthread"); goto err_1; @@ -311,7 +315,18 @@ determine_lib_path(void) path_libpthread = NULL; DS_ERROR(ENAMETOOLONG, "path_libpthread is too long"); goto err_1; - } + } + rc = asprintf(&path_libdl, "%s/libdl-%s.so", lib_dir_str, libc_version_str); + if (rc < 0) { + DS_ERROR(ENOMEM, "Failed to allocate memory for path_libdl"); + goto err_1; + } + if (rc >= PATH_MAX) { + free(path_libdl); + path_libdl = NULL; + DS_ERROR(ENAMETOOLONG, "path_libdl is too long"); + goto err_1; + } D_FREE(lib_dir_str); pos = strstr(read_buff_map, "libpil4dfs.so"); @@ -348,6 +363,11 @@ query_pil4dfs_path(void) return path_libpil4dfs; } +float +query_libc_version(void) +{ + return libc_version; +} /* * query_func_addr - Determine the addresses and code sizes of functions in func_name_list[]. @@ -754,6 +774,7 @@ free_memory_in_hook(void) D_FREE(path_ld); D_FREE(path_libc); D_FREE(module_list); + free(path_libdl); free(path_libpthread); if (lib_name_list) { @@ -1034,6 +1055,8 @@ register_a_hook(const char *module_name, const char *func_name, const void *new_ module_name_local = path_ld; else if (strncmp(module_name, "libc", 5) == 0) module_name_local = path_libc; + else if (strncmp(module_name, "libdl", 6) == 0) + module_name_local = path_libdl; else if (strncmp(module_name, "libpthread", 11) == 0) module_name_local = path_libpthread; else diff --git a/src/client/dfuse/pil4dfs/hook.h b/src/client/dfuse/pil4dfs/hook.h index 7742faaff53..b686d99ce4e 100644 --- a/src/client/dfuse/pil4dfs/hook.h +++ b/src/client/dfuse/pil4dfs/hook.h @@ -60,4 +60,10 @@ free_memory_in_hook(void); char * query_pil4dfs_path(void); +/** + * return glibc version in current process + */ +float +query_libc_version(void); + #endif diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index fff49d9f0ae..84567ac5f2d 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -161,6 +161,7 @@ static long int page_size; #define DAOS_INIT_RUNNING 1 static _Atomic uint64_t mpi_init_count; +static _Atomic int64_t zeInit_count; static long int daos_initing; _Atomic bool d_daos_inited; @@ -470,6 +471,10 @@ static int (*next_tcgetattr)(int fd, void *termios_p); /* end NOT supported by DAOS */ static int (*next_mpi_init)(int *argc, char ***argv); +static int (*next_pmpi_init)(int *argc, char ***argv); +static int (*next_ze_init)(int flags); +static void *(*next_dlsym)(void *handle, const char *symbol); +static void *(*new_dlsym)(void *handle, const char *symbol); /* to do!! */ /** @@ -1041,6 +1046,159 @@ MPI_Init(int *argc, char ***argv) return rc; } +int +PMPI_Init(int *argc, char ***argv) +{ + int rc; + + if (next_pmpi_init == NULL) { + next_pmpi_init = dlsym(RTLD_NEXT, "PMPI_Init"); + D_ASSERT(next_pmpi_init != NULL); + } + + atomic_fetch_add_relaxed(&mpi_init_count, 1); + rc = next_pmpi_init(argc, argv); + atomic_fetch_add_relaxed(&mpi_init_count, -1); + return rc; +} + +int +zeInit(int flags) +{ + int rc; + + if (next_ze_init == NULL) { + if (d_hook_enabled) + next_ze_init = next_dlsym(RTLD_NEXT, "zeInit"); + else + next_ze_init = dlsym(RTLD_NEXT, "zeInit"); + } + D_ASSERT(next_ze_init != NULL); + atomic_fetch_add_relaxed(&zeInit_count, 1); + rc = next_ze_init(flags); + atomic_fetch_add_relaxed(&zeInit_count, -1); + return rc; +} + +#if defined(__x86_64__) +/* This is used to work around compiling warning and limitations of using asm function. */ +static void * +query_new_dlsym_addr(void *addr) +{ + int i; + + /* assume little endian */ + for (i = 0; i < 64; i++) { + /* 0x56579090 is corresponding to the first four instructions at new_dlsym_asm. + * 0x90 - nop, 0x90 - nop, 0x57 - push %rdi, 0x56 - push %rsi + */ + if (*((int *)(addr + i)) == 0x56579090) { + /* two nop are added for easier positioning. offset +2 here to skip two + * nop and start from the real entry. + */ + return ((void *)(addr + i + 2)); + } + } + return NULL; +} + +_Pragma("GCC diagnostic push") +_Pragma("GCC diagnostic ignored \"-Wunused-function\"") +_Pragma("GCC diagnostic ignored \"-Wunused-variable\"") + +_Pragma("GCC push_options") +_Pragma("GCC optimize(\"-O0\")") +static char str_zeinit[] = "zeInit"; + +static int +is_hook_enabled(void) +{ + return (d_hook_enabled ? (1) : (0)); +} + +/* This wrapper function is introduced to avoid compiling issue with Intel-C on Leap 15.5 */ +static int +my_strcmp(const char *s1, const char *s2) +{ + return strcmp(s1, s2); +} + +static void * +get_zeinit_addr(void) +{ + return (void *)zeInit; +} + +__attribute__((aligned(16))) static void +new_dlsym_marker(void) +{ +} + +__asm__( + "new_dlsym_asm:\n" + "nop\n" + "nop\n" + "push %rdi\n" + "push %rsi\n" + + "call is_hook_enabled\n" + "test %eax,%eax\n" + "je org_dlsym\n" + + "mov %rsi, %rdi\n" + "lea str_zeinit(%rip), %rsi\n" + "call my_strcmp\n" + "test %eax,%eax\n" + "jne org_dlsym\n" + + "pop %rsi\n" + "pop %rdi\n" + "call *next_dlsym(%rip)\n" + "mov %rax, next_ze_init(%rip)\n" + + "test %eax,%eax\n" + "jne found\n" + "ret\n" + + "found:\n" + "call get_zeinit_addr\n" + "ret\n" + + "org_dlsym:\n" + "pop %rsi\n" + "pop %rdi\n" + "jmp *next_dlsym(%rip)\n" +); +_Pragma("GCC pop_options") +_Pragma("GCC diagnostic pop") + +#else +/* c code for other architecture. caller info could be wrong inside libc dlsym() when handle is set + * RTLD_NEXT. Assembly version implementation similar to above is needed to fix the issue by using + * jump instead of call instruction. + */ +static void * +new_dlsym_c(void *handle, const char *symbol) +{ + if (!d_hook_enabled) + goto org_dlsym; + printf("Inside my dlsym().\n"); + if (strcmp(symbol, "zeInit") != 0) + goto org_dlsym; + + next_ze_init = next_dlsym(handle, symbol); + if (next_ze_init) + /* dlsym() finished successfully, then intercept zeInit() */ + return zeInit; + else + return next_ze_init; + +org_dlsym: + /* Ideally we need to adjust stack and jump to next_dlsym(). */ + return next_dlsym(handle, symbol); +} +#endif + /** determine whether a path (both relative and absolute) is on DAOS or not. If yes, * returns parent object, item name, full path of parent dir, full absolute path, and * the pointer to struct dfs_mt. @@ -1147,6 +1305,15 @@ query_path(const char *szInput, int *is_target_path, struct dcache_rec **parent, goto out_normal; } + /* Check whether zeInit() is running. If yes, pass to the original + * libc functions. Avoid possible zeInit reentrancy/nested call. + */ + + if (atomic_load_relaxed(&zeInit_count) > 0) { + *is_target_path = 0; + goto out_normal; + } + /* daos_init() is expensive to call. We call it only when necessary. */ /* Check whether daos_init() is running. If yes, pass to the original @@ -2017,6 +2184,7 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char if (!is_target_path) goto org_func; + atomic_fetch_add_relaxed(&num_open, 1); if (oflags & O_CREAT && (oflags & O_DIRECTORY || oflags & O_PATH)) { /* Create a dir is not supported. */ errno = ENOENT; @@ -2044,7 +2212,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char } /* Need to create a fake fd and associate with fd_kernel */ - atomic_fetch_add_relaxed(&num_open, 1); dfs_get_mode(dfs_obj, &mode_query); /* regular file */ @@ -2220,7 +2387,6 @@ open_common(int (*real_open)(const char *pathname, int oflags, ...), const char return (idx_dirfd + FD_DIR_BASE); } - atomic_fetch_add_relaxed(&num_open, 1); rc = find_next_available_fd(NULL, &idx_fd); if (rc) @@ -5997,7 +6163,7 @@ ioctl(int fd, unsigned long request, ...) va_list arg; void *param; struct dfuse_user_reply *reply; - int fd_directed; + int fd_directed = fd; va_start(arg, request); param = va_arg(arg, void *); @@ -6023,12 +6189,11 @@ ioctl(int fd, unsigned long request, ...) return next_ioctl(fd, request, param); fd_directed = d_get_fd_redirected(fd); - if (fd_directed < FD_FILE_BASE) + if ((fd_directed < FD_FILE_BASE) || (fd_directed >= (FD_DIR_BASE + MAX_OPENED_DIR))) return next_ioctl(fd, request, param); errno = ENOTSUP; - - return -1; + return (-1); } int @@ -6224,6 +6389,14 @@ new_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) atomic_fetch_add_relaxed(&num_mmap, 1); + if ((fd < FD_FILE_BASE) && (fd_directed >= FD_FILE_BASE) && d_compatible_mode) { + /* DAOS-14494: Force the kernel to update the size before mapping. */ + rc = next_fxstat(1, fd, &stat_buf); + if (rc == -1) + return MAP_FAILED; + return next_mmap(addr, length, prot, flags, fd, offset); + } + addr_ret = next_mmap(addr, length, prot, flags | MAP_ANONYMOUS, -1, offset); if (addr_ret == MAP_FAILED) return MAP_FAILED; @@ -6693,6 +6866,18 @@ check_exe_sh_bash(void) return; } +#define SMALL_DIFF (0.0001) +static int +libc_ver_cmp(float ver_a, float ver_b) +{ + if ((ver_a + SMALL_DIFF) < ver_b) + return (-1); + else if (ver_a > (ver_b + SMALL_DIFF)) + return (1); + else + return (0); +} + static __attribute__((constructor)) void init_myhook(void) { @@ -6700,6 +6885,7 @@ init_myhook(void) char *env_log; int rc; uint64_t eq_count_loc = 0; + float libc_version; umask_old = umask(0); umask(umask_old); @@ -6842,14 +7028,24 @@ init_myhook(void) register_a_hook("libc", "fcntl", (void *)new_fcntl, (long int *)(&libc_fcntl)); - if (d_compatible_mode == false) { - register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap)); - register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap)); - } + register_a_hook("libc", "mmap", (void *)new_mmap, (long int *)(&next_mmap)); + register_a_hook("libc", "munmap", (void *)new_munmap, (long int *)(&next_munmap)); register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit)); register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3)); +#if defined(__x86_64__) + new_dlsym = query_new_dlsym_addr(new_dlsym_marker); +#else + new_dlsym = new_dlsym_c; +#endif + D_ASSERT(new_dlsym != NULL); + libc_version = query_libc_version(); + if (libc_ver_cmp(libc_version, 2.34) < 0) + register_a_hook("libdl", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym)); + else + register_a_hook("libc", "dlsym", (void *)new_dlsym, (long int *)(&next_dlsym)); + init_fd_dup2_list(); /* Need to check whether current process is bash or not under regular & compatible modes.*/ @@ -6861,6 +7057,10 @@ init_myhook(void) dcache_rec_timeout = 0; install_hook(); + + /* Check it here to minimize the work in function new_dlsym() written in assembly */ + D_ASSERT(next_dlsym != NULL); + d_hook_enabled = 1; hook_enabled_bak = d_hook_enabled; } diff --git a/src/client/dfuse/pil4dfs/pil4dfs_int.h b/src/client/dfuse/pil4dfs/pil4dfs_int.h index a9c54b55555..0693123b51f 100644 --- a/src/client/dfuse/pil4dfs/pil4dfs_int.h +++ b/src/client/dfuse/pil4dfs/pil4dfs_int.h @@ -30,7 +30,7 @@ /* FD_FILE_BASE - The base number of the file descriptor for a directory. * The fd allocate from this lib is always larger than FD_FILE_BASE. */ -#define FD_DIR_BASE (0x40000000) +#define FD_DIR_BASE (FD_FILE_BASE + MAX_OPENED_FILE) /* structure allocated for a FD for a file */ struct file_obj { diff --git a/src/common/lru.c b/src/common/lru.c index de86d367e0e..87dbdaddaa9 100644 --- a/src/common/lru.c +++ b/src/common/lru.c @@ -255,7 +255,9 @@ void daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink) { D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1); - D_ASSERT(d_list_empty(&llink->ll_qlink)); + D_ASSERTF(d_list_empty(&llink->ll_qlink), + "May hit corrupted item in LRU cache %p: llink %p, refs %d, prev %p, next %p\n", + lcache, llink, llink->ll_ref, llink->ll_qlink.prev, llink->ll_qlink.next); lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link); diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 372da43afe4..91d87d9b978 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1555,9 +1555,9 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, * - Users who can delete any container in the pool * - Users who have been given access to delete the specific container */ - if (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) && - !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, - &owner, acl)) { + if (pool_hdl->sph_pool->sp_immutable || + (!ds_sec_pool_can_delete_cont(pool_hdl->sph_sec_capas) && + !ds_sec_cont_can_delete(pool_hdl->sph_flags, &pool_hdl->sph_cred, &owner, acl))) { D_ERROR(DF_CONT": permission denied to delete cont\n", DP_CONT(pool_hdl->sph_pool->sp_uuid, cont->c_uuid)); D_GOTO(out_prop, rc = -DER_NO_PERM); @@ -2253,6 +2253,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr goto out; } + if (pool_hdl->sph_pool->sp_immutable && (flags & DAOS_COO_IO_BASE_MASK) != DAOS_COO_RO) { + rc = -DER_NO_PERM; + D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the immutable " + "container with flags " DF_X64 ", sec_capas " DF_X64 ": " DF_RC "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, DP_RC(rc)); + goto out; + } + /* * Need props to check for pool redundancy requirements and access * control. @@ -2274,6 +2283,11 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr D_GOTO(out, rc); } + D_DEBUG(DB_MD, DF_UUID "/" DF_UUID "/" DF_UUID ": opening container with flags " + DF_X64", sec_capas " DF_X64 "/" DF_X64 "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas); + if ((flags & DAOS_COO_EVICT_ALL) && !ds_sec_cont_can_evict_all(sec_capas)) { D_ERROR(DF_CONT": permission denied evicting all handles\n", DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid)); @@ -2282,11 +2296,15 @@ cont_open(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, cr goto out; } - if ((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) { - D_ERROR(DF_CONT": permission denied opening exclusively\n", - DP_CONT(cont->c_svc->cs_pool_uuid, cont->c_uuid)); - daos_prop_free(prop); + if (((flags & DAOS_COO_EX) && !ds_sec_cont_can_open_ex(sec_capas)) || + ((flags & DAOS_COO_RW) && !ds_sec_cont_can_modify(sec_capas))) { rc = -DER_NO_PERM; + D_ERROR(DF_UUID "/" DF_UUID "/" DF_UUID ": failed to open the container " + "with flags " DF_X64 ", capas " DF_X64 "/" DF_X64 ": " DF_RC "\n", + DP_UUID(cont->c_svc->cs_pool_uuid), DP_UUID(pool_hdl->sph_uuid), + DP_UUID(cont->c_uuid), flags, pool_hdl->sph_sec_capas, sec_capas, + DP_RC(rc)); + daos_prop_free(prop); goto out; } diff --git a/src/container/srv_target.c b/src/container/srv_target.c index b0f3b693580..caf2a4b2ee7 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -475,7 +475,7 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb, if (rc == -DER_SHUTDOWN) { break; /* pool destroyed */ } else if (rc < 0) { - DL_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, rc, + DL_CDEBUG(rc == -DER_BUSY || rc == -DER_INPROGRESS, DB_EPC, DLOG_ERR, rc, DF_CONT ": %s aggregate failed", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), param->ap_vos_agg ? "VOS" : "EC"); @@ -669,6 +669,7 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, cont->sc_dtx_committable_coll_count = 0; D_INIT_LIST_HEAD(&cont->sc_dtx_cos_list); D_INIT_LIST_HEAD(&cont->sc_dtx_coll_list); + D_INIT_LIST_HEAD(&cont->sc_dtx_batched_list); *link = &cont->sc_list; return 0; @@ -935,7 +936,7 @@ cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid, cont_child->sc_stopping, cont_child->sc_destroying); rc = -DER_SHUTDOWN; } else if (!cont_child_started(cont_child)) { - if (!ds_pool_skip_for_check(pool_child->spc_pool)) { + if (!ds_pool_restricted(pool_child->spc_pool, false)) { rc = cont_start_agg(cont_child); if (rc != 0) goto out; @@ -1591,11 +1592,15 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, * but for creating rebuild global container handle. */ D_ASSERT(hdl->sch_cont != NULL); + D_ASSERT(hdl->sch_cont->sc_pool != NULL); hdl->sch_cont->sc_open++; if (hdl->sch_cont->sc_open > 1) goto opened; + if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false)) + goto csum_init; + rc = dtx_cont_open(hdl->sch_cont); if (rc != 0) { D_ASSERTF(hdl->sch_cont->sc_open == 1, "Unexpected open count for cont " @@ -1623,10 +1628,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, D_GOTO(err_dtx, rc); } - D_ASSERT(hdl->sch_cont != NULL); - D_ASSERT(hdl->sch_cont->sc_pool != NULL); +csum_init: rc = ds_cont_csummer_init(hdl->sch_cont); - if (rc != 0) D_GOTO(err_dtx, rc); } diff --git a/src/control/common/proto/ctl/storage_nvme.pb.go b/src/control/common/proto/ctl/storage_nvme.pb.go index 62fede43ed4..cb2dc5099d4 100644 --- a/src/control/common/proto/ctl/storage_nvme.pb.go +++ b/src/control/common/proto/ctl/storage_nvme.pb.go @@ -95,11 +95,12 @@ type ScanNvmeReq struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - Health bool `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"` // Retrieve NVMe device health statistics - Meta bool `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"` // Retrieve metadata relating to NVMe device - Basic bool `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"` // Strip NVMe device details to only basic - MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob - RdbSize uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"` // Size of the RDB blob + Health bool `protobuf:"varint,1,opt,name=Health,proto3" json:"Health,omitempty"` // Retrieve NVMe device health statistics + Meta bool `protobuf:"varint,2,opt,name=Meta,proto3" json:"Meta,omitempty"` // Retrieve metadata relating to NVMe device + Basic bool `protobuf:"varint,3,opt,name=Basic,proto3" json:"Basic,omitempty"` // Strip NVMe device details to only basic + MetaSize uint64 `protobuf:"varint,4,opt,name=MetaSize,proto3" json:"MetaSize,omitempty"` // Size of the metadata blob + RdbSize uint64 `protobuf:"varint,5,opt,name=RdbSize,proto3" json:"RdbSize,omitempty"` // Size of the RDB blob + LinkStats bool `protobuf:"varint,6,opt,name=LinkStats,proto3" json:"LinkStats,omitempty"` // Populate PCIe link info in health statistics } func (x *ScanNvmeReq) Reset() { @@ -169,6 +170,13 @@ func (x *ScanNvmeReq) GetRdbSize() uint64 { return 0 } +func (x *ScanNvmeReq) GetLinkStats() bool { + if x != nil { + return x.LinkStats + } + return false +} + type ScanNvmeResp struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache @@ -276,7 +284,7 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x72, 0x6f, 0x6c, 0x65, 0x5f, 0x62, 0x69, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x08, - 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0x85, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, + 0x72, 0x6f, 0x6c, 0x65, 0x42, 0x69, 0x74, 0x73, 0x22, 0xa3, 0x01, 0x0a, 0x0b, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x12, 0x16, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x65, 0x74, 0x61, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, @@ -285,18 +293,20 @@ var file_ctl_storage_nvme_proto_rawDesc = []byte{ 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x4d, 0x65, 0x74, 0x61, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x52, 0x64, 0x62, 0x53, 0x69, 0x7a, 0x65, - 0x22, 0x65, 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, - 0x12, 0x2b, 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, - 0x32, 0x13, 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, - 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, - 0x74, 0x6c, 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, - 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, - 0x74, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, - 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, - 0x6b, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, - 0x63, 0x74, 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x12, 0x1c, 0x0a, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x18, 0x06, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x09, 0x4c, 0x69, 0x6e, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x73, 0x22, 0x65, + 0x0a, 0x0c, 0x53, 0x63, 0x61, 0x6e, 0x4e, 0x76, 0x6d, 0x65, 0x52, 0x65, 0x73, 0x70, 0x12, 0x2b, + 0x0a, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, + 0x2e, 0x63, 0x74, 0x6c, 0x2e, 0x4e, 0x76, 0x6d, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x6c, 0x65, 0x72, 0x52, 0x06, 0x63, 0x74, 0x72, 0x6c, 0x72, 0x73, 0x12, 0x28, 0x0a, 0x05, 0x73, + 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x74, 0x6c, + 0x2e, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, + 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x0f, 0x0a, 0x0d, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4e, + 0x76, 0x6d, 0x65, 0x52, 0x65, 0x71, 0x42, 0x39, 0x5a, 0x37, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, + 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x61, 0x6f, 0x73, 0x2d, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x2f, + 0x64, 0x61, 0x6f, 0x73, 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, + 0x2f, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2f, 0x63, 0x74, + 0x6c, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/src/control/lib/control/storage.go b/src/control/lib/control/storage.go index 65942a6c12d..9d5fe470de6 100644 --- a/src/control/lib/control/storage.go +++ b/src/control/lib/control/storage.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -258,6 +258,8 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe // Health and meta details required to populate usage statistics. Health: req.NvmeHealth || req.Usage, Meta: req.Usage, + // Only request link stats if health explicitly requested. + LinkStats: req.NvmeHealth, }, }) }) diff --git a/src/control/lib/hardware/pciutils/bindings.go b/src/control/lib/hardware/pciutils/bindings.go index 86686896c80..f4d43c50fa3 100644 --- a/src/control/lib/hardware/pciutils/bindings.go +++ b/src/control/lib/hardware/pciutils/bindings.go @@ -40,6 +40,7 @@ var ( ErrMultiDevices = errors.New("want single device config got multiple") ErrCfgNotTerminated = errors.New("device config content not new-line terminated") ErrCfgMissing = errors.New("incomplete device config") + ErrNoPCIeCaps = errors.New("no pci-express capabilities found") ) // api provides the PCIeLinkStatsProvider interface by exposing a concrete implementation of @@ -150,7 +151,7 @@ func (ap *api) PCIeCapsFromConfig(cfgBytes []byte, dev *hardware.PCIDevice) erro var cp *C.struct_pci_cap = C.pci_find_cap(pciDev, C.PCI_CAP_ID_EXP, C.PCI_CAP_NORMAL) if cp == nil { - return errors.New("no pci-express capabilities found") + return ErrNoPCIeCaps } cpAddr := uint32(cp.addr) diff --git a/src/control/lib/telemetry/counter.go b/src/control/lib/telemetry/counter.go index 81549a32daf..e6e59a8d0ea 100644 --- a/src/control/lib/telemetry/counter.go +++ b/src/control/lib/telemetry/counter.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -37,18 +37,22 @@ func (c *Counter) FloatValue() float64 { } func (c *Counter) Value() uint64 { + ctrVal := BadUintVal if c.handle == nil || c.node == nil { - return BadUintVal + return ctrVal } - var val C.uint64_t - - res := C.d_tm_get_counter(c.handle.ctx, &val, c.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_counter(c.handle.ctx, &val, c.node) + if res == C.DER_SUCCESS { + ctrVal = uint64(val) + } + return res } + c.fetchValWithRetry(fetch) - return BadUintVal + return ctrVal } func newCounter(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Counter { diff --git a/src/control/lib/telemetry/duration.go b/src/control/lib/telemetry/duration.go index 1f32125bc90..3cfd240bde7 100644 --- a/src/control/lib/telemetry/duration.go +++ b/src/control/lib/telemetry/duration.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,18 +34,22 @@ func (d *Duration) Type() MetricType { } func (d *Duration) Value() time.Duration { + durValue := BadDuration if d.handle == nil || d.node == nil { - return BadDuration + return durValue } - var tms C.struct_timespec - - res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node) - if res == C.DER_SUCCESS { - return time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond + fetch := func() C.int { + var tms C.struct_timespec + res := C.d_tm_get_duration(d.handle.ctx, &tms, &d.stats, d.node) + if res == C.DER_SUCCESS { + durValue = time.Duration(tms.tv_sec)*time.Second + time.Duration(tms.tv_nsec)*time.Nanosecond + } + return res } + d.fetchValWithRetry(fetch) - return BadDuration + return durValue } func (d *Duration) FloatValue() float64 { diff --git a/src/control/lib/telemetry/gauge.go b/src/control/lib/telemetry/gauge.go index ea84ff90504..93db24ab9fc 100644 --- a/src/control/lib/telemetry/gauge.go +++ b/src/control/lib/telemetry/gauge.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -41,18 +41,22 @@ func (g *Gauge) FloatValue() float64 { // Value returns the value as an unsigned integer. func (g *Gauge) Value() uint64 { + gaugeVal := BadUintVal if g.handle == nil || g.node == nil { - return BadUintVal + return gaugeVal } - var val C.uint64_t - - res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_gauge(g.handle.ctx, &val, nil, g.node) + if res == C.DER_SUCCESS { + gaugeVal = uint64(val) + } + return res } + g.fetchValWithRetry(fetch) - return BadUintVal + return gaugeVal } func newGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *Gauge { @@ -103,18 +107,22 @@ func (g *StatsGauge) FloatValue() float64 { // Value returns the gauge value as an unsigned integer. func (g *StatsGauge) Value() uint64 { + gaugeVal := BadUintVal if g.handle == nil || g.node == nil { - return BadUintVal + return gaugeVal } - var val C.uint64_t - - res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node) - if res == C.DER_SUCCESS { - return uint64(val) + fetch := func() C.int { + var val C.uint64_t + res := C.d_tm_get_gauge(g.handle.ctx, &val, &g.stats, g.node) + if res == C.DER_SUCCESS { + gaugeVal = uint64(val) + } + return res } + g.fetchValWithRetry(fetch) - return BadUintVal + return gaugeVal } func newStatsGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_node_t) *StatsGauge { diff --git a/src/control/lib/telemetry/snapshot.go b/src/control/lib/telemetry/snapshot.go index 2ffa23296c3..5b2af9f0747 100644 --- a/src/control/lib/telemetry/snapshot.go +++ b/src/control/lib/telemetry/snapshot.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,18 +34,22 @@ func (s *Snapshot) Type() MetricType { } func (s *Snapshot) Value() time.Time { + timeVal := time.Time{} // zero val if s.handle == nil || s.node == nil { - return time.Time{} + return timeVal } - var tms C.struct_timespec - - res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node) - if res == C.DER_SUCCESS { - return time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec)) + fetch := func() C.int { + var tms C.struct_timespec + res := C.d_tm_get_timer_snapshot(s.handle.ctx, &tms, s.node) + if res == C.DER_SUCCESS { + timeVal = time.Unix(int64(tms.tv_sec), int64(tms.tv_nsec)) + } + return res } + s.fetchValWithRetry(fetch) - return time.Time{} + return timeVal } func (s *Snapshot) FloatValue() float64 { diff --git a/src/control/lib/telemetry/telemetry.go b/src/control/lib/telemetry/telemetry.go index bb0593240b6..479c41e2aab 100644 --- a/src/control/lib/telemetry/telemetry.go +++ b/src/control/lib/telemetry/telemetry.go @@ -84,6 +84,8 @@ const ( BadDuration = time.Duration(BadIntVal) PathSep = filepath.Separator + + maxFetchRetries = 1 ) type ( @@ -304,6 +306,16 @@ func (mb *metricBase) String() string { return strings.TrimSpace(string(buf[:bytes.Index(buf, []byte{0})])) } +func (mb *metricBase) fetchValWithRetry(fetchFn func() C.int) C.int { + var rc C.int + for i := 0; i < maxFetchRetries; i++ { + if rc = fetchFn(); rc == C.DER_SUCCESS { + return rc + } + } + return rc +} + func (sm *statsMetric) Min() uint64 { return uint64(sm.stats.dtm_min) } diff --git a/src/control/lib/telemetry/timestamp.go b/src/control/lib/telemetry/timestamp.go index 97ef5bb1ed9..c787aed488d 100644 --- a/src/control/lib/telemetry/timestamp.go +++ b/src/control/lib/telemetry/timestamp.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2022 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -34,16 +34,22 @@ func (t *Timestamp) Type() MetricType { } func (t *Timestamp) Value() time.Time { - zero := time.Time{} + timeVal := time.Time{} // zero val if t.handle == nil || t.node == nil { - return zero + return timeVal } - var clk C.time_t - res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node) - if res == C.DER_SUCCESS { - return time.Unix(int64(clk), 0) + + fetch := func() C.int { + var clk C.time_t + res := C.d_tm_get_timestamp(t.handle.ctx, &clk, t.node) + if res == C.DER_SUCCESS { + timeVal = time.Unix(int64(clk), 0) + } + return res } - return zero + t.fetchValWithRetry(fetch) + + return timeVal } // FloatValue converts the timestamp to time in seconds since the UNIX epoch. diff --git a/src/control/server/ctl_smd_rpc_test.go b/src/control/server/ctl_smd_rpc_test.go index 6378e81d23c..06f1276fa25 100644 --- a/src/control/server/ctl_smd_rpc_test.go +++ b/src/control/server/ctl_smd_rpc_test.go @@ -17,6 +17,7 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/drpc" "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/hardware/pciutils" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/config" @@ -88,6 +89,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { drpcResps map[int][]*mockDrpcResponse harnessStopped bool ioStopped bool + pciDevErr error expResp *ctlpb.SmdQueryResp expErr error }{ @@ -658,6 +660,46 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { }, expErr: daos.FreeMemError, }, + "list-devices; with health; update link stats": { + req: &ctlpb.SmdQueryReq{ + OmitPools: true, + Rank: uint32(ranklist.NilRank), + Uuid: test.MockUUID(1), + IncludeBioHealth: true, + }, + drpcResps: map[int][]*mockDrpcResponse{ + 0: { + { + Message: &ctlpb.SmdDevResp{ + Devices: []*ctlpb.SmdDevice{pbNormDev(0)}, + }, + }, + }, + 1: { + { + Message: &ctlpb.SmdDevResp{ + Devices: []*ctlpb.SmdDevice{ + func() *ctlpb.SmdDevice { + sd := pbFaultDev(1) + sd.Ctrlr.PciCfg = "ABCD" + return sd + }(), + }, + }, + }, + { + Message: &ctlpb.BioHealthResp{ + Temperature: 1000000, + TempWarn: true, + }, + }, + }, + }, + // Prove mock link stats provider gets called when IncludeBioHealth + // flag is set and Ctrlr.PciCfg string is not empty. + pciDevErr: errors.New("link stats provider fail"), + expErr: errors.New("link stats provider fail"), + }, "ambiguous UUID": { req: &ctlpb.SmdQueryReq{ Rank: uint32(ranklist.NilRank), @@ -680,6 +722,13 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) + linkStatsProv = &mockPCIeLinkStatsProvider{ + pciDevErr: tc.pciDevErr, + } + defer func() { + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() + }() + engineCount := len(tc.drpcResps) if engineCount == 0 { engineCount = 1 diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go index 90d66600dad..3097440f500 100644 --- a/src/control/server/instance_storage_rpc.go +++ b/src/control/server/instance_storage_rpc.go @@ -28,9 +28,14 @@ import ( ) var ( - scanSmd = listSmdDevices - getCtrlrHealth = getBioHealth + // Function pointers to enable mocking. + scanSmd = listSmdDevices + scanHealth = getBioHealth + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() + + // Sentinel errors to enable comparison. errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance") + errCtrlrHealthSkipped = errors.New("controller health update was skipped") ) // newMntRet creates and populates SCM mount result. @@ -168,13 +173,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe } func addLinkInfoToHealthStats(prov hardware.PCIeLinkStatsProvider, pciCfg string, health *ctlpb.BioHealthResp) error { + if health == nil { + return errors.New("nil BioHealthResp") + } + // Convert byte-string to lspci-format. sb := new(strings.Builder) formatBytestring(pciCfg, sb) pciDev := &hardware.PCIDevice{} if err := prov.PCIeCapsFromConfig([]byte(sb.String()), pciDev); err != nil { - return err + return errors.Wrap(err, "pciutils lib") } // Copy link details from PCIDevice to health stats. @@ -243,31 +252,74 @@ func publishLinkStatEvents(engine Engine, pciAddr string, stats *ctlpb.BioHealth lastMaxWidthStr, lastWidthStr, stats.LinkPortId) } -func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController, prov hardware.PCIeLinkStatsProvider) (bool, error) { - stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)] - if !ctrlr.CanSupplyHealthStats() { - engine.Debugf("skip fetching health stats on device %q in %q state", - ctrlr.PciAddr, stateName) - return false, nil +type ctrlrHealthReq struct { + meta bool + engine Engine + bhReq *ctlpb.BioHealthReq + ctrlr *ctlpb.NvmeController + linkStatsProv hardware.PCIeLinkStatsProvider +} + +// Retrieve NVMe controller health statistics for those in an acceptable state. Return nil health +// resp if in a bad state. +func getCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (*ctlpb.BioHealthResp, error) { + stateName := ctlpb.NvmeDevState_name[int32(req.ctrlr.DevState)] + if !req.ctrlr.CanSupplyHealthStats() { + req.engine.Debugf("skip fetching health stats on device %q in %q state", + req.ctrlr.PciAddr, stateName) + return nil, errCtrlrHealthSkipped } - health, err := getCtrlrHealth(ctx, engine, req) + health, err := scanHealth(ctx, req.engine, req.bhReq) if err != nil { - return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr, + return nil, errors.Wrapf(err, "retrieve health stats for %q (state %q)", req.ctrlr, stateName) } - if ctrlr.PciCfg != "" { - if err := addLinkInfoToHealthStats(prov, ctrlr.PciCfg, health); err != nil { - return false, errors.Wrapf(err, "add link stats for %q", ctrlr) + return health, nil +} + +// Add link state and capability information to input health statistics for the given controller +// then if successful publish events based on link statistic changes. Link updated health stats to +// controller. +func setCtrlrHealthWithLinkInfo(req ctrlrHealthReq, health *ctlpb.BioHealthResp) error { + err := addLinkInfoToHealthStats(req.linkStatsProv, req.ctrlr.PciCfg, health) + if err == nil { + publishLinkStatEvents(req.engine, req.ctrlr.PciAddr, health) + } else { + if errors.Cause(err) != pciutils.ErrNoPCIeCaps { + return errors.Wrapf(err, "add link stats for %q", req.ctrlr) + } + req.engine.Debugf("device %q not reporting PCIe capabilities", req.ctrlr.PciAddr) + } + + return nil +} + +// Update controller health statistics and include link info if required and available. +func populateCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (bool, error) { + health, err := getCtrlrHealth(ctx, req) + if err != nil { + if err == errCtrlrHealthSkipped { + // Nothing to do. + return false, nil } - publishLinkStatEvents(engine, ctrlr.PciAddr, health) + return false, errors.Wrap(err, "get ctrlr health") + } + + if req.linkStatsProv == nil { + req.engine.Debugf("device %q skip adding link stats; nil provider", + req.ctrlr.PciAddr) + } else if req.ctrlr.PciCfg == "" { + req.engine.Debugf("device %q skip adding link stats; empty pci cfg", + req.ctrlr.PciAddr) } else { - engine.Debugf("no pcie config space received for %q, skip add link stats", ctrlr) + if err = setCtrlrHealthWithLinkInfo(req, health); err != nil { + return false, errors.Wrap(err, "set ctrlr health") + } } - ctrlr.HealthStats = health - ctrlr.PciCfg = "" + req.ctrlr.HealthStats = health return true, nil } @@ -305,12 +357,13 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc c := seenCtrlrs[addr] - // Only minimal info provided in standard scan to enable result aggregation across - // homogeneous hosts. engineRank, err := engine.GetRank() if err != nil { return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index()) } + + // Only provide minimal info in standard scan to enable result aggregation across + // homogeneous hosts. nsd := &ctlpb.SmdDevice{ RoleBits: sd.RoleBits, CtrlrNamespaceId: sd.CtrlrNamespaceId, @@ -326,18 +379,29 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc // Populate health if requested. healthUpdated := false if pbReq.Health && c.HealthStats == nil { - bhReq := &ctlpb.BioHealthReq{ - DevUuid: sd.Uuid, - MetaSize: pbReq.MetaSize, - RdbSize: pbReq.RdbSize, + bhReq := &ctlpb.BioHealthReq{DevUuid: sd.Uuid} + if pbReq.Meta { + bhReq.MetaSize = pbReq.MetaSize + bhReq.RdbSize = pbReq.RdbSize } - upd, err := populateCtrlrHealth(ctx, engine, bhReq, c, - pciutils.NewPCIeLinkStatsProvider()) + + chReq := ctrlrHealthReq{ + engine: engine, + bhReq: bhReq, + ctrlr: c, + } + if pbReq.LinkStats { + // Add link stats to health if flag set. + chReq.linkStatsProv = linkStatsProv + } + + healthUpdated, err = populateCtrlrHealth(ctx, chReq) if err != nil { return nil, err } - healthUpdated = upd } + // Used to update health with link stats, now redundant. + c.PciCfg = "" // Populate usage data if requested. if pbReq.Meta { @@ -510,12 +574,20 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq continue // Skip health query if UUID doesn't match requested. } if pbReq.IncludeBioHealth { - bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid} - if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr, - pciutils.NewPCIeLinkStatsProvider()); err != nil { + chReq := ctrlrHealthReq{ + engine: engine, + bhReq: &ctlpb.BioHealthReq{DevUuid: dev.Uuid}, + ctrlr: dev.Ctrlr, + linkStatsProv: linkStatsProv, + } + + if _, err = populateCtrlrHealth(ctx, chReq); err != nil { return nil, err } } + // Used to update health with link stats, now redundant. + dev.Ctrlr.PciCfg = "" + if pbReq.Uuid != "" && dev.Uuid == pbReq.Uuid { rResp.Devices = []*ctlpb.SmdDevice{dev} found = true diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go index b199adb6b8d..0f72f739970 100644 --- a/src/control/server/instance_storage_rpc_test.go +++ b/src/control/server/instance_storage_rpc_test.go @@ -21,6 +21,7 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/events" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/hardware/pciutils" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/config" @@ -59,24 +60,27 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { } for name, tc := range map[string]struct { - badDevState bool - noPciCfgSpc bool - pciDev *hardware.PCIDevice - pciDevErr error - emptyHealthRes bool - healthErr error - lastStats map[string]*ctlpb.BioHealthResp - expCtrlr *ctlpb.NvmeController - expNotUpdated bool - expErr error - expDispatched []*events.RASEvent - expLastStats map[string]*ctlpb.BioHealthResp + badDevState bool + nilLinkStatsProv bool + noPciCfgSpc bool + pciDev *hardware.PCIDevice + pciDevErr error + healthReq *ctlpb.BioHealthReq + healthRes *ctlpb.BioHealthResp + nilHealthRes bool + healthErr error + lastStats map[string]*ctlpb.BioHealthResp + expCtrlr *ctlpb.NvmeController + expNotUpdated bool + expErr error + expDispatched []*events.RASEvent + expLastStats map[string]*ctlpb.BioHealthResp }{ "bad state; skip health": { badDevState: true, - noPciCfgSpc: true, expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", }, expNotUpdated: true, }, @@ -88,11 +92,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { HealthStats: healthWithLinkStats(0, 0, 0, 0), }, }, + "nil bio health response": { + nilHealthRes: true, + expErr: errors.New("nil BioHealthResp"), + }, "empty bio health response; empty link stats": { - emptyHealthRes: true, - pciDev: new(hardware.PCIDevice), + healthRes: new(ctlpb.BioHealthResp), + pciDev: new(hardware.PCIDevice), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: new(ctlpb.BioHealthResp), }, @@ -106,9 +115,19 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { pciDevErr: errors.New("fail"), expErr: errors.New("fail"), }, + "update health; add link stats; pciutils lib error; missing pcie caps": { + pciDevErr: pciutils.ErrNoPCIeCaps, + expCtrlr: &ctlpb.NvmeController{ + PciAddr: pciAddr, + PciCfg: "ABCD", + DevState: ctlpb.NvmeDevState_NORMAL, + HealthStats: healthWithLinkStats(0, 0, 0, 0), + }, + }, "update health; add link stats; normal link state; no event published": { expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -128,6 +147,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { }, expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4), }, @@ -152,6 +172,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { }, expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4), }, @@ -167,6 +188,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(proto.MockNvmeHealth()), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -176,6 +198,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 0.5e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -191,6 +214,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 4, 1)), expCtrlr: &ctlpb.NvmeController{ PciAddr: pciAddr, + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: proto.MockNvmeHealth(), }, @@ -213,6 +237,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 4, 4), }, @@ -229,6 +254,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 8, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 8, 4), }, @@ -245,6 +271,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 1e+9, 8, 8), }, @@ -271,6 +298,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(2.5e+9, 1e+9, 8, 8)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(2.5e+9, 2.5e+9, 8, 4), }, @@ -297,6 +325,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(8e+9, 2.5e+9, 4, 4)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(8e+9, 1e+9, 4, 4), }, @@ -319,6 +348,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { lastStats: lastStatsMap(healthWithLinkStats(1e+9, 1e+9, 16, 8)), expCtrlr: &ctlpb.NvmeController{ PciAddr: test.MockPCIAddr(1), + PciCfg: "ABCD", DevState: ctlpb.NvmeDevState_NORMAL, HealthStats: healthWithLinkStats(1e+9, 1e+9, 16, 4), }, @@ -335,15 +365,11 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - healthRes := healthWithLinkStats(0, 0, 0, 0) - if tc.emptyHealthRes { - healthRes = new(ctlpb.BioHealthResp) - } - getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { - return healthRes, tc.healthErr + scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { + return tc.healthRes, tc.healthErr } defer func() { - getCtrlrHealth = getBioHealth + scanHealth = getBioHealth }() var devState ctlpb.NvmeDevState @@ -363,10 +389,16 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { LinkMaxWidth: 4, } } + if tc.healthRes == nil && !tc.nilHealthRes { + tc.healthRes = healthWithLinkStats(0, 0, 0, 0) + } - mockProv := &mockPCIeLinkStatsProvider{ - pciDev: tc.pciDev, - pciDevErr: tc.pciDevErr, + var mockProv *mockPCIeLinkStatsProvider + if !tc.nilLinkStatsProv { + mockProv = &mockPCIeLinkStatsProvider{ + pciDev: tc.pciDev, + pciDevErr: tc.pciDevErr, + } } ctrlr := &ctlpb.NvmeController{ @@ -387,8 +419,14 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) { subscriber := newMockSubscriber(2) ps.Subscribe(events.RASTypeInfoOnly, subscriber) - upd, err := populateCtrlrHealth(test.Context(t), ei, - &ctlpb.BioHealthReq{}, ctrlr, mockProv) + chReq := ctrlrHealthReq{ + engine: ei, + bhReq: tc.healthReq, + ctrlr: ctrlr, + linkStatsProv: mockProv, + } + + upd, err := populateCtrlrHealth(test.Context(t), chReq) test.CmpErr(t, tc.expErr, err) if err != nil { return @@ -436,6 +474,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { } defSmdScanRes := func() *ctlpb.SmdDevResp { sd := proto.MockSmdDevice(c, 2) + sd.Rank = 2 return &ctlpb.SmdDevResp{Devices: []*ctlpb.SmdDevice{sd}} } healthRespWithUsage := func() *ctlpb.BioHealthResp { @@ -444,6 +483,19 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { mh.MetaWalSize, mh.RdbWalSize = 4, 5 return mh } + ctrlrWithUsageAndMeta := func() *ctlpb.NvmeController { + c := proto.MockNvmeController(2) + c.HealthStats = healthRespWithUsage() + sd := proto.MockSmdDevice(nil, 2) + sd.Rank = 1 + sd.TotalBytes = c.HealthStats.TotalBytes + sd.AvailBytes = c.HealthStats.AvailBytes + sd.ClusterSize = c.HealthStats.ClusterSize + sd.MetaWalSize = c.HealthStats.MetaWalSize + sd.RdbWalSize = c.HealthStats.RdbWalSize + c.SmdDevices = []*ctlpb.SmdDevice{sd} + return c + } for name, tc := range map[string]struct { req ctlpb.ScanNvmeReq @@ -640,28 +692,14 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { State: new(ctlpb.ResponseState), }, }, - "scan over drpc; with smd and health; usage and wal size reported": { + "scan over drpc; with meta and health; usage and wal size reported": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, rank: 1, smdRes: defSmdScanRes(), healthRes: healthRespWithUsage(), expResp: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{ - func() *ctlpb.NvmeController { - c := proto.MockNvmeController(2) - c.HealthStats = healthRespWithUsage() - sd := proto.MockSmdDevice(nil, 2) - sd.Rank = 1 - sd.TotalBytes = c.HealthStats.TotalBytes - sd.AvailBytes = c.HealthStats.AvailBytes - sd.ClusterSize = c.HealthStats.ClusterSize - sd.MetaWalSize = c.HealthStats.MetaWalSize - sd.RdbWalSize = c.HealthStats.RdbWalSize - c.SmdDevices = []*ctlpb.SmdDevice{sd} - return c - }(), - }, - State: new(ctlpb.ResponseState), + Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()}, + State: new(ctlpb.ResponseState), }, }, "scan over drpc; only ctrlrs with valid states shown": { @@ -703,7 +741,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { State: new(ctlpb.ResponseState), }, }, - "scan over drpc; with smd and health; missing ctrlr in smd": { + "scan over drpc; with meta and health; missing ctrlr in smd": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdRes: func() *ctlpb.SmdDevResp { ssr := defSmdScanRes() @@ -713,23 +751,48 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { healthRes: healthRespWithUsage(), expErr: errors.New("no ctrlr ref"), }, - "scan over drpc; with smd and health; health scan fails": { + "scan over drpc; with meta and health; health scan fails": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdRes: defSmdScanRes(), healthErr: errors.New("health scan failed"), expErr: errors.New("health scan failed"), }, - "scan over drpc; with smd and health; smd list fails": { + "scan over drpc; with meta and health; smd list fails": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, smdErr: errors.New("smd scan failed"), healthRes: healthRespWithUsage(), expErr: errors.New("smd scan failed"), }, - "scan over drpc; with smd and health; nil smd list returned": { + "scan over drpc; with meta and health; nil smd list returned": { req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, healthRes: healthRespWithUsage(), expErr: errors.New("nil smd scan resp"), }, + "scan over drpc; with meta and health; link info update skipped": { + req: ctlpb.ScanNvmeReq{Meta: true, Health: true}, + rank: 1, + smdRes: func() *ctlpb.SmdDevResp { + ssr := defSmdScanRes() + ssr.Devices[0].Ctrlr.PciCfg = "ABCD" + return ssr + }(), + healthRes: healthRespWithUsage(), + expResp: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ctrlrWithUsageAndMeta()}, + State: new(ctlpb.ResponseState), + }, + }, + "scan over drpc; with health; link info update run but failed": { + req: ctlpb.ScanNvmeReq{Health: true, LinkStats: true}, + smdRes: func() *ctlpb.SmdDevResp { + ssr := defSmdScanRes() + ssr.Devices[0].Ctrlr.PciCfg = "ABCD" + return ssr + }(), + healthRes: healthRespWithUsage(), + // Prove link stat provider gets called when LinkStats flag set. + expErr: errors.New("link stats provider fail"), + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) @@ -741,11 +804,17 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { defer func() { scanSmd = listSmdDevices }() - getCtrlrHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { + scanHealth = func(_ context.Context, _ Engine, _ *ctlpb.BioHealthReq) (*ctlpb.BioHealthResp, error) { return tc.healthRes, tc.healthErr } defer func() { - getCtrlrHealth = getBioHealth + scanHealth = getBioHealth + }() + linkStatsProv = &mockPCIeLinkStatsProvider{ + pciDevErr: errors.New("link stats provider fail"), + } + defer func() { + linkStatsProv = pciutils.NewPCIeLinkStatsProvider() }() if tc.provRes == nil { @@ -776,7 +845,8 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { ei.setSuperblock(nil) } else { ei.setSuperblock(&Superblock{ - Rank: ranklist.NewRankPtr(uint32(tc.rank)), ValidRank: true, + Rank: ranklist.NewRankPtr(uint32(tc.rank)), + ValidRank: true, }) } diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index e85091b5529..a35753d95ba 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -1002,6 +1002,8 @@ func (svc *mgmtSvc) SystemExclude(ctx context.Context, req *mgmtpb.SystemExclude }) } + svc.reqGroupUpdate(ctx, false) + return resp, nil } diff --git a/src/control/server/mgmt_system_test.go b/src/control/server/mgmt_system_test.go index f482b1943a1..0cffe7f63d8 100644 --- a/src/control/server/mgmt_system_test.go +++ b/src/control/server/mgmt_system_test.go @@ -1765,6 +1765,7 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) { mockMember(t, 3, 2, "joined"), }, }, + "unexclude hosts": { req: &mgmtpb.SystemExcludeReq{Hosts: test.MockHostAddr(1).String(), Clear: true}, members: system.Members{ @@ -1795,12 +1796,32 @@ func TestServer_MgmtSvc_SystemExclude(t *testing.T) { if tc.req != nil && tc.req.Sys == "" { tc.req.Sys = build.DefaultSystemName } + + startMapVer, err := svc.sysdb.CurMapVersion() + if err != nil { + t.Fatalf("startMapVer CurMapVersion() failed\n") + return + } gotResp, gotAPIErr := svc.SystemExclude(ctx, tc.req) test.CmpErr(t, tc.expAPIErr, gotAPIErr) if tc.expAPIErr != nil { return } + // Check for any system map version increase by the (asynchronous) update. + // Test will time out if it never happens, thus choice of an infinite loop here. + for { + curMapVer, err := svc.sysdb.CurMapVersion() + if err != nil { + t.Fatalf("CurMapVersion() failed\n") + return + } + + if curMapVer > startMapVer { + break + } + } + checkRankResults(t, tc.expResults, gotResp.Results) checkMembers(t, tc.expMembers, svc.membership) }) diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 863307e9a7f..ba45aaa2616 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -80,8 +80,14 @@ dtx_coll_prep_ult(void *arg) DP_UUID(cont->sc_uuid), DP_RC(rc)); } - if (dcpa->dcpa_result != 0) + if (dcpa->dcpa_result != 0) { + if (dcpa->dcpa_result < 0 && + dcpa->dcpa_result != -DER_INPROGRESS && dcpa->dcpa_result != -DER_NONEXIST) + D_ERROR("Failed to load mbs for "DF_DTI" in "DF_UUID"/"DF_UUID", opc %u: " + DF_RC"\n", DP_DTI(&dci->dci_xid), DP_UUID(dci->dci_po_uuid), + DP_UUID(dci->dci_co_uuid), opc, DP_RC(dcpa->dcpa_result)); goto out; + } dcpa->dcpa_result = dtx_coll_prep(dci->dci_po_uuid, dcpa->dcpa_oid, &dci->dci_xid, mbs, -1, dci->dci_version, cont->sc_pool->spc_map_version, diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index ff4f2dfe4ef..1ee74ae11a4 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -392,11 +392,11 @@ dtx_cleanup(void *arg) if (rc == 0) { D_ASSERT(dce != NULL); - rc = dtx_coll_commit(cont, dce, NULL); + rc = dtx_coll_commit(cont, dce, NULL, false); dtx_coll_entry_put(dce); } } else { - rc = dtx_commit(cont, &dte, NULL, 1); + rc = dtx_commit(cont, &dte, NULL, 1, false); } } @@ -620,17 +620,16 @@ dtx_batched_commit_one(void *arg) tls->dt_batched_ult_cnt++; /* dbca->dbca_reg_gen != cont->sc_dtx_batched_gen means someone reopen the container. */ - while (!dss_ult_exiting(dbca->dbca_commit_req) && + while (!dss_ult_exiting(dbca->dbca_commit_req) && dtx_cont_opened(cont) && dbca->dbca_reg_gen == cont->sc_dtx_batched_gen) { struct dtx_entry **dtes = NULL; - struct dtx_cos_key *dcks = NULL; struct dtx_coll_entry *dce = NULL; struct dtx_stat stat = { 0 }; int cnt; int rc; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, NULL, - DAOS_EPOCH_MAX, false, &dtes, &dcks, &dce); + DAOS_EPOCH_MAX, false, &dtes, NULL, &dce); if (cnt == 0) break; @@ -644,11 +643,11 @@ dtx_batched_commit_one(void *arg) /* Currently, commit collective DTX one by one. */ D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, NULL, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, NULL, cnt, true); } - dtx_free_committable(dtes, dcks, dce, cnt); + dtx_free_committable(dtes, NULL, dce, cnt); if (rc != 0) { D_WARN("Fail to batched commit %d entries for "DF_UUID": "DF_RC"\n", cnt, DP_UUID(cont->sc_uuid), DP_RC(rc)); @@ -1271,9 +1270,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul uint32_t flags; int status = -1; int rc = 0; - int i; bool aborted = false; - bool unpin = false; D_ASSERT(cont != NULL); @@ -1341,7 +1338,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * it persistently. Otherwise, the subsequent DTX resync may not find it as * to regard it as failed transaction and abort it. */ - if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo && + if (!dth->dth_active && !dth->dth_prepared && (dth->dth_dist || dth->dth_modification_cnt > 0)) { result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false); if (unlikely(result < 0)) { @@ -1351,7 +1348,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul } } - if (dth->dth_prepared || dtx_batched_ult_max == 0) { + if ((dth->dth_prepared && !dlh->dlh_coll) || dtx_batched_ult_max == 0) { dth->dth_sync = 1; goto sync; } @@ -1365,14 +1362,12 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul if (DAOS_FAIL_CHECK(DAOS_DTX_MISS_COMMIT)) dth->dth_sync = 1; - /* For synchronous DTX, do not add it into CoS cache, otherwise, - * we may have no way to remove it from the cache. - */ if (dth->dth_sync) goto sync; D_ASSERT(dth->dth_mbs != NULL); +cache: if (dlh->dlh_coll) { rc = dtx_cos_add(cont, dlh->dlh_coll_entry, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_epoch, DCF_EXP_CMT | DCF_COLL); @@ -1380,38 +1375,47 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul size = sizeof(*dte) + sizeof(*mbs) + dth->dth_mbs->dm_data_size; D_ALLOC(dte, size); if (dte == NULL) { - dth->dth_sync = 1; - goto sync; - } - - mbs = (struct dtx_memberships *)(dte + 1); - memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); - - dte->dte_xid = dth->dth_xid; - dte->dte_ver = dth->dth_ver; - dte->dte_refs = 1; - dte->dte_mbs = mbs; + rc = -DER_NOMEM; + } else { + mbs = (struct dtx_memberships *)(dte + 1); + memcpy(mbs, dth->dth_mbs, size - sizeof(*dte)); + + dte->dte_xid = dth->dth_xid; + dte->dte_ver = dth->dth_ver; + dte->dte_refs = 1; + dte->dte_mbs = mbs; + + if (!(mbs->dm_flags & DMF_SRDG_REP)) + flags = DCF_EXP_CMT; + else if (dth->dth_modify_shared) + flags = DCF_SHARED; + else + flags = 0; - if (!(mbs->dm_flags & DMF_SRDG_REP)) - flags = DCF_EXP_CMT; - else if (dth->dth_modify_shared) - flags = DCF_SHARED; - else - flags = 0; + rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash, + dth->dth_epoch, flags); + dtx_entry_put(dte); + } + } - rc = dtx_cos_add(cont, dte, &dth->dth_leader_oid, dth->dth_dkey_hash, - dth->dth_epoch, flags); - dtx_entry_put(dte); + /* + * NOTE: If we failed to add the committable DTX into CoS cache, then we also have no way + * to commit (or abort) the DTX because of out of memory. Such DTX will be finally + * committed via next DTX resync (after recovered from OOM). + * + * Here, we only warning to notify the trouble, but not failed the transaction. + */ + if (rc != 0) { + D_WARN(DF_UUID": Fail to cache %s DTX "DF_DTI": "DF_RC"\n", + DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", + DP_DTI(&dth->dth_xid), DP_RC(rc)); + D_GOTO(out, result = 0); } - if (rc == 0) { - if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) { - vos_dtx_mark_committable(dth); - if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll) - sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req); - } - } else { - dth->dth_sync = 1; + if (!DAOS_FAIL_CHECK(DAOS_DTX_NO_COMMITTABLE)) { + vos_dtx_mark_committable(dth); + if (cont->sc_dtx_committable_count > DTX_THRESHOLD_COUNT || dlh->dlh_coll) + sched_req_wakeup(dss_get_module_info()->dmi_dtx_cmt_req); } sync: @@ -1424,16 +1428,21 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul vos_dtx_mark_committable(dth); if (dlh->dlh_coll) { - rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL); + rc = dtx_coll_commit(cont, dlh->dlh_coll_entry, NULL, false); } else { dte = &dth->dth_dte; - rc = dtx_commit(cont, &dte, NULL, 1); + rc = dtx_commit(cont, &dte, NULL, 1, false); } - if (rc != 0) + if (rc != 0) { D_WARN(DF_UUID": Fail to sync %s commit DTX "DF_DTI": "DF_RC"\n", DP_UUID(cont->sc_uuid), dlh->dlh_coll ? "collective" : "regular", DP_DTI(&dth->dth_xid), DP_RC(rc)); + if (likely(dtx_batched_ult_max != 0)) { + dth->dth_sync = 0; + goto cache; + } + } /* * NOTE: The semantics of 'sync' commit does not guarantee that all @@ -1453,7 +1462,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * to locally retry for avoiding related forwarded RPC timeout, instead, * The leader will trigger retry globally without abort 'prepared' ones. */ - if (unpin || (result < 0 && result != -DER_AGAIN && !dth->dth_solo)) { + if (result < 0 && result != -DER_AGAIN && !dth->dth_solo) { /* 1. Drop partial modification for distributed transaction. * 2. Remove the pinned DTX entry. */ @@ -1487,15 +1496,9 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul /* If piggyback DTX has been done everywhere, then need to handle CoS cache. * It is harmless to keep some partially committed DTX entries in CoS cache. */ - if (result == 0 && dth->dth_cos_done) { - for (i = 0; i < dth->dth_dti_cos_count; i++) - dtx_cos_del(cont, &dth->dth_dti_cos[i], - &dth->dth_leader_oid, dth->dth_dkey_hash); - } else { - for (i = 0; i < dth->dth_dti_cos_count; i++) - dtx_cos_put_piggyback(cont, &dth->dth_dti_cos[i], - &dth->dth_leader_oid, dth->dth_dkey_hash); - } + dtx_cos_put_piggyback(cont, &dth->dth_leader_oid, dth->dth_dkey_hash, dth->dth_dti_cos, + dth->dth_dti_cos_count, + (result == 0 && dth->dth_cos_done) ? true : false); D_DEBUG(DB_IO, "Stop the DTX "DF_DTI" ver %u, dkey %lu, %s, cos %d/%d: result "DF_RC"\n", DP_DTI(&dth->dth_xid), dth->dth_ver, (unsigned long)dth->dth_dkey_hash, @@ -1654,7 +1657,8 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db struct dtx_coll_entry *dce = NULL; cnt = dtx_fetch_committable(cont, DTX_THRESHOLD_COUNT, - NULL, DAOS_EPOCH_MAX, true, &dtes, &dcks, &dce); + NULL, DAOS_EPOCH_MAX, true, &dtes, + dbca->dbca_commit_req != NULL ? &dcks : NULL, &dce); if (cnt <= 0) D_GOTO(out, rc = cnt); @@ -1675,9 +1679,9 @@ dtx_flush_on_close(struct dss_module_info *dmi, struct dtx_batched_cont_args *db if (dce != NULL) { D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, dcks, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, dcks, cnt, true); } dtx_free_committable(dtes, dcks, dce, cnt); } @@ -2365,9 +2369,9 @@ dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, if (dce != NULL) { D_ASSERT(cnt == 1); - rc = dtx_coll_commit(cont, dce, dcks); + rc = dtx_coll_commit(cont, dce, dcks, true); } else { - rc = dtx_commit(cont, dtes, dcks, cnt); + rc = dtx_commit(cont, dtes, dcks, cnt, true); } dtx_free_committable(dtes, dcks, dce, cnt); if (rc < 0) { diff --git a/src/dtx/dtx_cos.c b/src/dtx/dtx_cos.c index 6e1d042b82b..4c165f94d0c 100644 --- a/src/dtx/dtx_cos.c +++ b/src/dtx/dtx_cos.c @@ -54,6 +54,8 @@ struct dtx_cos_rec_child { d_list_t dcrc_gl_committable; /* Link into related dcr_{reg,prio}_list. */ d_list_t dcrc_lo_link; + /* Link into container::sc_dtx_batched_list. */ + d_list_t dcrc_batched_link; union { struct dtx_entry *dcrc_dte; struct dtx_coll_entry *dcrc_dce; @@ -61,8 +63,12 @@ struct dtx_cos_rec_child { /* The DTX epoch. */ daos_epoch_t dcrc_epoch; struct dtx_cos_rec *dcrc_ptr; + uint64_t dcrc_ready_time; uint32_t dcrc_piggyback_refs; - uint32_t dcrc_coll:1; /* For collective DTX. */ + uint32_t dcrc_expcmt:1, + dcrc_prio:1, + dcrc_reg:1, + dcrc_coll:1; /* For collective DTX. */ }; struct dtx_cos_rec_bundle { @@ -129,6 +135,8 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, return -DER_NOMEM; } + D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link); + dcrc->dcrc_ready_time = daos_getmtime_coarse(); dcrc->dcrc_epoch = rbund->epoch; dcrc->dcrc_ptr = dcr; if (rbund->flags & DCF_COLL) { @@ -144,12 +152,15 @@ dtx_cos_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_tm_inc_gauge(tls->dt_committable, 1); if (rbund->flags & DCF_EXP_CMT) { + dcrc->dcrc_expcmt = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list); dcr->dcr_expcmt_count = 1; } else if (rbund->flags & DCF_SHARED) { + dcrc->dcrc_prio = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_prio_count = 1; } else { + dcrc->dcrc_reg = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list); dcr->dcr_reg_count = 1; } @@ -177,6 +188,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -190,6 +202,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -203,6 +216,7 @@ dtx_cos_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) dcrc_lo_link) { d_list_del(&dcrc->dcrc_lo_link); d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_batched_link); if (dcrc->dcrc_coll) { dtx_coll_entry_put(dcrc->dcrc_dce); coll++; @@ -256,6 +270,8 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec, if (dcrc == NULL) return -DER_NOMEM; + D_INIT_LIST_HEAD(&dcrc->dcrc_batched_link); + dcrc->dcrc_ready_time = daos_getmtime_coarse(); dcrc->dcrc_epoch = rbund->epoch; dcrc->dcrc_ptr = dcr; if (rbund->flags & DCF_COLL) { @@ -271,12 +287,15 @@ dtx_cos_rec_update(struct btr_instance *tins, struct btr_record *rec, d_tm_inc_gauge(tls->dt_committable, 1); if (rbund->flags & DCF_EXP_CMT) { + dcrc->dcrc_expcmt = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_expcmt_list); dcr->dcr_expcmt_count++; } else if (rbund->flags & DCF_SHARED) { + dcrc->dcrc_prio = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_prio_count++; } else { + dcrc->dcrc_reg = 1; d_list_add_tail(&dcrc->dcrc_lo_link, &dcr->dcr_reg_list); dcr->dcr_reg_count++; } @@ -294,6 +313,53 @@ btr_ops_t dtx_btr_cos_ops = { .to_rec_update = dtx_cos_rec_update, }; +static int +dtx_cos_del_one(struct ds_cont_child *cont, struct dtx_cos_rec_child *dcrc) +{ + struct dtx_cos_key key; + d_iov_t kiov; + struct dtx_cos_rec *dcr = dcrc->dcrc_ptr; + uint64_t time = daos_getmtime_coarse() - dcrc->dcrc_ready_time; + int rc = 0; + + d_list_del(&dcrc->dcrc_gl_committable); + d_list_del(&dcrc->dcrc_lo_link); + if (!d_list_empty(&dcrc->dcrc_batched_link)) + d_list_del_init(&dcrc->dcrc_batched_link); + + if (dcrc->dcrc_expcmt) + dcr->dcr_expcmt_count--; + else if (dcrc->dcrc_prio) + dcr->dcr_prio_count--; + else + dcr->dcr_reg_count--; + + if (dcrc->dcrc_coll) + cont->sc_dtx_committable_coll_count--; + cont->sc_dtx_committable_count--; + + d_tm_set_gauge(dtx_tls_get()->dt_async_cmt_lat, time); + + if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && dcr->dcr_expcmt_count == 0) { + key.oid = dcr->dcr_oid; + key.dkey_hash = dcr->dcr_dkey_hash; + d_iov_set(&kiov, &key, sizeof(key)); + rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL); + } + + DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, + "Remove DTX "DF_DTI" from CoS cache", DP_DTI(&dcrc->dcrc_dte->dte_xid)); + + if (dcrc->dcrc_coll) + dtx_coll_entry_put(dcrc->dcrc_dce); + else + dtx_entry_put(dcrc->dcrc_dte); + + D_FREE(dcrc); + + return rc; +} + int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, daos_unit_oid_t *oid, daos_epoch_t epoch, bool force, @@ -306,18 +372,45 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, uint32_t count; uint32_t i = 0; + /* Last batched commit failed, let's re-commit them. */ + if (dcks == NULL && !d_list_empty(&cont->sc_dtx_batched_list)) { + dcrc = d_list_entry(cont->sc_dtx_batched_list.next, struct dtx_cos_rec_child, + dcrc_batched_link); + if (unlikely(dcrc->dcrc_coll)) { + *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); + return 1; + } + + D_ALLOC_ARRAY(dte_buf, max_cnt); + if (dte_buf == NULL) + return -DER_NOMEM; + + d_list_for_each_entry(dcrc, &cont->sc_dtx_batched_list, dcrc_batched_link) { + D_ASSERT(i < max_cnt); + dte_buf[i++] = dtx_entry_get(dcrc->dcrc_dte); + } + + *dtes = dte_buf; + return i; + } + /* Process collective DXT with higher priority. */ if (!d_list_empty(&cont->sc_dtx_coll_list) && oid == NULL) { d_list_for_each_entry(dcrc, &cont->sc_dtx_coll_list, dcrc_gl_committable) { if (epoch >= dcrc->dcrc_epoch && (dcrc->dcrc_piggyback_refs == 0 || force)) { - D_ALLOC_PTR(dck_buf); - if (dck_buf == NULL) - return -DER_NOMEM; - - dck_buf->oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; - *dcks = dck_buf; + if (dcks != NULL) { + D_ALLOC_PTR(dck_buf); + if (dck_buf == NULL) + return -DER_NOMEM; + + dck_buf->oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf->dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + *dcks = dck_buf; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, + &cont->sc_dtx_batched_list); + } *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); return 1; @@ -326,19 +419,19 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, } count = min(cont->sc_dtx_committable_count, max_cnt); - if (count == 0) { - *dtes = NULL; + if (count == 0) return 0; - } D_ALLOC_ARRAY(dte_buf, count); if (dte_buf == NULL) return -DER_NOMEM; - D_ALLOC_ARRAY(dck_buf, count); - if (dck_buf == NULL) { - D_FREE(dte_buf); - return -DER_NOMEM; + if (dcks != NULL) { + D_ALLOC_ARRAY(dck_buf, count); + if (dck_buf == NULL) { + D_FREE(dte_buf); + return -DER_NOMEM; + } } d_list_for_each_entry(dcrc, &cont->sc_dtx_cos_list, dcrc_gl_committable) { @@ -353,17 +446,26 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, continue; D_FREE(dte_buf); - dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; - *dcks = dck_buf; + if (dcks != NULL) { + dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + *dcks = dck_buf; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, + &cont->sc_dtx_batched_list); + } *p_dce = dtx_coll_entry_get(dcrc->dcrc_dce); return 1; } dte_buf[i] = dtx_entry_get(dcrc->dcrc_dte); - dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; - dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + if (dcks != NULL) { + dck_buf[i].oid = dcrc->dcrc_ptr->dcr_oid; + dck_buf[i].dkey_hash = dcrc->dcrc_ptr->dcr_dkey_hash; + } else { + d_list_add_tail(&dcrc->dcrc_batched_link, &cont->sc_dtx_batched_list); + } if (++i >= count) break; @@ -372,10 +474,10 @@ dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, if (i == 0) { D_FREE(dte_buf); D_FREE(dck_buf); - *dtes = NULL; } else { *dtes = dte_buf; - *dcks = dck_buf; + if (dcks != NULL) + *dcks = dck_buf; } return i; @@ -436,32 +538,44 @@ dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, } void -dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid, - daos_unit_oid_t *oid, uint64_t dkey_hash) +dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, + struct dtx_id xid[], uint32_t count, bool rm) { struct dtx_cos_key key; d_iov_t kiov; d_iov_t riov; struct dtx_cos_rec *dcr; struct dtx_cos_rec_child *dcrc; + int del = 0; int rc; + int i; key.oid = *oid; key.dkey_hash = dkey_hash; d_iov_set(&kiov, &key, sizeof(key)); d_iov_set(&riov, NULL, 0); - /* It is normal that the DTX entry (to be put) in CoS has already been removed by race. */ - rc = dbtree_lookup(cont->sc_dtx_cos_hdl, &kiov, &riov); if (rc == 0) { dcr = (struct dtx_cos_rec *)riov.iov_buf; - d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { - dcrc->dcrc_piggyback_refs--; - return; + for (i = 0; i < count; i++) { + d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { + if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], + sizeof(struct dtx_id)) == 0) { + if (rm) { + rc = dtx_cos_del_one(cont, dcrc); + if (rc == 0) + del++; + } else { + dcrc->dcrc_piggyback_refs--; + } + break; + } } } + + if (del > 0) + d_tm_dec_gauge(dtx_tls_get()->dt_committable, del); } } @@ -493,12 +607,12 @@ dtx_cos_add(struct ds_cont_child *cont, void *entry, daos_unit_oid_t *oid, DAOS_INTENT_UPDATE, &kiov, &riov, NULL); if (flags & DCF_COLL) - D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert coll DTX "DF_DTI" to CoS cache, " + D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert coll DTX "DF_DTI" to CoS cache, " DF_UOID", key %lu, flags %x: "DF_RC"\n", DP_DTI(&((struct dtx_coll_entry *)entry)->dce_xid), DP_UOID(*oid), (unsigned long)dkey_hash, flags, DP_RC(rc)); else - D_CDEBUG(rc != 0, DLOG_ERR, DB_IO, "Insert reg DTX "DF_DTI" to CoS cache, " + D_CDEBUG(rc != 0, DLOG_ERR, DB_TRACE, "Insert reg DTX "DF_DTI" to CoS cache, " DF_UOID", key %lu, flags %x: "DF_RC"\n", DP_DTI(&((struct dtx_entry *)entry)->dte_xid), DP_UOID(*oid), (unsigned long)dkey_hash, flags, DP_RC(rc)); @@ -530,82 +644,36 @@ dtx_cos_del(struct ds_cont_child *cont, struct dtx_id *xid, dcr = (struct dtx_cos_rec *)riov.iov_buf; d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 1); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_prio_count--; - - D_GOTO(out, found = 1); } d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 2); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_reg_count--; - - D_GOTO(out, found = 2); } d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) != 0) - continue; - - d_list_del(&dcrc->dcrc_gl_committable); - d_list_del(&dcrc->dcrc_lo_link); - if (dcrc->dcrc_coll) { - dtx_coll_entry_put(dcrc->dcrc_dce); - cont->sc_dtx_committable_coll_count--; - } else { - dtx_entry_put(dcrc->dcrc_dte); + if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + rc = dtx_cos_del_one(cont, dcrc); + D_GOTO(out, found = 3); } - D_FREE(dcrc); - - cont->sc_dtx_committable_count--; - dcr->dcr_expcmt_count--; - - D_GOTO(out, found = 3); } out: - if (found > 0) { + if (found > 0) d_tm_dec_gauge(dtx_tls_get()->dt_committable, 1); - if (dcr->dcr_reg_count == 0 && dcr->dcr_prio_count == 0 && - dcr->dcr_expcmt_count == 0) - rc = dbtree_delete(cont->sc_dtx_cos_hdl, BTR_PROBE_EQ, &kiov, NULL); - } - if (rc == 0 && found == 0) rc = -DER_NONEXIST; - D_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_IO, - "Remove DTX "DF_DTI" from CoS " - "cache, "DF_UOID", key %lu, %s shared entry: rc = "DF_RC"\n", - DP_DTI(xid), DP_UOID(*oid), (unsigned long)dkey_hash, - found == 1 ? "has" : "has not", DP_RC(rc)); + DL_CDEBUG(rc != 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc, + "Remove DTX from CoS cache "DF_UOID", key %lu", + DP_UOID(*oid), (unsigned long)dkey_hash); return rc == -DER_NONEXIST ? 0 : rc; } @@ -624,6 +692,12 @@ dtx_cos_oldest(struct ds_cont_child *cont) return dcrc->dcrc_epoch; } +/* + * It is inefficient to search some item on a very long list. So let's skip + * the search if the length exceeds DTX_COS_SEARCH_MAX. That is not fatal. + */ +#define DTX_COS_SEARCH_MAX 32 + void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash) @@ -647,8 +721,13 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, dcr = (struct dtx_cos_rec *)riov.iov_buf; + if (dcr->dcr_reg_count > DTX_COS_SEARCH_MAX) + goto expcmt; + d_list_for_each_entry(dcrc, &dcr->dcr_reg_list, dcrc_lo_link) { if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { + dcrc->dcrc_reg = 0; + dcrc->dcrc_prio = 1; d_list_del(&dcrc->dcrc_lo_link); d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); dcr->dcr_reg_count--; @@ -658,14 +737,9 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, } } - d_list_for_each_entry(dcrc, &dcr->dcr_prio_list, dcrc_lo_link) { - if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) { - d_list_del(&dcrc->dcrc_lo_link); - d_list_add(&dcrc->dcrc_lo_link, &dcr->dcr_prio_list); - - D_GOTO(out, found = true); - } - } +expcmt: + if (dcr->dcr_expcmt_count > DTX_COS_SEARCH_MAX) + goto out; d_list_for_each_entry(dcrc, &dcr->dcr_expcmt_list, dcrc_lo_link) { if (memcmp(&dcrc->dcrc_dte->dte_xid, xid, sizeof(*xid)) == 0) @@ -683,3 +757,39 @@ dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, /* It is normal that the DTX entry (for priority) in CoS has been committed by race. */ } + +void +dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], uint32_t count) +{ + struct dtx_cos_rec_child *dcrc; + int del = 0; + int rc; + int i = 0; + bool found; + + while ((dcrc = d_list_pop_entry(&cont->sc_dtx_batched_list, struct dtx_cos_rec_child, + dcrc_batched_link)) != NULL) { + for (found = false; i < count && !found; i++) { + /* + * Some entries in the sc_dtx_batched_list may have been committed by + * others by race. Since the entries order in the sc_dtx_batched_list + * will not be changed, let's compare with xid[i] via one cycle scan. + */ + if (memcmp(&dcrc->dcrc_dte->dte_xid, &xid[i], sizeof(struct dtx_id)) == 0) { + found = true; + + if (rm[i]) { + rc = dtx_cos_del_one(cont, dcrc); + if (rc == 0) + del++; + } + } + } + + /* There must be one in xid array that matches current dcrc. */ + D_ASSERT(found); + } + + if (del > 0) + d_tm_dec_gauge(dtx_tls_get()->dt_committable, del); +} diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index a42bcc1d7d6..06d0333dd77 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -213,6 +213,7 @@ struct dtx_pool_metrics { struct dtx_tls { struct d_tm_node_t *dt_committable; struct d_tm_node_t *dt_dtx_leader_total; + struct d_tm_node_t *dt_async_cmt_lat; uint64_t dt_agg_gen; uint32_t dt_batched_ult_cnt; }; @@ -263,6 +264,9 @@ uint64_t dtx_cos_oldest(struct ds_cont_child *cont); void dtx_cos_prio(struct ds_cont_child *cont, struct dtx_id *xid, daos_unit_oid_t *oid, uint64_t dkey_hash); +void dtx_cos_batched_del(struct ds_cont_child *cont, struct dtx_id xid[], bool rm[], + uint32_t count); + /* dtx_rpc.c */ int dtx_check(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index b98a6469954..756dfe0ca44 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -115,7 +115,7 @@ dtx_resync_commit(struct ds_cont_child *cont, } if (j > 0) { - rc = dtx_commit(cont, dtes, dcks, j); + rc = dtx_commit(cont, dtes, dcks, j, true); if (rc < 0) D_ERROR("Failed to commit the DTXs: rc = "DF_RC"\n", DP_RC(rc)); @@ -359,7 +359,7 @@ dtx_status_handle_one(struct ds_cont_child *cont, struct dtx_entry *dte, daos_un dck.oid = oid; dck.dkey_hash = dkey_hash; - rc = dtx_coll_commit(cont, dce, &dck); + rc = dtx_coll_commit(cont, dce, &dck, true); } dtx_coll_entry_put(dce); diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index e654047a621..6d34e871269 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -225,9 +225,10 @@ dtx_req_cb(const struct crt_cb_info *cb_info) } out: - D_DEBUG(DB_TRACE, "DTX req for opc %x (req %p future %p) got reply from %d/%d: " - "epoch :"DF_X64", result %d\n", dra->dra_opc, req, dra->dra_future, - drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0, rc); + DL_CDEBUG(rc < 0 && rc != -DER_NONEXIST, DLOG_ERR, DB_TRACE, rc, + "DTX req for opc %x (req %p future %p) got reply from %d/%d: " + "epoch :"DF_X64, dra->dra_opc, req, dra->dra_future, + drr->drr_rank, drr->drr_tag, din != NULL ? din->di_epoch : 0); drr->drr_comp = 1; drr->drr_result = rc; @@ -397,19 +398,14 @@ dtx_req_list_send(struct dtx_common_args *dca, bool is_reentrance) if (unlikely(dra->dra_opc == DTX_COMMIT && dca->dca_i == 0 && DAOS_FAIL_CHECK(DAOS_DTX_FAIL_COMMIT))) - rc = dtx_req_send(dca->dca_drr, 1); + dtx_req_send(dca->dca_drr, 1); else - rc = dtx_req_send(dca->dca_drr, dca->dca_epoch); - if (rc != 0) { - /* If the first sub-RPC failed, then break, otherwise - * other remote replicas may have already received the - * RPC and executed it, so have to go ahead. - */ - if (dca->dca_i == 0) { - ABT_future_free(&dra->dra_future); - return rc; - } - } + dtx_req_send(dca->dca_drr, dca->dca_epoch); + /* + * Do not care dtx_req_send result, itself or its cb func will set dra->dra_future. + * Each RPC is independent from the others, let's go head to handle the other RPCs + * and set dra->dra_future that will avoid blocking the RPC sponsor - dtx_req_wait. + */ /* dca->dca_drr maybe not points to a real entry if all RPCs have been sent. */ dca->dca_drr = d_list_entry(dca->dca_drr->drr_link.next, @@ -616,12 +612,8 @@ dtx_rpc_helper(struct dss_chore *chore, bool is_reentrance) rc = dtx_req_list_send(dca, is_reentrance); if (rc == DSS_CHORE_YIELD) return DSS_CHORE_YIELD; - if (rc == DSS_CHORE_DONE) - rc = 0; - if (rc != 0) - dca->dca_dra.dra_result = rc; - D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, "%p: DTX RPC chore for %u done: %d\n", chore, - dca->dca_dra.dra_opc, rc); + D_ASSERTF(rc == DSS_CHORE_DONE, "Unexpected helper return value for RPC %u: %d\n", + dca->dca_dra.dra_opc, rc); if (dca->dca_chore_eventual != ABT_EVENTUAL_NULL) { rc = ABT_eventual_set(dca->dca_chore_eventual, NULL, 0); D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_set: %d\n", rc); @@ -737,8 +729,10 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry **dtes, } rc = dss_chore_delegate(&dca->dca_chore, dtx_rpc_helper); - if (rc != 0) + if (rc != 0) { + ABT_eventual_free(&dca->dca_chore_eventual); goto out; + } rc = ABT_eventual_wait(dca->dca_chore_eventual, NULL); D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_wait: %d\n", rc); @@ -809,7 +803,7 @@ dtx_rpc(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry **dtes, */ int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count) + struct dtx_cos_key *dcks, int count, bool has_cos) { struct dtx_common_args dca; struct dtx_req_args *dra = &dca.dca_dra; @@ -842,7 +836,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, rc1 = vos_dtx_set_flags(cont->sc_hdl, dca.dca_dtis, count, DTE_PARTIAL_COMMITTED); } else { - if (dcks != NULL) { + if (has_cos) { if (count > 1) { D_ALLOC_ARRAY(rm_cos, count); if (rm_cos == NULL) @@ -862,12 +856,16 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, } if (rc1 == 0 && rm_cos != NULL) { - for (i = 0; i < count; i++) { - if (rm_cos[i]) { - D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub)); - dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid, - dcks[i].dkey_hash); + if (dcks != NULL) { + for (i = 0; i < count; i++) { + if (rm_cos[i]) { + D_ASSERT(!daos_oid_is_null(dcks[i].oid.id_pub)); + dtx_cos_del(cont, &dca.dca_dtis[i], &dcks[i].oid, + dcks[i].dkey_hash); + } } + } else { + dtx_cos_batched_del(cont, dca.dca_dtis, rm_cos, count); } } @@ -976,8 +974,11 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che if (dsp->dsp_mbs == NULL) { rc = vos_dtx_load_mbs(cont->sc_hdl, &dsp->dsp_xid, NULL, &dsp->dsp_mbs); if (rc != 0) { - if (rc < 0 && rc != -DER_NONEXIST && for_io) + if (rc < 0 && rc != -DER_NONEXIST && for_io) { + D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", + DP_DTI(&dsp->dsp_xid), DP_RC(rc)); goto out; + } drop = true; goto next; @@ -1237,7 +1238,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che case DTX_ST_COMMITTABLE: dck.oid = dsp->dsp_oid; dck.dkey_hash = dsp->dsp_dkey_hash; - rc = dtx_commit(cont, &pdte, &dck, 1); + rc = dtx_commit(cont, &pdte, &dck, 1, true); if (rc < 0 && rc != -DER_NONEXIST && for_io) d_list_add_tail(&dsp->dsp_link, cmt_list); else @@ -1258,7 +1259,7 @@ dtx_refresh_internal(struct ds_cont_child *cont, int *check_count, d_list_t *che case DSHR_NEED_COMMIT: { dck.oid = dsp->dsp_oid; dck.dkey_hash = dsp->dsp_dkey_hash; - rc = dtx_commit(cont, &pdte, &dck, 1); + rc = dtx_commit(cont, &pdte, &dck, 1, true); if (rc < 0 && rc != -DER_NONEXIST && for_io) d_list_add_tail(&dsp->dsp_link, cmt_list); else @@ -1571,17 +1572,20 @@ dtx_coll_rpc_post(struct dtx_coll_rpc_args *dcra, int ret) { int rc; - rc = ABT_future_wait(dcra->dcra_future); - D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, - "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", - dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); - ABT_future_free(&dcra->dcra_future); + if (dcra->dcra_future != ABT_FUTURE_NULL) { + rc = ABT_future_wait(dcra->dcra_future); + D_CDEBUG(rc != ABT_SUCCESS, DLOG_ERR, DB_TRACE, + "Collective DTX wait req for opc %u, future %p done, rc %d, result %d\n", + dcra->dcra_opc, dcra->dcra_future, rc, dcra->dcra_result); + ABT_future_free(&dcra->dcra_future); + } return ret != 0 ? ret : dcra->dcra_result; } int -dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck) +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck, + bool has_cos) { struct dtx_coll_rpc_args dcra = { 0 }; int *results = NULL; @@ -1591,11 +1595,22 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d int rc1 = 0; int rc2 = 0; int i; + bool cos = true; if (dce->dce_ranks != NULL) rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_COMMIT, 0, &dcra); + /* + * NOTE: Before committing the DTX on remote participants, we cannot remove the active + * DTX locally; otherwise, the local committed DTX entry may be removed via DTX + * aggregation before remote participants commit done. Under such case, if some + * remote DTX participant triggere DTX_REFRESH for such DTX during the interval, + * then it will get -DER_TX_UNCERTAIN, that may cause related application to be + * failed. So here, we let remote participants to commit firstly, if failed, we + * will ask the leader to retry the commit until all participants got committed. + */ if (dce->dce_bitmap != NULL) { + clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, 0, DTX_COLL_COMMIT, dce->dce_bitmap_sz, dce->dce_bitmap, &results); @@ -1634,12 +1649,17 @@ dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct d * to remove the collective DTX entry from the CoS even if the commit failed remotely. * Otherwise, the batched commit ULT may be blocked by such "bad" entry. */ - if (rc2 == 0 && dck != NULL) - dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash); + if (rc2 == 0 && has_cos) { + if (dck != NULL) + dtx_cos_del(cont, &dce->dce_xid, &dck->oid, dck->dkey_hash); + else + dtx_cos_batched_del(cont, &dce->dce_xid, &cos, 1); + } D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, - "Collectively commit DTX "DF_DTI": %d/%d/%d\n", - DP_DTI(&dce->dce_xid), rc, rc1, rc2); + "Collectively commit DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/%d\n", + DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), + rc, rc1, rc2); return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; } @@ -1658,7 +1678,17 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc if (dce->dce_ranks != NULL) rc = dtx_coll_rpc_prep(cont, dce, DTX_COLL_ABORT, epoch, &dcra); + /* + * NOTE: The DTX abort maybe triggered by dtx_leader_end() for timeout on some DTX + * participant(s). Under such case, the client side RPC sponsor may also hit + * the RPC timeout and resends related RPC to the leader. Here, to avoid DTX + * abort and resend RPC forwarding being executed in parallel, we will abort + * local DTX after remote done, before that the logic of handling resent RPC + * on server will find the local pinned DTX entry then notify related client + * to resend sometime later. + */ if (dce->dce_bitmap != NULL) { + clrbit(dce->dce_bitmap, dss_get_module_info()->dmi_tgt_id); len = dtx_coll_local_exec(cont->sc_pool_uuid, cont->sc_uuid, &dce->dce_xid, epoch, DTX_COLL_ABORT, dce->dce_bitmap_sz, dce->dce_bitmap, &results); @@ -1688,8 +1718,9 @@ dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc rc2 = 0; D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DB_TRACE, - "Collectively abort DTX "DF_DTI": %d/%d/%d\n", - DP_DTI(&dce->dce_xid), rc, rc1, rc2); + "Collectively abort DTX "DF_DTI" with epoch "DF_X64" in " + DF_UUID"/"DF_UUID": %d/%d/%d\n", DP_DTI(&dce->dce_xid), epoch, + DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), rc, rc1, rc2); return rc != 0 ? rc : rc1 != 0 ? rc1 : rc2; } @@ -1737,8 +1768,9 @@ dtx_coll_check(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoc } D_CDEBUG((rc < 0 && rc != -DER_NONEXIST) || (rc1 < 0 && rc1 != -DER_NONEXIST), DLOG_ERR, - DB_TRACE, "Collectively check DTX "DF_DTI": %d/%d/\n", - DP_DTI(&dce->dce_xid), rc, rc1); + DB_TRACE, "Collectively check DTX "DF_DTI" in "DF_UUID"/"DF_UUID": %d/%d/\n", + DP_DTI(&dce->dce_xid), DP_UUID(cont->sc_pool_uuid), DP_UUID(cont->sc_uuid), + rc, rc1); return dce->dce_ranks != NULL ? rc : rc1; } diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 93cc6744a2d..084f804ec81 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -34,8 +34,8 @@ dtx_tls_init(int tags, int xs_id, int tgt_id) tls->dt_agg_gen = 1; rc = d_tm_add_metric(&tls->dt_committable, D_TM_STATS_GAUGE, - "total number of committable DTX entries", - "entries", "io/dtx/committable/tgt_%u", tgt_id); + "total number of committable DTX entries", "entry", + "io/dtx/committable/tgt_%u", tgt_id); if (rc != DER_SUCCESS) D_WARN("Failed to create DTX committable metric: " DF_RC"\n", DP_RC(rc)); @@ -48,6 +48,13 @@ dtx_tls_init(int tags, int xs_id, int tgt_id) D_WARN("Failed to create DTX leader metric: " DF_RC"\n", DP_RC(rc)); + rc = d_tm_add_metric(&tls->dt_async_cmt_lat, D_TM_STATS_GAUGE, + "DTX async commit latency", "ms", + "io/dtx/async_cmt_lat/tgt_%u", tgt_id); + if (rc != DER_SUCCESS) + D_WARN("Failed to create DTX async commit latency metric: " DF_RC"\n", + DP_RC(rc)); + return tls; } @@ -117,7 +124,7 @@ dtx_metrics_alloc(const char *path, int tgt_id) rc = d_tm_add_metric(&metrics->dpm_total[DTX_PROTO_SRV_RPC_COUNT], D_TM_COUNTER, "total number of processed sync DTX_COMMIT", "ops", - "%s/ops/sync_dtx_commit/tgt_%u", path, tgt_id); + "%s/ops/dtx_sync_commit/tgt_%u", path, tgt_id); if (rc != DER_SUCCESS) D_WARN("Failed to create sync DTX_COMMIT RPC cnt metric: "DF_RC"\n", DP_RC(rc)); @@ -330,7 +337,7 @@ dtx_handler(crt_rpc_t *rpc) dout->do_status = rc; /* For DTX_COMMIT, it is the count of real committed DTX entries. */ dout->do_misc = committed; - rc = crt_reply_send(rpc); + rc = crt_reply_send_input_free(rpc); if (rc != 0) D_ERROR("send reply failed for DTX rpc %u: rc = "DF_RC"\n", opc, DP_RC(rc)); @@ -467,8 +474,9 @@ dtx_coll_handler(crt_rpc_t *rpc) out: D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, - "Handled collective DTX PRC %u on rank %u for "DF_DTI": "DF_RC"\n", - opc, myrank, DP_DTI(&dci->dci_xid), DP_RC(rc)); + "Handled collective DTX PRC %u on rank %u for "DF_DTI" in " + DF_UUID"/"DF_UUID": "DF_RC"\n", opc, myrank, DP_DTI(&dci->dci_xid), + DP_UUID(dci->dci_po_uuid), DP_UUID(dci->dci_co_uuid), DP_RC(rc)); dco->dco_status = rc; rc = crt_reply_send(rpc); diff --git a/src/engine/ult.c b/src/engine/ult.c index 94a2a0f9390..fbeb3f538fa 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -458,7 +458,7 @@ sched_ult2xs(int xs_type, int tgt_id) break; case DSS_XS_OFFLOAD: if (dss_numa_nr > 1) - xs_id = sched_ult2xs_multisocket(xs_type, tgt_id); + return sched_ult2xs_multisocket(xs_type, tgt_id); if (!dss_helper_pool) { if (dss_tgt_offload_xs_nr > 0) xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr; diff --git a/src/gurt/telemetry.c b/src/gurt/telemetry.c index 8f08192d072..be7762184a9 100644 --- a/src/gurt/telemetry.c +++ b/src/gurt/telemetry.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -624,6 +625,7 @@ alloc_node(struct d_tm_shmem_hdr *shmem, struct d_tm_node_t **newnode, goto out; tmp->dtn_metric = NULL; tmp->dtn_sibling = NULL; + atomic_store_relaxed(&tmp->dtn_readable, false); *newnode = node; out: @@ -2409,6 +2411,9 @@ add_metric(struct d_tm_context *ctx, struct d_tm_node_t **node, int metric_type, pthread_mutexattr_destroy(&mattr); temp->dtn_protect = true; } + + atomic_store_relaxed(&temp->dtn_readable, true); + if (node != NULL) *node = temp; @@ -3090,6 +3095,15 @@ d_tm_try_del_ephemeral_dir(const char *fmt, ...) return rc; } +static bool +node_is_readable(struct d_tm_node_t *node) +{ + if (node == NULL) + return false; + + return atomic_load_relaxed(&node->dtn_readable); +} + /** * Creates histogram counters for the given node. It calculates the * extents of each bucket and creates counters at the path specified that @@ -3278,6 +3292,9 @@ d_tm_get_num_buckets(struct d_tm_context *ctx, if (ctx == NULL || histogram == NULL || node == NULL) return -DER_INVAL; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + rc = validate_node_ptr(ctx, node, &shmem); if (rc != 0) return rc; @@ -3341,6 +3358,9 @@ d_tm_get_bucket_range(struct d_tm_context *ctx, struct d_tm_bucket_t *bucket, if (rc != 0) return rc; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + if (!has_stats(node)) return -DER_OP_NOT_PERMITTED; @@ -3392,6 +3412,9 @@ d_tm_get_counter(struct d_tm_context *ctx, uint64_t *val, if (node->dtn_type != D_TM_COUNTER) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + /* "ctx == NULL" is server side fast version to read the counter. */ if (ctx == NULL) { metric_data = node->dtn_metric; @@ -3441,6 +3464,9 @@ d_tm_get_timestamp(struct d_tm_context *ctx, time_t *val, if (node->dtn_type != D_TM_TIMESTAMP) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3470,6 +3496,9 @@ d_tm_get_meminfo(struct d_tm_context *ctx, struct d_tm_meminfo_t *meminfo, if (node->dtn_type != D_TM_MEMINFO) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3513,6 +3542,9 @@ d_tm_get_timer_snapshot(struct d_tm_context *ctx, struct timespec *tms, if (!(node->dtn_type & D_TM_TIMER_SNAPSHOT)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); @@ -3563,6 +3595,9 @@ d_tm_get_duration(struct d_tm_context *ctx, struct timespec *tms, if (!(node->dtn_type & D_TM_DURATION)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data == NULL) return -DER_METRIC_NOT_FOUND; @@ -3628,6 +3663,9 @@ d_tm_get_gauge(struct d_tm_context *ctx, uint64_t *val, if (!is_gauge(node)) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { dtm_stats = conv_ptr(shmem, metric_data->dtm_stats); @@ -3700,6 +3738,9 @@ int d_tm_get_metadata(struct d_tm_context *ctx, char **desc, char **units, if (node->dtn_type == D_TM_DIRECTORY) return -DER_OP_NOT_PERMITTED; + if (unlikely(!node_is_readable(node))) + return -DER_AGAIN; + metric_data = conv_ptr(shmem, node->dtn_metric); if (metric_data != NULL) { d_tm_node_lock(node); diff --git a/src/include/cart/api.h b/src/include/cart/api.h index 21941d7cb98..16fc2091d4b 100644 --- a/src/include/cart/api.h +++ b/src/include/cart/api.h @@ -474,6 +474,21 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg); int crt_reply_send(crt_rpc_t *req); +/** + * Send an RPC reply and free the input buffer immediately. + * Only to be called on the server side. + * + * \param[in] req pointer to RPC request + * + * \return DER_SUCCESS on success, negative value if error + * + * \note the crt_rpc_t is exported to user, caller should fill the + * crt_rpc_t::cr_output before sending the RPC reply. + * See \ref crt_req_create. + */ +int +crt_reply_send_input_free(crt_rpc_t *req); + /** * Return request buffer * diff --git a/src/include/daos/container.h b/src/include/daos/container.h index 282dfb8ec49..57ef06f63a2 100644 --- a/src/include/daos/container.h +++ b/src/include/daos/container.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -130,7 +130,7 @@ dc_cont_open_flags_valid(uint64_t flags) f = flags; /* One and only one of DAOS_COO_RO, DAOS_COO_RW, and DAOS_COO_EX. */ - m = f & (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX); + m = f & DAOS_COO_IO_BASE_MASK; if (m != DAOS_COO_RO && m != DAOS_COO_RW && m != DAOS_COO_EX) return false; diff --git a/src/include/daos_cont.h b/src/include/daos_cont.h index c3bda9837d8..632eafe8130 100644 --- a/src/include/daos_cont.h +++ b/src/include/daos_cont.h @@ -79,6 +79,9 @@ extern "C" { /** Mask for all of the bits in the container open mode flag, DAOS_COO_ bits */ #define DAOS_COO_MASK ((1U << DAOS_COO_NBITS) - 1) +/** The basic IO mode: read-only, read-write or exclusively read-write. */ +#define DAOS_COO_IO_BASE_MASK (DAOS_COO_RO | DAOS_COO_RW | DAOS_COO_EX) + /** Maximum length for container hints */ #define DAOS_CONT_HINT_MAX_LEN 128 diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 793d585c07a..9fc615c2a8b 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -108,6 +108,9 @@ struct ds_cont_child { uint32_t sc_dtx_committable_count; uint32_t sc_dtx_committable_coll_count; + /* Last timestamp when EC aggregation reports -DER_INPROGRESS. */ + uint64_t sc_ec_agg_busy_ts; + /* The global minimum EC aggregation epoch, which will be upper * limit for VOS aggregation, i.e. EC object VOS aggregation can * not cross this limit. For simplification purpose, all objects @@ -134,6 +137,8 @@ struct ds_cont_child { d_list_t sc_dtx_cos_list; /* The global list for committable collective DTXs. */ d_list_t sc_dtx_coll_list; + /* The list for current DTX batched commit. */ + d_list_t sc_dtx_batched_list; /* the pool map version of updating DAOS_PROP_CO_STATUS prop */ uint32_t sc_status_pm_ver; }; diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 7b3f7e9ee57..7c60d2deaa0 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -318,8 +318,8 @@ int dtx_cos_get_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, int max, struct dtx_id **dtis); void -dtx_cos_put_piggyback(struct ds_cont_child *cont, struct dtx_id *xid, - daos_unit_oid_t *oid, uint64_t dkey_hash); +dtx_cos_put_piggyback(struct ds_cont_child *cont, daos_unit_oid_t *oid, uint64_t dkey_hash, + struct dtx_id xid[], uint32_t count, bool rm); int dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func, dtx_agg_cb_t agg_cb, int allow_failure, void *func_arg); @@ -338,14 +338,15 @@ int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, daos_epoch_t epoch); int dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, - struct dtx_cos_key *dcks, int count); + struct dtx_cos_key *dcks, int count, bool has_cos); int dtx_abort(struct ds_cont_child *cont, struct dtx_entry *dte, daos_epoch_t epoch); int dtx_refresh(struct dtx_handle *dth, struct ds_cont_child *cont); int -dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck); +dtx_coll_commit(struct ds_cont_child *cont, struct dtx_coll_entry *dce, struct dtx_cos_key *dck, + bool has_cos); int dtx_coll_abort(struct ds_cont_child *cont, struct dtx_coll_entry *dce, daos_epoch_t epoch); diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 6cbe3873f0a..de22d55ed5d 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -84,6 +84,7 @@ struct ds_pool { uuid_t sp_srv_pool_hdl; uint32_t sp_stopping:1, sp_cr_checked:1, + sp_immutable:1, sp_fetch_hdls:1, sp_need_discard:1, sp_disable_rebuild:1; @@ -277,9 +278,9 @@ int ds_pool_tgt_finish_rebuild(uuid_t pool_uuid, struct pool_target_id_list *lis int ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, unsigned int map_version); -bool ds_pool_skip_for_check(struct ds_pool *pool); -int ds_pool_start_after_check(uuid_t uuid); -int ds_pool_start(uuid_t uuid, bool aft_chk); +bool ds_pool_restricted(struct ds_pool *pool, bool immutable); +int ds_pool_start_after_check(uuid_t uuid, bool immutable); +int ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable); int ds_pool_stop(uuid_t uuid); int dsc_pool_svc_extend(uuid_t pool_uuid, d_rank_list_t *svc_ranks, uint64_t deadline, int ntargets, const d_rank_list_t *rank_list, int ndomains, const uint32_t *domains); diff --git a/src/include/daos_srv/security.h b/src/include/daos_srv/security.h index 203b9b548ee..28fe83852b3 100644 --- a/src/include/daos_srv/security.h +++ b/src/include/daos_srv/security.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -304,6 +304,19 @@ ds_sec_cont_can_open_ex(uint64_t cont_capas); bool ds_sec_cont_can_evict_all(uint64_t cont_capas); +/** + * Determine if the container can be modified based on the container security + * capabilities. + * + * \param[in] cont_capas Capability bits acquired via + * ds_sec_cont_get_capabilities + * + * \return True Access allowed + * False Access denied + */ +bool +ds_sec_cont_can_modify(uint64_t cont_capas); + /** * Get the security capabilities for a rebuild container handle created by the * DAOS server. diff --git a/src/include/gurt/telemetry_common.h b/src/include/gurt/telemetry_common.h index 4e7d3b3a02d..2068c1c0cce 100644 --- a/src/include/gurt/telemetry_common.h +++ b/src/include/gurt/telemetry_common.h @@ -8,6 +8,8 @@ #include +#include + #define D_TM_VERSION 1 #define D_TM_MAX_NAME_LEN 256 #define D_TM_MAX_DESC_LEN 128 @@ -236,6 +238,7 @@ struct d_tm_node_t { pthread_mutex_t dtn_lock; /** individual mutex */ struct d_tm_metric_t *dtn_metric; /** values */ bool dtn_protect; /** synchronized access */ + _Atomic bool dtn_readable; /** fully initialized and ready for reads */ }; struct d_tm_nodeList_t { diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index e1d6ee79bce..7975a2115d4 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1212,7 +1212,7 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) tc_out->tc_ranks.ca_arrays = rank; tc_out->tc_ranks.ca_count = 1; - rc = ds_pool_start(tc_in->tc_pool_uuid, false); + rc = ds_pool_start(tc_in->tc_pool_uuid, false, false); if (rc) { D_ERROR(DF_UUID": failed to start pool: "DF_RC"\n", DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc)); diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index e05abadf3cf..d517e3269d6 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -178,6 +178,10 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool if (coa->coa_sparse == 0) coa->coa_dct_cap = obj_ranks; } + + /* Save obj->cob_min_rank for verification during subsequent obj_coll_prep_one(). */ + coa->coa_min_rank = obj->cob_min_rank; + D_RWLOCK_UNLOCK(&obj->cob_lock); if (coa->coa_sparse) { @@ -208,7 +212,6 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool coa->coa_dct_nr = -1; } - coa->coa_max_dct_sz = 0; coa->coa_max_shard_nr = 0; coa->coa_max_bitmap_sz = 0; coa->coa_target_nr = 0; @@ -236,17 +239,55 @@ obj_coll_oper_args_fini(struct coll_oper_args *coa) coa->coa_dct_nr = 0; } +static void +obj_coll_collapse_one(struct coll_oper_args *coa, struct daos_coll_target *dct, + uint32_t *size, bool copy) +{ + struct daos_coll_shard *dcs; + uint32_t dct_size; + int i; + + /* The size may be over estimated, no matter. */ + dct_size = sizeof(*dct) + dct->dct_bitmap_sz + + sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); + + for (i = 0; i <= dct->dct_max_shard; i++) { + dcs = &dct->dct_shards[i]; + if (dcs->dcs_nr > 1) + dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; + } + + if (coa->coa_for_modify) + dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; + + if (coa->coa_max_dct_sz < dct_size) + coa->coa_max_dct_sz = dct_size; + + if (copy) + memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); + + coa->coa_dct_nr++; + *size += dct_size; +} + +struct obj_coll_tree_args { + struct coll_oper_args *coa; + uint32_t *size; +}; + static int obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg) { - struct coll_oper_args *coa = arg; - struct daos_coll_target *dct = val->iov_buf; + struct obj_coll_tree_args *octa = arg; + struct coll_oper_args *coa = octa->coa; + struct daos_coll_target *dct = val->iov_buf; D_ASSERTF(coa->coa_dct_nr < coa->coa_dct_cap, "Too short pre-allcoated dct_array: %u vs %u\n", coa->coa_dct_nr, coa->coa_dct_cap); + D_ASSERT(dct->dct_bitmap != NULL); - memcpy(&coa->coa_dcts[coa->coa_dct_nr++], dct, sizeof(*dct)); + obj_coll_collapse_one(coa, dct, octa->size, true); /* The following members have been migrated into coa->coa_dcts. */ dct->dct_bitmap = NULL; @@ -259,6 +300,7 @@ obj_coll_tree_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *arg) static int obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size) { + struct obj_coll_tree_args octa; struct coll_sparse_targets *tree = coa->coa_tree; int rc = 0; @@ -270,7 +312,14 @@ obj_coll_collapse_tree(struct coll_oper_args *coa, uint32_t *size) D_GOTO(out, rc = -DER_NOMEM); coa->coa_sparse = 0; - rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, obj_coll_tree_cb, coa); + coa->coa_raw_sparse = 1; + coa->coa_dct_nr = 0; + coa->coa_max_dct_sz = 0; + + octa.coa = coa; + octa.size = size; + rc = dbtree_iterate(tree->cst_tree_hdl, DAOS_INTENT_DEFAULT, false, + obj_coll_tree_cb, &octa); if (rc == 0) D_ASSERTF(coa->coa_dct_nr == coa->coa_dct_cap, "Something is wrong when prepare coll target array: %u vs %u\n", @@ -287,36 +336,13 @@ static int obj_coll_collapse_array(struct coll_oper_args *coa, uint32_t *size) { struct daos_coll_target *dct; - struct daos_coll_shard *dcs; - uint32_t dct_size; int i; - int j; - for (i = 0, *size = 0, coa->coa_dct_nr = 0; i < coa->coa_dct_cap; i++) { + for (i = 0, *size = 0, coa->coa_dct_nr = 0, coa->coa_max_dct_sz = 0; + i < coa->coa_dct_cap; i++) { dct = &coa->coa_dcts[i]; - if (dct->dct_bitmap != NULL) { - /* The size may be over estimated, no matter. */ - dct_size = sizeof(*dct) + dct->dct_bitmap_sz + - sizeof(dct->dct_shards[0]) * (dct->dct_max_shard + 1); - - for (j = 0; j <= dct->dct_max_shard; j++) { - dcs = &dct->dct_shards[j]; - if (dcs->dcs_nr > 1) - dct_size += sizeof(dcs->dcs_buf[0]) * dcs->dcs_nr; - } - - if (coa->coa_for_modify) - dct_size += sizeof(dct->dct_tgt_ids[0]) * dct->dct_tgt_nr; - - if (coa->coa_max_dct_sz < dct_size) - coa->coa_max_dct_sz = dct_size; - - if (coa->coa_dct_nr < i) - memcpy(&coa->coa_dcts[coa->coa_dct_nr], dct, sizeof(*dct)); - - coa->coa_dct_nr++; - *size += dct_size; - } + if (dct->dct_bitmap != NULL) + obj_coll_collapse_one(coa, dct, size, coa->coa_dct_nr < i); } /* Reset the other dct slots to avoid double free during cleanup. */ @@ -373,8 +399,9 @@ obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj, D_RWLOCK_RDLOCK(&obj->cob_lock); - D_ASSERTF(shard->do_target_rank <= obj->cob_max_rank, - "Unexpected shard with rank %u > %u\n", shard->do_target_rank, obj->cob_max_rank); + D_ASSERTF(coa->coa_min_rank == obj->cob_min_rank, + "Object "DF_OID" layout has been changed unexpectedly %u => %u, idx %u, ver %u\n", + DP_OID(obj->cob_md.omd_id), coa->coa_min_rank, obj->cob_min_rank, idx, map_ver); D_ASSERTF(shard->do_target_rank >= obj->cob_min_rank, "Unexpected shard with rank %u < %u\n", shard->do_target_rank, obj->cob_min_rank); @@ -669,7 +696,6 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo uint32_t tgt_size = 0; uint32_t mbs_max_size; uint32_t inline_size; - uint32_t flags = ORF_LEADER; uint32_t leader = -1; uint32_t len; int rc; @@ -746,6 +772,12 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo memcpy(dct, &tmp_tgt, sizeof(tmp_tgt)); } + /* 'shard' is on the leader target that is must be the coa->coa_dcts[0]. */ + D_ASSERTF(shard->do_target_rank == coa->coa_dcts[0].dct_rank, + "Object "DF_OID" target array corrupted: rank %u vs %ur, nr %u\n", + DP_OID(obj->cob_md.omd_id), shard->do_target_rank, + coa->coa_dcts[0].dct_rank, coa->coa_dct_nr); + rc = dc_obj_coll_punch_mbs(coa, obj, shard->do_target_id, &mbs); if (rc < 0) goto out; @@ -767,12 +799,14 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo if (rc != 0) goto out; + auxi->flags = ORF_LEADER; if (auxi->io_retry) { - flags |= ORF_RESEND; + auxi->flags |= ORF_RESEND; /* Reset @enqueue_id if resend to new leader. */ if (spa->pa_auxi.target != shard->do_target_id) spa->pa_auxi.enqueue_id = 0; } else { + auxi->flags &= ~ORF_RESEND; spa->pa_auxi.obj_auxi = auxi; daos_dti_gen(&spa->pa_dti, false); } @@ -781,14 +815,15 @@ dc_obj_coll_punch(tse_task_t *task, struct dc_object *obj, struct dtx_epoch *epo spa->pa_auxi.shard = shard->do_shard_idx; if (obj_is_ec(obj)) - flags |= ORF_EC; + auxi->flags |= ORF_EC; mbs_max_size = sizeof(*mbs) + mbs->dm_data_size + sizeof(coa->coa_targets[0]) * coa->coa_max_shard_nr + coa->coa_max_bitmap_sz; return dc_obj_shard_coll_punch(shard, spa, mbs, mbs_max_size, cpca.cpca_bulks, tgt_size, coa->coa_dcts, coa->coa_dct_nr, coa->coa_max_dct_sz, epoch, - args->flags, flags, map_ver, &auxi->map_ver_reply, task); + args->flags, auxi->flags, map_ver, + &auxi->map_ver_reply, task); out: if (rc > 0) @@ -838,7 +873,7 @@ queue_coll_query_task(tse_task_t *api_task, struct obj_auxi_args *obj_auxi, stru 0, 0, ocdc); for (i = 0; i < ocdc->grp_nr; i++) { - obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep); + obj_coll_disp_dest(ocdc, coa->coa_dcts, &tgt_ep, obj->cob_md.omd_id); tmp = coa->coa_dcts[ocdc->cur_pos].dct_shards[tgt_ep.ep_tag].dcs_idx; rc = queue_shard_query_key_task(api_task, obj_auxi, epoch, tmp, map_ver, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 318ba0cb51e..46161bf7a2d 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -1718,15 +1718,20 @@ dc_obj_retry_delay(tse_task_t *task, int err, uint16_t *retry_cnt, uint16_t *inp uint32_t timeout_sec) { uint32_t delay = 0; + uint32_t limit = 4; /* - * Randomly delay 5 - 68 us if it is not the first retry for + * Randomly delay 5 ~ 1028 us if it is not the first retry for * -DER_INPROGRESS || -DER_UPDATE_AGAIN cases. */ ++(*retry_cnt); if (err == -DER_INPROGRESS || err == -DER_UPDATE_AGAIN) { if (++(*inprogress_cnt) > 1) { - delay = (d_rand() & ((1 << 6) - 1)) + 5; + limit += *inprogress_cnt; + if (limit > 10) + limit = 10; + + delay = (d_rand() & ((1 << limit) - 1)) + 5; /* Rebuild is being established on the server side, wait a bit longer */ if (err == -DER_UPDATE_AGAIN) delay <<= 10; @@ -4856,11 +4861,14 @@ obj_comp_cb(tse_task_t *task, void *data) D_ASSERT(daos_handle_is_inval(obj_auxi->th)); D_ASSERT(obj_is_modification_opc(obj_auxi->opc)); - if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) - /* XXX: it is must because miss to set "RESEND" flag, that is bug. */ - D_ASSERTF(0, - "Miss 'RESEND' flag (%x) when resend the RPC for task %p: %u\n", - obj_auxi->flags, task, obj_auxi->retry_cnt); + if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) { + D_ERROR("TX ID maybe reused for unknown reason, " + "task %p, opc %u, flags %x, retry_cnt %u\n", + task, obj_auxi->opc, obj_auxi->flags, obj_auxi->retry_cnt); + task->dt_result = -DER_IO; + obj_auxi->io_retry = 0; + goto args_fini; + } if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) { daos_obj_rw_t *api_args = dc_task_get_args(obj_auxi->obj_task); @@ -4886,6 +4894,7 @@ obj_comp_cb(tse_task_t *task, void *data) } } +args_fini: if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH) obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa); diff --git a/src/object/cli_shard.c b/src/object/cli_shard.c index 36c5c5f1e0c..9f084140f80 100644 --- a/src/object/cli_shard.c +++ b/src/object/cli_shard.c @@ -1451,11 +1451,15 @@ obj_shard_coll_punch_cb(tse_task_t *task, void *data) shard_args->pa_auxi.obj_auxi->max_delay = timeout; } - DL_CDEBUG(task->dt_result < 0, DLOG_ERR, DB_IO, task->dt_result, - "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" with DTX " - DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x", rpc, DP_UOID(ocpi->ocpi_oid), - DP_DTI(&ocpi->ocpi_xid), task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, - (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags); + DL_CDEBUG(task->dt_result < 0 && task->dt_result != -DER_INPROGRESS, + DLOG_ERR, DB_IO, task->dt_result, + "DAOS_OBJ_RPC_COLL_PUNCH RPC %p for "DF_UOID" in "DF_UUID"/"DF_UUID"/" + DF_UUID" with DTX "DF_DTI" for task %p, map_ver %u/%u, flags %lx/%x, %s layout", + rpc, DP_UOID(ocpi->ocpi_oid), DP_UUID(ocpi->ocpi_po_uuid), + DP_UUID(ocpi->ocpi_co_hdl), DP_UUID(ocpi->ocpi_co_uuid), DP_DTI(&ocpi->ocpi_xid), + task, ocpi->ocpi_map_ver, *cb_args->cpca_ver, + (unsigned long)ocpi->ocpi_api_flags, ocpi->ocpi_flags, + cb_args->cpca_shard_args->pa_coa.coa_raw_sparse ? "sparse" : "continuous"); crt_req_decref(rpc); diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index ae5c9c82fd1..c0df21dd009 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -293,10 +293,16 @@ struct coll_oper_args { struct shard_auxi_args coa_auxi; int coa_dct_nr; uint32_t coa_dct_cap; - uint32_t coa_max_dct_sz; + union { + /* Save obj->cob_min_rank for verification during obj_coll_prep_one(). */ + uint32_t coa_min_rank; + /* Only can be used since obj_coll_oper_args_collapse() after object layout scan. */ + uint32_t coa_max_dct_sz; + }; uint8_t coa_max_shard_nr; uint8_t coa_max_bitmap_sz; uint8_t coa_for_modify:1, + coa_raw_sparse:1, coa_sparse:1; uint8_t coa_target_nr; /* @@ -1094,7 +1100,7 @@ int daos_obj_query_merge(struct obj_query_merge_args *oqma); void obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, uint32_t start, uint32_t max_width, struct obj_coll_disp_cursor *ocdc); void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, - crt_endpoint_t *tgt_ep); + crt_endpoint_t *tgt_ep, daos_obj_id_t oid); void obj_coll_disp_move(struct obj_coll_disp_cursor *ocdc); int obj_utils_init(void); void obj_utils_fini(void); diff --git a/src/object/obj_utils.c b/src/object/obj_utils.c index 7bf0ef4aaf9..c01947a05a1 100644 --- a/src/object/obj_utils.c +++ b/src/object/obj_utils.c @@ -616,24 +616,23 @@ obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, void obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, - crt_endpoint_t *tgt_ep) + crt_endpoint_t *tgt_ep, daos_obj_id_t oid) { struct daos_coll_target *dct = &tgts[ocdc->cur_pos]; struct daos_coll_target tmp; - unsigned long rand = 0; uint32_t size; int pos; int i; if (ocdc->cur_step > 2) { - rand = d_rand(); /* - * Randomly choose an engine as the relay one for load balance. - * If the one corresponding to "pos" is former moved one, then - * use the "cur_pos" as the relay engine. + * Choose an engine (according to the given oid) as the relay one for load balance. + * If the one corresponding to "pos" is former moved one, then use the "cur_pos" as + * the relay engine. Then even if related RPC was resent without changing pool map, + * then the relay one will be the same as the original case. */ - pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; - if (pos != ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { + pos = oid.lo % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; + if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { memcpy(&tmp, &tgts[pos], sizeof(tmp)); memcpy(&tgts[pos], dct, sizeof(tmp)); memcpy(dct, &tmp, sizeof(tmp)); @@ -642,8 +641,8 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t size = dct->dct_bitmap_sz << 3; - /* Randomly choose a XS as the local leader on target engine for load balance. */ - for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) { + /* Choose a target as the local agent on the engine for load balance. */ + for (i = 0, pos = oid.lo % dct->dct_tgt_nr; i < size; i++) { if (isset(dct->dct_bitmap, i)) { pos -= dct->dct_shards[i].dcs_nr; if (pos < 0) diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index 2a152b47bd6..5b59f954f86 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -239,9 +239,15 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct int i; int j; - /* dcts[0] is for current engine. */ - if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || - dcts[0].dct_shards == NULL) { + /* dcts[0] must be for current engine. */ + if (unlikely(dcts[0].dct_rank != dss_self_rank())) { + D_ERROR("Invalid targets array: rank %u vs %u, nr %u, flags %x\n", + dcts[0].dct_rank, dss_self_rank(), dct_nr, ocpi->ocpi_flags); + D_GOTO(out, rc = -DER_INVAL); + } + + if (unlikely(dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || + dcts[0].dct_shards == NULL)) { D_ERROR("Invalid input for current engine: bitmap %s, bitmap_sz %u, shards %s\n", dcts[0].dct_bitmap == NULL ? "empty" : "non-empty", dcts[0].dct_bitmap_sz, dcts[0].dct_shards == NULL ? "empty" : "non-empty"); diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 71c630fa947..50b513d1612 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -2667,8 +2667,13 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, struct ec_agg_param *ec_agg_param = agg_param->ap_data; vos_iter_param_t iter_param = { 0 }; struct vos_iter_anchors anchors = { 0 }; + struct dtx_handle *dth = NULL; + struct dtx_share_peer *dsp; + struct dtx_id dti = { 0 }; + struct dtx_epoch epoch = { 0 }; + daos_unit_oid_t oid = { 0 }; + int blocks = 0; int rc = 0; - int blocks = 0; /* * Avoid calling into vos_aggregate() when aborting aggregation @@ -2715,8 +2720,32 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL); retry: + epoch.oe_value = epr->epr_hi; + rc = dtx_begin(cont->sc_hdl, &dti, &epoch, 0, cont->sc_pool->spc_map_version, &oid, + NULL, 0, 0, NULL, &dth); + if (rc != 0) + goto update_hae; + rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors, agg_iterate_pre_cb, - agg_iterate_post_cb, ec_agg_param, NULL); + agg_iterate_post_cb, ec_agg_param, dth); + if (rc == -DER_INPROGRESS && !d_list_empty(&dth->dth_share_tbd_list)) { + uint64_t now = daos_gettime_coarse(); + + /* Report warning per each 10 seconds to avoid log flood. */ + if (now - cont->sc_ec_agg_busy_ts > 10) { + while ((dsp = d_list_pop_entry(&dth->dth_share_tbd_list, + struct dtx_share_peer, dsp_link)) != NULL) { + D_WARN(DF_CONT ": EC aggregate hit non-committed DTX " DF_DTI "\n", + DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), + DP_DTI(&dsp->dsp_xid)); + dtx_dsp_free(dsp); + } + + cont->sc_ec_agg_busy_ts = now; + } + } + + dtx_end(dth, cont, rc); /* Post_cb may not being executed in some cases */ agg_clear_extents(&ec_agg_param->ap_agg_entry); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index a51682b4785..4762e04b898 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -162,7 +162,7 @@ obj_rw_complete(crt_rpc_t *rpc, struct obj_io_context *ioc, } static void -obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, +obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, bool release_input, struct obj_io_context *ioc) { struct obj_rw_out *orwo = crt_reply_get(rpc); @@ -187,7 +187,11 @@ obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, ioc->ioc_map_ver, orwo->orw_epoch, status); if (!ioc->ioc_lost_reply) { - rc = crt_reply_send(rpc); + if (release_input) + rc = crt_reply_send_input_free(rpc); + else + rc = crt_reply_send(rpc); + if (rc != 0) D_ERROR("send reply failed: "DF_RC"\n", DP_RC(rc)); } else { @@ -2079,6 +2083,8 @@ obj_ioc_init(uuid_t pool_uuid, uuid_t coh_uuid, uuid_t cont_uuid, crt_rpc_t *rpc D_ASSERT(ioc != NULL); memset(ioc, 0, sizeof(*ioc)); + + crt_req_addref(rpc); ioc->ioc_rpc = rpc; ioc->ioc_opc = opc_get(rpc->cr_opc); rc = ds_cont_find_hdl(pool_uuid, coh_uuid, &coh); @@ -2154,6 +2160,10 @@ obj_ioc_fini(struct obj_io_context *ioc, int err) ds_cont_child_put(ioc->ioc_coc); ioc->ioc_coc = NULL; } + if (ioc->ioc_rpc) { + crt_req_decref(ioc->ioc_rpc); + ioc->ioc_rpc = NULL; + } } /* Setup lite IO context, it is only for compound RPC so far: @@ -2508,7 +2518,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc) rc = vos_obj_array_remove(ioc.ioc_coc->sc_hdl, oer->er_oid, &oer->er_epoch_range, dkey, &iod->iod_name, &recx); out: - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, false, &ioc); obj_ioc_end(&ioc, rc); } @@ -2604,7 +2614,7 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc) D_ERROR(DF_UOID ": array_remove failed: " DF_RC "\n", DP_UOID(oea->ea_oid), DP_RC(rc1)); out: - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, false, &ioc); obj_ioc_end(&ioc, rc); } @@ -2733,7 +2743,7 @@ ds_obj_tgt_update_handler(crt_rpc_t *rpc) out: if (dth != NULL) rc = dtx_end(dth, ioc.ioc_coc, rc); - obj_rw_reply(rpc, rc, 0, &ioc); + obj_rw_reply(rpc, rc, 0, true, &ioc); D_FREE(mbs); obj_ioc_end(&ioc, rc); } @@ -2943,8 +2953,11 @@ ds_obj_rw_handler(crt_rpc_t *rpc) d_tm_inc_counter(opm->opm_update_resent, 1); -again1: - e = 0; +again: + if (flags & ORF_RESEND) + e = orw->orw_epoch; + else + e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &orw->orw_dti, &e, &version); switch (rc) { @@ -2955,8 +2968,13 @@ ds_obj_rw_handler(crt_rpc_t *rpc) orw->orw_epoch = e; /* TODO: Also recover the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &orw->orw_dti, e); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: - rc = 0; + flags = 0; break; default: D_GOTO(out, rc); @@ -2966,7 +2984,6 @@ ds_obj_rw_handler(crt_rpc_t *rpc) D_GOTO(out, rc); } -again2: /* For leader case, we need to find out the potential conflict * (or share the same non-committed object/dkey) DTX(s) in the * CoS (committable) cache, piggyback them via the dispdatched @@ -3011,7 +3028,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; exec_arg.start = orw->orw_start_shard; /* Execute the operation on all targets */ @@ -3026,28 +3043,25 @@ ds_obj_rw_handler(crt_rpc_t *rpc) case -DER_TX_RESTART: /* * If this is a standalone operation, we can restart the - * internal transaction right here. Otherwise, we have to defer - * the restart to the RPC client. + * internal transaction right here. Otherwise we have to + * defer the restart to the RPC sponsor. */ - if (opc == DAOS_OBJ_RPC_UPDATE) { - /* - * Only standalone updates use this RPC. Retry with - * newer epoch. - */ - orw->orw_epoch = d_hlc_get(); - orw->orw_flags &= ~ORF_RESEND; - flags = 0; - d_tm_inc_counter(opm->opm_update_restart, 1); - goto again2; - } + if (opc != DAOS_OBJ_RPC_UPDATE) + break; - break; + /* Only standalone updates use this RPC. Retry with newer epoch. */ + orw->orw_epoch = d_hlc_get(); + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + d_tm_inc_counter(opm->opm_update_restart, 1); + goto again; case -DER_AGAIN: - orw->orw_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; d_tm_inc_counter(opm->opm_update_retry, 1); ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -3073,7 +3087,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (ioc.ioc_map_ver < max_ver) ioc.ioc_map_ver = max_ver; - obj_rw_reply(rpc, rc, epoch.oe_value, &ioc); + obj_rw_reply(rpc, rc, epoch.oe_value, false, &ioc); D_FREE(mbs); D_FREE(dti_cos); obj_ioc_end(&ioc, rc); @@ -3865,8 +3879,11 @@ ds_obj_punch_handler(crt_rpc_t *rpc) if (opi->opi_flags & ORF_RESEND) { daos_epoch_t e; -again1: - e = 0; +again: + if (flags & ORF_RESEND) + e = opi->opi_epoch; + else + e = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &opi->opi_dti, &e, &version); switch (rc) { @@ -3877,8 +3894,13 @@ ds_obj_punch_handler(crt_rpc_t *rpc) flags |= ORF_RESEND; /* TODO: Also recovery the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &opi->opi_dti, e); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: - rc = 0; + flags = 0; break; default: D_GOTO(out, rc); @@ -3888,7 +3910,6 @@ ds_obj_punch_handler(crt_rpc_t *rpc) goto cleanup; } -again2: /* For leader case, we need to find out the potential conflict * (or share the same non-committed object/dkey) DTX(s) in the * CoS (committable) cache, piggyback them via the dispdatched @@ -3933,7 +3954,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; /* Execute the operation on all shards */ if (opi->opi_api_flags & DAOS_COND_PUNCH) @@ -3949,19 +3970,17 @@ ds_obj_punch_handler(crt_rpc_t *rpc) rc = dtx_leader_end(dlh, ioc.ioc_coh, rc); switch (rc) { case -DER_TX_RESTART: - /* - * Only standalone punches use this RPC. Retry with newer - * epoch. - */ + /* Only standalone punches use this RPC. Retry with newer epoch. */ opi->opi_epoch = d_hlc_get(); - opi->opi_flags &= ~ORF_RESEND; - flags = 0; - goto again2; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + goto again; case -DER_AGAIN: - opi->opi_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -5605,12 +5624,12 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) D_DEBUG(DB_IO, "(%s) handling collective punch RPC %p for obj " DF_UOID" on XS %u/%u epc "DF_X64" pmv %u, with dti " - DF_DTI", forward width %u, forward depth %u\n", + DF_DTI", forward width %u, forward depth %u, flags %x\n", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, ocpi->ocpi_map_ver, DP_DTI(&ocpi->ocpi_xid), - ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); D_ASSERT(dmi->dmi_xs_id != 0); @@ -5653,8 +5672,11 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (ocpi->ocpi_flags & ORF_RESEND) { -again1: - tmp = 0; +again: + if (!(ocpi->ocpi_flags & ORF_LEADER) || (flags & ORF_RESEND)) + tmp = ocpi->ocpi_epoch; + else + tmp = 0; rc = dtx_handle_resend(ioc.ioc_vos_coh, &ocpi->ocpi_xid, &tmp, &version); switch (rc) { case -DER_ALREADY: @@ -5664,7 +5686,13 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) flags |= ORF_RESEND; /* TODO: Also recovery the epoch uncertainty. */ break; + case -DER_MISMATCH: + rc = vos_dtx_abort(ioc.ioc_vos_coh, &ocpi->ocpi_xid, tmp); + if (rc < 0 && rc != -DER_NONEXIST) + D_GOTO(out, rc); + /* Fall through */ case -DER_NONEXIST: + flags = 0; break; default: D_GOTO(out, rc); @@ -5673,7 +5701,6 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) dce->dce_ver = version; } -again2: epoch.oe_value = ocpi->ocpi_epoch; epoch.oe_first = epoch.oe_value; epoch.oe_flags = orf_to_dtx_epoch_flags(ocpi->ocpi_flags); @@ -5685,7 +5712,7 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) exec_arg.rpc = rpc; exec_arg.ioc = &ioc; - exec_arg.flags = flags; + exec_arg.flags |= flags; exec_arg.coll_shards = dcts[0].dct_shards; exec_arg.coll_tgts = dcts; obj_coll_disp_init(dct_nr, ocpi->ocpi_max_tgt_sz, @@ -5718,14 +5745,15 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) switch (rc) { case -DER_TX_RESTART: ocpi->ocpi_epoch = d_hlc_get(); - ocpi->ocpi_flags &= ~ORF_RESEND; - flags = 0; - goto again2; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; + goto again; case -DER_AGAIN: - ocpi->ocpi_flags |= ORF_RESEND; need_abort = true; + exec_arg.flags |= ORF_RESEND; + flags = ORF_RESEND; ABT_thread_yield(); - goto again1; + goto again; default: break; } @@ -5745,15 +5773,17 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) max_ver = version; DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, - "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc " - DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, " - "tgt_nr %u, forward width %u, forward depth %u", + "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/" + DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, " + "bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, - DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, + DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, + DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl), + DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch, ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count, - ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags); obj_punch_complete(rpc, rc, max_ver); diff --git a/src/object/srv_obj_remote.c b/src/object/srv_obj_remote.c index ce06723621b..f64d851e5b4 100644 --- a/src/object/srv_obj_remote.c +++ b/src/object/srv_obj_remote.c @@ -136,7 +136,7 @@ ds_obj_remote_update(struct dtx_leader_handle *dlh, void *data, int idx, *orw = *orw_parent; orw->orw_oid.id_shard = shard_tgt->st_shard_id; - orw->orw_flags |= ORF_BULK_BIND | obj_exec_arg->flags; + orw->orw_flags |= (ORF_BULK_BIND | obj_exec_arg->flags) & ~ORF_LEADER; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) orw->orw_api_flags &= ~DAOS_COND_MASK; orw->orw_dti_cos.ca_count = dth->dth_dti_cos_count; @@ -247,7 +247,7 @@ ds_obj_remote_punch(struct dtx_leader_handle *dlh, void *data, int idx, *opi = *opi_parent; opi->opi_oid.id_shard = shard_tgt->st_shard_id; - opi->opi_flags |= obj_exec_arg->flags; + opi->opi_flags |= obj_exec_arg->flags & ~ORF_LEADER; if (shard_tgt->st_flags & DTF_DELAY_FORWARD && dlh->dlh_drop_cond) opi->opi_api_flags &= ~DAOS_COND_PUNCH; opi->opi_dti_cos.ca_count = dth->dth_dti_cos_count; @@ -495,7 +495,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, crt_endpoint_t tgt_ep = { 0 }; crt_rpc_t *parent_req = exec_arg->rpc; crt_rpc_t *req; - struct obj_coll_punch_in *ocpi_parent; + struct obj_coll_punch_in *ocpi_parent = crt_req_get(parent_req); struct obj_coll_punch_in *ocpi; int tag; int rc = 0; @@ -509,7 +509,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, if (remote_arg == NULL) D_GOTO(out, rc = -DER_NOMEM); - obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep); + obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocpi_parent->ocpi_oid.id_pub); tag = tgt_ep.ep_tag; crt_req_addref(parent_req); @@ -524,9 +524,7 @@ ds_obj_coll_punch_remote(struct dtx_leader_handle *dlh, void *data, int idx, D_GOTO(out, rc); } - ocpi_parent = crt_req_get(parent_req); ocpi = crt_req_get(req); - ocpi->ocpi_odm = ocpi_parent->ocpi_odm; uuid_copy(ocpi->ocpi_po_uuid, ocpi_parent->ocpi_po_uuid); uuid_copy(ocpi->ocpi_co_hdl, ocpi_parent->ocpi_co_hdl); @@ -634,7 +632,7 @@ ds_obj_coll_query_remote(struct dtx_leader_handle *dlh, void *data, int idx, if (remote_arg == NULL) D_GOTO(out, rc = -DER_NOMEM); - obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep); + obj_coll_disp_dest(cursor, exec_arg->coll_tgts, &tgt_ep, ocqi_parent->ocqi_oid.id_pub); tag = tgt_ep.ep_tag; remote_arg->dlh = dlh; diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index 8f864c8c11a..612f7760fd1 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -57,6 +57,12 @@ pool_tls_get() return tls; } +static inline bool +ds_pool_skip_for_check(struct ds_pool *pool) +{ + return engine_in_check() && !pool->sp_cr_checked; +} + struct pool_iv_map { d_rank_t piv_master_rank; uint32_t piv_pool_map_ver; diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 667e4bc6ed6..6e3a01379fa 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -62,7 +62,6 @@ ds_pool_get_vos_df_version(uint32_t pool_global_version) /* Pool service crt event */ struct pool_svc_event { - d_list_t psv_link; d_rank_t psv_rank; uint64_t psv_incarnation; enum crt_event_source psv_src; @@ -72,15 +71,40 @@ struct pool_svc_event { #define DF_PS_EVENT "rank=%u inc="DF_U64" src=%d type=%d" #define DP_PS_EVENT(e) e->psv_rank, e->psv_incarnation, e->psv_src, e->psv_type -#define RECHOOSE_SLEEP_MS 250 +/* + * Pool service crt event set + * + * This stores an unordered array of pool_svc_event objects. For all different + * i and j, we have pss_buf[i].psv_rank != pss_buf[j].psv_rank. + * + * An event set facilitates the merging of a sequence of events. For instance, + * sequence (in the format ) + * <3, D>, <5, D>, <1, D>, <5, A>, <1, A>, <1, D> + * will merge into set + * <3, D>, <5, A>, <1, D> + * (that is, during the merge, an event overrides a previuos event of the same + * rank in the set). + */ +struct pool_svc_event_set { + struct pool_svc_event *pss_buf; + uint32_t pss_len; + uint32_t pss_cap; +}; + +#define DF_PS_EVENT_SET "len=%u" +#define DP_PS_EVENT_SET(s) s->pss_len /* Pool service crt-event-handling state */ struct pool_svc_events { - ABT_mutex pse_mutex; - ABT_cond pse_cv; - d_list_t pse_queue; - ABT_thread pse_handler; - bool pse_stop; + ABT_mutex pse_mutex; + ABT_cond pse_cv; + struct pool_svc_event_set *pse_pending; + uint64_t pse_timeout; /* s */ + uint64_t pse_time; /* s */ + struct sched_request *pse_timer; + ABT_thread pse_handler; + bool pse_stop; + bool pse_paused; }; /* Pool service schedule state */ @@ -1162,6 +1186,15 @@ pool_svc_locate_cb(d_iov_t *id, char **path) return 0; } +static unsigned int +get_crt_event_delay(void) +{ + unsigned int t = 10 /* s */; + + d_getenv_uint("CRT_EVENT_DELAY", &t); + return t; +} + static int pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) { @@ -1182,7 +1215,7 @@ pool_svc_alloc_cb(d_iov_t *id, struct ds_rsvc **rsvc) d_iov_set(&svc->ps_rsvc.s_id, svc->ps_uuid, sizeof(uuid_t)); uuid_copy(svc->ps_uuid, id->iov_buf); - D_INIT_LIST_HEAD(&svc->ps_events.pse_queue); + svc->ps_events.pse_timeout = get_crt_event_delay(); svc->ps_events.pse_handler = ABT_THREAD_NULL; svc->ps_svc_rf = -1; svc->ps_force_notify = false; @@ -1300,98 +1333,221 @@ ds_pool_enable_exclude(void) pool_disable_exclude = false; } +static int +alloc_event_set(struct pool_svc_event_set **event_set) +{ + D_ALLOC_PTR(*event_set); + if (*event_set == NULL) + return -DER_NOMEM; + return 0; +} + +static void +free_event_set(struct pool_svc_event_set **event_set) +{ + D_FREE((*event_set)->pss_buf); + D_FREE(*event_set); +} + +static int +add_to_event_set(struct pool_svc_event_set *event_set, d_rank_t rank, uint64_t incarnation, + enum crt_event_source src, enum crt_event_type type) +{ + int i; + + /* Find rank in event_set. */ + for (i = 0; i < event_set->pss_len; i++) + if (event_set->pss_buf[i].psv_rank == rank) + break; + + /* If not found, prepare to add a new event. */ + if (i == event_set->pss_len) { + if (event_set->pss_len == event_set->pss_cap) { + uint32_t cap; + struct pool_svc_event *buf; + + if (event_set->pss_cap == 0) + cap = 1; + else + cap = 2 * event_set->pss_cap; + D_REALLOC_ARRAY(buf, event_set->pss_buf, event_set->pss_cap, cap); + if (buf == NULL) + return -DER_NOMEM; + event_set->pss_buf = buf; + event_set->pss_cap = cap; + } + event_set->pss_len++; + } + + event_set->pss_buf[i].psv_rank = rank; + event_set->pss_buf[i].psv_incarnation = incarnation; + event_set->pss_buf[i].psv_src = src; + event_set->pss_buf[i].psv_type = type; + return 0; +} + +/* Merge next into prev. */ +static int +merge_event_sets(struct pool_svc_event_set *prev, struct pool_svc_event_set *next) +{ + int i; + + for (i = 0; i < next->pss_len; i++) { + struct pool_svc_event *event = &next->pss_buf[i]; + int rc; + + rc = add_to_event_set(prev, event->psv_rank, event->psv_incarnation, event->psv_src, + event->psv_type); + if (rc != 0) + return rc; + } + return 0; +} + static int queue_event(struct pool_svc *svc, d_rank_t rank, uint64_t incarnation, enum crt_event_source src, enum crt_event_type type) { struct pool_svc_events *events = &svc->ps_events; - struct pool_svc_event *event; + int rc; + bool allocated = false; - D_ALLOC_PTR(event); - if (event == NULL) - return -DER_NOMEM; + D_DEBUG(DB_MD, DF_UUID ": queuing event: " DF_PS_EVENT "\n", DP_UUID(svc->ps_uuid), rank, + incarnation, src, type); - event->psv_rank = rank; - event->psv_incarnation = incarnation; - event->psv_src = src; - event->psv_type = type; + ABT_mutex_lock(events->pse_mutex); - D_DEBUG(DB_MD, DF_UUID": queuing event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), - DP_PS_EVENT(event)); + if (events->pse_pending == NULL) { + rc = alloc_event_set(&events->pse_pending); + if (rc != 0) + goto out; + allocated = true; + } + + rc = add_to_event_set(events->pse_pending, rank, incarnation, src, type); + if (rc != 0) + goto out; + + events->pse_time = daos_gettime_coarse(); + + if (events->pse_paused) { + D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid)); + events->pse_paused = false; + } - ABT_mutex_lock(events->pse_mutex); - d_list_add_tail(&event->psv_link, &events->pse_queue); ABT_cond_broadcast(events->pse_cv); + +out: + if (rc != 0 && allocated) + free_event_set(&events->pse_pending); ABT_mutex_unlock(events->pse_mutex); - return 0; + return rc; } static void -discard_events(d_list_t *queue) +resume_event_handling(struct pool_svc *svc) { - struct pool_svc_event *event; - struct pool_svc_event *tmp; + struct pool_svc_events *events = &svc->ps_events; - d_list_for_each_entry_safe(event, tmp, queue, psv_link) { - D_DEBUG(DB_MD, "discard event: "DF_PS_EVENT"\n", DP_PS_EVENT(event)); - d_list_del_init(&event->psv_link); - D_FREE(event); + ABT_mutex_lock(events->pse_mutex); + if (events->pse_paused) { + D_DEBUG(DB_MD, DF_UUID ": resuming event handling\n", DP_UUID(svc->ps_uuid)); + events->pse_paused = false; + ABT_cond_broadcast(events->pse_cv); } + ABT_mutex_unlock(events->pse_mutex); } -static int pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank); +static int pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set); -static void -handle_event(struct pool_svc *svc, struct pool_svc_event *event) +static int +handle_event(struct pool_svc *svc, struct pool_svc_event_set *event_set) { + int i; int rc; - if ((event->psv_src != CRT_EVS_GRPMOD && event->psv_src != CRT_EVS_SWIM) || - (event->psv_type == CRT_EVT_DEAD && pool_disable_exclude)) { - D_DEBUG(DB_MD, "ignore event: "DF_PS_EVENT" exclude=%d\n", DP_PS_EVENT(event), - pool_disable_exclude); - goto out; + D_INFO(DF_UUID ": handling event set: " DF_PS_EVENT_SET "\n", DP_UUID(svc->ps_uuid), + DP_PS_EVENT_SET(event_set)); + + if (!pool_disable_exclude) { + rc = pool_svc_exclude_ranks(svc, event_set); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": failed to exclude ranks", DP_UUID(svc->ps_uuid)); + return rc; + } } - if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD && - event->psv_type == CRT_EVT_DEAD) { - D_DEBUG(DB_MD, "ignore exclusion of self\n"); - goto out; + /* + * Check if the alive ranks are up in the pool map. If in the future we + * add automatic reintegration below, for instance, we may need + * to not only take svc->ps_lock, but also employ an RDB TX by + * the book. + */ + ABT_rwlock_rdlock(svc->ps_pool->sp_lock); + for (i = 0; i < event_set->pss_len; i++) { + struct pool_svc_event *event = &event_set->pss_buf[i]; + + if (event->psv_src != CRT_EVS_SWIM || event->psv_type != CRT_EVT_ALIVE) + continue; + if (ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank)) { + /* + * The rank is up in the pool map. Request a pool map + * distribution just in case the rank has recently + * restarted and does not have a copy of the pool map. + */ + ds_rsvc_request_map_dist(&svc->ps_rsvc); + D_DEBUG(DB_MD, DF_UUID ": requested map dist for rank %u\n", + DP_UUID(svc->ps_uuid), event->psv_rank); + break; + } } + ABT_rwlock_unlock(svc->ps_pool->sp_lock); - D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), - DP_PS_EVENT(event)); + return 0; +} - if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) { - /* - * Check if the rank is up in the pool map. If in the future we - * add automatic reintegration below, for instance, we may need - * to not only take svc->ps_lock, but also employ an RDB TX by - * the book. - */ - ABT_rwlock_rdlock(svc->ps_pool->sp_lock); - rc = ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank); - ABT_rwlock_unlock(svc->ps_pool->sp_lock); - if (!rc) - goto out; +struct event_timer_arg { + struct pool_svc_events *eta_events; + uint64_t eta_deadline; +}; - /* - * The rank is up in the pool map. Request a pool map - * distribution just in case the rank has recently restarted - * and does not have a copy of the pool map. - */ - ds_rsvc_request_map_dist(&svc->ps_rsvc); - D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n", - DP_UUID(svc->ps_uuid), event->psv_rank); - } else if (event->psv_type == CRT_EVT_DEAD) { - rc = pool_svc_exclude_rank(svc, event->psv_rank); - if (rc != 0) - D_ERROR(DF_UUID": failed to exclude rank %u: "DF_RC"\n", - DP_UUID(svc->ps_uuid), event->psv_rank, DP_RC(rc)); - } +static void +event_timer(void *varg) +{ + struct event_timer_arg *arg = varg; + struct pool_svc_events *events = arg->eta_events; + int64_t time_left = arg->eta_deadline - daos_gettime_coarse(); -out: - return; + if (time_left > 0) + sched_req_sleep(events->pse_timer, time_left * 1000); + ABT_cond_broadcast(events->pse_cv); +} + +static int +start_event_timer(struct event_timer_arg *arg) +{ + struct pool_svc_events *events = arg->eta_events; + uuid_t uuid; + struct sched_req_attr attr; + + D_ASSERT(events->pse_timer == NULL); + uuid_clear(uuid); + sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &uuid); + events->pse_timer = sched_create_ult(&attr, event_timer, arg, 0); + if (events->pse_timer == NULL) + return -DER_NOMEM; + return 0; +} + +static void +stop_event_timer(struct event_timer_arg *arg) +{ + struct pool_svc_events *events = arg->eta_events; + + D_ASSERT(events->pse_timer != NULL); + sched_req_wait(events->pse_timer, true /* abort */); + sched_req_put(events->pse_timer); + events->pse_timer = NULL; } static void @@ -1403,31 +1559,83 @@ events_handler(void *arg) D_DEBUG(DB_MD, DF_UUID": starting\n", DP_UUID(svc->ps_uuid)); for (;;) { - struct pool_svc_event *event = NULL; - bool stop; + struct pool_svc_event_set *event_set = NULL; + bool stop; + int rc; ABT_mutex_lock(events->pse_mutex); for (;;) { + struct event_timer_arg timer_arg; + int64_t time_left; + stop = events->pse_stop; if (stop) { - discard_events(&events->pse_queue); + events->pse_paused = false; + if (events->pse_pending != NULL) + free_event_set(&events->pse_pending); break; } - if (!d_list_empty(&events->pse_queue)) { - event = d_list_entry(events->pse_queue.next, struct pool_svc_event, - psv_link); - d_list_del_init(&event->psv_link); + + timer_arg.eta_events = events; + timer_arg.eta_deadline = events->pse_time + events->pse_timeout; + + time_left = timer_arg.eta_deadline - daos_gettime_coarse(); + if (events->pse_pending != NULL && !events->pse_paused && time_left <= 0) { + event_set = events->pse_pending; + events->pse_pending = NULL; break; } + + /* A simple timed cond_wait without polling. */ + if (time_left > 0) { + rc = start_event_timer(&timer_arg); + if (rc != 0) { + /* No delay then. */ + DL_ERROR(rc, DF_UUID ": failed to start event timer", + DP_UUID(svc->ps_uuid)); + events->pse_time = 0; + continue; + } + } sched_cond_wait(events->pse_cv, events->pse_mutex); + if (time_left > 0) + stop_event_timer(&timer_arg); } ABT_mutex_unlock(events->pse_mutex); if (stop) break; - handle_event(svc, event); + rc = handle_event(svc, event_set); + if (rc != 0) { + /* Put event_set back to events->pse_pending. */ + D_DEBUG(DB_MD, DF_UUID ": returning event set\n", DP_UUID(svc->ps_uuid)); + ABT_mutex_lock(events->pse_mutex); + if (events->pse_pending == NULL) { + /* + * No pending events; pause the handling until + * next event or pool map change. + */ + D_DEBUG(DB_MD, DF_UUID ": pausing event handling\n", + DP_UUID(svc->ps_uuid)); + events->pse_paused = true; + } else { + /* + * There are pending events; do not pause the + * handling. + */ + rc = merge_event_sets(event_set, events->pse_pending); + if (rc != 0) + DL_ERROR(rc, DF_UUID ": failed to merge events", + DP_UUID(svc->ps_uuid)); + free_event_set(&events->pse_pending); + } + events->pse_pending = event_set; + event_set = NULL; + ABT_mutex_unlock(events->pse_mutex); + } - D_FREE(event); + if (event_set != NULL) + free_event_set(&event_set); ABT_thread_yield(); } @@ -1437,7 +1645,7 @@ events_handler(void *arg) static bool events_pending(struct pool_svc *svc) { - return !d_list_empty(&svc->ps_events.pse_queue); + return svc->ps_events.pse_pending != NULL; } static void @@ -1461,11 +1669,13 @@ init_events(struct pool_svc *svc) struct pool_svc_events *events = &svc->ps_events; int rc; - D_ASSERT(d_list_empty(&events->pse_queue)); + D_ASSERT(events->pse_pending == NULL); + D_ASSERT(events->pse_timer == NULL); D_ASSERT(events->pse_handler == ABT_THREAD_NULL); - D_ASSERT(events->pse_stop == false); + D_ASSERT(!events->pse_stop); + D_ASSERT(!events->pse_paused); - if (!ds_pool_skip_for_check(svc->ps_pool)) { + if (!ds_pool_restricted(svc->ps_pool, false)) { rc = crt_register_event_cb(ds_pool_crt_event_cb, svc); if (rc != 0) { D_ERROR(DF_UUID": failed to register event callback: "DF_RC"\n", @@ -1496,9 +1706,10 @@ init_events(struct pool_svc *svc) return 0; err_cb: - if (!ds_pool_skip_for_check(svc->ps_pool)) + if (!ds_pool_restricted(svc->ps_pool, false)) crt_unregister_event_cb(ds_pool_crt_event_cb, svc); - discard_events(&events->pse_queue); + if (events->pse_pending != NULL) + free_event_set(&events->pse_pending); err: return rc; } @@ -1507,11 +1718,10 @@ static void fini_events(struct pool_svc *svc) { struct pool_svc_events *events = &svc->ps_events; - int rc; D_ASSERT(events->pse_handler != ABT_THREAD_NULL); - if (!ds_pool_skip_for_check(svc->ps_pool)) + if (!ds_pool_restricted(svc->ps_pool, false)) crt_unregister_event_cb(ds_pool_crt_event_cb, svc); ABT_mutex_lock(events->pse_mutex); @@ -1519,8 +1729,6 @@ fini_events(struct pool_svc *svc) ABT_cond_broadcast(events->pse_cv); ABT_mutex_unlock(events->pse_mutex); - rc = ABT_thread_join(events->pse_handler); - D_ASSERTF(rc == 0, DF_RC"\n", DP_RC(rc)); ABT_thread_free(&events->pse_handler); events->pse_handler = ABT_THREAD_NULL; events->pse_stop = false; @@ -2357,6 +2565,11 @@ int ds_pool_failed_lookup(uuid_t uuid) return 0; } +struct pool_start_args { + bool psa_aft_chk; + bool psa_immutable; +}; + /* * Try to start the pool. Continue the iteration upon errors as other pools may * still be able to work. @@ -2364,13 +2577,26 @@ int ds_pool_failed_lookup(uuid_t uuid) static int start_one(uuid_t uuid, void *varg) { - int rc; + struct pool_start_args *psa = varg; + bool aft_chk; + bool immutable; + int rc; + + if (psa != NULL) { + aft_chk = psa->psa_aft_chk; + immutable = psa->psa_immutable; + } else { + aft_chk = false; + immutable = false; + } - D_DEBUG(DB_MD, DF_UUID ": starting pool\n", DP_UUID(uuid)); + D_DEBUG(DB_MD, DF_UUID ": starting pool, aft_chk %s, immutable %s\n", + DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no"); - rc = ds_pool_start(uuid, varg != NULL ? true : false); + rc = ds_pool_start(uuid, aft_chk, immutable); if (rc != 0) { - DL_ERROR(rc, DF_UUID ": failed to start pool", DP_UUID(uuid)); + DL_ERROR(rc, DF_UUID ": failed to start pool, aft_chk %s, immutable %s", + DP_UUID(uuid), aft_chk ? "yes" : "no", immutable ? "yes" : "no"); ds_pool_failed_add(uuid, rc); } @@ -2389,12 +2615,27 @@ pool_start_all(void *arg) DP_RC(rc)); } +bool +ds_pool_restricted(struct ds_pool *pool, bool immutable) +{ + if (ds_pool_skip_for_check(pool)) + return true; + + if (pool->sp_immutable && !immutable) + return true; + + return false; +} + int -ds_pool_start_after_check(uuid_t uuid) +ds_pool_start_after_check(uuid_t uuid, bool immutable) { - bool aft_chk = true; + struct pool_start_args psa; + + psa.psa_aft_chk = true; + psa.psa_immutable = immutable; - return start_one(uuid, &aft_chk); + return start_one(uuid, &psa); } /* Note that this function is currently called from the main xstream. */ @@ -3612,7 +3853,6 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) &ds_pool_prop_connectable, &value); if (rc != 0) goto out_lock; - D_DEBUG(DB_MD, DF_UUID ": connectable=%u\n", DP_UUID(in->pci_op.pi_uuid), connectable); if (!connectable) { D_ERROR(DF_UUID": being destroyed, not accepting connections\n", DP_UUID(in->pci_op.pi_uuid)); @@ -3621,12 +3861,21 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) /* * NOTE: Under check mode, there is a small race window between ds_pool_mark_connectable() - * the PS restart for full service. If some client tries to connect the pool during + * and PS restart with full service. If some client tries to connect the pool during * such internal, it will get -DER_BUSY temporarily. */ if (unlikely(ds_pool_skip_for_check(svc->ps_pool))) { - D_ERROR(DF_UUID" is not ready for full pool service\n", DP_UUID(in->pci_op.pi_uuid)); - D_GOTO(out_lock, rc = -DER_BUSY); + rc = -DER_BUSY; + D_ERROR(DF_UUID " is not ready for full pool service: " DF_RC "\n", + DP_UUID(in->pci_op.pi_uuid), DP_RC(rc)); + goto out_lock; + } + + if (svc->ps_pool->sp_immutable && flags != DAOS_PC_RO) { + rc = -DER_NO_PERM; + D_ERROR(DF_UUID " failed to connect immutable pool, flags " DF_X64 ": " DF_RC "\n", + DP_UUID(in->pci_op.pi_uuid), flags, DP_RC(rc)); + goto out_lock; } /* Check existing pool handles. */ @@ -3789,6 +4038,11 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version) } } + D_DEBUG(DB_MD, DF_UUID "/" DF_UUID ": connecting to %s pool with flags " + DF_X64", sec_capas " DF_X64 "\n", + DP_UUID(in->pci_op.pi_uuid), DP_UUID(in->pci_op.pi_hdl), + svc->ps_pool->sp_immutable ? "immutable" : "regular", flags, sec_capas); + rc = pool_connect_iv_dist(svc, in->pci_op.pi_hdl, flags, sec_capas, credp, global_ver, obj_layout_ver); if (rc == 0 && DAOS_FAIL_CHECK(DAOS_POOL_CONNECT_FAIL_CORPC)) { @@ -4332,7 +4586,7 @@ ds_pool_svc_list_cont(uuid_t uuid, d_rank_list_t *ranks, list_cont_bulk_destroy(bulk); D_FREE(resp_cont); crt_req_decref(rpc); - dss_sleep(RECHOOSE_SLEEP_MS); + dss_sleep(250); D_GOTO(rechoose, rc); } @@ -6314,7 +6568,7 @@ pool_svc_schedule(struct pool_svc *svc, struct pool_svc_sched *sched, void (*fun D_DEBUG(DB_MD, DF_UUID": begin\n", DP_UUID(svc->ps_uuid)); - if (ds_pool_skip_for_check(svc->ps_pool)) { + if (ds_pool_restricted(svc->ps_pool, false)) { D_DEBUG(DB_MD, DF_UUID": end: skip in check mode\n", DP_UUID(svc->ps_uuid)); return -DER_OP_CANCELED; } @@ -6501,7 +6755,7 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma } static int -pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank) +pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map) { crt_group_t *primary_grp; struct pool_domain *doms; @@ -6517,13 +6771,10 @@ pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank) D_CRIT("!!! Please try to recover these engines in top priority -\n"); D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n"); - D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n", - DP_UUID(svc->ps_uuid), rank); for (i = 0; i < doms_cnt; i++) { struct swim_member_state state; - if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) || - (doms[i].do_comp.co_rank == rank)) + if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN)) continue; rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state); @@ -6701,8 +6952,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, * with CRIT log message to ask administrator to bring back the engine. */ if (src == MUS_SWIM && opc == MAP_EXCLUDE) { - d_rank_t rank; - int failed_cnt; + int failed_cnt; rc = pool_map_update_failed_cnt(map); if (rc != 0) { @@ -6711,15 +6961,19 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, goto out_map; } - D_ASSERT(tgt_addrs->pta_number == 1); - rank = tgt_addrs->pta_addrs->pta_rank; failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE); - D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n", - DP_UUID(svc->ps_uuid), rank, failed_cnt); + D_INFO(DF_UUID": SWIM exclude %d ranks, failed NODE %d\n", + DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, failed_cnt); if (failed_cnt > pw_rf) { - D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n", - DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt); - rc = pool_map_crit_prompt(svc, map, rank); + D_CRIT(DF_UUID": exclude %d ranks will break pool RF %d, failed_cnt %d\n", + DP_UUID(svc->ps_uuid), tgt_addrs->pta_number, pw_rf, failed_cnt); + ABT_rwlock_rdlock(svc->ps_pool->sp_lock); + rc = pool_map_crit_prompt(svc, svc->ps_pool->sp_map); + ABT_rwlock_unlock(svc->ps_pool->sp_lock); + if (rc != 0) + DL_ERROR(rc, DF_UUID ": failed to log prompt", + DP_UUID(svc->ps_uuid)); + rc = -DER_RF; goto out_map; } } @@ -6768,6 +7022,9 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, ds_rsvc_request_map_dist(&svc->ps_rsvc); + /* See events_handler. */ + resume_event_handling(svc); + rc = pool_svc_schedule_reconf(svc, NULL /* map */, map_version, false /* sync_remove */); if (rc != 0) { DL_INFO(rc, DF_UUID": failed to schedule pool service reconfiguration", @@ -7167,28 +7424,51 @@ ds_pool_update_handler_v5(crt_rpc_t *rpc) } static int -pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank) +pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set) { struct pool_target_addr_list list; struct pool_target_addr_list inval_list_out = { 0 }; - struct pool_target_addr tgt_rank; + struct pool_target_addr *addrs; + d_rank_t self_rank = dss_self_rank(); uint32_t map_version = 0; + int n = 0; + int i; int rc; - tgt_rank.pta_rank = rank; - tgt_rank.pta_target = -1; - list.pta_number = 1; - list.pta_addrs = &tgt_rank; + D_ALLOC_ARRAY(addrs, event_set->pss_len); + if (addrs == NULL) + return -DER_NOMEM; + for (i = 0; i < event_set->pss_len; i++) { + struct pool_svc_event *event = &event_set->pss_buf[i]; + + if (event->psv_type != CRT_EVT_DEAD) + continue; + if (event->psv_src == CRT_EVS_GRPMOD && event->psv_rank == self_rank) { + D_DEBUG(DB_MD, DF_UUID ": ignore exclusion of self\n", + DP_UUID(svc->ps_uuid)); + continue; + } + addrs[n].pta_rank = event->psv_rank; + addrs[n].pta_target = -1; + n++; + } + if (n == 0) { + rc = 0; + goto out; + } + list.pta_number = n; + list.pta_addrs = addrs; rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */, NULL, NULL, 0, &list, &inval_list_out, &map_version, NULL /* hint */, MUS_SWIM); - D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n", - DP_UUID(svc->ps_uuid), map_version, rank, rc); + D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u ranks %u: rc %d\n", + DP_UUID(svc->ps_uuid), map_version, n, rc); pool_target_addr_list_free(&inval_list_out); - +out: + D_FREE(addrs); return rc; } @@ -8527,9 +8807,3 @@ ds_pool_svc_upgrade_vos_pool(struct ds_pool *pool) ds_rsvc_put(rsvc); return rc; } - -bool -ds_pool_skip_for_check(struct ds_pool *pool) -{ - return engine_in_check() && !pool->sp_cr_checked; -} diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index cfa837e8b2a..113fb757fd9 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -505,7 +505,7 @@ pool_child_start(struct ds_pool_child *child, bool recreate) goto done; } - if (!ds_pool_skip_for_check(child->spc_pool)) { + if (!ds_pool_restricted(child->spc_pool, false)) { rc = start_gc_ult(child); if (rc != 0) goto out_close; @@ -1128,7 +1128,7 @@ pool_fetch_hdls_ult_abort(struct ds_pool *pool) * till ds_pool_stop. Only for mgmt and pool modules. */ int -ds_pool_start(uuid_t uuid, bool aft_chk) +ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable) { struct ds_pool *pool; struct daos_llink *llink; @@ -1185,6 +1185,11 @@ ds_pool_start(uuid_t uuid, bool aft_chk) else pool->sp_cr_checked = 0; + if (immutable) + pool->sp_immutable = 1; + else + pool->sp_immutable = 0; + rc = pool_child_add_all(pool); if (rc != 0) goto failure_pool; @@ -1199,7 +1204,9 @@ ds_pool_start(uuid_t uuid, bool aft_chk) } pool->sp_fetch_hdls = 1; + } + if (!ds_pool_restricted(pool, false)) { rc = ds_pool_start_ec_eph_query_ult(pool); if (rc != 0) { D_ERROR(DF_UUID": failed to start ec eph query ult: "DF_RC"\n", @@ -1882,13 +1889,10 @@ ds_pool_tgt_map_update(struct ds_pool *pool, struct pool_buf *buf, DP_UUID(pool->sp_uuid), map_version_before, map_version); } - if (map_updated) { + if (map_updated && !ds_pool_restricted(pool, false)) { struct dtx_scan_args *arg; int ret; - if (ds_pool_skip_for_check(pool)) - D_GOTO(out, rc = 0); - D_ALLOC_PTR(arg); if (arg == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/proto/ctl/storage_nvme.proto b/src/proto/ctl/storage_nvme.proto index 944d8e943ba..be068e5274d 100644 --- a/src/proto/ctl/storage_nvme.proto +++ b/src/proto/ctl/storage_nvme.proto @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -28,6 +28,7 @@ message ScanNvmeReq { bool Basic = 3; // Strip NVMe device details to only basic uint64 MetaSize = 4; // Size of the metadata blob uint64 RdbSize = 5; // Size of the RDB blob + bool LinkStats = 6; // Populate PCIe link info in health statistics } message ScanNvmeResp { diff --git a/src/rebuild/rebuild_iv.c b/src/rebuild/rebuild_iv.c index cc585e037e1..f4b9f75f407 100644 --- a/src/rebuild/rebuild_iv.c +++ b/src/rebuild/rebuild_iv.c @@ -167,6 +167,7 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, struct rebuild_tgt_pool_tracker *rpt; struct rebuild_iv *dst_iv = entry->iv_value.sg_iovs[0].iov_buf; struct rebuild_iv *src_iv = src->sg_iovs[0].iov_buf; + uint32_t old_ver; int rc = 0; rpt = rpt_lookup(src_iv->riv_pool_uuid, -1, src_iv->riv_ver, @@ -200,16 +201,18 @@ rebuild_iv_ent_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, dst_iv->riv_stable_epoch); rpt->rt_global_done = dst_iv->riv_global_done; rpt->rt_global_scan_done = dst_iv->riv_global_scan_done; - if (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver && + old_ver = rpt->rt_global_dtx_resync_version; + if (rpt->rt_global_dtx_resync_version < dst_iv->riv_global_dtx_resyc_version) + rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version; + if (old_ver < rpt->rt_rebuild_ver && dst_iv->riv_global_dtx_resyc_version >= rpt->rt_rebuild_ver) { D_INFO(DF_UUID " global/iv/rebuild_ver %u/%u/%u signal wait cond\n", - DP_UUID(src_iv->riv_pool_uuid), rpt->rt_global_dtx_resync_version, + DP_UUID(src_iv->riv_pool_uuid), old_ver, dst_iv->riv_global_dtx_resyc_version, rpt->rt_rebuild_ver); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond); ABT_mutex_unlock(rpt->rt_lock); } - rpt->rt_global_dtx_resync_version = dst_iv->riv_global_dtx_resyc_version; } out: diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index b1f722b8254..5c463af9e90 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1941,7 +1941,7 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, return 0; } - if (ds_pool_skip_for_check(pool)) { + if (ds_pool_restricted(pool, false)) { D_DEBUG(DB_REBUILD, DF_UUID" skip rebuild under check mode\n", DP_UUID(pool->sp_uuid)); return 0; diff --git a/src/security/srv_acl.c b/src/security/srv_acl.c index ad3c0107e3b..8f05216e971 100644 --- a/src/security/srv_acl.c +++ b/src/security/srv_acl.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -754,6 +754,12 @@ ds_sec_cont_can_evict_all(uint64_t cont_capas) return (cont_capas & CONT_CAPA_EVICT_ALL) != 0; } +bool +ds_sec_cont_can_modify(uint64_t cont_capas) +{ + return (cont_capas & CONT_CAPAS_W_MASK) != 0; +} + uint64_t ds_sec_get_rebuild_cont_capabilities(void) { diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml index 78f132474b5..95aab541329 100644 --- a/src/tests/ftest/erasurecode/multiple_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_failure.yaml @@ -25,6 +25,7 @@ server_config: storage: auto pool: size: 93% + set_logmasks: False container: type: POSIX control_method: daos diff --git a/src/tests/ftest/erasurecode/rebuild_fio.yaml b/src/tests/ftest/erasurecode/rebuild_fio.yaml index a895c356707..a3539d86579 100644 --- a/src/tests/ftest/erasurecode/rebuild_fio.yaml +++ b/src/tests/ftest/erasurecode/rebuild_fio.yaml @@ -39,6 +39,7 @@ pool: aggregation: threshold: 50000000 aggr_timeout: 180 + set_logmasks: False container: type: POSIX control_method: daos diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 42e05937f37..563ff7adece 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -643,6 +643,7 @@ def __init__(self, *args, **kwargs): self.setup_start_agents = True self.slurm_exclude_servers = False self.slurm_exclude_nodes = NodeSet() + self.max_test_dir_usage_check = 90 self.host_info = HostInfo() self.hostlist_servers = NodeSet() self.hostlist_clients = NodeSet() @@ -693,6 +694,11 @@ def setUp(self): self.slurm_exclude_servers = self.params.get( "slurm_exclude_servers", "/run/setup/*", self.slurm_exclude_servers) + # Max test directory usage percentage - when exceeded will display sizes of files in the + # test directory + self.max_test_dir_usage_check = self.params.get( + "max_test_dir_usage_check", "/run/setup/*", self.max_test_dir_usage_check) + # The server config name should be obtained from each ServerManager # object, but some tests still use this TestWithServers attribute. self.server_group = self.params.get("name", "/run/server_config/*", "daos_server") @@ -765,12 +771,20 @@ def setUp(self): # List common test directory contents before running the test self.log.info("-" * 100) - self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir)) + self.log.debug( + "Common test directory (%s) contents (check > %s%%):", + os.path.dirname(self.test_dir), self.max_test_dir_usage_check) all_hosts = include_local_host(self.host_info.all_hosts) test_dir_parent = os.path.dirname(self.test_dir) - result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") - if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90: - run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*") + _result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") + _details = NodeSet() + for _host, _stdout in _result.all_stdout.items(): + _test_dir_usage = re.findall(r"\s+([\d]+)%\s+", _stdout) + _test_dir_usage_int = int(max(_test_dir_usage + ["0"])) + if _test_dir_usage_int > self.max_test_dir_usage_check: + _details.add(_host) + if _details: + run_remote(self.log, _details, f"du -sh {test_dir_parent}/*") self.log.info("-" * 100) if not self.start_servers_once or self.name.uid == 1: diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py index 12c66d76b8c..f95e22bd4bd 100644 --- a/src/tests/ftest/util/file_count_test_base.py +++ b/src/tests/ftest/util/file_count_test_base.py @@ -17,15 +17,15 @@ class FileCountTestBase(IorTestBase, MdtestBase): :avocado: recursive """ - def add_containers(self, file_oclass=None, dir_oclass=None): - """Create a list of containers that the various jobs use for storage. + def get_file_write_container(self, file_oclass=None, dir_oclass=None): + """Create a container, set oclass, dir_oclass, and add rd_fac property based on oclass. Args: - file_oclass (str, optional): file object class of container. - Defaults to None. - dir_oclass (str, optional): dir object class of container. - Defaults to None. + file_oclass (str, optional): file object class of container. Defaults to None. + dir_oclass (str, optional): dir object class of container. Defaults to None. + Returns: + TestContainer: Created container with oclass, dir_oclass, and rd_fac set. """ # Create a container and add it to the overall list of containers @@ -92,7 +92,7 @@ def run_file_count(self): rd_fac = extract_redundancy_factor(oclass) dir_oclass = self.get_diroclass(rd_fac) self.mdtest_cmd.dfs_dir_oclass.update(dir_oclass) - self.container = self.add_containers(oclass, dir_oclass) + self.container = self.get_file_write_container(oclass, dir_oclass) try: self.processes = mdtest_np self.ppn = mdtest_ppn @@ -111,14 +111,27 @@ def run_file_count(self): # run ior self.log.info("=======>>>Starting IOR with %s and %s", api, oclass) self.ior_cmd.dfs_oclass.update(oclass) - self.container = self.add_containers(oclass) + self.container = self.get_file_write_container(oclass) self.update_ior_cmd_with_pool(False) try: self.processes = ior_np self.ppn = ior_ppn if api == 'HDF5-VOL': + # Format the container properties so that it works with HDF5-VOL env var. + # Each entry:value pair needs to be separated by a semicolon. Since we're + # using this in the mpirun command, semicolon would indicate the end of the + # command, so quote the whole thing. + cont_props = self.container.properties.value + cont_props_hdf5_vol = '"' + cont_props.replace(",", ";") + '"' + self.log.info("cont_props_hdf5_vol = %s", cont_props_hdf5_vol) + env = self.ior_cmd.env.copy() + env.update({ + "HDF5_DAOS_OBJ_CLASS": oclass, + "HDF5_DAOS_FILE_PROP": cont_props_hdf5_vol + }) self.ior_cmd.api.update('HDF5') - self.run_ior_with_pool(create_pool=False, plugin_path=hdf5_plugin_path) + self.run_ior_with_pool( + create_pool=False, plugin_path=hdf5_plugin_path, env=env) elif self.ior_cmd.api.value == 'POSIX': self.run_ior_with_pool(create_pool=False, intercept=intercept) else: diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 440ffe68f82..7cda958d242 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -435,6 +435,7 @@ class EngineYamlParameters(YamlParameters): "common": [ "D_LOG_FILE_APPEND_PID=1", "DAOS_POOL_RF=4", + "CRT_EVENT_DELAY=1", "COVFILE=/tmp/test.cov"], "ofi+tcp": [], "ofi+tcp;ofi_rxm": [], diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index f32e068cb16..2d1c11e722e 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -25,8 +25,8 @@ from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse, create_app_cmdline, create_dm_cmdline, create_fio_cmdline, create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline, - create_racer_cmdline, ddhhmmss_format, get_daos_server_logs, get_harassers, - get_journalctl, launch_exclude_reintegrate, launch_extend, launch_reboot, + create_racer_cmdline, ddhhmmss_format, get_harassers, + launch_exclude_reintegrate, launch_extend, launch_reboot, launch_server_stop_start, launch_snapshot, launch_vmd_identify_check, reserved_file_copy, run_event_check, run_metrics_check, run_monitor_check) @@ -164,17 +164,6 @@ def pre_tear_down(self): # display final metrics run_metrics_check(self, prefix="final") - # Gather server logs - try: - get_daos_server_logs(self) - except SoakTestError as error: - errors.append(f"<>") - # Gather journalctl logs - hosts = list(set(self.hostlist_servers)) - since = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.start_time)) - until = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.end_time)) - for journalctl_type in ["kernel", "daos_server"]: - get_journalctl(self, hosts, since, until, journalctl_type, logging=True) if self.all_failed_harassers: errors.extend(self.all_failed_harassers) diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index d59c8d39e81..70d2352881d 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -90,6 +90,7 @@ class TelemetryUtils(): "engine_pool_ops_dtx_coll_commit", "engine_pool_ops_dtx_commit", "engine_pool_ops_dtx_refresh", + "engine_pool_ops_dtx_sync_commit", "engine_pool_ops_ec_agg", "engine_pool_ops_ec_rep", "engine_pool_ops_fetch", @@ -200,6 +201,8 @@ class TelemetryUtils(): "engine_dmabuff_queued_reqs", "engine_dmabuff_grab_errs", *_gen_stats_metrics("engine_dmabuff_grab_retries")] + ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS = \ + _gen_stats_metrics("engine_io_dtx_async_cmt_lat") ENGINE_IO_DTX_COMMITTABLE_METRICS = \ _gen_stats_metrics("engine_io_dtx_committable") ENGINE_IO_DTX_COMMITTED_METRICS = \ @@ -304,7 +307,8 @@ class TelemetryUtils(): _gen_stats_metrics("engine_io_ops_tgt_update_active") ENGINE_IO_OPS_UPDATE_ACTIVE_METRICS = \ _gen_stats_metrics("engine_io_ops_update_active") - ENGINE_IO_METRICS = ENGINE_IO_DTX_COMMITTABLE_METRICS +\ + ENGINE_IO_METRICS = ENGINE_IO_DTX_ASYNC_CMT_LAT_METRICS +\ + ENGINE_IO_DTX_COMMITTABLE_METRICS +\ ENGINE_IO_DTX_COMMITTED_METRICS +\ ENGINE_IO_LATENCY_FETCH_METRICS +\ ENGINE_IO_LATENCY_BULK_FETCH_METRICS +\ diff --git a/src/tests/suite/daos_container.c b/src/tests/suite/daos_container.c index 958f9fd1d16..243a829c317 100644 --- a/src/tests/suite/daos_container.c +++ b/src/tests/suite/daos_container.c @@ -3835,7 +3835,7 @@ co_op_dup_timing(void **state) size_t const in_sizes[] = {strlen(in_values[0]), strlen(in_values[1])}; int n = (int)ARRAY_SIZE(names); const unsigned int NUM_FP = 3; - const uint32_t NUM_OPS = 200; + const uint32_t NUM_OPS = 500; uint32_t num_failures = 0; const uint32_t SVC_OPS_ENABLED = 1; const uint32_t SVC_OPS_ENTRY_AGE = 60; diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 3827a0eda6a..4c981f29645 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -18,6 +18,8 @@ #include +#include "daos_iotest.h" + /* * Will enable accurate query result verification after DAOS-13520 resolved. * #define CR_ACCURATE_QUERY_RESULT 1 @@ -3573,6 +3575,146 @@ cr_handle_fail_pool2(void **state) cr_cleanup(arg, &pool, 1); } +#define CR_IO_SIZE 16 + +/* + * 1. Create pool and container without inconsistency. + * 2. Write something to the container. + * 3. Start checker with --dry-run option. + * 4. Query checker, it should be completed without any inconsistency reported. + * 5. Verify the pool only can be connected with read-only permission. + * 6. Verify the container only can be opened with read-only permission. + * 7. Verify the object only can be opened with read-only permission. + * 8. Verify the object cannot be written. + * 9. Read former wrote data and verify its correctness. + * 10. Switch to normal mode and cleanup. + */ +static void +cr_maintenance_mode(void **state) +{ + test_arg_t *arg = *state; + struct test_pool pool = { 0 }; + struct test_cont cont = { 0 }; + daos_handle_t coh; + daos_obj_id_t oid; + struct ioreq req; + const char *dkey = "cr_dkey"; + const char *akey = "cr_akey"; + char update_buf[CR_IO_SIZE]; + char fetch_buf[CR_IO_SIZE]; + struct daos_check_info dci = { 0 }; + int rc; + + print_message("CR28: maintenance mode after dry-run check\n"); + + rc = cr_pool_create(state, &pool, true, TCC_NONE); + assert_rc_equal(rc, 0); + + rc = cr_cont_create(state, &pool, &cont, 0); + assert_rc_equal(rc, 0); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL); + assert_rc_equal(rc, 0); + + oid = daos_test_oid_gen(coh, OC_SX, 0, 0, arg->myrank); + arg->fail_loc = DAOS_DTX_COMMIT_SYNC | DAOS_FAIL_ALWAYS; + arg->async = 0; + ioreq_init(&req, coh, oid, DAOS_IOD_SINGLE, arg); + dts_buf_render(update_buf, CR_IO_SIZE); + + print_message("Generate some data.\n"); + + insert_single(dkey, akey, 0, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + daos_fail_loc_set(0); + + rc = daos_obj_close(req.oh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_pool_disconnect(pool.poh, NULL); + assert_rc_equal(rc, 0); + + rc = cr_system_stop(false); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(true); + assert_rc_equal(rc, 0); + + rc = cr_check_start(TCSF_DRYRUN, 0, NULL, NULL); + assert_rc_equal(rc, 0); + + cr_ins_wait(1, &pool.pool_uuid, &dci); + + rc = cr_ins_verify(&dci, TCIS_COMPLETED); + assert_rc_equal(rc, 0); + + rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the pool only can be connected as RDONLY mode.\n"); + + rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RW, &pool.poh, NULL, NULL); + assert_rc_equal(rc, -DER_NO_PERM); + + rc = daos_pool_connect(pool.pool_str, arg->group, DAOS_PC_RO, &pool.poh, NULL, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the container only can be opened as RDONLY mode.\n"); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RW, &coh, NULL, NULL); + assert_rc_equal(rc, -DER_NO_PERM); + + rc = daos_cont_open(pool.poh, cont.label, DAOS_COO_RO, &coh, NULL, NULL); + assert_rc_equal(rc, 0); + +#if 0 + /* + * NOTE: Currently, DAOS security check is server-side logic, but object open may not + * talk with server. So related permission check will not be done until read or + * write really happened. If not consider exclusively open and cache case, then + * it seems fine; otherwise, need some enhancement in future. + */ + + print_message("Verify the object only can be opened as RDONLY mode.\n"); + + rc = daos_obj_open(coh, oid, DAOS_OO_RW, &req.oh, NULL); + assert_rc_equal(rc, -DER_NO_PERM); +#endif + + rc = daos_obj_open(coh, oid, DAOS_OO_RO, &req.oh, NULL); + assert_rc_equal(rc, 0); + + print_message("Verify the object cannot be modified.\n"); + + req.arg->expect_result = -DER_NO_PERM; + insert_single(dkey, akey, 100, update_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + daos_fail_loc_set(0); + + print_message("Verify former wrote data.\n"); + + req.arg->expect_result = 0; + lookup_single(dkey, akey, 0, fetch_buf, CR_IO_SIZE, DAOS_TX_NONE, &req); + assert_int_equal(req.iod[0].iod_size, CR_IO_SIZE); + assert_memory_equal(update_buf, fetch_buf, CR_IO_SIZE); + + rc = daos_cont_close(coh, NULL); + assert_rc_equal(rc, 0); + + rc = daos_pool_disconnect(pool.poh, NULL); + assert_rc_equal(rc, 0); + + rc = cr_mode_switch(false); + assert_rc_equal(rc, 0); + + rc = cr_system_start(); + assert_rc_equal(rc, 0); + + cr_dci_fini(&dci); + cr_cleanup(arg, &pool, 1); +} + static const struct CMUnitTest cr_tests[] = { { "CR1: start checker for specified pools", cr_start_specified, async_disable, test_case_teardown}, @@ -3628,6 +3770,8 @@ static const struct CMUnitTest cr_tests[] = { cr_handle_fail_pool1, async_disable, test_case_teardown}, { "CR27: handle the pool if some engine failed to report some pool service", cr_handle_fail_pool2, async_disable, test_case_teardown}, + { "CR28: maintenance mode after dry-run check", + cr_maintenance_mode, async_disable, test_case_teardown}, }; static int diff --git a/src/vos/sys_db.c b/src/vos/sys_db.c index e7b4a2baa20..d1f6d4bce98 100644 --- a/src/vos/sys_db.c +++ b/src/vos/sys_db.c @@ -349,8 +349,9 @@ vos_db_init(const char *db_path) int vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool destroy_db_on_fini) { - int create; - int rc; + ABT_mutex_attr attr = ABT_MUTEX_ATTR_NULL; + int create; + int rc; D_ASSERT(db_path != NULL); @@ -373,12 +374,26 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool goto failed; } - rc = ABT_mutex_create(&vos_db.db_lock); + rc = ABT_mutex_attr_create(&attr); if (rc != ABT_SUCCESS) { - rc = -DER_NOMEM; - goto failed; + D_ERROR("Failed to create mutex attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); + } + + rc = ABT_mutex_attr_set_recursive(attr, ABT_TRUE); + if (rc != ABT_SUCCESS) { + D_ERROR("Failed to set mutex attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); } + rc = ABT_mutex_create_with_attr(attr, &vos_db.db_lock); + if (rc != ABT_SUCCESS) { + D_ERROR("Failed to create mutex with attr: %d\n", rc); + D_GOTO(failed, rc = dss_abterr2der(rc)); + } + + ABT_mutex_attr_free(&attr); + vos_db.db_poh = DAOS_HDL_INVAL; vos_db.db_coh = DAOS_HDL_INVAL; @@ -416,6 +431,8 @@ vos_db_init_ex(const char *db_path, const char *db_name, bool force_create, bool } return 0; failed: + if (attr != ABT_MUTEX_ATTR_NULL) + ABT_mutex_attr_free(&attr); vos_db_fini(); return rc; } diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 1c60f781507..47e410fc1e7 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -15,11 +15,10 @@ #include "vos_layout.h" #include "vos_internal.h" -/* 128 KB per SCM blob */ -#define DTX_BLOB_SIZE (1 << 17) -/** Ensure 16-bit signed int is sufficient to store record index */ -D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_act_ent_df)) < (1 << 15)); -D_CASSERT((DTX_BLOB_SIZE / sizeof(struct vos_dtx_cmt_ent_df)) < (1 << 15)); +/* 16 KB blob for each active DTX blob */ +#define DTX_ACT_BLOB_SIZE (1 << 14) +/* 4 KB for committed DTX blob */ +#define DTX_CMT_BLOB_SIZE (1 << 12) #define DTX_ACT_BLOB_MAGIC 0x14130a2b #define DTX_CMT_BLOB_MAGIC 0x2502191c @@ -313,16 +312,38 @@ dtx_act_ent_update(struct btr_instance *tins, struct btr_record *rec, if (unlikely(!dae_old->dae_aborted)) { /* - * XXX: There are two possible reasons for that: - * - * 1. Client resent the RPC but without set 'RESEND' flag. - * 2. Client reused the DTX ID for different modifications. - * - * Currently, the 1st case is more suspected. + * If the new entry and the old entry are for the same transaction, then the RPC + * for the new one will take 'RESEND' flag, that will cause the old one has been + * aborted before arriving at here. So it is quite possible that the new one and + * the old one are for different transactions. */ - D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n", - DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); - return -DER_TX_ID_REUSED; + if (DAE_EPOCH(dae_old) < DAE_EPOCH(dae_new)) { + D_ERROR("The TX ID "DF_DTI" may be reused for epoch "DF_X64" vs "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); + return -DER_TX_ID_REUSED; + } + + /* + * If the old entry has higher epoch, it is quite possible that the resent RPC + * was handled before the original RPC (corresponding to 'dae_new'). Returning + * -DER_INPROGRESS to make the RPC sponsor to retry the RPC with 'RESEND' flag, + * then related RPC handler logic will handle such case. + */ + if (DAE_EPOCH(dae_old) > DAE_EPOCH(dae_new)) { + D_ERROR("Resent RPC may be handled before original one for DTX "DF_DTI + " with epoch "DF_X64" vs "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old), DAE_EPOCH(dae_new)); + return -DER_INPROGRESS; + } + + /* + * The two entries uses the same epoch, then it may be caused by repeated RPCs + * from different sources, such as multiple relay engines forward the same RPC + * to current target. We need to notify related caller for such buggy case. + */ + D_ERROR("Receive repeated DTX "DF_DTI" with epoch "DF_X64"\n", + DP_DTI(&DAE_XID(dae_old)), DAE_EPOCH(dae_old)); + return -DER_MISC; } rec->rec_off = umem_ptr2off(&tins->ti_umm, dae_new); @@ -767,7 +788,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, static int vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t epoch, daos_epoch_t cmt_time, struct vos_dtx_cmt_ent **dce_p, - struct vos_dtx_act_ent **dae_p, bool *rm_cos, bool *fatal) + struct vos_dtx_act_ent **dae_p, bool *rm_cos) { struct vos_dtx_act_ent *dae = NULL; struct vos_dtx_cmt_ent *dce = NULL; @@ -866,10 +887,8 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t goto out; rc = dtx_rec_release(cont, dae, false); - if (rc != 0) { - *fatal = true; + if (rc != 0) goto out; - } D_ASSERT(dae_p != NULL); *dae_p = dae; @@ -915,7 +934,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) umem_off_t dbd_off; int rc; - dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE); + dbd_off = umem_zalloc(umm, DTX_ACT_BLOB_SIZE); if (UMOFF_IS_NULL(dbd_off)) { D_ERROR("No space when create active DTX table.\n"); return -DER_NOSPACE; @@ -923,7 +942,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) dbd = umem_off2ptr(umm, dbd_off); dbd->dbd_magic = DTX_ACT_BLOB_MAGIC; - dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / + dbd->dbd_cap = (DTX_ACT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / sizeof(struct vos_dtx_act_ent_df); dbd->dbd_count = 0; dbd->dbd_index = 0; @@ -1175,16 +1194,20 @@ vos_dtx_check_availability(daos_handle_t coh, uint32_t entry, } if (intent == DAOS_INTENT_PURGE) { - uint32_t age = d_hlc_age2sec(DAE_XID(dae).dti_hlc); + uint64_t now = daos_gettime_coarse(); /* * The DTX entry still references related data record, * then we cannot (vos) aggregate related data record. + * Report warning per each 10 seconds to avoid log flood. */ - if (age >= DAOS_AGG_THRESHOLD) - D_WARN("DTX "DF_DTI" (state:%u, age:%u) still references the data, " - "cannot be (VOS) aggregated\n", - DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), age); + if (now - cont->vc_agg_busy_ts > 10) { + D_WARN("DTX "DF_DTI" (state:%u, flags:%x, age:%u) still references " + "the modification, cannot be (VOS) aggregated\n", + DP_DTI(&DAE_XID(dae)), vos_dtx_status(dae), DAE_FLAGS(dae), + (unsigned int)d_hlc_age2sec(DAE_XID(dae).dti_hlc)); + cont->vc_agg_busy_ts = now; + } return ALB_AVAILABLE_DIRTY; } @@ -1912,8 +1935,13 @@ vos_dtx_check(daos_handle_t coh, struct dtx_id *dti, daos_epoch_t *epoch, daos_epoch_t e = *epoch; *epoch = DAE_EPOCH(dae); - if (e != 0 && e != DAE_EPOCH(dae)) - return -DER_MISMATCH; + if (e != 0) { + if (e > DAE_EPOCH(dae)) + return -DER_MISMATCH; + + if (e < DAE_EPOCH(dae)) + return -DER_TX_RESTART; + } } return vos_dae_is_prepare(dae) ? DTX_ST_PREPARED : DTX_ST_INITED; @@ -1976,9 +2004,6 @@ vos_dtx_load_mbs(daos_handle_t coh, struct dtx_id *dti, daos_unit_oid_t *oid, rc = -DER_INPROGRESS; } - if (rc < 0) - D_ERROR("Failed to load mbs for "DF_DTI": "DF_RC"\n", DP_DTI(dti), DP_RC(rc)); - return rc; } @@ -1994,12 +2019,11 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], umem_off_t dbd_off; uint64_t cmt_time = daos_gettime_coarse(); int committed = 0; - int cur = 0; int rc = 0; - int rc1 = 0; + int p = 0; int i = 0; int j; - bool fatal = false; + int k; bool allocated = false; dbd = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); @@ -2017,67 +2041,64 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto new_blob; again: - for (j = dbd->dbd_count; i < count && j < dbd->dbd_cap && rc1 == 0; - i++, cur++) { - struct vos_dtx_cmt_ent *dce = NULL; - - rc = vos_dtx_commit_one(cont, &dtis[cur], epoch, cmt_time, &dce, - daes != NULL ? &daes[cur] : NULL, - rm_cos != NULL ? &rm_cos[cur] : NULL, &fatal); - if (dces != NULL) - dces[cur] = dce; - - if (fatal) - goto out; - - if (rc == 0 && (daes == NULL || daes[cur] != NULL)) + for (j = dbd->dbd_count; j < dbd->dbd_cap && i < count; i++) { + rc = vos_dtx_commit_one(cont, &dtis[i], epoch, cmt_time, &dces[i], + daes != NULL ? &daes[i] : NULL, + rm_cos != NULL ? &rm_cos[i] : NULL); + if (rc == 0 && (daes == NULL || daes[i] != NULL)) committed++; if (rc == -DER_ALREADY || rc == -DER_NONEXIST) rc = 0; - if (rc1 == 0) - rc1 = rc; + if (rc != 0) + goto out; + + if (dces[i] != NULL) + j++; + } + + if (j > dbd->dbd_count) { + if (!allocated) { + rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[dbd->dbd_count], + sizeof(struct vos_dtx_cmt_ent_df) * + (j - dbd->dbd_count), UMEM_XADD_NO_SNAPSHOT); + if (rc != 0) + goto out; - if (dce != NULL) { - rc = umem_tx_xadd_ptr(umm, &dbd->dbd_committed_data[j], - sizeof(struct vos_dtx_cmt_ent_df), - UMEM_XADD_NO_SNAPSHOT); + /* Only need to add range for the first partial blob. */ + rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; + } - memcpy(&dbd->dbd_committed_data[j++], &dce->dce_base, + for (k = dbd->dbd_count; k < j; k++, p++) { + while (dces[p] == NULL) + p++; + + memcpy(&dbd->dbd_committed_data[k], &dces[p]->dce_base, sizeof(struct vos_dtx_cmt_ent_df)); } - } - if (!allocated) { - /* Only need to add range for the first partial blob. */ - rc = umem_tx_add_ptr(umm, &dbd->dbd_count, - sizeof(dbd->dbd_count)); - if (rc != 0) - D_GOTO(out, fatal = true); + dbd->dbd_count = j; } - dbd->dbd_count = j; - - if (i == count || rc1 != 0) + if (i == count) goto out; new_blob: dbd_prev = dbd; /* Need new @dbd */ - dbd_off = umem_zalloc(umm, DTX_BLOB_SIZE); + dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE); if (UMOFF_IS_NULL(dbd_off)) { D_ERROR("No space to store committed DTX %d "DF_DTI"\n", - count, DP_DTI(&dtis[cur])); - fatal = true; + count, DP_DTI(&dtis[i])); D_GOTO(out, rc = -DER_NOSPACE); } dbd = umem_off2ptr(umm, dbd_off); dbd->dbd_magic = DTX_CMT_BLOB_MAGIC; - dbd->dbd_cap = (DTX_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / + dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / sizeof(struct vos_dtx_cmt_ent_df); dbd->dbd_prev = umem_ptr2off(umm, dbd_prev); @@ -2090,21 +2111,21 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], sizeof(cont_df->cd_dtx_committed_head) + sizeof(cont_df->cd_dtx_committed_tail)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; cont_df->cd_dtx_committed_head = dbd_off; } else { rc = umem_tx_add_ptr(umm, &dbd_prev->dbd_next, sizeof(dbd_prev->dbd_next)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; dbd_prev->dbd_next = dbd_off; rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_tail, sizeof(cont_df->cd_dtx_committed_tail)); if (rc != 0) - D_GOTO(out, fatal = true); + goto out; } D_DEBUG(DB_IO, "Allocated DTX committed blob %p ("UMOFF_PF") for cont "DF_UUID"\n", @@ -2115,7 +2136,7 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto again; out: - return fatal ? rc : (committed > 0 ? committed : rc1); + return rc < 0 ? rc : committed; } void diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 7d4dd3ac166..5aa0a1fadbb 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -353,6 +353,8 @@ struct vos_container { daos_epoch_range_t vc_epr_aggregation; /* Current ongoing discard EPR */ daos_epoch_range_t vc_epr_discard; + /* Last timestamp when VOS aggregation reports -DER_TX_BUSY */ + uint64_t vc_agg_busy_ts; /* Last timestamp when VOS aggregation reporting ENOSPACE */ uint64_t vc_agg_nospc_ts; /* Last timestamp when IO reporting ENOSPACE */ diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index ea47cf4454c..58699453f40 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -791,13 +791,14 @@ oi_iter_process(struct vos_iterator *iter, vos_iter_proc_op_t op, void *args) int oi_iter_check_punch(daos_handle_t ih) { - struct vos_iterator *iter = vos_hdl2iter(ih); - struct vos_oi_iter *oiter = iter2oiter(iter); - struct vos_obj_df *obj; - struct oi_delete_arg del_arg; - daos_unit_oid_t oid; - d_iov_t rec_iov; - int rc; + struct vos_iterator *iter = vos_hdl2iter(ih); + struct vos_oi_iter *oiter = iter2oiter(iter); + struct vos_container *cont = oiter->oit_cont; + struct vos_obj_df *obj; + struct oi_delete_arg del_arg; + daos_unit_oid_t oid; + d_iov_t rec_iov; + int rc; D_ASSERT(iter->it_type == VOS_ITER_OBJ); @@ -811,10 +812,22 @@ oi_iter_check_punch(daos_handle_t ih) obj = (struct vos_obj_df *)rec_iov.iov_buf; oid = obj->vo_id; - if (!vos_ilog_is_punched(vos_cont2hdl(oiter->oit_cont), &obj->vo_ilog, &oiter->oit_epr, - NULL, &oiter->oit_ilog_info)) + if (!vos_ilog_is_punched(vos_cont2hdl(cont), &obj->vo_ilog, &oiter->oit_epr, NULL, + &oiter->oit_ilog_info)) return 0; + rc = vos_obj_hold(vos_obj_cache_current(cont->vc_pool->vp_sysdb), cont, oid, + &oiter->oit_epr, iter->it_bound, VOS_OBJ_AGGREGATE | VOS_OBJ_NO_HOLD, + DAOS_INTENT_PURGE, NULL, NULL); + if (rc != 0) { + /** -DER_BUSY means the object is in-use already. We will after a yield in this + * case. + */ + D_CDEBUG(rc == -DER_BUSY, DB_EPC, DLOG_ERR, "Hold check failed for " DF_UOID "\n", + DP_UOID(oid)); + return rc; + } + /** Ok, ilog is fully punched, so we can move it to gc heap */ rc = umem_tx_begin(vos_cont2umm(oiter->oit_cont), NULL); if (rc != 0) diff --git a/utils/build.config b/utils/build.config index 4d770e6c924..a14ad039c15 100644 --- a/utils/build.config +++ b/utils/build.config @@ -8,8 +8,8 @@ pmdk=2.1.0 isal=v2.30.0 isal_crypto=v2.23.0 spdk=v22.01.2 -ofi=v1.19.1 -mercury=v2.4.0rc5 +ofi=v1.22.0 +mercury=v2.4.0 protobufc=v1.3.3 ucx=v1.14.1 @@ -27,7 +27,6 @@ ucx=https://github.com/openucx/ucx.git [patch_versions] spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff -ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff -mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff +mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index e58af4e6d66..59376d93d40 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -14,8 +14,8 @@ %endif Name: daos -Version: 2.6.1 -Release: 4%{?relval}%{?dist} +Version: 2.6.2 +Release: 1%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -602,6 +602,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent %changelog +* Tue Nov 05 2024 Phillip Henderson 2.6.2-1 +- First test build for 2.6.2 + * Wed Oct 02 2024 Tomasz Gromadzki 2.6.1-4 - Add support of the PMDK package 2.1.0 with NDCTL enabled. * Increase the default ULT stack size to 20KiB if the engine uses diff --git a/utils/trivy/.trivyignore b/utils/trivy/.trivyignore new file mode 100644 index 00000000000..3a3b4cff1ce --- /dev/null +++ b/utils/trivy/.trivyignore @@ -0,0 +1,27 @@ +## Ignored hadoop 3.3.6 related CVE +## CVE-2023-52428,MEDIUM,,"Denial of Service in Connect2id Nimbus JOSE+JWT","com.nimbusds:nimbus-jose-jwt","9.8.1","9.37.2",https://avd.aquasec.com/nvd/cve-2023-52428 +CVE-2023-52428 +## CVE-2023-39410,HIGH,7.5,"apache-avro: Apache Avro Java SDK: Memory when deserializing untrusted data in Avro Java SDK","org.apache.avro:avro","1.7.7","1.11.3",https://avd.aquasec.com/nvd/cve-2023-39410 +CVE-2023-39410 +## CVE-2024-25710,HIGH,5.5,"commons-compress: Denial of service caused by an infinite loop for a corrupted DUMP file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-25710 +CVE-2024-25710 +## CVE-2024-26308,HIGH,5.5,"commons-compress: OutOfMemoryError unpacking broken Pack200 file","org.apache.commons:commons-compress","1.21","1.26.0",https://avd.aquasec.com/nvd/cve-2024-26308 +CVE-2024-26308 +## CVE-2024-29131,MEDIUM,,"commons-configuration: StackOverflowError adding property in AbstractListDelimiterHandler.flattenIterator()","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29131 +CVE-2024-29131 +## CVE-2024-29133,MEDIUM,,"commons-configuration: StackOverflowError calling ListDelimiterHandler.flatten(Object, int) with a cyclical object tree","org.apache.commons:commons-configuration2","2.8.0","2.10.1",https://avd.aquasec.com/nvd/cve-2024-29133 +CVE-2024-29133 +## CVE-2024-25638,HIGH,,"dnsjava: Improper response validation allowing DNSSEC bypass","dnsjava:dnsjava","2.1.7","3.6.0",https://avd.aquasec.com/nvd/cve-2024-25638 +CVE-2024-25638 + +## Ignored hadoop 3.4.0 related CVE +## CVE-2024-47561,CRITICAL,,"apache-avro: Schema parsing may trigger Remote Code Execution (RCE)","org.apache.avro:avro","1.9.2","1.11.4",https://avd.aquasec.com/nvd/cve-2024-47561 +CVE-2024-47561 +## CVE-2023-33201,MEDIUM,5.3,"bouncycastle: potential blind LDAP injection attack using a self-signed certificate","org.bouncycastle:bcprov-jdk15on","1.70","",https://avd.aquasec.com/nvd/cve-2023-33201 +CVE-2023-33201 +## CVE-2024-29857,MEDIUM,,"org.bouncycastle: Importing an EC certificate with crafted F2m parameters may lead to Denial of Service","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-29857 +CVE-2024-29857 +## CVE-2024-30171,MEDIUM,,"bc-java: BouncyCastle vulnerable to a timing variant of Bleichenbacher (Marvin Attack)","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30171 +CVE-2024-30171 +## CVE-2024-30172,MEDIUM,,"org.bouncycastle:bcprov-jdk18on: Infinite loop in ED25519 verification in the ScalarUtil class","org.bouncycastle:bcprov-jdk15on","1.70","1.78",https://avd.aquasec.com/nvd/cve-2024-30172 +CVE-2024-30172 diff --git a/utils/trivy/csv.tpl b/utils/trivy/csv.tpl new file mode 100644 index 00000000000..d3bcafcef5e --- /dev/null +++ b/utils/trivy/csv.tpl @@ -0,0 +1,29 @@ +{{ range . }} +Trivy Vulnerability Scan Results ({{- .Target -}}) +VulnerabilityID,Severity,CVSS Score,Title,Library,Vulnerable Version,Fixed Version,Information URL,Triage Information +{{ range .Vulnerabilities }} + {{- .VulnerabilityID }}, + {{- .Severity }}, + {{- range $key, $value := .CVSS }} + {{- if (eq $key "nvd") }} + {{- .V3Score -}} + {{- end }} + {{- end }}, + {{- quote .Title }}, + {{- quote .PkgName }}, + {{- quote .InstalledVersion }}, + {{- quote .FixedVersion }}, + {{- .PrimaryURL }} +{{ else -}} + No vulnerabilities found at this time. +{{ end }} +Trivy Dependency Scan Results ({{ .Target }}) +ID,Name,Version,Notes +{{ range .Packages -}} + {{- quote .ID }}, + {{- quote .Name }}, + {{- quote .Version }} +{{ else -}} + No dependencies found at this time. +{{ end }} +{{ end }} \ No newline at end of file diff --git a/utils/trivy/trivy.yaml b/utils/trivy/trivy.yaml new file mode 100644 index 00000000000..cfb13b5c40f --- /dev/null +++ b/utils/trivy/trivy.yaml @@ -0,0 +1,249 @@ +cache: + backend: fs + dir: + redis: + ca: "" + cert: "" + key: "" + tls: false + ttl: 0s +config: trivy.yaml +db: + download-java-only: false + download-only: false + java-repository: ghcr.io/aquasecurity/trivy-java-db + java-skip-update: false + no-progress: false + repository: ghcr.io/aquasecurity/trivy-db + skip-update: false +debug: false +dependency-tree: true +exit-code: 0 +generate-default-config: false +ignore-policy: "" +ignorefile: ./utils/trivy/.trivyignore +include-dev-deps: false +insecure: false +license: + confidencelevel: "0.9" + forbidden: + - AGPL-1.0 + - AGPL-3.0 + - CC-BY-NC-1.0 + - CC-BY-NC-2.0 + - CC-BY-NC-2.5 + - CC-BY-NC-3.0 + - CC-BY-NC-4.0 + - CC-BY-NC-ND-1.0 + - CC-BY-NC-ND-2.0 + - CC-BY-NC-ND-2.5 + - CC-BY-NC-ND-3.0 + - CC-BY-NC-ND-4.0 + - CC-BY-NC-SA-1.0 + - CC-BY-NC-SA-2.0 + - CC-BY-NC-SA-2.5 + - CC-BY-NC-SA-3.0 + - CC-BY-NC-SA-4.0 + - Commons-Clause + - Facebook-2-Clause + - Facebook-3-Clause + - Facebook-Examples + - WTFPL + full: false + ignored: [] + notice: + - AFL-1.1 + - AFL-1.2 + - AFL-2.0 + - AFL-2.1 + - AFL-3.0 + - Apache-1.0 + - Apache-1.1 + - Apache-2.0 + - Artistic-1.0-cl8 + - Artistic-1.0-Perl + - Artistic-1.0 + - Artistic-2.0 + - BSL-1.0 + - BSD-2-Clause-FreeBSD + - BSD-2-Clause-NetBSD + - BSD-2-Clause + - BSD-3-Clause-Attribution + - BSD-3-Clause-Clear + - BSD-3-Clause-LBNL + - BSD-3-Clause + - BSD-4-Clause + - BSD-4-Clause-UC + - BSD-Protection + - CC-BY-1.0 + - CC-BY-2.0 + - CC-BY-2.5 + - CC-BY-3.0 + - CC-BY-4.0 + - FTL + - ISC + - ImageMagick + - Libpng + - Lil-1.0 + - Linux-OpenIB + - LPL-1.02 + - LPL-1.0 + - MS-PL + - MIT + - NCSA + - OpenSSL + - PHP-3.01 + - PHP-3.0 + - PIL + - Python-2.0 + - Python-2.0-complete + - PostgreSQL + - SGI-B-1.0 + - SGI-B-1.1 + - SGI-B-2.0 + - Unicode-DFS-2015 + - Unicode-DFS-2016 + - Unicode-TOU + - UPL-1.0 + - W3C-19980720 + - W3C-20150513 + - W3C + - X11 + - Xnet + - Zend-2.0 + - zlib-acknowledgement + - Zlib + - ZPL-1.1 + - ZPL-2.0 + - ZPL-2.1 + permissive: [] + reciprocal: + - APSL-1.0 + - APSL-1.1 + - APSL-1.2 + - APSL-2.0 + - CDDL-1.0 + - CDDL-1.1 + - CPL-1.0 + - EPL-1.0 + - EPL-2.0 + - FreeImage + - IPL-1.0 + - MPL-1.0 + - MPL-1.1 + - MPL-2.0 + - Ruby + restricted: + - BCL + - CC-BY-ND-1.0 + - CC-BY-ND-2.0 + - CC-BY-ND-2.5 + - CC-BY-ND-3.0 + - CC-BY-ND-4.0 + - CC-BY-SA-1.0 + - CC-BY-SA-2.0 + - CC-BY-SA-2.5 + - CC-BY-SA-3.0 + - CC-BY-SA-4.0 + - GPL-1.0 + - GPL-2.0 + - GPL-2.0-with-autoconf-exception + - GPL-2.0-with-bison-exception + - GPL-2.0-with-classpath-exception + - GPL-2.0-with-font-exception + - GPL-2.0-with-GCC-exception + - GPL-3.0 + - GPL-3.0-with-autoconf-exception + - GPL-3.0-with-GCC-exception + - LGPL-2.0 + - LGPL-2.1 + - LGPL-3.0 + - NPL-1.0 + - NPL-1.1 + - OSL-1.0 + - OSL-1.1 + - OSL-2.0 + - OSL-2.1 + - OSL-3.0 + - QPL-1.0 + - Sleepycat + unencumbered: + - CC0-1.0 + - Unlicense + - 0BSD +list-all-pkgs: false +misconfiguration: + cloudformation: + params: [] + helm: + set: [] + set-file: [] + set-string: [] + values: [] + include-non-failures: false + check-bundle-repository: ghcr.io/aquasecurity/trivy-policies:0 + # scanners: + # - azure-arm + # - cloudformation + # - dockerfile + # - helm + # - kubernetes + # - terraform + # - terraformplan + terraform: + exclude-downloaded-modules: false + vars: [] +module: + dir: + enable-modules: [] +output: "trivy-report-daos.txt" +format: template +template: '@./utils/trivy/csv.tpl' +output-plugin-arg: "" +quiet: false +registry: + password: [] + token: "" + username: [] +rego: + data: [] + namespaces: [] + policy: [] + skip-policy-update: false + trace: false +report: all +scan: + compliance: "" + file-patterns: [] + offline: false + parallel: 1 + rekor-url: https://rekor.sigstore.dev + sbom-sources: [] + scanners: + - vuln + - secret + # ignore all hadoop dependencies + skip-dirs: + ./src/client/java/hadoop-daos + skip-files: [] + show-suppressed: true +secret: + config: trivy-secret.yaml +server: + addr: "" + custom-headers: [] + token: "" + token-header: Trivy-Token +severity: + - UNKNOWN + - MEDIUM + - HIGH + - CRITICAL +timeout: 5m0s +version: false +vulnerability: + ignore-status: [] + ignore-unfixed: false + type: + - os + - library