-
Notifications
You must be signed in to change notification settings - Fork 301
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DAOS-16721 dtx: handle potential DTX ID reusing trouble #15408
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1718,15 +1718,20 @@ dc_obj_retry_delay(tse_task_t *task, int err, uint16_t *retry_cnt, uint16_t *inp | |
uint32_t timeout_sec) | ||
{ | ||
uint32_t delay = 0; | ||
uint32_t limit = 4; | ||
|
||
/* | ||
* Randomly delay 5 - 68 us if it is not the first retry for | ||
* Randomly delay 5 ~ 1028 us if it is not the first retry for | ||
* -DER_INPROGRESS || -DER_UPDATE_AGAIN cases. | ||
*/ | ||
++(*retry_cnt); | ||
if (err == -DER_INPROGRESS || err == -DER_UPDATE_AGAIN) { | ||
if (++(*inprogress_cnt) > 1) { | ||
delay = (d_rand() & ((1 << 6) - 1)) + 5; | ||
limit += *inprogress_cnt; | ||
if (limit > 10) | ||
limit = 10; | ||
|
||
delay = (d_rand() & ((1 << limit) - 1)) + 5; | ||
/* Rebuild is being established on the server side, wait a bit longer */ | ||
if (err == -DER_UPDATE_AGAIN) | ||
delay <<= 10; | ||
|
@@ -4856,11 +4861,14 @@ obj_comp_cb(tse_task_t *task, void *data) | |
D_ASSERT(daos_handle_is_inval(obj_auxi->th)); | ||
D_ASSERT(obj_is_modification_opc(obj_auxi->opc)); | ||
|
||
if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) | ||
/* XXX: it is must because miss to set "RESEND" flag, that is bug. */ | ||
D_ASSERTF(0, | ||
"Miss 'RESEND' flag (%x) when resend the RPC for task %p: %u\n", | ||
obj_auxi->flags, task, obj_auxi->retry_cnt); | ||
if (task->dt_result == -DER_TX_ID_REUSED && obj_auxi->retry_cnt != 0) { | ||
D_ERROR("TX ID maybe reused for unknown reason, " | ||
"task %p, opc %u, flags %x, retry_cnt %u\n", | ||
task, obj_auxi->opc, obj_auxi->flags, obj_auxi->retry_cnt); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe some other bug not found some where? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the test log, the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we actually see this error with this patch? |
||
task->dt_result = -DER_IO; | ||
obj_auxi->io_retry = 0; | ||
goto args_fini; | ||
} | ||
|
||
if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) { | ||
daos_obj_rw_t *api_args = dc_task_get_args(obj_auxi->obj_task); | ||
|
@@ -4886,6 +4894,7 @@ obj_comp_cb(tse_task_t *task, void *data) | |
} | ||
} | ||
|
||
args_fini: | ||
if (obj_auxi->opc == DAOS_OBJ_RPC_COLL_PUNCH) | ||
obj_coll_oper_args_fini(&obj_auxi->p_args.pa_coa); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -616,23 +616,22 @@ obj_coll_disp_init(uint32_t tgt_nr, uint32_t max_tgt_size, uint32_t inline_size, | |
|
||
void | ||
obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *tgts, | ||
crt_endpoint_t *tgt_ep) | ||
crt_endpoint_t *tgt_ep, daos_obj_id_t oid) | ||
{ | ||
struct daos_coll_target *dct = &tgts[ocdc->cur_pos]; | ||
struct daos_coll_target tmp; | ||
unsigned long rand = 0; | ||
uint32_t size; | ||
int pos; | ||
int i; | ||
|
||
if (ocdc->cur_step > 2) { | ||
rand = d_rand(); | ||
/* | ||
* Randomly choose an engine as the relay one for load balance. | ||
* If the one corresponding to "pos" is former moved one, then | ||
* use the "cur_pos" as the relay engine. | ||
* Choose an engine (according to the given oid) as the relay one for load balance. | ||
* If the one corresponding to "pos" is former moved one, then use the "cur_pos" as | ||
* the relay engine. Then even if related RPC was resent without changing pool map, | ||
* then the relay one will be the same as the original case. | ||
*/ | ||
pos = rand % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; | ||
pos = oid.lo % (ocdc->tgt_nr - ocdc->cur_pos) + ocdc->cur_pos; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really think we should hash oid.lo instead of using it directly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We hope that the relay engine helping to forwarding collective punch RPC for different objects can be changed instead of always using the same engine(s), then the load can be relative balanced. So either |
||
if (pos > ocdc->cur_pos && tgts[pos].dct_rank > dct->dct_rank) { | ||
memcpy(&tmp, &tgts[pos], sizeof(tmp)); | ||
memcpy(&tgts[pos], dct, sizeof(tmp)); | ||
|
@@ -642,8 +641,8 @@ obj_coll_disp_dest(struct obj_coll_disp_cursor *ocdc, struct daos_coll_target *t | |
|
||
size = dct->dct_bitmap_sz << 3; | ||
|
||
/* Randomly choose a XS as the local leader on target engine for load balance. */ | ||
for (i = 0, pos = (rand != 0 ? rand : d_rand()) % dct->dct_tgt_nr; i < size; i++) { | ||
/* Choose a target as the local agent on the engine for load balance. */ | ||
for (i = 0, pos = oid.lo % dct->dct_tgt_nr; i < size; i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here. |
||
if (isset(dct->dct_bitmap, i)) { | ||
pos -= dct->dct_shards[i].dcs_nr; | ||
if (pos < 0) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2667,8 +2667,13 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, | |
struct ec_agg_param *ec_agg_param = agg_param->ap_data; | ||
vos_iter_param_t iter_param = { 0 }; | ||
struct vos_iter_anchors anchors = { 0 }; | ||
struct dtx_handle *dth = NULL; | ||
struct dtx_share_peer *dsp; | ||
struct dtx_id dti = { 0 }; | ||
struct dtx_epoch epoch = { 0 }; | ||
daos_unit_oid_t oid = { 0 }; | ||
int blocks = 0; | ||
int rc = 0; | ||
int blocks = 0; | ||
|
||
/* | ||
* Avoid calling into vos_aggregate() when aborting aggregation | ||
|
@@ -2715,8 +2720,32 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, | |
agg_reset_entry(&ec_agg_param->ap_agg_entry, NULL, NULL); | ||
|
||
retry: | ||
epoch.oe_value = epr->epr_hi; | ||
rc = dtx_begin(cont->sc_hdl, &dti, &epoch, 0, cont->sc_pool->spc_map_version, &oid, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the dth is just for returning tbd_list or for other reasons? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this case, the |
||
NULL, 0, 0, NULL, &dth); | ||
if (rc != 0) | ||
goto update_hae; | ||
|
||
rc = vos_iterate(&iter_param, VOS_ITER_OBJ, true, &anchors, agg_iterate_pre_cb, | ||
agg_iterate_post_cb, ec_agg_param, NULL); | ||
agg_iterate_post_cb, ec_agg_param, dth); | ||
if (rc == -DER_INPROGRESS && !d_list_empty(&dth->dth_share_tbd_list)) { | ||
uint64_t now = daos_gettime_coarse(); | ||
|
||
/* Report warning per each 10 seconds to avoid log flood. */ | ||
if (now - cont->sc_ec_agg_busy_ts > 10) { | ||
while ((dsp = d_list_pop_entry(&dth->dth_share_tbd_list, | ||
struct dtx_share_peer, dsp_link)) != NULL) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it's necessary to dump all TX ids is necessary, probably just a few of them and the total number should be good enough? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There will be at most 4 entries per 10 seconds. It will not cause too much overhead. |
||
D_WARN(DF_CONT ": EC aggregate hit non-committed DTX " DF_DTI "\n", | ||
DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), | ||
DP_DTI(&dsp->dsp_xid)); | ||
dtx_dsp_free(dsp); | ||
} | ||
|
||
cont->sc_ec_agg_busy_ts = now; | ||
} | ||
} | ||
|
||
dtx_end(dth, cont, rc); | ||
|
||
/* Post_cb may not being executed in some cases */ | ||
agg_clear_extents(&ec_agg_param->ap_agg_entry); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think RPC should use millisecond instead of microsecond?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is general logic for RPC retry because of non-committed DTX, that usually happens for fetch case. The next resent RPC maybe be sent to related DTX leader on which the DTX maybe committable. If we use millisecond, the delay maybe too large.