From a8ee909c348e4403f6a3326f777c663a3117c79d Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Tue, 22 Oct 2024 14:15:49 -0400 Subject: [PATCH] i#7044 hang: Fix deadlock regression on drmemtrace replay (#7049) Fixes a deadlock on replay introduced by PR #6985. If a recorded schedule has back-to-back entries with the same input (with a gap where another thread runs that input in between of course) and it has to wait for that other thread for its 2nd stint with that input, it will deadlock as it holds the input's lock while it tries to get the lock in set_cur_input(). That 2nd lock acquisition is what was added by PR #6985: that's the source of the regression. Tested on a 12-thread threadsig trace which hangs without the fix and succeeds with the fix. It is too large to turn into a convenient regression test. Issue: #7044 --- clients/drcachesim/scheduler/scheduler.cpp | 12 +++++++++--- clients/drcachesim/scheduler/scheduler.h | 5 ++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 18c1a88e6dc..d22d94e29a3 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -2946,7 +2946,8 @@ scheduler_tmpl_t::syscall_incurs_switch(input_info_t *in template typename scheduler_tmpl_t::stream_status_t scheduler_tmpl_t::set_cur_input(output_ordinal_t output, - input_ordinal_t input) + input_ordinal_t input, + bool caller_holds_cur_input_lock) { // XXX i#5843: Merge tracking of current inputs with ready_queue.queue to better // manage the possible 3 states of each input (a live cur_input for an output stream, @@ -2958,7 +2959,9 @@ scheduler_tmpl_t::set_cur_input(output_ordinal_t output, if (prev_input >= 0) { if (prev_input != input) { input_info_t &prev_info = inputs_[prev_input]; - std::lock_guard lock(*prev_info.lock); + auto scoped_lock = caller_holds_cur_input_lock + ? std::unique_lock() + : std::unique_lock(*prev_info.lock); prev_info.cur_output = INVALID_OUTPUT_ORDINAL; prev_info.last_run_time = get_output_time(output); if (options_.schedule_record_ostream != nullptr) { @@ -3146,7 +3149,10 @@ scheduler_tmpl_t::pick_next_input_as_previously( output, index, segment.value.start_instruction); // Give up this input and go into a wait state. // We'll come back here on the next next_record() call. - set_cur_input(output, INVALID_INPUT_ORDINAL); + set_cur_input(output, INVALID_INPUT_ORDINAL, + // Avoid livelock if prev input == cur input which happens + // with back-to-back segments with the same input. + index == outputs_[output].cur_input); outputs_[output].waiting = true; return sched_type_t::STATUS_WAIT; } diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index e752839c1f8..c996ee889a1 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -1920,8 +1920,11 @@ template class scheduler_tmpl_t { input_reader_info_t &reader_info); // The caller cannot hold the output or input lock. + // The caller can hold the output's current input's lock but must pass + // true for the 3rd parameter in that case. stream_status_t - set_cur_input(output_ordinal_t output, input_ordinal_t input); + set_cur_input(output_ordinal_t output, input_ordinal_t input, + bool caller_holds_cur_input_lock = false); // Finds the next input stream for the 'output_ordinal'-th output stream. // No input_info_t lock can be held on entry.