From 71f7490e5bd244dc71ea439d0520e4a033235974 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Tue, 21 Nov 2023 18:25:02 -0500
Subject: [PATCH 1/5] i#6471 sched idle: Add idle time

Adds a new STATUS_IDLE return code, and a corresponding
TRACE_MARKER_TYPE_CORE_IDLE record.

Changes the scheduler behavior to no longer return STATUS_EOF for an
output when the ready queue is empty: instead STATUS_IDLE is returned
until every single input is at EOF.  This results in a more realistic
schedule where other cores can pick up work later rather than
disappearing from the system.

Augments the schedule_stats tool to count idle replies and compute a %
cpu usage metric.  Adds a unit test for counting idles.

Augments the scheduler_launcher to also compute %cpu usage.

Updates all the scheduler tests for the new change.

Adding idle time due to blocking syscalls will be done separately.

Issue: #6471
---
 clients/drcachesim/analyzer.cpp               | 33 +++++++++-
 clients/drcachesim/analyzer.h                 |  3 +
 clients/drcachesim/common/options.cpp         |  4 +-
 clients/drcachesim/common/trace_entry.h       | 11 ++++
 clients/drcachesim/reader/reader.h            |  3 +-
 clients/drcachesim/scheduler/scheduler.cpp    | 61 ++++++++++++++-----
 clients/drcachesim/scheduler/scheduler.h      | 27 +++++++-
 .../tests/schedule_stats_nopreempt.templatex  | 10 +++
 .../drcachesim/tests/schedule_stats_test.cpp  | 58 +++++++++++++++++-
 .../drcachesim/tests/scheduler_launcher.cpp   | 31 +++++++++-
 .../drcachesim/tests/scheduler_unit_tests.cpp | 61 +++++++++++--------
 clients/drcachesim/tools/basic_counts.cpp     |  3 +-
 clients/drcachesim/tools/schedule_stats.cpp   | 51 ++++++++++++----
 clients/drcachesim/tools/schedule_stats.h     |  3 +
 clients/drcachesim/tools/view.cpp             |  1 +
 15 files changed, 292 insertions(+), 68 deletions(-)

diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index 8b0452f3d91..ddd51ba291f 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -133,6 +133,17 @@ analyzer_t::create_wait_marker()
     return record;
 }
 
+template <>
+memref_t
+analyzer_t::create_idle_marker()
+{
+    memref_t record = {}; // Zero the other fields.
+    record.marker.type = TRACE_TYPE_MARKER;
+    record.marker.marker_type = TRACE_MARKER_TYPE_CORE_IDLE;
+    record.marker.tid = INVALID_THREAD_ID;
+    return record;
+}
+
 /******************************************************************************
  * Specializations for analyzer_tmpl_t<record_reader_t>, aka record_analyzer_t.
  */
@@ -182,6 +193,17 @@ record_analyzer_t::create_wait_marker()
     return record;
 }
 
+template <>
+trace_entry_t
+record_analyzer_t::create_idle_marker()
+{
+    trace_entry_t record;
+    record.type = TRACE_TYPE_MARKER;
+    record.size = TRACE_MARKER_TYPE_CORE_IDLE;
+    record.addr = 0; // Marker value has no meaning so we zero it.
+    return record;
+}
+
 /********************************************************************
  * Other analyzer_tmpl_t routines that do not need to be specialized.
  */
@@ -537,6 +559,11 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_tasks(analyzer_worker_data_t *w
             // We synthesize a record here.  If we wanted this to count toward output
             // stream ordinals we would need to add a scheduler API to inject it.
             record = create_wait_marker();
+        } else if (status == sched_type_t::STATUS_IDLE) {
+            // We let tools know about idle time so they can analyze cpu usage.
+            // We synthesize a record here.  If we wanted this to count toward output
+            // stream ordinals we would need to add a scheduler API to inject it.
+            record = create_idle_marker();
         } else if (status != sched_type_t::STATUS_OK) {
             if (status == sched_type_t::STATUS_REGION_INVALID) {
                 worker->error =
@@ -596,8 +623,10 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_tasks(analyzer_worker_data_t *w
         }
     }
     if (shard_type_ == SHARD_BY_CORE) {
-        if (!process_shard_exit(worker, worker->index))
-            return;
+        if (worker->shard_data.find(worker->index) != worker->shard_data.end()) {
+            if (!process_shard_exit(worker, worker->index))
+                return;
+        }
     }
     for (const auto &keyval : worker->shard_data) {
         if (!keyval.second.exited) {
diff --git a/clients/drcachesim/analyzer.h b/clients/drcachesim/analyzer.h
index 35a1df83a56..8ebc10547b2 100644
--- a/clients/drcachesim/analyzer.h
+++ b/clients/drcachesim/analyzer.h
@@ -252,6 +252,9 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     RecordType
     create_wait_marker();
 
+    RecordType
+    create_idle_marker();
+
     // Invoked when the given interval finishes during serial or parallel
     // analysis of the trace. For parallel analysis, the shard_id
     // parameter should be set to the shard_id for which the interval
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 71d7c5ac2f1..53fceb2c03d 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -809,7 +809,7 @@ droption_t<bool> op_core_sharded(
     "software threads.  This option instead schedules those threads onto virtual cores "
     "and analyzes each core in parallel.  Thus, each shard consists of pieces from "
     "many software threads.  How the scheduling is performed is controlled by a set "
-    "of options with the prefix \"sched_\" along with -num_cores.");
+    "of options with the prefix \"sched_\" along with -cores.");
 
 droption_t<bool> op_core_serial(
     DROPTION_SCOPE_ALL, "core_serial", false, "Analyze per-core in serial.",
@@ -817,7 +817,7 @@ droption_t<bool> op_core_serial(
     "However, the resulting schedule is acted upon by a single analysis thread"
     "which walks the N cores in lockstep in round robin fashion. "
     "How the scheduling is performed is controlled by a set "
-    "of options with the prefix \"sched_\" along with -num_cores.");
+    "of options with the prefix \"sched_\" along with -cores.");
 
 droption_t<int64_t>
     op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 1 * 1000 * 1000,
diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h
index c885ed1a72a..9cffd946533 100644
--- a/clients/drcachesim/common/trace_entry.h
+++ b/clients/drcachesim/common/trace_entry.h
@@ -583,6 +583,17 @@ typedef enum {
      */
     TRACE_MARKER_TYPE_CORE_WAIT,
 
+    /**
+     * This marker is used for core-sharded analyses to indicate that the current
+     * core has no available inputs to run (all inputs are on other cores or are
+     * blocked waiting for kernel resources).  A new marker is emitted each
+     * time the tool analysis framework requests a new record from the scheduler and
+     * is given a wait status.  There are no units of time here but each repetition
+     * is roughly the time where a regular record could have been read and passed
+     * along.
+     */
+    TRACE_MARKER_TYPE_CORE_IDLE,
+
     // ...
     // These values are reserved for future built-in marker types.
     // ...
diff --git a/clients/drcachesim/reader/reader.h b/clients/drcachesim/reader/reader.h
index ad2b64b7f82..98dcae855c2 100644
--- a/clients/drcachesim/reader/reader.h
+++ b/clients/drcachesim/reader/reader.h
@@ -193,7 +193,8 @@ class reader_t : public std::iterator<std::input_iterator_tag, memref_t>,
     is_record_synthetic() const override
     {
         if (cur_ref_.marker.type == TRACE_TYPE_MARKER &&
-            cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) {
+            (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT ||
+             cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE)) {
             // These are synthetic records not part of the input and not
             // counting toward ordinals.
             return true;
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 14fda834445..8786d8d5e0b 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -631,6 +631,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
             }
         }
     }
+    VPRINT(this, 1, "%zu inputs\n", inputs_.size());
+    live_input_count_.store(inputs_.size(), std::memory_order_release);
     return set_initial_schedule(workload2inputs);
 }
 
@@ -1313,7 +1315,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
                input.cur_region);
         if (input.cur_region >= static_cast<int>(input.regions_of_interest.size())) {
             if (input.at_eof)
-                return sched_type_t::STATUS_EOF;
+                return eof_or_idle(output);
             else {
                 // We let the user know we're done.
                 if (options_.schedule_record_ostream != nullptr) {
@@ -1329,7 +1331,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
                         return status;
                 }
                 input.queue.push_back(create_thread_exit(input.tid));
-                input.at_eof = true;
+                mark_input_eof(input);
                 return sched_type_t::STATUS_SKIPPED;
             }
         }
@@ -1408,7 +1410,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::skip_instructions(output_ordinal_t out
     if (*input.reader == *input.reader_end) {
         // Raise error because the input region is out of bounds.
         VPRINT(this, 2, "skip_instructions: input=%d skip out of bounds\n", input.index);
-        input.at_eof = true;
+        mark_input_eof(input);
         return sched_type_t::STATUS_REGION_INVALID;
     }
     input.in_cur_region = true;
@@ -1645,7 +1647,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
 {
     if (outputs_[output].record_index + 1 >=
         static_cast<int>(outputs_[output].record.size()))
-        return sched_type_t::STATUS_EOF;
+        return eof_or_idle(output);
     const schedule_record_t &segment =
         outputs_[output].record[outputs_[output].record_index + 1];
     index = segment.key.input;
@@ -1719,7 +1721,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         // queued candidate record, if any.
         clear_input_queue(inputs_[index]);
         inputs_[index].queue.push_back(create_thread_exit(inputs_[index].tid));
-        inputs_[index].at_eof = true;
+        mark_input_eof(inputs_[index]);
         VPRINT(this, 2, "early end for input %d\n", index);
         // We're done with this entry but we need the queued record to be read,
         // so we do not move past the entry.
@@ -1773,7 +1775,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                         const schedule_record_t &segment =
                             outputs_[output].record[outputs_[output].record_index];
                         int input = segment.key.input;
-                        VPRINT(this, res == sched_type_t::STATUS_WAIT ? 3 : 2,
+                        VPRINT(this,
+                               (res == sched_type_t::STATUS_IDLE ||
+                                res == sched_type_t::STATUS_WAIT)
+                                   ? 3
+                                   : 2,
                                "next_record[%d]: replay segment in=%d (@%" PRId64
                                ") type=%d start=%" PRId64 " end=%" PRId64 "\n",
                                output, input,
@@ -1819,10 +1825,10 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                     // We found a direct switch target above.
                 } else if (ready_queue_empty()) {
                     if (prev_index == INVALID_INPUT_ORDINAL)
-                        return sched_type_t::STATUS_EOF;
+                        return eof_or_idle(output);
                     std::lock_guard<std::mutex> lock(*inputs_[prev_index].lock);
                     if (inputs_[prev_index].at_eof)
-                        return sched_type_t::STATUS_EOF;
+                        return eof_or_idle(output);
                     else
                         index = prev_index; // Go back to prior.
                 } else {
@@ -1836,7 +1842,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                     }
                     input_info_t *queue_next = pop_from_ready_queue(output);
                     if (queue_next == nullptr)
-                        return sched_type_t::STATUS_EOF;
+                        return eof_or_idle(output);
                     index = queue_next->index;
                 }
             } else if (options_.deps == DEPENDENCY_TIMESTAMPS) {
@@ -1850,7 +1856,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                     }
                 }
                 if (index < 0)
-                    return sched_type_t::STATUS_EOF;
+                    return eof_or_idle(output);
                 VPRINT(this, 2,
                        "next_record[%d]: advancing to timestamp %" PRIu64
                        " == input #%d\n",
@@ -1883,14 +1889,15 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
         std::lock_guard<std::mutex> lock(*inputs_[index].lock);
         if (inputs_[index].at_eof ||
             *inputs_[index].reader == *inputs_[index].reader_end) {
-            VPRINT(this, 2, "next_record[%d]: local index %d == input #%d at eof\n",
-                   output, outputs_[output].input_indices_index, index);
+            VPRINT(this, 2, "next_record[%d]: input #%d at eof\n", output, index);
             if (options_.schedule_record_ostream != nullptr &&
                 prev_index != INVALID_INPUT_ORDINAL)
                 close_schedule_segment(output, inputs_[prev_index]);
-            inputs_[index].at_eof = true;
+            if (!inputs_[index].at_eof)
+                mark_input_eof(inputs_[index]);
             index = INVALID_INPUT_ORDINAL;
             // Loop and pick next thread.
+            prev_index = INVALID_INPUT_ORDINAL;
             continue;
         }
         break;
@@ -1911,7 +1918,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     // check for quantum end.
     outputs_[output].cur_time = cur_time; // Invalid values are checked below.
     if (!outputs_[output].active)
-        return sched_type_t::STATUS_WAIT;
+        return sched_type_t::STATUS_IDLE;
     if (outputs_[output].waiting) {
         VPRINT(this, 5, "next_record[%d]: need new input (cur=waiting)\n", output);
         sched_type_t::stream_status_t res = pick_next_input(output, true);
@@ -1922,7 +1929,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     if (outputs_[output].cur_input < 0) {
         // This happens with more outputs than inputs.  For non-empty outputs we
         // require cur_input to be set to >=0 during init().
-        return sched_type_t::STATUS_EOF;
+        return eof_or_idle(output);
     }
     input = &inputs_[outputs_[output].cur_input];
     auto lock = std::unique_lock<std::mutex>(*input->lock);
@@ -1970,6 +1977,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 input->needs_advance = true;
             }
             if (input->at_eof || *input->reader == *input->reader_end) {
+                if (!input->at_eof)
+                    mark_input_eof(*input);
                 lock.unlock();
                 VPRINT(this, 5, "next_record[%d]: need new input (cur=%d eof)\n", output,
                        input->index);
@@ -1998,6 +2007,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
             if (outputs_[output].record_index >=
                 static_cast<int>(outputs_[output].record.size())) {
                 // We're on the last record.
+                VPRINT(this, 4, "next_record[%d]: on last record\n", output);
             } else if (outputs_[output].record[outputs_[output].record_index].type ==
                        schedule_record_t::SKIP) {
                 VPRINT(this, 5, "next_record[%d]: need new input after skip\n", output);
@@ -2257,6 +2267,27 @@ scheduler_tmpl_t<RecordType, ReaderType>::stop_speculation(output_ordinal_t outp
     return sched_type_t::STATUS_OK;
 }
 
+template <typename RecordType, typename ReaderType>
+void
+scheduler_tmpl_t<RecordType, ReaderType>::mark_input_eof(input_info_t &input)
+{
+    input.at_eof = true;
+    assert(live_input_count_.load(std::memory_order_acquire) > 0);
+    live_input_count_.fetch_add(-1, std::memory_order_release);
+}
+
+template <typename RecordType, typename ReaderType>
+typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
+scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output)
+{
+    if (live_input_count_.load(std::memory_order_acquire) == 0) {
+        return sched_type_t::STATUS_EOF;
+    } else {
+        outputs_[output].waiting = true;
+        return sched_type_t::STATUS_IDLE;
+    }
+}
+
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::set_output_active(output_ordinal_t output,
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 046c57e3085..e9f999117bb 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -45,6 +45,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <atomic>
 #include <deque>
 #include <limits>
 #include <memory>
@@ -112,7 +113,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * (#DEPENDENCY_TIMESTAMPS) to avoid one stream getting ahead of another.  For
          * replaying a schedule as it was traced with #MAP_TO_RECORDED_OUTPUT this can
          * indicate an idle period on a core where the traced workload was not currently
-         * scheduled.
+         * scheduled, but generally #STATUS_WAIT should be treated as artificial.
+         * Simulators are suggested to not advance simulated time for #STATUS_WAIT while
+         * they should advance time for #STATUS_IDLE.
          */
         STATUS_WAIT,
         STATUS_INVALID,         /**< Error condition. */
@@ -120,6 +123,15 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         STATUS_NOT_IMPLEMENTED, /**< Feature not implemented. */
         STATUS_SKIPPED,         /**< Used for internal scheduler purposes. */
         STATUS_RECORD_FAILED,   /**< Failed to record schedule for future replay. */
+        /**
+         * This code indicates that all inputs are blocked waiting for kernel resources
+         * (such as i/o).  This is similar to #STATUS_WAIT, but #STATUS_WAIT indicates an
+         * artificial pause due to imposing the original ordering while #STATUS_IDLE
+         * indicates actual idle time in the application.  Simulators are suggested
+         * to not advance simulated time for #STATUS_WAIT while they should advance
+         * time for #STATUS_IDLE.
+         */
+        STATUS_IDLE,
     };
 
     /** Identifies an input stream by its index. */
@@ -629,7 +641,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         /**
          * Disables or re-enables this output stream.  If "active" is false, this
          * stream becomes inactive and its currently assigned input is moved to the
-         * ready queue to be scheduled on other outputs.  The #STATUS_WAIT code is
+         * ready queue to be scheduled on other outputs.  The #STATUS_IDLE code is
          * returned to next_record() for inactive streams.  If "active" is true,
          * this stream becomes active again.
          * This is only supported for #MAP_TO_ANY_OUTPUT.
@@ -1076,7 +1088,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // sched_lock_.
         std::vector<schedule_record_t> record;
         int record_index = 0;
-        bool waiting = false;
+        bool waiting = false; // Waiting or idling.
         bool active = true;
         // Used for time-based quanta.
         uint64_t cur_time = 0;
@@ -1259,6 +1271,13 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     stream_status_t
     set_output_active(output_ordinal_t output, bool active);
 
+    // Caller must hold the input's lock.
+    void
+    mark_input_eof(input_info_t &input);
+
+    stream_status_t
+    eof_or_idle(output_ordinal_t output);
+
     ///////////////////////////////////////////////////////////////////////////
     // Support for ready queues for who to schedule next:
 
@@ -1325,6 +1344,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     flexible_queue_t<input_info_t *, InputTimestampComparator> ready_priority_;
     // Global ready queue counter used to provide FIFO for same-priority inputs.
     uint64_t ready_counter_ = 0;
+    // Count of inputs not yet at eof.
+    std::atomic<int> live_input_count_;
     // Map from workload,tid pair to input.
     struct workload_tid_t {
         workload_tid_t(int wl, memref_tid_t tid)
diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
index 9f1ef413e23..f96422fd667 100644
--- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
+++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -14,6 +14,8 @@ Total counts:
            2 maybe-blocking system calls
            0 direct switch requests
            0 waits
+           0 idles
+      100.00% cpu busy
 Core #0 counts:
            . threads
       *[0-9]* instructions
@@ -28,6 +30,8 @@ Core #0 counts:
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
+           0 idles
+      100.00% cpu busy
 Core #1 counts:
            . threads
       *[0-9]* instructions
@@ -42,6 +46,8 @@ Core #1 counts:
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
+           0 idles
+      100.00% cpu busy
 Core #2 counts:
            . threads
       *[0-9]* instructions
@@ -56,6 +62,8 @@ Core #2 counts:
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
+           0 idles
+      100.00% cpu busy
 Core #3 counts:
            . threads
       *[0-9]* instructions
@@ -70,6 +78,8 @@ Core #3 counts:
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
+           0 idles
+      100.00% cpu busy
 Core #0 schedule: [A-H,]*
 Core #1 schedule: [A-H,]*
 Core #2 schedule: [A-H,]*
diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp
index 897ffe46fbf..48177635375 100644
--- a/clients/drcachesim/tests/schedule_stats_test.cpp
+++ b/clients/drcachesim/tests/schedule_stats_test.cpp
@@ -54,6 +54,7 @@ namespace drmemtrace {
 using ::dynamorio::drmemtrace::default_memtrace_stream_t;
 using ::dynamorio::drmemtrace::memref_t;
 using ::dynamorio::drmemtrace::memref_tid_t;
+using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_IDLE;
 using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_WAIT;
 using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH;
 using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL;
@@ -198,10 +199,65 @@ test_basic_stats()
     return true;
 }
 
+static bool
+test_idle()
+{
+    static constexpr int64_t TID_A = 42;
+    static constexpr int64_t TID_B = 142;
+    static constexpr int64_t TID_C = 242;
+    std::unordered_map<memref_tid_t, int64_t> tid2ord;
+    tid2ord[TID_A] = 0;
+    tid2ord[TID_B] = 1;
+    tid2ord[TID_C] = 2;
+    std::vector<std::vector<memref_t>> memrefs = {
+        {
+            gen_instr(TID_B),
+            gen_instr(TID_B),
+            gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0),
+            gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0),
+            gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0),
+            gen_instr(TID_B),
+            gen_instr(TID_B),
+            gen_instr(TID_B),
+        },
+        {
+            gen_instr(TID_C),
+            // Involuntary switch.
+            gen_instr(TID_A),
+            // Involuntary switch.
+            gen_instr(TID_C),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0),
+            gen_instr(TID_C),
+            gen_instr(TID_C),
+            // Wait.
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0),
+            // Involuntary switch.
+            gen_instr(TID_A),
+            gen_instr(TID_A),
+            gen_instr(TID_A),
+        },
+    };
+    auto result = run_schedule_stats(memrefs, tid2ord);
+    assert(result.instrs == 13);
+    assert(result.total_switches == 3);
+    assert(result.voluntary_switches == 0);
+    assert(result.direct_switches == 0);
+    assert(result.syscalls == 0);
+    assert(result.maybe_blocking_syscalls == 0);
+    assert(result.direct_switch_requests == 0);
+    assert(result.waits == 3);
+    assert(result.idles == 6);
+    return true;
+}
+
 int
 test_main(int argc, const char *argv[])
 {
-    if (test_basic_stats()) {
+    if (test_basic_stats() && test_idle()) {
         std::cerr << "schedule_stats_test passed\n";
         return 0;
     }
diff --git a/clients/drcachesim/tests/scheduler_launcher.cpp b/clients/drcachesim/tests/scheduler_launcher.cpp
index bfebb7e5861..814a0b703fa 100644
--- a/clients/drcachesim/tests/scheduler_launcher.cpp
+++ b/clients/drcachesim/tests/scheduler_launcher.cpp
@@ -155,6 +155,11 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch
     memref_t record;
     uint64_t micros = op_sched_time.get_value() ? get_current_microseconds() : 0;
     uint64_t cur_segment_instrs = 0;
+    // XXX: Should we remove this tool as the schedule_stats analysis tool has
+    // superceded it?  We're basically duplicating new features in both.
+    bool prev_was_wait = false, prev_was_idle = false;
+    // Measure cpu usage by counting each next_record() as one cycle.
+    uint64_t cycles_total = 0, cycles_busy = 0;
     // Thread ids can be duplicated, so use the input ordinals to distinguish.
     scheduler_t::input_ordinal_t prev_input = scheduler_t::INVALID_INPUT_ORDINAL;
     for (scheduler_t::stream_status_t status = stream->next_record(record, micros);
@@ -162,13 +167,28 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch
          status = stream->next_record(record, micros)) {
         if (op_sched_time.get_value())
             micros = get_current_microseconds();
+        ++cycles_total;
         if (status == scheduler_t::STATUS_WAIT) {
-            thread_sequence += '-';
+            if (!prev_was_wait || cur_segment_instrs == op_print_every.get_value())
+                thread_sequence += '-';
+            ++cur_segment_instrs;
+            if (cur_segment_instrs == op_print_every.get_value())
+                cur_segment_instrs = 0;
+            prev_was_wait = true;
             std::this_thread::yield();
             continue;
-        }
-        if (status != scheduler_t::STATUS_OK)
+        } else if (status == scheduler_t::STATUS_IDLE) {
+            if (!prev_was_idle || cur_segment_instrs == op_print_every.get_value())
+                thread_sequence += '_';
+            ++cur_segment_instrs;
+            if (cur_segment_instrs == op_print_every.get_value())
+                cur_segment_instrs = 0;
+            prev_was_idle = true;
+            std::this_thread::yield();
+            continue;
+        } else if (status != scheduler_t::STATUS_OK)
             FATAL_ERROR("scheduler failed to advance: %d", status);
+        ++cycles_busy;
         if (op_verbose.get_value() >= 4) {
             std::ostringstream line;
             line << "Core #" << std::setw(2) << ordinal << " @" << std::setw(9)
@@ -249,6 +269,11 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch
         }
 #endif
     }
+    float usage = 0;
+    if (cycles_total > 0)
+        usage = 100.f * cycles_busy / static_cast<float>(cycles_total);
+    std::cerr << "Core #" << std::setw(2) << ordinal << " usage: " << std::setw(9)
+              << usage << "%\n";
 }
 
 } // namespace
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 117e6f73e98..2e111ccd20c 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -842,6 +842,10 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti
                 sched_as_string[i] += '-';
                 continue;
             }
+            if (status == scheduler_t::STATUS_IDLE) {
+                sched_as_string[i] += '_';
+                continue;
+            }
             assert(status == scheduler_t::STATUS_OK);
             if (type_is_instr(memref.instr.type)) {
                 sched_as_string[i] +=
@@ -895,7 +899,7 @@ test_synthetic()
     // core alternates; with an odd number the 2nd core finishes early.
     // The dots are thread exits.
     static const char *const CORE0_SCHED_STRING = "AAACCCEEEGGGBBBDDDFFFAAA.CCC.EEE.GGG.";
-    static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF.";
+    static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF.____";
     {
         // Test instruction quanta.
         std::vector<scheduler_t::input_workload_t> sched_inputs;
@@ -1048,7 +1052,7 @@ test_synthetic_time_quanta()
         check_next(cpu0, ++time, scheduler_t::STATUS_OK, TID_C, TRACE_TYPE_INSTR);
         check_next(cpu0, ++time, scheduler_t::STATUS_OK, TID_C, TRACE_TYPE_INSTR);
         check_next(cpu0, time, scheduler_t::STATUS_OK, TID_C, TRACE_TYPE_THREAD_EXIT);
-        check_next(cpu0, time, scheduler_t::STATUS_EOF);
+        check_next(cpu0, time, scheduler_t::STATUS_IDLE);
         check_next(cpu1, time, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_THREAD_EXIT);
         check_next(cpu1, time, scheduler_t::STATUS_EOF);
         if (scheduler.write_recorded_schedule() != scheduler_t::STATUS_SUCCESS)
@@ -1079,7 +1083,7 @@ test_synthetic_time_quanta()
         for (int i = 0; i < NUM_OUTPUTS; i++) {
             std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
         }
-        assert(sched_as_string[0] == "..A..CCC.");
+        assert(sched_as_string[0] == "..A..CCC._");
         assert(sched_as_string[1] == "..BAA.BB.");
     }
 #endif
@@ -1158,8 +1162,9 @@ test_synthetic_with_timestamps()
     // workloads we should start with {C,F,I,J} and then move on to {B,E,H} and finish
     // with {A,D,G}.  We should interleave within each group -- except once we reach J
     // we should completely finish it.
-    assert(sched_as_string[0] ==
-           ".CC.C.II.IC.CC.F.FF.I.II.FF.F..BB.B.HH.HE.EE.BB.B.HH.H..DD.DA.AA.G.GG.DD.D.");
+    assert(
+        sched_as_string[0] ==
+        ".CC.C.II.IC.CC.F.FF.I.II.FF.F..BB.B.HH.HE.EE.BB.B.HH.H..DD.DA.AA.G.GG.DD.D._");
     assert(sched_as_string[1] ==
            ".FF.F.JJ.JJ.JJ.JJ.J.CC.C.II.I..EE.EB.BB.H.HH.EE.E..AA.A.GG.GD.DD.AA.A.GG.G.");
 }
@@ -1241,8 +1246,9 @@ test_synthetic_with_priorities()
     // See the test_synthetic_with_timestamps() test which has our base sequence.
     // We've elevated B, E, and H to higher priorities so they go
     // first.  J remains uninterrupted due to lower timestamps.
-    assert(sched_as_string[0] ==
-           ".BB.B.HH.HE.EE.BB.B.HH.H..FF.F.JJ.JJ.JJ.JJ.J.CC.C.II.I..DD.DA.AA.G.GG.DD.D.");
+    assert(
+        sched_as_string[0] ==
+        ".BB.B.HH.HE.EE.BB.B.HH.H..FF.F.JJ.JJ.JJ.JJ.J.CC.C.II.I..DD.DA.AA.G.GG.DD.D._");
     assert(sched_as_string[1] ==
            ".EE.EB.BB.H.HH.EE.E..CC.C.II.IC.CC.F.FF.I.II.FF.F..AA.A.GG.GD.DD.AA.A.GG.G.");
 }
@@ -1308,11 +1314,11 @@ test_synthetic_with_bindings()
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
     }
     // We have {A,B,C} on {2,4}, {D,E,F} on {0,1}, and {G,H,I} on {1,2,3}:
-    assert(sched_as_string[0] == ".DD.D.FF.FD.DD.F.FF.DD.D.FF.F.");
-    assert(sched_as_string[1] == ".EE.E.HH.HE.EE.I.II.EE.E.");
+    assert(sched_as_string[0] == ".DD.D.FF.FD.DD.F.FF.DD.D.FF.F._");
+    assert(sched_as_string[1] == ".EE.E.HH.HE.EE.I.II.EE.E.______");
     assert(sched_as_string[2] == ".AA.A.CC.CG.GG.C.CC.HH.H.CC.C.");
-    assert(sched_as_string[3] == ".GG.G.II.IH.HH.GG.G.II.I.");
-    assert(sched_as_string[4] == ".BB.BA.AA.B.BB.AA.A.BB.B.");
+    assert(sched_as_string[3] == ".GG.G.II.IH.HH.GG.G.II.I._____");
+    assert(sched_as_string[4] == ".BB.BA.AA.B.BB.AA.A.BB.B._____");
 }
 
 static void
@@ -1379,11 +1385,11 @@ test_synthetic_with_bindings_weighted()
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
     }
     // We have {A,B,C} on {2,4}, {D,E,F} on {0,1}, and {G,H,I} on {1,2,3}:
-    assert(sched_as_string[0] == ".FF.FF.FF.FF.F..EE.EE.EE.EE.E.");
-    assert(sched_as_string[1] == ".II.II.II.II.I..DD.DD.DD.DD.D.");
-    assert(sched_as_string[2] == ".CC.CC.CC.CC.C..AA.AA.AA.AA.A.");
+    assert(sched_as_string[0] == ".FF.FF.FF.FF.F..EE.EE.EE.EE.E._");
+    assert(sched_as_string[1] == ".II.II.II.II.I..DD.DD.DD.DD.D._");
+    assert(sched_as_string[2] == ".CC.CC.CC.CC.C..AA.AA.AA.AA.A._");
     assert(sched_as_string[3] == ".HH.HH.HH.HH.H..GG.GG.GG.GG.G.");
-    assert(sched_as_string[4] == ".BB.BB.BB.BB.B.");
+    assert(sched_as_string[4] == ".BB.BB.BB.BB.B._______________");
 }
 
 static void
@@ -1472,7 +1478,7 @@ test_synthetic_with_syscalls_multiple()
     // blocking syscall.
     assert(sched_as_string[0] ==
            ".B..HH.H.B.H.HH..B.HH.H..B.E.B...II.I.JJ.JJ.JJ.JJ.J.CC.C.II.I..DD.DA.AA.G.GG."
-           "DD.D.");
+           "DD.D.___");
     assert(sched_as_string[1] ==
            ".EE..B..EE..B..EE..B..EE...CC.C.FF.FB..C.CC.F.FF.I.II.FF.F..AA.A.GG.GD.DD.AA."
            "A.GG.G.");
@@ -1538,7 +1544,7 @@ test_synthetic_with_syscalls_single()
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
     }
     assert(sched_as_string[0] == ".AA..AA.A.A.AA..A.");
-    assert(sched_as_string[1] == "");
+    assert(sched_as_string[1] == "__________________");
 }
 
 static bool
@@ -1778,7 +1784,7 @@ simulate_core(scheduler_t::stream_t *stream)
     memref_t record;
     for (scheduler_t::stream_status_t status = stream->next_record(record);
          status != scheduler_t::STATUS_EOF; status = stream->next_record(record)) {
-        if (status == scheduler_t::STATUS_WAIT) {
+        if (status == scheduler_t::STATUS_WAIT || status == scheduler_t::STATUS_IDLE) {
             std::this_thread::yield();
             continue;
         }
@@ -1998,7 +2004,7 @@ test_replay()
     // We expect 3 letter sequences (our quantum) alternating every-other as each
     // core alternates; with an odd number the 2nd core finishes early.
     static const char *const CORE0_SCHED_STRING = "AAACCCEEEGGGBBBDDDFFFAAA.CCC.EEE.GGG.";
-    static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF.";
+    static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF.____";
 
     static constexpr memref_tid_t TID_BASE = 100;
     std::vector<trace_entry_t> inputs[NUM_INPUTS];
@@ -2090,7 +2096,7 @@ simulate_core_and_record_schedule(scheduler_t::stream_t *stream,
     memtrace_stream_t *prev_stream = nullptr;
     for (scheduler_t::stream_status_t status = stream->next_record(record);
          status != scheduler_t::STATUS_EOF; status = stream->next_record(record)) {
-        if (status == scheduler_t::STATUS_WAIT) {
+        if (status == scheduler_t::STATUS_WAIT || status == scheduler_t::STATUS_IDLE) {
             std::this_thread::yield();
             continue;
         }
@@ -2287,7 +2293,7 @@ test_replay_timestamps()
 
     // Create a record file with timestamps requiring waiting.
     // We cooperate with the test_scheduler_t class which constructs this schedule:
-    static const char *const CORE0_SCHED_STRING = ".AAA-------------------------CCC.";
+    static const char *const CORE0_SCHED_STRING = ".AAA-------------------------CCC.____";
     static const char *const CORE1_SCHED_STRING = ".BBB.CCCCCC.DDDAAABBBDDDAAA.BBB.DDD.";
     std::string record_fname = "tmp_test_replay_timestamp.zip";
     test_scheduler_t test_scheduler;
@@ -2515,7 +2521,8 @@ test_replay_limit()
         memref_t memref;
         for (scheduler_t::stream_status_t status = stream->next_record(memref);
              status != scheduler_t::STATUS_EOF; status = stream->next_record(memref)) {
-            if (status == scheduler_t::STATUS_WAIT) {
+            if (status == scheduler_t::STATUS_WAIT ||
+                status == scheduler_t::STATUS_IDLE) {
                 std::this_thread::yield();
                 continue;
             }
@@ -2696,7 +2703,7 @@ test_replay_as_traced()
 
     // Synthesize a cpu-schedule file.
     std::string cpu_fname = "tmp_test_cpu_as_traced.zip";
-    static const char *const CORE0_SCHED_STRING = "EEE-AAA-CCCAAACCCBBB.DDD.";
+    static const char *const CORE0_SCHED_STRING = "EEE-AAA-CCCAAACCCBBB.DDD.___";
     static const char *const CORE1_SCHED_STRING = "---EEE.BBBDDDBBBDDDAAA.CCC.";
     {
         std::vector<schedule_entry_t> sched0;
@@ -3018,11 +3025,11 @@ test_inactive()
         // Make cpu1 inactive.
         status = stream1->set_active(false);
         assert(status == scheduler_t::STATUS_OK);
-        check_next(stream1, scheduler_t::STATUS_WAIT);
+        check_next(stream1, scheduler_t::STATUS_IDLE);
         // Test making cpu1 inactive while it's already inactive.
         status = stream1->set_active(false);
         assert(status == scheduler_t::STATUS_OK);
-        check_next(stream1, scheduler_t::STATUS_WAIT);
+        check_next(stream1, scheduler_t::STATUS_IDLE);
         // Advance cpu0 to its quantum end.
         check_next(stream0, scheduler_t::STATUS_OK, TID_A, TRACE_TYPE_INSTR);
         // Ensure cpu0 now picks up the input that was on cpu1.
@@ -3034,7 +3041,7 @@ test_inactive()
         // Make cpu0 inactive and cpu1 active.
         status = stream0->set_active(false);
         assert(status == scheduler_t::STATUS_OK);
-        check_next(stream0, scheduler_t::STATUS_WAIT);
+        check_next(stream0, scheduler_t::STATUS_IDLE);
         status = stream1->set_active(true);
         assert(status == scheduler_t::STATUS_OK);
         // Now cpu1 should finish things.
@@ -3070,7 +3077,7 @@ test_inactive()
         for (int i = 0; i < NUM_OUTPUTS; i++) {
             std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
         }
-        assert(sched_as_string[0] == "..AABBA.");
+        assert(sched_as_string[0] == "..AABBA._");
         assert(sched_as_string[1] == "..B---B.");
     }
 #endif // HAS_ZIP
diff --git a/clients/drcachesim/tools/basic_counts.cpp b/clients/drcachesim/tools/basic_counts.cpp
index 620d28a02d8..0fbf02a47c5 100644
--- a/clients/drcachesim/tools/basic_counts.cpp
+++ b/clients/drcachesim/tools/basic_counts.cpp
@@ -169,7 +169,8 @@ basic_counts_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
         } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_KERNEL_EVENT ||
                    memref.marker.marker_type == TRACE_MARKER_TYPE_KERNEL_XFER) {
             ++counters->xfer_markers;
-        } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) {
+        } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT ||
+                   memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE) {
             // This is a synthetic record so do not increment any counts.
         } else {
             if (memref.marker.marker_type == TRACE_MARKER_TYPE_WINDOW_ID &&
diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
index ad12644a3b5..fdc250fb4b8 100644
--- a/clients/drcachesim/tools/schedule_stats.cpp
+++ b/clients/drcachesim/tools/schedule_stats.cpp
@@ -136,9 +136,10 @@ schedule_stats_t::parallel_shard_error(void *shard_data)
 bool
 schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
 {
-    static constexpr char THREAD_LETTER_START = 'A';
-    static constexpr char THREAD_SEPARATOR = ',';
+    static constexpr char THREAD_LETTER_INITIAL_START = 'A';
+    static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a';
     static constexpr char WAIT_SYMBOL = '-';
+    static constexpr char IDLE_SYMBOL = '_';
     per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
     if (knob_verbose_ >= 4) {
         std::ostringstream line;
@@ -163,13 +164,29 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) {
         ++shard->counters.waits;
         if (!shard->prev_was_wait) {
-            shard->thread_sequence += '-';
+            shard->thread_sequence += WAIT_SYMBOL;
             shard->cur_segment_instrs = 0;
             shard->prev_was_wait = true;
         } else {
             ++shard->cur_segment_instrs;
             if (shard->cur_segment_instrs == knob_print_every_) {
                 shard->thread_sequence += WAIT_SYMBOL;
+                shard->cur_segment_instrs = 0;
+            }
+        }
+        return true;
+    } else if (memref.marker.type == TRACE_TYPE_MARKER &&
+               memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE) {
+        ++shard->counters.idles;
+        if (!shard->prev_was_idle) {
+            shard->thread_sequence += IDLE_SYMBOL;
+            shard->cur_segment_instrs = 0;
+            shard->prev_was_idle = true;
+        } else {
+            ++shard->cur_segment_instrs;
+            if (shard->cur_segment_instrs == knob_print_every_) {
+                shard->thread_sequence += IDLE_SYMBOL;
+                shard->cur_segment_instrs = 0;
             }
         }
         return true;
@@ -183,12 +200,9 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
                 ++shard->counters.voluntary_switches;
             if (shard->direct_switch_target == memref.marker.tid)
                 ++shard->counters.direct_switches;
-            // A comma separating each sequence makes it a little easier to
-            // read, and helps distinguish a switch from two threads with the
-            // same %26 letter.  (We could remove this though to compact it.)
-            shard->thread_sequence += THREAD_SEPARATOR;
         }
-        shard->thread_sequence += THREAD_LETTER_START + static_cast<char>(input % 26);
+        shard->thread_sequence +=
+            THREAD_LETTER_INITIAL_START + static_cast<char>(input % 26);
         shard->cur_segment_instrs = 0;
         if (knob_verbose_ >= 2) {
             std::ostringstream line;
@@ -213,7 +227,8 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         ++shard->counters.instrs;
         ++shard->cur_segment_instrs;
         if (shard->cur_segment_instrs == knob_print_every_) {
-            shard->thread_sequence += THREAD_LETTER_START + static_cast<char>(input % 26);
+            shard->thread_sequence +=
+                THREAD_LETTER_SUBSEQUENT_START + static_cast<char>(input % 26);
             shard->cur_segment_instrs = 0;
         }
         shard->direct_switch_target = INVALID_THREAD_ID;
@@ -237,6 +252,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
     } else if (memref.exit.type == TRACE_TYPE_THREAD_EXIT)
         shard->saw_exit = true;
     shard->prev_was_wait = false;
+    shard->prev_was_idle = false;
     return true;
 }
 
@@ -246,11 +262,15 @@ schedule_stats_t::print_counters(const counters_t &counters)
     std::cerr << std::setw(12) << counters.threads.size() << " threads\n";
     std::cerr << std::setw(12) << counters.instrs << " instructions\n";
     std::cerr << std::setw(12) << counters.total_switches << " total context switches\n";
-    std::cerr << std::setw(12) << std::fixed << std::setprecision(7)
-              << (1000 * counters.total_switches / static_cast<double>(counters.instrs))
+    double cspki = 0.;
+    if (counters.instrs > 0)
+        cspki = 1000 * counters.total_switches / static_cast<double>(counters.instrs);
+    std::cerr << std::setw(12) << std::fixed << std::setprecision(7) << cspki
               << " CSPKI (context switches per 1000 instructions)\n";
-    std::cerr << std::setw(12) << std::fixed << std::setprecision(0)
-              << (counters.instrs / static_cast<double>(counters.total_switches))
+    double ipcs = 0.;
+    if (counters.total_switches > 0)
+        ipcs = counters.instrs / static_cast<double>(counters.total_switches);
+    std::cerr << std::setw(12) << std::fixed << std::setprecision(0) << ipcs
               << " instructions per context switch\n";
     std::cerr << std::setw(12) << std::fixed << std::setprecision(7)
               << counters.voluntary_switches << " voluntary context switches\n";
@@ -273,6 +293,11 @@ schedule_stats_t::print_counters(const counters_t &counters)
     std::cerr << std::setw(12) << counters.direct_switch_requests
               << " direct switch requests\n";
     std::cerr << std::setw(12) << counters.waits << " waits\n";
+    std::cerr << std::setw(12) << counters.idles << " idles\n";
+    std::cerr << std::setw(12) << std::setprecision(2)
+              << 100 *
+            (counters.instrs / static_cast<double>(counters.instrs + counters.idles))
+              << "% cpu busy\n";
 }
 
 bool
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index 073d754b6e8..9d15ccc0638 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -88,6 +88,7 @@ class schedule_stats_t : public analysis_tool_t {
             maybe_blocking_syscalls += rhs.maybe_blocking_syscalls;
             direct_switch_requests += rhs.direct_switch_requests;
             waits += rhs.waits;
+            idles += rhs.idles;
             for (const memref_tid_t tid : rhs.threads) {
                 threads.insert(tid);
             }
@@ -101,6 +102,7 @@ class schedule_stats_t : public analysis_tool_t {
         int64_t maybe_blocking_syscalls = 0;
         int64_t direct_switch_requests = 0;
         int64_t waits = 0;
+        int64_t idles = 0;
         std::unordered_set<memref_tid_t> threads;
     };
     counters_t
@@ -121,6 +123,7 @@ class schedule_stats_t : public analysis_tool_t {
         std::string thread_sequence;
         uint64_t cur_segment_instrs = 0;
         bool prev_was_wait = false;
+        bool prev_was_idle = false;
     };
 
     void
diff --git a/clients/drcachesim/tools/view.cpp b/clients/drcachesim/tools/view.cpp
index fcda9cb7682..2c77e04d682 100644
--- a/clients/drcachesim/tools/view.cpp
+++ b/clients/drcachesim/tools/view.cpp
@@ -432,6 +432,7 @@ view_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
         case TRACE_MARKER_TYPE_CORE_WAIT:
             std::cerr << "<marker: wait for another core>\n";
             break;
+        case TRACE_MARKER_TYPE_CORE_IDLE: std::cerr << "<marker: core is idle>\n"; break;
         default:
             std::cerr << "<marker: type " << memref.marker.marker_type << "; value "
                       << memref.marker.marker_value << ">\n";

From 3358aee19713e51f233e26e340d9dcc4ec9b5c9b Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Wed, 22 Nov 2023 15:16:29 -0500
Subject: [PATCH 2/5] Fix Windows warning; fix nopreempt test templatex

---
 clients/drcachesim/scheduler/scheduler.cpp    |  2 +-
 .../tests/schedule_stats_nopreempt.templatex  | 58 +++++++++----------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 8786d8d5e0b..1d58484ba64 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -632,7 +632,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
         }
     }
     VPRINT(this, 1, "%zu inputs\n", inputs_.size());
-    live_input_count_.store(inputs_.size(), std::memory_order_release);
+    live_input_count_.store(static_cast<int>(inputs_.size()), std::memory_order_release);
     return set_initial_schedule(workload2inputs);
 }
 
diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
index f96422fd667..8eeada43d63 100644
--- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
+++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -4,83 +4,83 @@ Total counts:
            8 threads
       638938 instructions
            5 total context switches
-   0.0078255 CSPKI \(context switches per 1000 instructions\)
+   0\.0078255 CSPKI \(context switches per 1000 instructions\)
       127788 instructions per context switch
            5 voluntary context switches
            0 direct context switches
-      100.00% voluntary switches
-        0.00% direct switches
+      100\.00% voluntary switches
+        0\.00% direct switches
          161 system calls
            2 maybe-blocking system calls
            0 direct switch requests
            0 waits
-           0 idles
-      100.00% cpu busy
+    *[0-9]* idles
+      *[0-9\.]*% cpu busy
 Core #0 counts:
            . threads
       *[0-9]* instructions
            . total context switches
-   0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
+   0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
       *[0-9]* instructions per context switch
            . voluntary context switches
            0 direct context switches
-      100.00% voluntary switches
-        0.00% direct switches
+      100\.00% voluntary switches
+        0\.00% direct switches
          *[0-9]* system calls
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
-           0 idles
-      100.00% cpu busy
+    *[0-9]* idles
+      *[0-9\.]*% cpu busy
 Core #1 counts:
            . threads
       *[0-9]* instructions
            . total context switches
-   0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
+   0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
       *[0-9]* instructions per context switch
            . voluntary context switches
            0 direct context switches
-      100.00% voluntary switches
-        0.00% direct switches
+      100\.00% voluntary switches
+        0\.00% direct switches
          *[0-9]* system calls
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
-           0 idles
-      100.00% cpu busy
+    *[0-9]* idles
+      *[0-9\.]*% cpu busy
 Core #2 counts:
            . threads
       *[0-9]* instructions
            . total context switches
-   0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
+   0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
       *[0-9]* instructions per context switch
            . voluntary context switches
            0 direct context switches
-      100.00% voluntary switches
-        0.00% direct switches
+      100\.00% voluntary switches
+        0\.00% direct switches
          *[0-9]* system calls
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
-           0 idles
-      100.00% cpu busy
+    *[0-9]* idles
+      *[0-9\.]*% cpu busy
 Core #3 counts:
            . threads
       *[0-9]* instructions
            . total context switches
-   0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
+   0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
       *[0-9]* instructions per context switch
            . voluntary context switches
            0 direct context switches
-      100.00% voluntary switches
-        0.00% direct switches
+      100\.00% voluntary switches
+        0\.00% direct switches
          *[0-9]* system calls
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
-           0 idles
-      100.00% cpu busy
-Core #0 schedule: [A-H,]*
-Core #1 schedule: [A-H,]*
-Core #2 schedule: [A-H,]*
-Core #3 schedule: [A-H,]*
+    *[0-9]* idles
+      *[0-9\.]*% cpu busy
+Core #0 schedule: [A-Ha-h_]*
+Core #1 schedule: [A-Ha-h_]*
+Core #2 schedule: [A-Ha-h_]*
+Core #3 schedule: [A-Ha-h_]*

From 790ffd94c76c9304268d3e5ca1e074af8e90910c Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Wed, 22 Nov 2023 15:46:59 -0500
Subject: [PATCH 3/5] Do return EOF for MAP_TO_CONSISTENT_OUTPUT to avoid
 workers confusing analysis tools

---
 clients/drcachesim/analyzer.cpp            | 1 +
 clients/drcachesim/scheduler/scheduler.cpp | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index ddd51ba291f..c158c00cd84 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -560,6 +560,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_tasks(analyzer_worker_data_t *w
             // stream ordinals we would need to add a scheduler API to inject it.
             record = create_wait_marker();
         } else if (status == sched_type_t::STATUS_IDLE) {
+            assert(shard_type_ == SHARD_BY_CORE);
             // We let tools know about idle time so they can analyze cpu usage.
             // We synthesize a record here.  If we wanted this to count toward output
             // stream ordinals we would need to add a scheduler API to inject it.
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 1d58484ba64..a4109158902 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -2280,7 +2280,8 @@ template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output)
 {
-    if (live_input_count_.load(std::memory_order_acquire) == 0) {
+    if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT ||
+        live_input_count_.load(std::memory_order_acquire) == 0) {
         return sched_type_t::STATUS_EOF;
     } else {
         outputs_[output].waiting = true;

From 6c06c536bb0a28ebbeb9890dfbd31beaa9857775 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Wed, 22 Nov 2023 17:18:04 -0500
Subject: [PATCH 4/5] Add check for idle/wait to analysis_unit_tests

---
 clients/drcachesim/tests/analysis_unit_tests.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clients/drcachesim/tests/analysis_unit_tests.cpp b/clients/drcachesim/tests/analysis_unit_tests.cpp
index a4fa2646801..c580d27efbb 100644
--- a/clients/drcachesim/tests/analysis_unit_tests.cpp
+++ b/clients/drcachesim/tests/analysis_unit_tests.cpp
@@ -156,6 +156,10 @@ test_queries()
         parallel_shard_memref(void *shard_data, const memref_t &memref) override
         {
             per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
+            if (memref.marker.type == TRACE_TYPE_MARKER &&
+                (memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT ||
+                 memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE))
+                return true;
             // These are our testing goals: these queries.
             // We have one thread for each of our NUM_INPUTS workloads.
             assert(shard->stream->get_output_cpuid() == shard->index);

From e0b56294d04c7914d1eec9aaada32cd5f875550a Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Mon, 27 Nov 2023 21:48:10 -0500
Subject: [PATCH 5/5] Review requests: more robust prev-wait/idle; named
 constants; improved comments

---
 clients/drcachesim/common/trace_entry.h       |  6 ++--
 clients/drcachesim/scheduler/scheduler.cpp    |  5 +++
 clients/drcachesim/scheduler/scheduler.h      | 10 +++---
 .../drcachesim/tests/scheduler_launcher.cpp   | 35 ++++++++++++-------
 clients/drcachesim/tools/schedule_stats.cpp   | 15 ++++----
 5 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h
index 9cffd946533..166c5f32055 100644
--- a/clients/drcachesim/common/trace_entry.h
+++ b/clients/drcachesim/common/trace_entry.h
@@ -588,9 +588,11 @@ typedef enum {
      * core has no available inputs to run (all inputs are on other cores or are
      * blocked waiting for kernel resources).  A new marker is emitted each
      * time the tool analysis framework requests a new record from the scheduler and
-     * is given a wait status.  There are no units of time here but each repetition
+     * is given an idle status.  There are no units of time here but each repetition
      * is roughly the time where a regular record could have been read and passed
-     * along.
+     * along.  This idle marker indicates that a core actually had no work to do,
+     * as opposed to #TRACE_MARKER_TYPE_CORE_WAIT which is an artifact of an
+     * imposed re-created schedule.
      */
     TRACE_MARKER_TYPE_CORE_IDLE,
 
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index a4109158902..e9494f44a80 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -1683,6 +1683,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
             // XXX i#5843: We may want to provide a kernel-mediated wait
             // feature so a multi-threaded simulator doesn't have to do a
             // spinning poll loop.
+            // XXX i#5843: For replaying a schedule as it was traced with
+            // MAP_TO_RECORDED_OUTPUT there may have been true idle periods during
+            // tracing where some other process than the traced workload was
+            // scheduled on a core.  If we could identify those, we should return
+            // STATUS_IDLE rather than STATUS_WAIT.
             VPRINT(this, 3, "next_record[%d]: waiting for input %d instr #%" PRId64 "\n",
                    output, index, segment.start_instruction);
             // Give up this input and go into a wait state.
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index e9f999117bb..bf5d1808921 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -110,12 +110,12 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * For dynamic scheduling with cross-stream dependencies, the scheduler may pause
          * a stream if it gets ahead of another stream it should have a dependence on.
          * This value is also used for schedules following the recorded timestamps
-         * (#DEPENDENCY_TIMESTAMPS) to avoid one stream getting ahead of another.  For
-         * replaying a schedule as it was traced with #MAP_TO_RECORDED_OUTPUT this can
-         * indicate an idle period on a core where the traced workload was not currently
-         * scheduled, but generally #STATUS_WAIT should be treated as artificial.
+         * (#DEPENDENCY_TIMESTAMPS) to avoid one stream getting ahead of another.
+         * #STATUS_WAIT should be treated as artificial, an artifact of enforcing a
+         * recorded schedule on concurrent differently-timed output streams.
          * Simulators are suggested to not advance simulated time for #STATUS_WAIT while
-         * they should advance time for #STATUS_IDLE.
+         * they should advance time for #STATUS_IDLE as the latter indicates a true
+         * lack of work.
          */
         STATUS_WAIT,
         STATUS_INVALID,         /**< Error condition. */
diff --git a/clients/drcachesim/tests/scheduler_launcher.cpp b/clients/drcachesim/tests/scheduler_launcher.cpp
index 814a0b703fa..b0cac30df6e 100644
--- a/clients/drcachesim/tests/scheduler_launcher.cpp
+++ b/clients/drcachesim/tests/scheduler_launcher.cpp
@@ -152,11 +152,15 @@ void
 simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &scheduler,
               std::string &thread_sequence)
 {
+    // XXX: Could we share some code with the schedule_stats analysis tool?
+    // Some features are now duplicated in both.
+    static constexpr char THREAD_LETTER_INITIAL_START = 'A';
+    static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a';
+    static constexpr char WAIT_SYMBOL = '-';
+    static constexpr char IDLE_SYMBOL = '_';
     memref_t record;
     uint64_t micros = op_sched_time.get_value() ? get_current_microseconds() : 0;
     uint64_t cur_segment_instrs = 0;
-    // XXX: Should we remove this tool as the schedule_stats analysis tool has
-    // superceded it?  We're basically duplicating new features in both.
     bool prev_was_wait = false, prev_was_idle = false;
     // Measure cpu usage by counting each next_record() as one cycle.
     uint64_t cycles_total = 0, cycles_busy = 0;
@@ -168,9 +172,14 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch
         if (op_sched_time.get_value())
             micros = get_current_microseconds();
         ++cycles_total;
+        // Cache and reset here to ensure we reset on early return paths.
+        bool was_wait = prev_was_wait;
+        bool was_idle = prev_was_idle;
+        prev_was_wait = false;
+        prev_was_idle = false;
         if (status == scheduler_t::STATUS_WAIT) {
-            if (!prev_was_wait || cur_segment_instrs == op_print_every.get_value())
-                thread_sequence += '-';
+            if (!was_wait || cur_segment_instrs == op_print_every.get_value())
+                thread_sequence += WAIT_SYMBOL;
             ++cur_segment_instrs;
             if (cur_segment_instrs == op_print_every.get_value())
                 cur_segment_instrs = 0;
@@ -178,8 +187,8 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch
             std::this_thread::yield();
             continue;
         } else if (status == scheduler_t::STATUS_IDLE) {
-            if (!prev_was_idle || cur_segment_instrs == op_print_every.get_value())
-                thread_sequence += '_';
+            if (!was_idle || cur_segment_instrs == op_print_every.get_value())
+                thread_sequence += IDLE_SYMBOL;
             ++cur_segment_instrs;
             if (cur_segment_instrs == op_print_every.get_value())
                 cur_segment_instrs = 0;
@@ -215,9 +224,8 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch
         scheduler_t::input_ordinal_t input = stream->get_input_stream_ordinal();
         if (input != prev_input) {
             // We convert to letters which only works well for <=26 inputs.
-            if (!thread_sequence.empty())
-                thread_sequence += ',';
-            thread_sequence += 'A' + static_cast<char>(input % 26);
+            thread_sequence +=
+                THREAD_LETTER_INITIAL_START + static_cast<char>(input % 26);
             cur_segment_instrs = 0;
             if (op_verbose.get_value() >= 2) {
                 std::ostringstream line;
@@ -248,7 +256,8 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch
         if (type_is_instr(record.instr.type)) {
             ++cur_segment_instrs;
             if (cur_segment_instrs == op_print_every.get_value()) {
-                thread_sequence += 'A' + static_cast<char>(input % 26);
+                thread_sequence +=
+                    THREAD_LETTER_SUBSEQUENT_START + static_cast<char>(input % 26);
                 cur_segment_instrs = 0;
             }
         }
@@ -272,8 +281,10 @@ simulate_core(int ordinal, scheduler_t::stream_t *stream, const scheduler_t &sch
     float usage = 0;
     if (cycles_total > 0)
         usage = 100.f * cycles_busy / static_cast<float>(cycles_total);
-    std::cerr << "Core #" << std::setw(2) << ordinal << " usage: " << std::setw(9)
-              << usage << "%\n";
+    std::ostringstream line;
+    line << "Core #" << std::setw(2) << ordinal << " usage: " << std::setw(9) << usage
+         << "%\n";
+    std::cerr << line.str();
 }
 
 } // namespace
diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
index fdc250fb4b8..67bfe69b0e5 100644
--- a/clients/drcachesim/tools/schedule_stats.cpp
+++ b/clients/drcachesim/tools/schedule_stats.cpp
@@ -160,13 +160,18 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         line << "\n";
         std::cerr << line.str();
     }
+    // Cache and reset here to ensure we reset on early return paths.
+    bool was_wait = shard->prev_was_wait;
+    bool was_idle = shard->prev_was_idle;
+    shard->prev_was_wait = false;
+    shard->prev_was_idle = false;
     if (memref.marker.type == TRACE_TYPE_MARKER &&
         memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) {
         ++shard->counters.waits;
-        if (!shard->prev_was_wait) {
+        shard->prev_was_wait = true;
+        if (!was_wait) {
             shard->thread_sequence += WAIT_SYMBOL;
             shard->cur_segment_instrs = 0;
-            shard->prev_was_wait = true;
         } else {
             ++shard->cur_segment_instrs;
             if (shard->cur_segment_instrs == knob_print_every_) {
@@ -178,10 +183,10 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
     } else if (memref.marker.type == TRACE_TYPE_MARKER &&
                memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_IDLE) {
         ++shard->counters.idles;
-        if (!shard->prev_was_idle) {
+        shard->prev_was_idle = true;
+        if (!was_idle) {
             shard->thread_sequence += IDLE_SYMBOL;
             shard->cur_segment_instrs = 0;
-            shard->prev_was_idle = true;
         } else {
             ++shard->cur_segment_instrs;
             if (shard->cur_segment_instrs == knob_print_every_) {
@@ -251,8 +256,6 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         }
     } else if (memref.exit.type == TRACE_TYPE_THREAD_EXIT)
         shard->saw_exit = true;
-    shard->prev_was_wait = false;
-    shard->prev_was_idle = false;
     return true;
 }