i#6971: Use instr count instead of wallclock for simulated time (#7015)

When using the drmemtrace scheduler in an analyzer or other tool that does not track simulated time with the default QUANTUM_INSTRUCTIONS, the scheduler used to use wall-clock time to measure blocking-input and idle time. Here we change that to use the instruction count plus the idle count via a new idle counter. The time_units_per_us and sched_time_units_per_us defaults are set to 1000, reflecting a 2gHz machine with IPC=0.5. The old time_units_per_us=100 for wall clock was too low; to match it with counts, we need a low sched_time_units_per_us: 500 is better than 1000, but that seems unrealisitc. Instead we can get the results we want from our large traces by exiting earlier, since most of the unwanted idle is still in seemingly unrepresentative regions at the end. We raise exit_if_fraction_left from 0.05 to 0.1 here. Using counters provides a more reproducible result across different runs and machines. Wall-clock time is still used to measure idle time on replay. Switching to the idle count added here is left as separate work under #7023. (Replay also uses wall-clock time to coordinate concurrent outputs beyond shared input constraints; that will likely always remain.) The new default values of the options were tested on larger traces and found to produce a representative level of idle time. This change means that the clock going backward problem (#6966) is no longer seen in default runs. The analyzer still supports wall-clock with the -sched_time option so a check to avoid underflow is added. Fixes #6971 Fixes #6966
DynamoRIO · Oct 4, 2024 · dff98b7 · dff98b7
1 parent 94f5c44
commit dff98b7
Show file tree

Hide file tree

Showing 5 changed files with 66 additions and 33 deletions.
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
@@ -524,7 +524,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_serial(analyzer_worker_data_t &
     while (true) {
         RecordType record;
         // The current time is used for time quanta; for instr quanta, it's ignored and
-        // we pass 0.
+        // we pass 0 and let the scheduler use instruction + idle counts.
         uint64_t cur_micros = sched_by_time_ ? get_current_microseconds() : 0;
         typename sched_type_t::stream_status_t status =
             worker.stream->next_record(record, cur_micros);

diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
@@ -908,17 +908,17 @@ droption_t<int64_t>
     // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
     op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 10 * 1000 * 1000,
                      "Scheduling quantum",
-                     "Applies to -core_sharded and -core_serial. "
-                     "Scheduling quantum in instructions, unless -sched_time is set in "
-                     "which case this value is multiplied by -sched_time_per_us to "
-                     "produce a quantum in wall-clock microseconds.");
+                     "Applies to -core_sharded and -core_serial.  Scheduling quantum in "
+                     "instructions, unless -sched_time is set in which case this value "
+                     "is the quantum in simulated microseconds (equal to wall-clock "
+                     "microseconds multiplied by -sched_time_per_us).");
 
 droption_t<bool>
     op_sched_time(DROPTION_SCOPE_ALL, "sched_time", false,
                   "Whether to use time for the scheduling quantum",
-                  "Applies to -core_sharded and -core_serial. "
-                  "Whether to use wall-clock time for the scheduling quantum, with a "
-                  "value equal to -sched_quantum in microseconds of wall-clock time.");
+                  "Applies to -core_sharded and -core_serial.  Whether to use wall-clock "
+                  "time (multiplied by -sched_time_per_us) for measuring idle time and "
+                  "for the scheduling quantum (see -sched_quantum).");
 
 droption_t<bool> op_sched_order_time(DROPTION_SCOPE_ALL, "sched_order_time", true,
                                      "Whether to honor recorded timestamps for ordering",
@@ -1016,11 +1016,12 @@ droption_t<bool> op_sched_infinite_timeouts(
     "(set to false).");
 
 droption_t<double> op_sched_time_units_per_us(
-    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100.,
+    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 1000.,
     "Time units per simulated microsecond",
-    "Time units (currently wall-clock time) per simulated microsecond.  This scales all "
-    "of the -sched_*_us values as it converts wall-clock time into the simulated "
-    "microseconds measured by those options.");
+    "Time units per simulated microsecond.  The units are either the instruction count "
+    "plus the idle count (the default) or if -sched_time is selected wall-clock "
+    "microseconds.  This option value scales all of the -sched_*_us values as it "
+    "converts time units into the simulated microseconds measured by those options.");
 
 droption_t<uint64_t> op_sched_migration_threshold_us(
     DROPTION_SCOPE_ALL, "sched_migration_threshold_us", 500,
@@ -1035,7 +1036,7 @@ droption_t<uint64_t> op_sched_rebalance_period_us(
     "to redistribute load.");
 
 droption_t<double> op_sched_exit_if_fraction_inputs_left(
-    DROPTION_SCOPE_FRONTEND, "sched_exit_if_fraction_inputs_left", 0.05,
+    DROPTION_SCOPE_FRONTEND, "sched_exit_if_fraction_inputs_left", 0.1,
     "Exit if non-EOF inputs left are <= this fraction of the total",
     "Applies to -core_sharded and -core_serial.  When an input reaches EOF, if the "
     "number of non-EOF inputs left as a fraction of the original inputs is equal to or "

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
@@ -2721,8 +2721,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
             if (res->blocked_time > 0 &&
                 // cur_time can be 0 at initialization time.
                 (cur_time == 0 ||
-                 // XXX i#6966: We have seen wall-clock time go backward, which
-                 // underflows here and then always unblocks the input.
+                 // Guard against time going backward (happens for wall-clock: i#6966).
+                 cur_time < res->blocked_start_time ||
                  cur_time - res->blocked_start_time < res->blocked_time)) {
                 VPRINT(this, 4, "pop queue: %d still blocked for %" PRIu64 "\n",
                        res->index,
@@ -2733,6 +2733,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                 // This input is no longer blocked.
                 res->blocked_time = 0;
                 res->unscheduled = false;
+                VPRINT(this, 4, "pop queue: %d @ %" PRIu64 " no longer blocked\n",
+                       res->index, cur_time);
                 // We've found a candidate.  One final check if this is a migration.
                 bool found_candidate = false;
                 if (from_output == for_output)
@@ -2745,7 +2747,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                            from_output, for_output, cur_time, res->last_run_time,
                            cur_time - res->last_run_time,
                            options_.migration_threshold_us);
-                    // Guard against time going backward, which happens: i#6966.
+                    // Guard against time going backward (happens for wall-clock: i#6966).
                     if (options_.migration_threshold_us == 0 || res->last_run_time == 0 ||
                         (cur_time > res->last_run_time &&
                          cur_time - res->last_run_time >=
@@ -2771,6 +2773,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
     if (res == nullptr && !blocked.empty()) {
         // Do not hand out EOF thinking we're done: we still have inputs blocked
         // on i/o, so just wait and retry.
+        if (for_output != INVALID_OUTPUT_ORDINAL)
+            ++outputs_[for_output].idle_count;
         status = STATUS_IDLE;
     }
     // Re-add the ones we skipped, but without changing their counters so we preserve
@@ -3039,6 +3043,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         outputs_[output].waiting = true;
         outputs_[output].wait_start_time = get_output_time(output);
         outputs_[output].record_index->fetch_add(1, std::memory_order_release);
+        ++outputs_[output].idle_count;
         return sched_type_t::STATUS_IDLE;
     }
     index = segment.key.input;
@@ -3729,17 +3734,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     // do return an error on a time smaller than an input's current start time when we
     // check for quantum end.
     if (cur_time == 0) {
-        // It's more efficient for QUANTUM_INSTRUCTIONS to get the time here instead of
-        // in get_output_time().  This also makes the two more similarly behaved with
-        // respect to blocking system calls.
-        // TODO i#6971: Use INSTRS_PER_US to replace .cur_time completely
-        // with a counter-based time, weighted appropriately for STATUS_IDLE.
-        cur_time = get_time_micros();
+        if (options_.mapping == MAP_AS_PREVIOUSLY) {
+            // XXX i#7023: We should instead store the simulator's time (whether
+            // passed in or our instr-based formula below) in the records and do away
+            // with wall-clock time for idle measurement.  Either way, we should make
+            // it clear in the docs whether the user/simulator has to pass in the
+            // time on replay.
+            cur_time = get_time_micros();
+        } else {
+            // We add 1 to avoid an invalid value of 0.
+            cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
+                outputs_[output].idle_count;
+        }
     }
     // Invalid values for cur_time are checked below.
     outputs_[output].cur_time->store(cur_time, std::memory_order_release);
-    if (!outputs_[output].active->load(std::memory_order_acquire))
+    if (!outputs_[output].active->load(std::memory_order_acquire)) {
+        ++outputs_[output].idle_count;
         return sched_type_t::STATUS_IDLE;
+    }
     if (outputs_[output].waiting) {
         if (options_.mapping == MAP_AS_PREVIOUSLY &&
             outputs_[output].wait_start_time > 0) {
@@ -3752,6 +3765,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 VPRINT(this, 4,
                        "next_record[%d]: elapsed %" PRIu64 " < duration %" PRIu64 "\n",
                        output, now - outputs_[output].wait_start_time, duration);
+                // XXX i#7023: This should always be STATUS_IDLE, right?
                 return sched_type_t::STATUS_WAIT;
             } else
                 outputs_[output].wait_start_time = 0;
@@ -4266,6 +4280,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
     if (prev_input != INVALID_INPUT_ORDINAL)
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_IDLE];
     set_cur_input(output, INVALID_INPUT_ORDINAL);
+    ++outputs_[output].idle_count;
     return sched_type_t::STATUS_IDLE;
 }
 

diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
@@ -739,8 +739,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * other parameters that are in microseconds (they all end in "_us": e.g.,
          * #quantum_duration_us) so that they operate on the right time scale for the
          * passed-in simulator time (or wall-clock microseconds if no time is passed).
+         * The default value is a rough estimate when no accurate simulated time is
+         * available: the instruction count is used in that case, and we use the
+         * instructions per microsecond for a 2GHz clock at 0.5 IPC as our default.
          */
-        double time_units_per_us = 100.;
+        double time_units_per_us = 1000.;
         /**
          * The scheduling quantum duration for preemption, in simulated microseconds,
          * for #QUANTUM_TIME.  This value is multiplied by #time_units_per_us to
@@ -815,7 +818,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * instruction count is not considered (as it is not available), use discretion
          * when raising this value on uneven inputs.
          */
-        double exit_if_fraction_inputs_left = 0.05;
+        double exit_if_fraction_inputs_left = 0.1;
         // When adding new options, also add to print_configuration().
     };
 
@@ -874,19 +877,21 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // diminished.
         /**
          * Advances to the next record in the stream.  Returns a status code on whether
-         * and how to continue.
+         * and how to continue.  Uses the instruction count plus idle count to this point
+         * as the time; use the variant that takes "cur_time" to instead provide a time.
          */
         virtual stream_status_t
         next_record(RecordType &record);
 
         /**
          * Advances to the next record in the stream.  Returns a status code on whether
          * and how to continue.  Supplies the current time for #QUANTUM_TIME.  The time
-         * should be considered to be the time prior to processing the returned record.
-         * The time is unitless but needs to be a globally consistent increasing value
-         * across all output streams.  #STATUS_INVALID is returned if 0 or a value smaller
-         * than the start time of the current input's quantum is passed in when
-         * #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
+         * should be considered to be the simulated time prior to processing the returned
+         * record.  The time's units can be chosen by the caller, with
+         * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.time_units_per_us
+         * providing the conversion to simulated microseconds.  #STATUS_INVALID is
+         * returned if 0 or a value smaller than the start time of the current input's
+         * quantum is passed in when #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
          */
         virtual stream_status_t
         next_record(RecordType &record, uint64_t cur_time);
@@ -996,6 +1001,15 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
                 return scheduler_->get_input_stream(ordinal_)->get_instruction_ordinal();
             return cur_instr_count_;
         }
+        /**
+         * Identical to get_instruction_ordinal() but ignores the
+         * #SCHEDULER_USE_INPUT_ORDINALS flag.
+         */
+        uint64_t
+        get_output_instruction_ordinal() const
+        {
+            return cur_instr_count_;
+        }
         /**
          * Returns a name for the current input stream feeding this output stream. For
          * stored offline traces, this is the base name of the trace on disk. For online
@@ -1683,6 +1697,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // Exported statistics. Currently all integers and cast to double on export.
         std::vector<int64_t> stats =
             std::vector<int64_t>(memtrace_stream_t::SCHED_STAT_TYPE_COUNT);
+        // When no simulation time is passed to us, we use the idle count plus
+        // instruction count to measure time.
+        uint64_t idle_count = 0;
     };
 
     // Used for reading as-traced schedules.

diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -2245,7 +2245,7 @@ test_synthetic_with_syscalls_precise()
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
-                                               /*verbosity=*/4);
+                                               /*verbosity=*/3);
     sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, 1, std::move(sched_ops)) !=
@@ -2349,7 +2349,7 @@ test_synthetic_with_syscalls_latencies()
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
-                                               /*verbosity=*/4);
+                                               /*verbosity=*/3);
     // We use a mock time for a deterministic result.
     sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
     sched_ops.time_units_per_us = 1.;