DynamoRIO · abhinav92003 · Feb 12, 2024 · Feb 9, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/api/docs/release.dox b/api/docs/release.dox
@@ -142,6 +142,11 @@ changes:
    refers to timestamps and direct switches, which is what most users should want.
  - Rename the macro INSTR_CREATE_mul_sve to INSTR_CREATE_mul_sve_imm to
    differentiate it from the other SVE MUL instructions.
+ - Added a new drmemtrace analyzer option \p -interval_instr_count that enables trace
+   analyzer interval results for every given count of instrs in each shard. This mode
+   does not support merging the shard interval snapshots to output the whole-trace
+   interval snapshots. Instead, the print_interval_results() API is called separately
+   for each shard with the interval state snapshots of that shard.
 
 Further non-compatibility-affecting changes include:
  - Added DWARF-5 support to the drsyms library by linking in 4 static libraries

diff --git a/clients/drcachesim/analysis_tool.h b/clients/drcachesim/analysis_tool.h
@@ -231,10 +231,10 @@ template <typename RecordType> class analysis_tool_tmpl_t {
         // to the specified -interval_microseconds.
         uint64_t interval_end_timestamp = 0;
 
-        // Count of instructions: cumulative till this interval, and the incremental
-        // delta in this interval vs the previous one. May be useful for tools to
-        // compute PKI (per kilo instruction) metrics; obviates the need for each
-        // tool to duplicate this.
+        // Count of instructions: cumulative till this interval's end, and the
+        // incremental delta in this interval vs the previous one. May be useful for
+        // tools to compute PKI (per kilo instruction) metrics; obviates the need for
+        // each tool to duplicate this.
         uint64_t instr_count_cumulative = 0;
         uint64_t instr_count_delta = 0;
 

diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
diff --git a/clients/drcachesim/analyzer.h b/clients/drcachesim/analyzer.h
@@ -119,7 +119,8 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     analyzer_tmpl_t(const std::string &trace_path,
                     analysis_tool_tmpl_t<RecordType> **tools, int num_tools,
                     int worker_count = 0, uint64_t skip_instrs = 0,
-                    uint64_t interval_microseconds = 0, int verbosity = 0);
+                    uint64_t interval_microseconds = 0, uint64_t interval_instr_count = 0,
+                    int verbosity = 0);
     /** Launches the analysis process. */
     virtual bool
     run();
@@ -167,6 +168,8 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
         }
 
         uint64_t cur_interval_index;
+        // Cumulative instr count as it was just before the start of the current
+        // interval.
         uint64_t cur_interval_init_instr_count;
         // Identifier for the shard (thread or core id).
         int64_t shard_id;
@@ -250,6 +253,9 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     bool
     record_is_timestamp(const RecordType &record);
 
+    bool
+    record_is_instr(const RecordType &record);
+
     RecordType
     create_wait_marker();
 
@@ -262,27 +268,36 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     // finished. For serial analysis, it should remain the default value.
     bool
     process_interval(uint64_t interval_id, uint64_t interval_init_instr_count,
-                     analyzer_worker_data_t *worker, bool parallel, int shard_idx = 0);
+                     analyzer_worker_data_t *worker, bool parallel, bool at_instr_record,
+                     int shard_idx = 0);
 
     // Compute interval id for the given latest_timestamp, assuming the trace (or
-    // trace shard) starts at the given first_timestamp.
+    // trace shard) starts at the given first_timestamp. This is relevant when
+    // timestamp intervals are enabled using interval_microseconds_.
+    uint64_t
+    compute_timestamp_interval_id(uint64_t first_timestamp, uint64_t latest_timestamp);
+
+    // Compute interval id at the given instr count. This is relevant when instr count
+    // intervals are enabled using interval_instr_count_.
     uint64_t
-    compute_interval_id(uint64_t first_timestamp, uint64_t latest_timestamp);
+    compute_instr_count_interval_id(uint64_t cur_instr_count);
 
-    // Compute the interval end timestamp for the given interval_id, assuming the trace
-    // (or trace shard) starts at the given first_timestamp.
+    // Compute the interval end timestamp (non-inclusive) for the given interval_id,
+    // assuming the trace (or trace shard) starts at the given first_timestamp.
     uint64_t
     compute_interval_end_timestamp(uint64_t first_timestamp, uint64_t interval_id);
 
     // Possibly advances the current interval id stored in the worker data, based
     // on the most recent seen timestamp in the trace stream. Returns whether the
     // current interval id was updated, and if so also sets the previous interval index
     // in prev_interval_index.
+    // at_instr_record indicates that the next record that will be presented to
+    // the analysis tools is an instr record.
     bool
     advance_interval_id(
         typename scheduler_tmpl_t<RecordType, ReaderType>::stream_t *stream,
         analyzer_shard_data_t *shard, uint64_t &prev_interval_index,
-        uint64_t &prev_interval_init_instr_count);
+        uint64_t &prev_interval_init_instr_count, bool at_instr_record);
 
     // Collects interval results for all shards from the workers, and then optional
     // merges the shard-local intervals to form the whole-trace interval results using
@@ -305,6 +320,11 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
                         *> &merged_intervals,
         int tool_idx);
 
+    // Populates the unmerged_interval_snapshots_ field based on the interval snapshots
+    // stored in worker_data_.
+    void
+    populate_unmerged_shard_interval_results();
+
     // Combines all interval snapshots in the given vector to create the interval
     // snapshot for the whole-trace interval ending at interval_end_timestamp and
     // stores it in 'result'. These snapshots are for the tool at tool_idx. Returns
@@ -329,21 +349,63 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     std::vector<analyzer_worker_data_t> worker_data_;
     int num_tools_;
     analysis_tool_tmpl_t<RecordType> **tools_;
-    // Stores the interval state snapshots for the whole trace, which for the parallel
-    // mode are the resulting interval state snapshots after merging from all shards
-    // in merge_shard_interval_results.
+    // Stores the interval state snapshots, merged across shards. These are
+    // produced when timestamp intervals are enabled using interval_microseconds_.
+    //
     // merged_interval_snapshots_[tool_idx] is a vector of the interval snapshots
-    // (in order of the intervals) for that tool.
-    // This may not be set, depending on the derived class's implementation of
-    // collect_and_maybe_merge_shard_interval_results.
+    // (in order of the intervals) for that tool. For the parallel mode, these
+    // interval state snapshots are produced after merging corresponding shard
+    // interval snapshots using merge_shard_interval_results.
     std::vector<std::vector<
         typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>>
         merged_interval_snapshots_;
+
+    // Key that combines tool and shard idx for use with an std::unordered_map.
+    struct key_tool_shard_t {
+        int tool_idx;
+        int shard_idx;
+        bool
+        operator==(const key_tool_shard_t &rhs) const
+        {
+            return tool_idx == rhs.tool_idx && shard_idx == rhs.shard_idx;
+        }
+    };
+    struct key_tool_shard_hash_t {
+        std::size_t
+        operator()(const key_tool_shard_t &t) const
+        {
+            return std::hash<int>()(t.tool_idx ^ t.shard_idx);
+        }
+    };
+
+    // Stores the interval state snapshots, unmerged across shards. These are
+    // produced when instr count intervals are enabled using interval_instr_count_.
+    //
+    // unmerged_interval_snapshots_[(tool_idx, shard_idx)] is a vector
+    // of the interval snapshots for that tool and shard. Note that the snapshots for
+    // each shard are separate; they are not merged across shards.
+    //
+    // TODO i#6643: Figure out a useful way to merge instr count intervals across shards.
+    // One way is to merge the shard interval snapshots that correspond to the same
+    // [interval_instr_count_ * interval_id, interval_instr_count_ * (interval_id + 1))
+    // shard-local instrs. But it is not clear whether this is useful.
+    // Another way is to merge the shard interval snapshots that correspond to the same
+    // [interval_instr_count_ * interval_id, interval_instr_count_ * (interval_id + 1))
+    // whole-trace instrs. But that is much harder to compute. We'd need some way to
+    // identify the whole-trace interval boundaries in each shard's stream (since we
+    // process each shard separately); this would likely need a pre-processing pass.
+    std::unordered_map<key_tool_shard_t,
+                       std::vector<typename analysis_tool_tmpl_t<
+                           RecordType>::interval_state_snapshot_t *>,
+                       key_tool_shard_hash_t>
+        unmerged_interval_snapshots_;
+
     bool parallel_;
     int worker_count_;
     const char *output_prefix_ = "[analyzer]";
     uint64_t skip_instrs_ = 0;
     uint64_t interval_microseconds_ = 0;
+    uint64_t interval_instr_count_ = 0;
     int verbosity_ = 0;
     shard_type_t shard_type_ = SHARD_BY_THREAD;
     bool sched_by_time_ = false;

diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
@@ -111,6 +111,7 @@ analyzer_multi_t::analyzer_multi_t()
     worker_count_ = op_jobs.get_value();
     skip_instrs_ = op_skip_instrs.get_value();
     interval_microseconds_ = op_interval_microseconds.get_value();
+    interval_instr_count_ = op_interval_instr_count.get_value();
     // Initial measurements show it's sometimes faster to keep the parallel model
     // of using single-file readers but use them sequentially, as opposed to
     // the every-file interleaving reader, but the user can specify -jobs 1, so

diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
@@ -521,7 +521,17 @@ droption_t<bytesize_t> op_interval_microseconds(
     "Enable periodic heartbeats for intervals of given microseconds in the trace.",
     "Desired length of each trace interval, defined in microseconds of trace time. "
     "Trace intervals are measured using the TRACE_MARKER_TYPE_TIMESTAMP marker values. "
-    "If set, analysis tools receive a callback at the end of each interval.");
+    "If set, analysis tools receive a callback at the end of each interval, and one "
+    "at the end of trace analysis to print the whole-trace interval results.");
+
+droption_t<bytesize_t> op_interval_instr_count(
+    DROPTION_SCOPE_FRONTEND, "interval_instr_count", 0,
+    "Enable periodic heartbeats for intervals of given per-shard instr count. ",
+    "Desired length of each trace interval, defined in instr count of each shard. "
+    "With -parallel, this does not support whole trace intervals, only per-shard "
+    "intervals. If set, analysis tools receive a callback at the end of each interval, "
+    "and separate callbacks per shard at the end of trace analysis to print each "
+    "shard's interval results.");
 
 droption_t<int>
     op_only_thread(DROPTION_SCOPE_FRONTEND, "only_thread", 0,

diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
@@ -158,6 +158,8 @@ extern dynamorio::droption::droption_t<std::string> op_tracer_alt;
 extern dynamorio::droption::droption_t<std::string> op_tracer_ops;
 extern dynamorio::droption::droption_t<dynamorio::droption::bytesize_t>
     op_interval_microseconds;
+extern dynamorio::droption::droption_t<dynamorio::droption::bytesize_t>
+    op_interval_instr_count;
 extern dynamorio::droption::droption_t<int> op_only_thread;
 extern dynamorio::droption::droption_t<dynamorio::droption::bytesize_t> op_skip_instrs;
 extern dynamorio::droption::droption_t<dynamorio::droption::bytesize_t> op_skip_refs;

diff --git a/clients/drcachesim/tests/offline-interval-instr-count-output.templatex b/clients/drcachesim/tests/offline-interval-instr-count-output.templatex
@@ -0,0 +1,37 @@
+Hello, world!
+Basic counts tool results:
+Total counts:
+     [ 0-9]* total \(fetched\) instructions
+     [ 0-9]* total unique \(fetched\) instructions
+     [ 0-9]* total non-fetched instructions
+     [ 0-9]* total prefetches
+     [ 0-9]* total data loads
+     [ 0-9]* total data stores
+     [ 0-9]* total icache flushes
+     [ 0-9]* total dcache flushes
+           1 total threads
+     [ 0-9]* total scheduling markers
+.*
+Thread [0-9]* counts:
+     [ 0-9]* \(fetched\) instructions
+     [ 0-9]* unique \(fetched\) instructions
+     [ 0-9]* non-fetched instructions
+     [ 0-9]* prefetches
+     [ 0-9]* data loads
+     [ 0-9]* data stores
+     [ 0-9]* icache flushes
+     [ 0-9]* dcache flushes
+     [ 0-9]* scheduling markers
+.*
+Printing unmerged per-shard interval results:
+Counts per trace interval for TID.*
+Interval #1 ending at timestamp [0-9]*:
+       10000 interval delta \(fetched\) instructions
+     [ 0-9]* interval delta non-fetched instructions
+     [ 0-9]* interval delta prefetches
+     [ 0-9]* interval delta data loads
+     [ 0-9]* interval delta data stores
+     [ 0-9]* interval delta icache flushes
+     [ 0-9]* interval delta dcache flushes
+     [ 0-9]* interval delta scheduling markers
+.*
diff --git a/...s/offline-interval-count-output.templatex → ...erval-microseconds-count-output.templatex b/...s/offline-interval-count-output.templatex → ...erval-microseconds-count-output.templatex
@@ -23,6 +23,7 @@ Thread [0-9]* counts:
      [ 0-9]* dcache flushes
      [ 0-9]* scheduling markers
 .*
+Printing whole-trace interval results:
 Counts per trace interval for whole trace:
 Interval #1 ending at timestamp [0-9]*:
      [ 0-9]* interval delta \(fetched\) instructions