diff --git a/api/docs/release.dox b/api/docs/release.dox
index 3d864c96cd6..5acf8fb5e9b 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -142,6 +142,11 @@ changes:
    refers to timestamps and direct switches, which is what most users should want.
  - Rename the macro INSTR_CREATE_mul_sve to INSTR_CREATE_mul_sve_imm to
    differentiate it from the other SVE MUL instructions.
+ - Added a new drmemtrace analyzer option \p -interval_instr_count that enables trace
+   analyzer interval results for every given count of instrs in each shard. This mode
+   does not support merging the shard interval snapshots to output the whole-trace
+   interval snapshots. Instead, the print_interval_results() API is called separately
+   for each shard with the interval state snapshots of that shard.
 
 Further non-compatibility-affecting changes include:
  - Added DWARF-5 support to the drsyms library by linking in 4 static libraries
diff --git a/clients/drcachesim/analysis_tool.h b/clients/drcachesim/analysis_tool.h
index fe5a6c097d6..0c4a03534da 100644
--- a/clients/drcachesim/analysis_tool.h
+++ b/clients/drcachesim/analysis_tool.h
@@ -231,10 +231,10 @@ template <typename RecordType> class analysis_tool_tmpl_t {
         // to the specified -interval_microseconds.
         uint64_t interval_end_timestamp = 0;
 
-        // Count of instructions: cumulative till this interval, and the incremental
-        // delta in this interval vs the previous one. May be useful for tools to
-        // compute PKI (per kilo instruction) metrics; obviates the need for each
-        // tool to duplicate this.
+        // Count of instructions: cumulative till this interval's end, and the
+        // incremental delta in this interval vs the previous one. May be useful for
+        // tools to compute PKI (per kilo instruction) metrics; obviates the need for
+        // each tool to duplicate this.
         uint64_t instr_count_cumulative = 0;
         uint64_t instr_count_delta = 0;
 
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index 851890684b4..62f9c29bd45 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -115,6 +115,13 @@ analyzer_t::record_is_timestamp(const memref_t &record)
         record.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP;
 }
 
+template <>
+bool
+analyzer_t::record_is_instr(const memref_t &record)
+{
+    return type_is_instr(record.instr.type);
+}
+
 template <>
 memref_t
 analyzer_t::create_wait_marker()
@@ -175,6 +182,13 @@ record_analyzer_t::record_is_timestamp(const trace_entry_t &record)
     return record.type == TRACE_TYPE_MARKER && record.size == TRACE_MARKER_TYPE_TIMESTAMP;
 }
 
+template <>
+bool
+record_analyzer_t::record_is_instr(const trace_entry_t &record)
+{
+    return type_is_instr(static_cast<trace_type_t>(record.type));
+}
+
 template <>
 trace_entry_t
 record_analyzer_t::create_wait_marker()
@@ -321,7 +335,7 @@ template <typename RecordType, typename ReaderType>
 analyzer_tmpl_t<RecordType, ReaderType>::analyzer_tmpl_t(
     const std::string &trace_path, analysis_tool_tmpl_t<RecordType> **tools,
     int num_tools, int worker_count, uint64_t skip_instrs, uint64_t interval_microseconds,
-    int verbosity)
+    uint64_t interval_instr_count, int verbosity)
     : success_(true)
     , num_tools_(num_tools)
     , tools_(tools)
@@ -329,8 +343,14 @@ analyzer_tmpl_t<RecordType, ReaderType>::analyzer_tmpl_t(
     , worker_count_(worker_count)
     , skip_instrs_(skip_instrs)
     , interval_microseconds_(interval_microseconds)
+    , interval_instr_count_(interval_instr_count)
     , verbosity_(verbosity)
 {
+    if (interval_microseconds_ > 0 && interval_instr_count_ > 0) {
+        success_ = false;
+        error_string_ = "Cannot enable both kinds of interval analysis";
+        return;
+    }
     // The scheduler will call reader_t::init() for each input file.  We assume
     // that won't block (analyzer_multi_t separates out IPC readers).
     typename sched_type_t::scheduler_options_t sched_ops;
@@ -385,10 +405,11 @@ analyzer_tmpl_t<RecordType, ReaderType>::get_current_microseconds()
 
 template <typename RecordType, typename ReaderType>
 uint64_t
-analyzer_tmpl_t<RecordType, ReaderType>::compute_interval_id(uint64_t first_timestamp,
-                                                             uint64_t latest_timestamp)
+analyzer_tmpl_t<RecordType, ReaderType>::compute_timestamp_interval_id(
+    uint64_t first_timestamp, uint64_t latest_timestamp)
 {
     assert(first_timestamp <= latest_timestamp);
+    assert(interval_microseconds_ > 0);
     // We keep the interval end timestamps independent of the first timestamp of the
     // trace. For the parallel mode, where we need to merge intervals from different
     // shards that were active during the same final whole-trace interval, having aligned
@@ -399,17 +420,34 @@ analyzer_tmpl_t<RecordType, ReaderType>::compute_interval_id(uint64_t first_time
         first_timestamp / interval_microseconds_ + 1;
 }
 
+template <typename RecordType, typename ReaderType>
+uint64_t
+analyzer_tmpl_t<RecordType, ReaderType>::compute_instr_count_interval_id(
+    uint64_t cur_instr_count)
+{
+    assert(interval_instr_count_ > 0);
+    if (cur_instr_count == 0)
+        return 1;
+    // We want all memory access entries following an instr to stay in the same
+    // interval as the instr, so we increment interval_id at instr entries. Also,
+    // we want the last instr in each interval to have an ordinal that's a multiple
+    // of interval_instr_count_.
+    return (cur_instr_count - 1) / interval_instr_count_ + 1;
+}
+
 template <typename RecordType, typename ReaderType>
 uint64_t
 analyzer_tmpl_t<RecordType, ReaderType>::compute_interval_end_timestamp(
     uint64_t first_timestamp, uint64_t interval_id)
 {
+    assert(interval_microseconds_ > 0);
     assert(interval_id >= 1);
     uint64_t end_timestamp =
         (first_timestamp / interval_microseconds_ + interval_id) * interval_microseconds_;
     // Since the interval's end timestamp is exclusive, the end_timestamp would actually
     // fall under the next interval.
-    assert(compute_interval_id(first_timestamp, end_timestamp) == interval_id + 1);
+    assert(compute_timestamp_interval_id(first_timestamp, end_timestamp) ==
+           interval_id + 1);
     return end_timestamp;
 }
 
@@ -418,19 +456,33 @@ bool
 analyzer_tmpl_t<RecordType, ReaderType>::advance_interval_id(
     typename scheduler_tmpl_t<RecordType, ReaderType>::stream_t *stream,
     analyzer_shard_data_t *shard, uint64_t &prev_interval_index,
-    uint64_t &prev_interval_init_instr_count)
+    uint64_t &prev_interval_init_instr_count, bool at_instr_record)
 {
-    if (interval_microseconds_ == 0) {
+    uint64_t next_interval_index = 0;
+    if (interval_microseconds_ > 0) {
+        next_interval_index = compute_timestamp_interval_id(stream->get_first_timestamp(),
+                                                            stream->get_last_timestamp());
+    } else if (interval_instr_count_ > 0) {
+        // The interval callbacks are invoked just prior to the process_memref or
+        // parallel_shard_memref callback for the first instr of the new interval; This
+        // keeps the instr's memory accesses in the same interval as the instr.
+        next_interval_index =
+            compute_instr_count_interval_id(stream->get_instruction_ordinal());
+    } else {
         return false;
     }
-    uint64_t next_interval_index =
-        compute_interval_id(stream->get_first_timestamp(), stream->get_last_timestamp());
     if (next_interval_index != shard->cur_interval_index) {
         assert(next_interval_index > shard->cur_interval_index);
         prev_interval_index = shard->cur_interval_index;
         prev_interval_init_instr_count = shard->cur_interval_init_instr_count;
         shard->cur_interval_index = next_interval_index;
-        shard->cur_interval_init_instr_count = stream->get_instruction_ordinal();
+        // If the next record to be presented to the tools is an instr record, we need to
+        // adjust for the fact that the record has already been read from the stream.
+        // Since we know that the next record is a part of the new interval and
+        // cur_interval_init_instr_count is supposed to be the count just prior to the
+        // new interval, we need to subtract one count for the instr.
+        shard->cur_interval_init_instr_count =
+            stream->get_instruction_ordinal() - (at_instr_record ? 1 : 0);
         return true;
     }
     return false;
@@ -443,7 +495,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_serial(analyzer_worker_data_t &
     std::vector<void *> user_worker_data(num_tools_);
 
     worker.shard_data[0].tool_data.resize(num_tools_);
-    if (interval_microseconds_ != 0)
+    if (interval_microseconds_ != 0 || interval_instr_count_ != 0)
         worker.shard_data[0].cur_interval_index = 1;
     for (int i = 0; i < num_tools_; ++i) {
         worker.error = tools_[i]->initialize_stream(worker.stream);
@@ -474,21 +526,22 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_serial(analyzer_worker_data_t &
                     worker.error =
                         "Failed to read from trace: " + worker.stream->get_stream_name();
                 }
-            } else if (interval_microseconds_ != 0) {
+            } else if (interval_microseconds_ != 0 || interval_instr_count_ != 0) {
                 process_interval(worker.shard_data[0].cur_interval_index,
                                  worker.shard_data[0].cur_interval_init_instr_count,
                                  &worker,
-                                 /*parallel=*/false);
+                                 /*parallel=*/false, /*at_instr_record=*/false);
             }
             return;
         }
         uint64_t prev_interval_index;
         uint64_t prev_interval_init_instr_count;
-        if (record_is_timestamp(record) &&
+        if ((record_is_timestamp(record) || record_is_instr(record)) &&
             advance_interval_id(worker.stream, &worker.shard_data[0], prev_interval_index,
-                                prev_interval_init_instr_count) &&
+                                prev_interval_init_instr_count,
+                                record_is_instr(record)) &&
             !process_interval(prev_interval_index, prev_interval_init_instr_count,
-                              &worker, /*parallel=*/false)) {
+                              &worker, /*parallel=*/false, record_is_instr(record))) {
             return;
         }
         for (int i = 0; i < num_tools_; ++i) {
@@ -511,11 +564,11 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_shard_exit(
     VPRINT(this, 1, "Worker %d finished trace shard %s\n", worker->index,
            worker->stream->get_stream_name().c_str());
     worker->shard_data[shard_index].exited = true;
-    if (interval_microseconds_ != 0 &&
+    if ((interval_microseconds_ != 0 || interval_instr_count_ != 0) &&
         !process_interval(worker->shard_data[shard_index].cur_interval_index,
                           worker->shard_data[shard_index].cur_interval_init_instr_count,
                           worker,
-                          /*parallel=*/true, shard_index))
+                          /*parallel=*/true, /*at_instr_record=*/false, shard_index))
         return false;
     for (int i = 0; i < num_tools_; ++i) {
         if (!tools_[i]->parallel_shard_exit(
@@ -578,7 +631,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_tasks(analyzer_worker_data_t *w
             VPRINT(this, 1, "Worker %d starting on trace shard %d stream is %p\n",
                    worker->index, shard_index, worker->stream);
             worker->shard_data[shard_index].tool_data.resize(num_tools_);
-            if (interval_microseconds_ != 0)
+            if (interval_microseconds_ != 0 || interval_instr_count_ != 0)
                 worker->shard_data[shard_index].cur_interval_index = 1;
             for (int i = 0; i < num_tools_; ++i) {
                 worker->shard_data[shard_index].tool_data[i].shard_data =
@@ -596,11 +649,12 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_tasks(analyzer_worker_data_t *w
         }
         uint64_t prev_interval_index;
         uint64_t prev_interval_init_instr_count;
-        if (record_is_timestamp(record) &&
+        if ((record_is_timestamp(record) || record_is_instr(record)) &&
             advance_interval_id(worker->stream, &worker->shard_data[shard_index],
-                                prev_interval_index, prev_interval_init_instr_count) &&
+                                prev_interval_index, prev_interval_init_instr_count,
+                                record_is_instr(record)) &&
             !process_interval(prev_interval_index, prev_interval_init_instr_count, worker,
-                              /*parallel=*/true, shard_index)) {
+                              /*parallel=*/true, record_is_instr(record), shard_index)) {
             return;
         }
         for (int i = 0; i < num_tools_; ++i) {
@@ -654,7 +708,8 @@ analyzer_tmpl_t<RecordType, ReaderType>::combine_interval_snapshots(
     result = tools_[tool_idx]->combine_interval_snapshots(latest_shard_snapshots,
                                                           interval_end_timestamp);
     if (result == nullptr) {
-        error_string_ = "combine_interval_snapshots unexpectedly returned nullptr";
+        error_string_ = "combine_interval_snapshots unexpectedly returned nullptr: " +
+            tools_[tool_idx]->get_error_string();
         return false;
     }
     result->instr_count_delta = 0;
@@ -758,7 +813,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::merge_shard_interval_results(
         cur_merged_interval->shard_id = analysis_tool_tmpl_t<
             RecordType>::interval_state_snapshot_t::WHOLE_TRACE_SHARD_ID;
         cur_merged_interval->interval_end_timestamp = earliest_interval_end_timestamp;
-        cur_merged_interval->interval_id = compute_interval_id(
+        cur_merged_interval->interval_id = compute_timestamp_interval_id(
             earliest_ever_interval_end_timestamp, earliest_interval_end_timestamp);
         merged_intervals.push_back(cur_merged_interval);
     }
@@ -772,10 +827,74 @@ analyzer_tmpl_t<RecordType, ReaderType>::merge_shard_interval_results(
     return true;
 }
 
+template <typename RecordType, typename ReaderType>
+void
+analyzer_tmpl_t<RecordType, ReaderType>::drain_interval_snapshot_queue_to_vector(
+    std::queue<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
+        &que,
+    std::vector<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
+        &vec)
+{
+    // If the worker's interval snapshot data were an std::vector we could just
+    // std::move here, but that would make the implementation of
+    // merge_shard_interval_results more complex, so there are trade-offs.
+    while (!que.empty()) {
+        vec.push_back(que.front());
+        que.pop();
+    }
+}
+
+template <typename RecordType, typename ReaderType>
+void
+analyzer_tmpl_t<RecordType, ReaderType>::populate_unmerged_shard_interval_results()
+{
+    for (auto &worker : worker_data_) {
+        for (auto &shard_data : worker.shard_data) {
+            assert(static_cast<int>(shard_data.second.tool_data.size()) == num_tools_);
+            for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
+                key_tool_shard_t tool_shard_key = { tool_idx,
+                                                    shard_data.second.shard_index };
+                drain_interval_snapshot_queue_to_vector(
+                    shard_data.second.tool_data[tool_idx].interval_snapshot_data,
+                    per_shard_interval_snapshots_[tool_shard_key]);
+            }
+        }
+    }
+}
+
+template <typename RecordType, typename ReaderType>
+void
+analyzer_tmpl_t<RecordType, ReaderType>::populate_serial_interval_results()
+{
+    assert(whole_trace_interval_snapshots_.empty());
+    whole_trace_interval_snapshots_.resize(num_tools_);
+    assert(worker_data_.size() == 1);
+    assert(worker_data_[0].shard_data.size() == 1 &&
+           worker_data_[0].shard_data.count(0) == 1);
+    assert(static_cast<int>(worker_data_[0].shard_data[0].tool_data.size()) ==
+           num_tools_);
+    for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
+        drain_interval_snapshot_queue_to_vector(
+            worker_data_[0].shard_data[0].tool_data[tool_idx].interval_snapshot_data,
+            whole_trace_interval_snapshots_[tool_idx]);
+    }
+}
+
 template <typename RecordType, typename ReaderType>
 bool
 analyzer_tmpl_t<RecordType, ReaderType>::collect_and_maybe_merge_shard_interval_results()
 {
+    assert(interval_microseconds_ != 0 || interval_instr_count_ != 0);
+    if (!parallel_) {
+        populate_serial_interval_results();
+        return true;
+    }
+    if (interval_instr_count_ > 0) {
+        // We do not merge interval state snapshots across shards. See comment by
+        // per_shard_interval_snapshots for more details.
+        populate_unmerged_shard_interval_results();
+        return true;
+    }
     // all_intervals[tool_idx][shard_idx] contains a queue of the
     // interval_state_snapshot_t* that were output by that tool for that shard.
     std::vector<std::vector<std::queue<
@@ -783,20 +902,21 @@ analyzer_tmpl_t<RecordType, ReaderType>::collect_and_maybe_merge_shard_interval_
         all_intervals(num_tools_);
     for (const auto &worker : worker_data_) {
         for (const auto &shard_data : worker.shard_data) {
+            assert(static_cast<int>(shard_data.second.tool_data.size()) == num_tools_);
             for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
                 all_intervals[tool_idx].emplace_back(std::move(
                     shard_data.second.tool_data[tool_idx].interval_snapshot_data));
             }
         }
     }
-    assert(merged_interval_snapshots_.empty());
-    merged_interval_snapshots_.resize(num_tools_);
+    assert(whole_trace_interval_snapshots_.empty());
+    whole_trace_interval_snapshots_.resize(num_tools_);
     for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
         // We need to do this separately per tool because all tools may not
         // generate an interval_state_snapshot_t for the same intervals (even though
         // the framework notifies all tools of all intervals).
         if (!merge_shard_interval_results(all_intervals[tool_idx],
-                                          merged_interval_snapshots_[tool_idx],
+                                          whole_trace_interval_snapshots_[tool_idx],
                                           tool_idx)) {
             return false;
         }
@@ -844,12 +964,20 @@ analyzer_tmpl_t<RecordType, ReaderType>::run()
             }
         }
     }
-    if (interval_microseconds_ != 0) {
+    if (interval_microseconds_ != 0 || interval_instr_count_ != 0) {
         return collect_and_maybe_merge_shard_interval_results();
     }
     return true;
 }
 
+static void
+print_output_separator()
+{
+
+    std::cerr << "\n=========================================================="
+                 "=================\n";
+}
+
 template <typename RecordType, typename ReaderType>
 bool
 analyzer_tmpl_t<RecordType, ReaderType>::print_stats()
@@ -861,25 +989,58 @@ analyzer_tmpl_t<RecordType, ReaderType>::print_stats()
             error_string_ = tools_[i]->get_error_string();
             return false;
         }
-        if (interval_microseconds_ != 0 && !merged_interval_snapshots_.empty()) {
-            // merged_interval_snapshots_ may be empty depending on the derived class's
-            // implementation of collect_and_maybe_merge_shard_interval_results.
-            if (!merged_interval_snapshots_[i].empty() &&
-                !tools_[i]->print_interval_results(merged_interval_snapshots_[i])) {
+        if (i + 1 < num_tools_) {
+            // Separate tool output.
+            print_output_separator();
+        }
+    }
+    // Now print interval results.
+    // Should not have both whole-trace or per-shard interval snapshots.
+    assert(whole_trace_interval_snapshots_.empty() ||
+           per_shard_interval_snapshots_.empty());
+    // We may have whole-trace intervals snaphots for instr count intervals in serial
+    // mode, and for timestamp (microsecond) intervals in both serial and parallel mode.
+    if (!whole_trace_interval_snapshots_.empty()) {
+        // Separate non-interval and interval outputs.
+        print_output_separator();
+        std::cerr << "Printing whole-trace interval results:\n";
+        for (int i = 0; i < num_tools_; ++i) {
+            // whole_trace_interval_snapshots_[i] may be empty if the corresponding tool
+            // did not produce any interval results.
+            if (!whole_trace_interval_snapshots_[i].empty() &&
+                !tools_[i]->print_interval_results(whole_trace_interval_snapshots_[i])) {
                 error_string_ = tools_[i]->get_error_string();
                 return false;
             }
-            for (auto snapshot : merged_interval_snapshots_[i]) {
+            for (auto snapshot : whole_trace_interval_snapshots_[i]) {
                 if (!tools_[i]->release_interval_snapshot(snapshot)) {
                     error_string_ = tools_[i]->get_error_string();
                     return false;
                 }
             }
+            if (i + 1 < num_tools_) {
+                // Separate tool output.
+                print_output_separator();
+            }
         }
-        if (i + 1 < num_tools_) {
-            // Separate tool output.
-            std::cerr << "\n=========================================================="
-                         "=================\n";
+    } else if (!per_shard_interval_snapshots_.empty()) {
+        // Separate non-interval and interval outputs.
+        print_output_separator();
+        std::cerr << "Printing unmerged per-shard interval results:\n";
+        for (auto &interval_snapshots : per_shard_interval_snapshots_) {
+            int tool_idx = interval_snapshots.first.tool_idx;
+            if (!interval_snapshots.second.empty() &&
+                !tools_[tool_idx]->print_interval_results(interval_snapshots.second)) {
+                error_string_ = tools_[tool_idx]->get_error_string();
+                return false;
+            }
+            for (auto snapshot : interval_snapshots.second) {
+                if (!tools_[tool_idx]->release_interval_snapshot(snapshot)) {
+                    error_string_ = tools_[tool_idx]->get_error_string();
+                    return false;
+                }
+            }
+            print_output_separator();
         }
     }
     return true;
@@ -889,7 +1050,7 @@ template <typename RecordType, typename ReaderType>
 bool
 analyzer_tmpl_t<RecordType, ReaderType>::process_interval(
     uint64_t interval_id, uint64_t interval_init_instr_count,
-    analyzer_worker_data_t *worker, bool parallel, int shard_idx)
+    analyzer_worker_data_t *worker, bool parallel, bool at_instr_record, int shard_idx)
 {
     assert(parallel || shard_idx == 0); // Default to zero for the serial mode.
     for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
@@ -917,9 +1078,21 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_interval(
                 : analysis_tool_tmpl_t<
                       RecordType>::interval_state_snapshot_t::WHOLE_TRACE_SHARD_ID;
             snapshot->interval_id = interval_id;
-            snapshot->interval_end_timestamp = compute_interval_end_timestamp(
-                worker->stream->get_first_timestamp(), interval_id);
-            snapshot->instr_count_cumulative = worker->stream->get_instruction_ordinal();
+            if (interval_microseconds_ > 0) {
+                // For timestamp intervals, the interval_end_timestamp is the abstract
+                // non-inclusive end timestamp for the interval_id. This is to make it
+                // easier to line up the corresponding shard interval snapshots so that
+                // we can merge them to form the whole-trace interval snapshots.
+                snapshot->interval_end_timestamp = compute_interval_end_timestamp(
+                    worker->stream->get_first_timestamp(), interval_id);
+            } else {
+                snapshot->interval_end_timestamp = worker->stream->get_last_timestamp();
+            }
+            // instr_count_cumulative for the interval snapshot is supposed to be
+            // inclusive, so if the first record after the interval (that is, the record
+            // we're at right now) is an instr, it must be subtracted.
+            snapshot->instr_count_cumulative =
+                worker->stream->get_instruction_ordinal() - (at_instr_record ? 1 : 0);
             snapshot->instr_count_delta =
                 snapshot->instr_count_cumulative - interval_init_instr_count;
             worker->shard_data[shard_idx].tool_data[tool_idx].interval_snapshot_data.push(
diff --git a/clients/drcachesim/analyzer.h b/clients/drcachesim/analyzer.h
index 69696c6b91d..87857de7b7b 100644
--- a/clients/drcachesim/analyzer.h
+++ b/clients/drcachesim/analyzer.h
@@ -119,7 +119,8 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     analyzer_tmpl_t(const std::string &trace_path,
                     analysis_tool_tmpl_t<RecordType> **tools, int num_tools,
                     int worker_count = 0, uint64_t skip_instrs = 0,
-                    uint64_t interval_microseconds = 0, int verbosity = 0);
+                    uint64_t interval_microseconds = 0, uint64_t interval_instr_count = 0,
+                    int verbosity = 0);
     /** Launches the analysis process. */
     virtual bool
     run();
@@ -167,6 +168,8 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
         }
 
         uint64_t cur_interval_index;
+        // Cumulative instr count as it was just before the start of the current
+        // interval.
         uint64_t cur_interval_init_instr_count;
         // Identifier for the shard (thread or core id).
         int64_t shard_id;
@@ -250,6 +253,9 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     bool
     record_is_timestamp(const RecordType &record);
 
+    bool
+    record_is_instr(const RecordType &record);
+
     RecordType
     create_wait_marker();
 
@@ -262,15 +268,22 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     // finished. For serial analysis, it should remain the default value.
     bool
     process_interval(uint64_t interval_id, uint64_t interval_init_instr_count,
-                     analyzer_worker_data_t *worker, bool parallel, int shard_idx = 0);
+                     analyzer_worker_data_t *worker, bool parallel, bool at_instr_record,
+                     int shard_idx = 0);
 
     // Compute interval id for the given latest_timestamp, assuming the trace (or
-    // trace shard) starts at the given first_timestamp.
+    // trace shard) starts at the given first_timestamp. This is relevant when
+    // timestamp intervals are enabled using interval_microseconds_.
     uint64_t
-    compute_interval_id(uint64_t first_timestamp, uint64_t latest_timestamp);
+    compute_timestamp_interval_id(uint64_t first_timestamp, uint64_t latest_timestamp);
 
-    // Compute the interval end timestamp for the given interval_id, assuming the trace
-    // (or trace shard) starts at the given first_timestamp.
+    // Compute interval id at the given instr count. This is relevant when instr count
+    // intervals are enabled using interval_instr_count_.
+    uint64_t
+    compute_instr_count_interval_id(uint64_t cur_instr_count);
+
+    // Compute the interval end timestamp (non-inclusive) for the given interval_id,
+    // assuming the trace (or trace shard) starts at the given first_timestamp.
     uint64_t
     compute_interval_end_timestamp(uint64_t first_timestamp, uint64_t interval_id);
 
@@ -278,11 +291,13 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     // on the most recent seen timestamp in the trace stream. Returns whether the
     // current interval id was updated, and if so also sets the previous interval index
     // in prev_interval_index.
+    // at_instr_record indicates that the next record that will be presented to
+    // the analysis tools is an instr record.
     bool
     advance_interval_id(
         typename scheduler_tmpl_t<RecordType, ReaderType>::stream_t *stream,
         analyzer_shard_data_t *shard, uint64_t &prev_interval_index,
-        uint64_t &prev_interval_init_instr_count);
+        uint64_t &prev_interval_init_instr_count, bool at_instr_record);
 
     // Collects interval results for all shards from the workers, and then optional
     // merges the shard-local intervals to form the whole-trace interval results using
@@ -291,7 +306,7 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     virtual bool
     collect_and_maybe_merge_shard_interval_results();
 
-    // Computes and stores the interval results in merged_interval_snapshots_. For
+    // Computes and stores the interval results in whole_trace_interval_snapshots_. For
     // serial analysis where we already have only a single shard, this involves
     // simply copying interval_state_snapshot_t* from the input. For parallel
     // analysis, this involves merging results from multiple shards for intervals
@@ -305,6 +320,16 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
                         *> &merged_intervals,
         int tool_idx);
 
+    // Populates the per_shard_interval_snapshots_ field based on the interval snapshots
+    // stored in worker_data_.
+    void
+    populate_unmerged_shard_interval_results();
+
+    // Populates the whole_trace_interval_snapshots_ field based on the interval snapshots
+    // stored in the only entry of worker_data_.
+    void
+    populate_serial_interval_results();
+
     // Combines all interval snapshots in the given vector to create the interval
     // snapshot for the whole-trace interval ending at interval_end_timestamp and
     // stores it in 'result'. These snapshots are for the tool at tool_idx. Returns
@@ -321,6 +346,13 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     uint64_t
     get_current_microseconds();
 
+    void
+    drain_interval_snapshot_queue_to_vector(
+        std::queue<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
+            &que,
+        std::vector<
+            typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *> &vec);
+
     bool success_;
     scheduler_tmpl_t<RecordType, ReaderType> scheduler_;
     std::string error_string_;
@@ -329,21 +361,63 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     std::vector<analyzer_worker_data_t> worker_data_;
     int num_tools_;
     analysis_tool_tmpl_t<RecordType> **tools_;
-    // Stores the interval state snapshots for the whole trace, which for the parallel
-    // mode are the resulting interval state snapshots after merging from all shards
-    // in merge_shard_interval_results.
-    // merged_interval_snapshots_[tool_idx] is a vector of the interval snapshots
-    // (in order of the intervals) for that tool.
-    // This may not be set, depending on the derived class's implementation of
-    // collect_and_maybe_merge_shard_interval_results.
+    // Stores the interval state snapshots, merged across shards. These are
+    // produced when timestamp intervals are enabled using interval_microseconds_.
+    //
+    // whole_trace_interval_snapshots_[tool_idx] is a vector of the interval snapshots
+    // (in order of the intervals) for that tool. For the parallel mode, these
+    // interval state snapshots are produced after merging corresponding shard
+    // interval snapshots using merge_shard_interval_results.
     std::vector<std::vector<
         typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>>
-        merged_interval_snapshots_;
+        whole_trace_interval_snapshots_;
+
+    // Key that combines tool and shard idx for use with an std::unordered_map.
+    struct key_tool_shard_t {
+        int tool_idx;
+        int shard_idx;
+        bool
+        operator==(const key_tool_shard_t &rhs) const
+        {
+            return tool_idx == rhs.tool_idx && shard_idx == rhs.shard_idx;
+        }
+    };
+    struct key_tool_shard_hash_t {
+        std::size_t
+        operator()(const key_tool_shard_t &t) const
+        {
+            return std::hash<int>()(t.tool_idx ^ t.shard_idx);
+        }
+    };
+
+    // Stores the interval state snapshots, unmerged across shards. These are
+    // produced when instr count intervals are enabled using interval_instr_count_.
+    //
+    // per_shard_interval_snapshots_[(tool_idx, shard_idx)] is a vector
+    // of the interval snapshots for that tool and shard. Note that the snapshots for
+    // each shard are separate; they are not merged across shards.
+    //
+    // TODO i#6643: Figure out a useful way to merge instr count intervals across shards.
+    // One way is to merge the shard interval snapshots that correspond to the same
+    // [interval_instr_count_ * interval_id, interval_instr_count_ * (interval_id + 1))
+    // shard-local instrs. But it is not clear whether this is useful.
+    // Another way is to merge the shard interval snapshots that correspond to the same
+    // [interval_instr_count_ * interval_id, interval_instr_count_ * (interval_id + 1))
+    // whole-trace instrs. But that is much harder to compute. We'd need some way to
+    // identify the whole-trace interval boundaries in each shard's stream (since we
+    // process each shard separately); this would likely need a pre-processing pass.
+    std::unordered_map<key_tool_shard_t,
+                       std::vector<typename analysis_tool_tmpl_t<
+                           RecordType>::interval_state_snapshot_t *>,
+                       key_tool_shard_hash_t>
+        per_shard_interval_snapshots_;
+
     bool parallel_;
     int worker_count_;
     const char *output_prefix_ = "[analyzer]";
     uint64_t skip_instrs_ = 0;
     uint64_t interval_microseconds_ = 0;
+    uint64_t interval_instr_count_ = 0;
     int verbosity_ = 0;
     shard_type_t shard_type_ = SHARD_BY_THREAD;
     bool sched_by_time_ = false;
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index a4650803e53..eb1cbd21c2c 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -340,6 +340,7 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
     this->worker_count_ = op_jobs.get_value();
     this->skip_instrs_ = op_skip_instrs.get_value();
     this->interval_microseconds_ = op_interval_microseconds.get_value();
+    this->interval_instr_count_ = op_interval_instr_count.get_value();
     // Initial measurements show it's sometimes faster to keep the parallel model
     // of using single-file readers but use them sequentially, as opposed to
     // the every-file interleaving reader, but the user can specify -jobs 1, so
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 499cb860bc6..390db324ba6 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -521,7 +521,17 @@ droption_t<bytesize_t> op_interval_microseconds(
     "Enable periodic heartbeats for intervals of given microseconds in the trace.",
     "Desired length of each trace interval, defined in microseconds of trace time. "
     "Trace intervals are measured using the TRACE_MARKER_TYPE_TIMESTAMP marker values. "
-    "If set, analysis tools receive a callback at the end of each interval.");
+    "If set, analysis tools receive a callback at the end of each interval, and one "
+    "at the end of trace analysis to print the whole-trace interval results.");
+
+droption_t<bytesize_t> op_interval_instr_count(
+    DROPTION_SCOPE_FRONTEND, "interval_instr_count", 0,
+    "Enable periodic heartbeats for intervals of given per-shard instr count. ",
+    "Desired length of each trace interval, defined in instr count of each shard. "
+    "With -parallel, this does not support whole trace intervals, only per-shard "
+    "intervals. If set, analysis tools receive a callback at the end of each interval, "
+    "and separate callbacks per shard at the end of trace analysis to print each "
+    "shard's interval results.");
 
 droption_t<int>
     op_only_thread(DROPTION_SCOPE_FRONTEND, "only_thread", 0,
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index 6c29a1af9d0..dbd297f2966 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -158,6 +158,8 @@ extern dynamorio::droption::droption_t<std::string> op_tracer_alt;
 extern dynamorio::droption::droption_t<std::string> op_tracer_ops;
 extern dynamorio::droption::droption_t<dynamorio::droption::bytesize_t>
     op_interval_microseconds;
+extern dynamorio::droption::droption_t<dynamorio::droption::bytesize_t>
+    op_interval_instr_count;
 extern dynamorio::droption::droption_t<int> op_only_thread;
 extern dynamorio::droption::droption_t<dynamorio::droption::bytesize_t> op_skip_instrs;
 extern dynamorio::droption::droption_t<dynamorio::droption::bytesize_t> op_skip_refs;
diff --git a/clients/drcachesim/tests/offline-interval-instr-count-output.templatex b/clients/drcachesim/tests/offline-interval-instr-count-output.templatex
new file mode 100644
index 00000000000..e0a1268b7c7
--- /dev/null
+++ b/clients/drcachesim/tests/offline-interval-instr-count-output.templatex
@@ -0,0 +1,37 @@
+Hello, world!
+Basic counts tool results:
+Total counts:
+     [ 0-9]* total \(fetched\) instructions
+     [ 0-9]* total unique \(fetched\) instructions
+     [ 0-9]* total non-fetched instructions
+     [ 0-9]* total prefetches
+     [ 0-9]* total data loads
+     [ 0-9]* total data stores
+     [ 0-9]* total icache flushes
+     [ 0-9]* total dcache flushes
+           1 total threads
+     [ 0-9]* total scheduling markers
+.*
+Thread [0-9]* counts:
+     [ 0-9]* \(fetched\) instructions
+     [ 0-9]* unique \(fetched\) instructions
+     [ 0-9]* non-fetched instructions
+     [ 0-9]* prefetches
+     [ 0-9]* data loads
+     [ 0-9]* data stores
+     [ 0-9]* icache flushes
+     [ 0-9]* dcache flushes
+     [ 0-9]* scheduling markers
+.*
+Printing unmerged per-shard interval results:
+Counts per trace interval for TID.*
+Interval #1 ending at timestamp [0-9]*:
+       10000 interval delta \(fetched\) instructions
+     [ 0-9]* interval delta non-fetched instructions
+     [ 0-9]* interval delta prefetches
+     [ 0-9]* interval delta data loads
+     [ 0-9]* interval delta data stores
+     [ 0-9]* interval delta icache flushes
+     [ 0-9]* interval delta dcache flushes
+     [ 0-9]* interval delta scheduling markers
+.*
diff --git a/clients/drcachesim/tests/offline-interval-count-output.templatex b/clients/drcachesim/tests/offline-interval-microseconds-count-output.templatex
similarity index 96%
rename from clients/drcachesim/tests/offline-interval-count-output.templatex
rename to clients/drcachesim/tests/offline-interval-microseconds-count-output.templatex
index 08b84128398..bb1b946ce5d 100644
--- a/clients/drcachesim/tests/offline-interval-count-output.templatex
+++ b/clients/drcachesim/tests/offline-interval-microseconds-count-output.templatex
@@ -23,6 +23,7 @@ Thread [0-9]* counts:
      [ 0-9]* dcache flushes
      [ 0-9]* scheduling markers
 .*
+Printing whole-trace interval results:
 Counts per trace interval for whole trace:
 Interval #1 ending at timestamp [0-9]*:
      [ 0-9]* interval delta \(fetched\) instructions
diff --git a/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp b/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
index 384cd1ae440..533c5beeb00 100644
--- a/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
+++ b/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2023 Google, Inc.  All rights reserved.
+ * Copyright (c) 2023-2024 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -140,8 +140,10 @@ class test_stream_t : public scheduler_t::stream_t {
             return instr_count_;
         assert(at_ >= 0 && at_ < refs_.size());
         auto it = instr_counts_.find(refs_[at_].instr.tid);
-        assert(it != instr_counts_.end());
-        return it->second;
+        if (it != instr_counts_.end()) {
+            return it->second;
+        }
+        return 0;
     }
 
 private:
@@ -166,13 +168,15 @@ class test_stream_t : public scheduler_t::stream_t {
 class test_analyzer_t : public analyzer_t {
 public:
     test_analyzer_t(const std::vector<memref_t> &refs, analysis_tool_t **tools,
-                    int num_tools, bool parallel, uint64_t interval_microseconds)
+                    int num_tools, bool parallel, uint64_t interval_microseconds,
+                    uint64_t interval_instr_count)
         : analyzer_t()
     {
         num_tools_ = num_tools;
         tools_ = tools;
         parallel_ = parallel;
         interval_microseconds_ = interval_microseconds;
+        interval_instr_count_ = interval_instr_count;
         verbosity_ = 1;
         worker_count_ = 1;
         test_stream_ =
@@ -267,96 +271,105 @@ class dummy_analysis_tool_t : public analysis_tool_t {
     int generate_snapshot_count_;
 };
 
-// Test analysis_tool_t that records information about when the
-// generate_shard_interval_snapshot and generate_interval_snapshot APIs were invoked.
-class test_analysis_tool_t : public analysis_tool_t {
-public:
-    // Describes the point in trace when an interval ends. This is same as the point
-    // when the generate_*interval_snapshot API is invoked.
-    struct interval_end_point_t {
-        memref_tid_t tid;
-        int seen_memrefs; // For parallel mode, this is the shard-local count.
-        uint64_t interval_id;
+#define SERIAL_TID 0
 
-        bool
-        operator==(const interval_end_point_t &rhs) const
-        {
-            return tid == rhs.tid && seen_memrefs == rhs.seen_memrefs &&
-                interval_id == rhs.interval_id;
-        }
-        bool
-        operator<(const interval_end_point_t &rhs) const
-        {
-            if (tid != rhs.tid)
-                return tid < rhs.tid;
-            if (seen_memrefs != rhs.seen_memrefs)
-                return seen_memrefs < rhs.seen_memrefs;
-            return interval_id < rhs.interval_id;
-        }
-    };
+// Describes the point in trace when an interval ends. This is same as the point
+// when the generate_*interval_snapshot API is invoked.
+struct interval_end_point_t {
+    memref_tid_t tid;
+    int seen_memrefs; // For parallel mode, this is the shard-local count.
+    uint64_t interval_id;
 
-    // Describes the state recorded by test_analysis_tool_t at the end of each
-    // interval.
-    struct recorded_snapshot_t : public analysis_tool_t::interval_state_snapshot_t {
-        recorded_snapshot_t(uint64_t interval_id, uint64_t interval_end_timestamp,
-                            uint64_t instr_count_cumulative, uint64_t instr_count_delta,
-                            std::vector<interval_end_point_t> component_intervals)
-            // Actual tools do not need to supply values to construct the base
-            // interval_state_snapshot_t. This is only to make it easier to construct
-            // the expected snapshot objects in this test.
-            // Since the final verification happens only for the whole trace intervals,
-            // we simply use WHOLE_TRACE_SHARD_ID here and for tool_shard_id below.
-            : interval_state_snapshot_t(WHOLE_TRACE_SHARD_ID, interval_id,
-                                        interval_end_timestamp, instr_count_cumulative,
-                                        instr_count_delta)
-            , component_intervals(component_intervals)
-            , tool_shard_id(WHOLE_TRACE_SHARD_ID)
-        {
-        }
-        recorded_snapshot_t()
-        {
-        }
+    bool
+    operator==(const interval_end_point_t &rhs) const
+    {
+        return tid == rhs.tid && seen_memrefs == rhs.seen_memrefs &&
+            interval_id == rhs.interval_id;
+    }
+    bool
+    operator<(const interval_end_point_t &rhs) const
+    {
+        if (tid != rhs.tid)
+            return tid < rhs.tid;
+        if (seen_memrefs != rhs.seen_memrefs)
+            return seen_memrefs < rhs.seen_memrefs;
+        return interval_id < rhs.interval_id;
+    }
+};
 
-        bool
-        operator==(const recorded_snapshot_t &rhs) const
-        {
-            return shard_id == rhs.shard_id && tool_shard_id == rhs.tool_shard_id &&
-                interval_id == rhs.interval_id &&
-                interval_end_timestamp == rhs.interval_end_timestamp &&
-                instr_count_cumulative == rhs.instr_count_cumulative &&
-                instr_count_delta == rhs.instr_count_delta &&
-                component_intervals == rhs.component_intervals;
-        }
-        void
-        print() const
-        {
-            std::cerr << "(shard_id: " << shard_id << ", interval_id: " << interval_id
-                      << ", tool_shard_id: " << tool_shard_id
-                      << ", end_timestamp: " << interval_end_timestamp
-                      << ", instr_count_cumulative: " << instr_count_cumulative
-                      << ", instr_count_delta: " << instr_count_delta
-                      << ", component_intervals: ";
-            for (const auto &s : component_intervals) {
-                std::cerr << "(tid:" << s.tid << ", seen_memrefs:" << s.seen_memrefs
-                          << ", interval_id:" << s.interval_id << "),";
-            }
-            std::cerr << ")\n";
+// Describes the state recorded by test_analysis_tool_t at the end of each
+// interval.
+struct recorded_snapshot_t : public analysis_tool_t::interval_state_snapshot_t {
+    recorded_snapshot_t(uint64_t shard_id, uint64_t interval_id,
+                        uint64_t interval_end_timestamp, uint64_t instr_count_cumulative,
+                        uint64_t instr_count_delta,
+                        std::vector<interval_end_point_t> component_intervals)
+        // Actual tools do not need to supply values to construct the base
+        // interval_state_snapshot_t. This is only to make it easier to construct
+        // the expected snapshot objects in this test.
+        : interval_state_snapshot_t(shard_id, interval_id, interval_end_timestamp,
+                                    instr_count_cumulative, instr_count_delta)
+        , component_intervals(component_intervals)
+        , tool_shard_id(shard_id)
+    {
+    }
+    recorded_snapshot_t(uint64_t interval_id, uint64_t interval_end_timestamp,
+                        uint64_t instr_count_cumulative, uint64_t instr_count_delta,
+                        std::vector<interval_end_point_t> component_intervals)
+        : recorded_snapshot_t(WHOLE_TRACE_SHARD_ID, interval_id, interval_end_timestamp,
+                              instr_count_cumulative, instr_count_delta,
+                              component_intervals)
+    {
+    }
+    recorded_snapshot_t()
+    {
+    }
+
+    bool
+    operator==(const recorded_snapshot_t &rhs) const
+    {
+        return shard_id == rhs.shard_id && tool_shard_id == rhs.tool_shard_id &&
+            interval_id == rhs.interval_id &&
+            interval_end_timestamp == rhs.interval_end_timestamp &&
+            instr_count_cumulative == rhs.instr_count_cumulative &&
+            instr_count_delta == rhs.instr_count_delta &&
+            component_intervals == rhs.component_intervals;
+    }
+    void
+    print() const
+    {
+        std::cerr << "(shard_id: " << shard_id << ", interval_id: " << interval_id
+                  << ", tool_shard_id: " << tool_shard_id
+                  << ", end_timestamp: " << interval_end_timestamp
+                  << ", instr_count_cumulative: " << instr_count_cumulative
+                  << ", instr_count_delta: " << instr_count_delta
+                  << ", component_intervals: ";
+        for (const auto &s : component_intervals) {
+            std::cerr << "(tid:" << s.tid << ", seen_memrefs:" << s.seen_memrefs
+                      << ", interval_id:" << s.interval_id << "),";
         }
+        std::cerr << ")\n";
+    }
 
-        // Stores the list of intervals that were combined to produce this snapshot.
-        // In the serial case, this contains just a single value. In the parallel case,
-        // this contains a list of size equal to the count of shard interval snapshots
-        // that were combined to create this snapshot.
-        std::vector<interval_end_point_t> component_intervals;
-        // Stores the shard_id recorded by the test tool. Compared with the shard_id
-        // stored by the framework in the base struct.
-        int64_t tool_shard_id;
-    };
+    // Stores the list of intervals that were combined to produce this snapshot.
+    // In the serial case, this contains just a single value. In the parallel case,
+    // this contains a list of size equal to the count of shard interval snapshots
+    // that were combined to create this snapshot.
+    std::vector<interval_end_point_t> component_intervals;
+    // Stores the shard_id recorded by the test tool. Compared with the shard_id
+    // stored by the framework in the base struct.
+    int64_t tool_shard_id;
+};
 
+// Test analysis_tool_t that records information about when the
+// generate_shard_interval_snapshot and generate_interval_snapshot APIs were invoked.
+class test_analysis_tool_t : public analysis_tool_t {
+public:
     // Constructs an analysis_tool_t that expects the given interval state snapshots to be
     // produced.
-    test_analysis_tool_t(const std::vector<recorded_snapshot_t> &expected_state_snapshots,
-                         bool combine_only_active_shards)
+    test_analysis_tool_t(
+        const std::vector<std::vector<recorded_snapshot_t>> &expected_state_snapshots,
+        bool combine_only_active_shards)
         : seen_memrefs_(0)
         , expected_state_snapshots_(expected_state_snapshots)
         , outstanding_snapshots_(0)
@@ -376,7 +389,7 @@ class test_analysis_tool_t : public analysis_tool_t {
         snapshot->tool_shard_id =
             analysis_tool_t::interval_state_snapshot_t::WHOLE_TRACE_SHARD_ID;
         snapshot->component_intervals.push_back(
-            { /*tid=*/0, seen_memrefs_, interval_id });
+            { SERIAL_TID, seen_memrefs_, interval_id });
         ++outstanding_snapshots_;
         return snapshot;
     }
@@ -393,6 +406,7 @@ class test_analysis_tool_t : public analysis_tool_t {
     void *
     parallel_shard_init(int shard_index, void *worker_data) override
     {
+        parallel_mode_ = true;
         auto per_shard = new per_shard_t;
         per_shard->magic_num = kMagicNum;
         per_shard->tid = kInvalidTid;
@@ -440,6 +454,18 @@ class test_analysis_tool_t : public analysis_tool_t {
             latest_shard_snapshots,
         uint64_t interval_end_timestamp) override
     {
+        // If we expect multiple std::vector of interval snapshots (one for each shard),
+        // it means we're not merging the snapshots across shards, so there should not
+        // be any combine_interval_snapshot calls.
+        if (expected_state_snapshots_.size() != 1) {
+            error_string_ = "Did not expect any combine_interval_snapshots() calls";
+            return nullptr;
+        }
+        if (!parallel_mode_) {
+            error_string_ =
+                "Did not expect any combine_interval_snapshots() calls in serial mode.";
+            return nullptr;
+        }
         recorded_snapshot_t *result = new recorded_snapshot_t();
         result->tool_shard_id =
             analysis_tool_t::interval_state_snapshot_t::WHOLE_TRACE_SHARD_ID;
@@ -485,21 +511,29 @@ class test_analysis_tool_t : public analysis_tool_t {
     print_interval_results(
         const std::vector<interval_state_snapshot_t *> &snapshots) override
     {
+        if (seen_print_interval_results_calls_ >= expected_state_snapshots_.size()) {
+            error_string_ = "Saw more print_interval_results() calls than expected";
+            return false;
+        }
         std::vector<recorded_snapshot_t *> recorded_state_snapshots;
         for (const auto &p : snapshots) {
             recorded_state_snapshots.push_back(
                 reinterpret_cast<recorded_snapshot_t *>(p));
         }
-        if (!compare_results(recorded_state_snapshots, expected_state_snapshots_)) {
+        if (!compare_results(
+                recorded_state_snapshots,
+                expected_state_snapshots_[seen_print_interval_results_calls_])) {
             error_string_ = "Unexpected state snapshots";
             std::cerr << "Expected:\n";
-            for (const auto &snapshot : expected_state_snapshots_)
+            for (const auto &snapshot :
+                 expected_state_snapshots_[seen_print_interval_results_calls_])
                 snapshot.print();
             std::cerr << "Found:\n";
             for (const auto &snapshot : recorded_state_snapshots)
                 snapshot->print();
             return false;
         }
+        ++seen_print_interval_results_calls_;
         return true;
     }
     bool
@@ -515,12 +549,23 @@ class test_analysis_tool_t : public analysis_tool_t {
     {
         return outstanding_snapshots_;
     }
+    int
+    get_outstanding_print_interval_results_calls() const
+    {
+        return expected_state_snapshots_.size() - seen_print_interval_results_calls_;
+    }
 
 private:
     int seen_memrefs_;
-    std::vector<recorded_snapshot_t> expected_state_snapshots_;
+    // We expect to see one print_interval_results call per shard (we do not merge
+    // the shard interval snapshots for instr count intervals), or exactly one
+    // print_interval_results call for the whole-trace (we merge shard interval
+    // snapshots for timestamp intervals).
+    std::vector<std::vector<recorded_snapshot_t>> expected_state_snapshots_;
     int outstanding_snapshots_;
     bool combine_only_active_shards_;
+    int seen_print_interval_results_calls_ = 0;
+    bool parallel_mode_ = false;
 
     // Data tracked per shard.
     struct per_shard_t {
@@ -537,6 +582,7 @@ static bool
 test_non_zero_interval(bool parallel, bool combine_only_active_shards = true)
 {
     constexpr uint64_t kIntervalMicroseconds = 100;
+    constexpr uint64_t kNoIntervalInstrCount = 0;
     std::vector<memref_t> refs = {
         // Trace for a single worker which has two constituent shards. (scheduler_t
         // does not guarantee that workers will process shards one after the other.)
@@ -563,52 +609,51 @@ test_non_zero_interval(bool parallel, bool combine_only_active_shards = true)
         gen_exit(52)                                      // _ | 6 | 7
     };
 
-    std::vector<test_analysis_tool_t::recorded_snapshot_t> expected_state_snapshots;
+    std::vector<std::vector<recorded_snapshot_t>> expected_state_snapshots;
     if (!parallel) {
         // Each whole trace interval is made up of only one snapshot, the
         // serial snapshot.
-        expected_state_snapshots = {
-            // Format for interval_end_point_t: <tid, seen_memrefs, interval_id>
-            test_analysis_tool_t::recorded_snapshot_t(1, 100, 1, 1, { { 0, 3, 1 } }),
-            test_analysis_tool_t::recorded_snapshot_t(2, 200, 3, 2, { { 0, 7, 2 } }),
-            test_analysis_tool_t::recorded_snapshot_t(3, 300, 6, 3, { { 0, 13, 3 } }),
-            test_analysis_tool_t::recorded_snapshot_t(5, 500, 7, 1, { { 0, 15, 5 } }),
-            test_analysis_tool_t::recorded_snapshot_t(6, 600, 7, 0, { { 0, 17, 6 } }),
-            test_analysis_tool_t::recorded_snapshot_t(7, 700, 8, 1, { { 0, 20, 7 } }),
-        };
+        expected_state_snapshots = { {
+            // Format:
+            // <interval_id, interval_end_timestamp, instr_count_cumulative,
+            //  instr_count_delta, <tid, seen_memrefs, interval_id>>
+            recorded_snapshot_t(1, 100, 1, 1, { { SERIAL_TID, 3, 1 } }),
+            recorded_snapshot_t(2, 200, 3, 2, { { SERIAL_TID, 7, 2 } }),
+            recorded_snapshot_t(3, 300, 6, 3, { { SERIAL_TID, 13, 3 } }),
+            recorded_snapshot_t(5, 500, 7, 1, { { SERIAL_TID, 15, 5 } }),
+            recorded_snapshot_t(6, 600, 7, 0, { { SERIAL_TID, 17, 6 } }),
+            recorded_snapshot_t(7, 700, 8, 1, { { SERIAL_TID, 20, 7 } }),
+        } };
     } else if (combine_only_active_shards) {
         // Each whole trace interval is made up of snapshots from each
         // shard that was active in that interval.
         expected_state_snapshots = {
-            // Format for interval_end_point_t: <tid, seen_memrefs, interval_id>
-            test_analysis_tool_t::recorded_snapshot_t(1, 100, 1, 1, { { 51, 3, 1 } }),
-            // Narration: The whole-trace interval_id=1 with interval_end_timestamp=200
-            // is made up of the following two shard-local interval snapshots:
-            // - from shard_id=51, the interval_id=1 that ends at the local_memref=5
-            // - from shard_id=52, the interval_id=0 that ends at the local_memref=2
-            test_analysis_tool_t::recorded_snapshot_t(2, 200, 3, 2,
-                                                      { { 51, 5, 2 }, { 52, 2, 1 } }),
-            test_analysis_tool_t::recorded_snapshot_t(3, 300, 6, 3,
-                                                      { { 51, 7, 3 }, { 52, 6, 2 } }),
-            test_analysis_tool_t::recorded_snapshot_t(5, 500, 7, 1, { { 52, 8, 4 } }),
-            test_analysis_tool_t::recorded_snapshot_t(6, 600, 7, 0, { { 51, 9, 6 } }),
-            test_analysis_tool_t::recorded_snapshot_t(7, 700, 8, 1, { { 52, 11, 6 } })
+            { // Format:
+              // <interval_id, interval_end_timestamp, instr_count_cumulative,
+              //  instr_count_delta, <tid, seen_memrefs, interval_id>>
+              recorded_snapshot_t(1, 100, 1, 1, { { 51, 3, 1 } }),
+              // Narration: The whole-trace interval_id=2 with interval_end_timestamp=200
+              // is made up of the following two shard-local interval snapshots:
+              // - from shard_id=51, the interval_id=2 that ends at the local_memref=5
+              // - from shard_id=52, the interval_id=1 that ends at the local_memref=2
+              recorded_snapshot_t(2, 200, 3, 2, { { 51, 5, 2 }, { 52, 2, 1 } }),
+              recorded_snapshot_t(3, 300, 6, 3, { { 51, 7, 3 }, { 52, 6, 2 } }),
+              recorded_snapshot_t(5, 500, 7, 1, { { 52, 8, 4 } }),
+              recorded_snapshot_t(6, 600, 7, 0, { { 51, 9, 6 } }),
+              recorded_snapshot_t(7, 700, 8, 1, { { 52, 11, 6 } }) }
         };
     } else {
         // Each whole trace interval is made up of last snapshots from all trace shards.
         expected_state_snapshots = {
-            // Format for interval_end_point_t: <tid, seen_memrefs, interval_id>
-            test_analysis_tool_t::recorded_snapshot_t(1, 100, 1, 1, { { 51, 3, 1 } }),
-            test_analysis_tool_t::recorded_snapshot_t(2, 200, 3, 2,
-                                                      { { 51, 5, 2 }, { 52, 2, 1 } }),
-            test_analysis_tool_t::recorded_snapshot_t(3, 300, 6, 3,
-                                                      { { 51, 7, 3 }, { 52, 6, 2 } }),
-            test_analysis_tool_t::recorded_snapshot_t(5, 500, 7, 1,
-                                                      { { 51, 7, 3 }, { 52, 8, 4 } }),
-            test_analysis_tool_t::recorded_snapshot_t(6, 600, 7, 0,
-                                                      { { 51, 9, 6 }, { 52, 8, 4 } }),
-            test_analysis_tool_t::recorded_snapshot_t(7, 700, 8, 1,
-                                                      { { 51, 9, 6 }, { 52, 11, 6 } })
+            { { // Format:
+                // <interval_id, interval_end_timestamp, instr_count_cumulative,
+                //  instr_count_delta, <tid, seen_memrefs, interval_id>>
+                recorded_snapshot_t(1, 100, 1, 1, { { 51, 3, 1 } }),
+                recorded_snapshot_t(2, 200, 3, 2, { { 51, 5, 2 }, { 52, 2, 1 } }),
+                recorded_snapshot_t(3, 300, 6, 3, { { 51, 7, 3 }, { 52, 6, 2 } }),
+                recorded_snapshot_t(5, 500, 7, 1, { { 51, 7, 3 }, { 52, 8, 4 } }),
+                recorded_snapshot_t(6, 600, 7, 0, { { 51, 9, 6 }, { 52, 8, 4 } }),
+                recorded_snapshot_t(7, 700, 8, 1, { { 51, 9, 6 }, { 52, 11, 6 } }) } }
         };
     }
     std::vector<analysis_tool_t *> tools;
@@ -619,7 +664,7 @@ test_non_zero_interval(bool parallel, bool combine_only_active_shards = true)
         std::unique_ptr<dummy_analysis_tool_t>(new dummy_analysis_tool_t());
     tools.push_back(dummy_analysis_tool.get());
     test_analyzer_t test_analyzer(refs, &tools[0], (int)tools.size(), parallel,
-                                  kIntervalMicroseconds);
+                                  kIntervalMicroseconds, kNoIntervalInstrCount);
     if (!test_analyzer) {
         FATAL_ERROR("failed to initialize test analyzer: %s",
                     test_analyzer.get_error_string().c_str());
@@ -638,6 +683,13 @@ test_non_zero_interval(bool parallel, bool combine_only_active_shards = true)
                   << " left\n";
         return false;
     }
+    if (test_analysis_tool.get()->get_outstanding_print_interval_results_calls() != 0) {
+        std::cerr
+            << "Missing "
+            << test_analysis_tool.get()->get_outstanding_print_interval_results_calls()
+            << " print_interval_result() calls\n";
+        return false;
+    }
     int expected_generate_call_count = parallel ? 8 : 6;
     if (dummy_analysis_tool.get()->get_generate_snapshot_count() !=
         expected_generate_call_count) {
@@ -654,11 +706,120 @@ test_non_zero_interval(bool parallel, bool combine_only_active_shards = true)
     return true;
 }
 
+static bool
+test_non_zero_instr_interval(bool parallel)
+{
+    constexpr uint64_t kNoIntervalMicroseconds = 0;
+    constexpr uint64_t kIntervalInstrCount = 2;
+    std::vector<memref_t> refs = {
+        // Trace for a single worker which has two constituent shards. (scheduler_t
+        // does not guarantee that workers will process shards one after the other.)
+        // Expected active interval_id: tid_51_local | tid_52_local | whole_trace
+        gen_marker(51, TRACE_MARKER_TYPE_TIMESTAMP, 40),  // 1 | _ | 1
+        gen_instr(51, 10000),                             // 1 | _ | 1
+        gen_data(51, true, 1234, 4),                      // 1 | _ | 1
+        gen_marker(52, TRACE_MARKER_TYPE_TIMESTAMP, 151), // _ | 1 | 1
+        gen_instr(52, 20000),                             // _ | 1 | 1
+        gen_marker(51, TRACE_MARKER_TYPE_TIMESTAMP, 170), // 1 | _ | 1
+        gen_instr(51, 10008),                             // 1 | _ | 2
+        gen_marker(51, TRACE_MARKER_TYPE_TIMESTAMP, 201), // 1 | _ | 2
+        gen_instr(51, 20004),                             // 2 | _ | 2
+        gen_marker(52, TRACE_MARKER_TYPE_TIMESTAMP, 210), // _ | 1 | 2
+        gen_instr(52, 20008),                             // _ | 1 | 3
+        gen_marker(52, TRACE_MARKER_TYPE_TIMESTAMP, 270), // _ | 1 | 3
+        gen_instr(52, 20008),                             // _ | 2 | 3
+        gen_marker(52, TRACE_MARKER_TYPE_TIMESTAMP, 490), // _ | 2 | 3
+        gen_instr(52, 20012),                             // _ | 2 | 4
+        gen_marker(51, TRACE_MARKER_TYPE_TIMESTAMP, 590), // 2 | _ | 4
+        gen_exit(51),                                     // 2 | _ | 4
+        gen_marker(52, TRACE_MARKER_TYPE_TIMESTAMP, 610), // _ | 2 | 4
+        gen_instr(52, 20016),                             // _ | 3 | 4
+        gen_exit(52)                                      // _ | 3 | 4
+    };
+
+    std::vector<std::vector<recorded_snapshot_t>> expected_state_snapshots;
+    if (!parallel) {
+        // Each whole trace interval is made up of only one snapshot, the
+        // serial snapshot.
+        expected_state_snapshots = {
+            { // Format:
+              // <interval_id, interval_end_timestamp, instr_count_cumulative,
+              //  instr_count_delta, <tid, seen_memrefs, interval_id>>
+              recorded_snapshot_t(1, 170, 2, 2, { { SERIAL_TID, 6, 1 } }),
+              recorded_snapshot_t(2, 210, 4, 2, { { SERIAL_TID, 10, 2 } }),
+              recorded_snapshot_t(3, 490, 6, 2, { { SERIAL_TID, 14, 3 } }),
+              recorded_snapshot_t(4, 610, 8, 2, { { SERIAL_TID, 20, 4 } }) }
+        };
+    } else {
+        // For instr count intervals, we do not merge the shard intervals to form the
+        // whole-trace intervals. Instead, there are multiple print_interval_result
+        // calls, one for the interval snapshots of each shard. The shard_id is
+        // included in the provided interval snapshots (see below).
+        expected_state_snapshots = {
+            // Format:
+            // <shard_id, interval_id, interval_end_timestamp, instr_count_cumulative,
+            //  instr_count_delta, <tid, seen_memrefs, interval_id>>
+            { recorded_snapshot_t(51, 1, 201, 2, 2, { { 51, 6, 1 } }),
+              recorded_snapshot_t(51, 2, 590, 3, 1, { { 51, 9, 2 } }) },
+            { recorded_snapshot_t(52, 1, 270, 2, 2, { { 52, 5, 1 } }),
+              recorded_snapshot_t(52, 2, 610, 4, 2, { { 52, 9, 2 } }),
+              recorded_snapshot_t(52, 3, 610, 5, 1, { { 52, 11, 3 } }) },
+        };
+    }
+    std::vector<analysis_tool_t *> tools;
+    constexpr bool kNopCombineOnlyActiveShards = false;
+    auto test_analysis_tool = std::unique_ptr<test_analysis_tool_t>(
+        new test_analysis_tool_t(expected_state_snapshots, kNopCombineOnlyActiveShards));
+    tools.push_back(test_analysis_tool.get());
+    auto dummy_analysis_tool =
+        std::unique_ptr<dummy_analysis_tool_t>(new dummy_analysis_tool_t());
+    tools.push_back(dummy_analysis_tool.get());
+    test_analyzer_t test_analyzer(refs, &tools[0], (int)tools.size(), parallel,
+                                  kNoIntervalMicroseconds, kIntervalInstrCount);
+    if (!test_analyzer) {
+        FATAL_ERROR("failed to initialize test analyzer: %s",
+                    test_analyzer.get_error_string().c_str());
+    }
+    if (!test_analyzer.run()) {
+        FATAL_ERROR("failed to run test_analyzer: %s",
+                    test_analyzer.get_error_string().c_str());
+    }
+    if (!test_analyzer.print_stats()) {
+        FATAL_ERROR("failed to print stats: %s",
+                    test_analyzer.get_error_string().c_str());
+    }
+    if (test_analysis_tool.get()->get_outstanding_snapshot_count() != 0) {
+        std::cerr << "Failed to release all outstanding snapshots: "
+                  << test_analysis_tool.get()->get_outstanding_snapshot_count()
+                  << " left\n";
+        return false;
+    }
+    if (test_analysis_tool.get()->get_outstanding_print_interval_results_calls() != 0) {
+        std::cerr
+            << "Missing "
+            << test_analysis_tool.get()->get_outstanding_print_interval_results_calls()
+            << " print_interval_result() calls\n";
+        return false;
+    }
+    int expected_generate_call_count = parallel ? 5 : 4;
+    if (dummy_analysis_tool.get()->get_generate_snapshot_count() !=
+        expected_generate_call_count) {
+        std::cerr << "Dummy analysis tool got "
+                  << dummy_analysis_tool.get()->get_generate_snapshot_count()
+                  << " interval API calls, but expected " << expected_generate_call_count
+                  << "\n";
+        return false;
+    }
+    fprintf(stderr, "test_non_zero_instr_interval done for parallel=%d\n", parallel);
+    return true;
+}
+
 int
 test_main(int argc, const char *argv[])
 {
     if (!test_non_zero_interval(false) || !test_non_zero_interval(true, true) ||
-        !test_non_zero_interval(true, false))
+        !test_non_zero_interval(true, false) || !test_non_zero_instr_interval(false) ||
+        !test_non_zero_instr_interval(true))
         return 1;
     fprintf(stderr, "All done!\n");
     return 0;
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index 170ffeb787e..ab457b418fa 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -4064,9 +4064,12 @@ if (BUILD_CLIENTS)
       endif ()
     endif ()
 
-    torunonly_drcacheoff(interval-count-output ${ci_shared_app} ""
+    torunonly_drcacheoff(interval-microseconds-count-output ${ci_shared_app} ""
       "@-simulator_type@basic_counts@-interval_microseconds@1M" "")
 
+    torunonly_drcacheoff(interval-instr-count-output ${ci_shared_app} ""
+      "@-simulator_type@basic_counts@-interval_instr_count@10000" "")
+
     # As for the online test, we check that only 1 thread is in the final trace.
     torunonly_drcacheoff(max-global client.annotation-concurrency
       # Include function tracing to sanity test combining with delay and max.