diff --git a/api/docs/release.dox b/api/docs/release.dox
index d82ca7bf109..d73b1ed0848 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -142,11 +142,11 @@ changes:
    refers to timestamps and direct switches, which is what most users should want.
  - Rename the macro INSTR_CREATE_mul_sve to INSTR_CREATE_mul_sve_imm to
    differentiate it from the other SVE MUL instructions.
- - Added a new drmemtrace analyzer option \p -interval_instr_count that enables trace
-   analyzer interval results for every given count of instrs in each shard. This mode
-   does not support merging the shard interval snapshots to output the whole-trace
-   interval snapshots. Instead, the print_interval_results() API is called separately
-   for each shard with the interval state snapshots of that shard.
+ - Renamed a protected data member in #dynamorio::drmemtrace::analyzer_tmpl_t from
+   merged_interval_snapshots_ to whole_trace_interval_snapshots_ (may be relevant for
+   users sub-classing analyzer_tmpl_t).
+ - Converted #dynamorio::drmemtrace::analysis_tool_tmpl_t::interval_state_snapshot_t
+   into a class with all its data members marked private with public accessor functions.
 
 Further non-compatibility-affecting changes include:
  - Added DWARF-5 support to the drsyms library by linking in 4 static libraries
@@ -203,11 +203,15 @@ Further non-compatibility-affecting changes include:
  - Added #dynamorio::drmemtrace::TRACE_MARKER_TYPE_VECTOR_LENGTH marker to indicate the
    current vector length for architectures with a hardware defined or runtime changeable
    vector length (such as AArch64's SVE scalable vectors).
- - Renamed a protected data member in #dynamorio::drmemtrace::analyzer_tmpl_t from
-   merged_interval_snapshots_ to whole_trace_interval_snapshots_ (may be relevant for
-   users sub-classing analyzer_tmpl_t).
- - Converted #dynamorio::drmemtrace::analysis_tool_tmpl_t::interval_state_snapshot_t
-   into a class with all its data members marked private with public accessor functions.
+ - Added a new drmemtrace analyzer option \p -interval_instr_count that enables trace
+   analyzer interval results for every given count of instrs in each shard. This mode
+   does not support merging the shard interval snapshots to output the whole-trace
+   interval snapshots. Instead, the print_interval_results() API is called separately
+   for each shard with the interval state snapshots of that shard.
+ - Added a new finalize_interval_snapshots() API to
+   #dynamorio::drmemtrace::analysis_tool_t to allow the tool to make holistic
+   adjustments to the interval snapshots after all have been generated, and before
+   they are used for merging across shards (potentially), and printing the results.
 
 **************************************************
 <hr>
diff --git a/clients/drcachesim/analysis_tool.h b/clients/drcachesim/analysis_tool.h
index a724d06a3af..24306cd7534 100644
--- a/clients/drcachesim/analysis_tool.h
+++ b/clients/drcachesim/analysis_tool.h
@@ -189,12 +189,13 @@ template <typename RecordType> class analysis_tool_tmpl_t {
     print_results() = 0;
 
     /**
-     * Struct that stores details of a tool's state snapshot at an interval. This is
+     * Type that stores details of a tool's state snapshot at an interval. This is
      * useful for computing and combining interval results. Tools should inherit from
-     * this struct to define their own state snapshot structs. Tools do not need to
-     * supply any values to construct this base struct; they can simply use the
+     * this type to define their own state snapshot types. Tools do not need to
+     * supply any values to construct this base class; they can simply use the
      * default constructor. The members of this base class will be set by the
-     * framework automatically.
+     * framework automatically, and must not be modified by the tool at any point.
+     * XXX: Perhaps this should be a class with private data members.
      */
     class interval_state_snapshot_t {
         // Allow the analyzer framework access to private data members to set them
@@ -220,6 +221,10 @@ template <typename RecordType> class analysis_tool_tmpl_t {
             , instr_count_delta_(instr_count_delta)
         {
         }
+        // This constructor should be used by tools that subclass
+        // interval_state_snapshot_t. The data members will be set by the framework
+        // automatically when the tool returns a pointer to their created object from
+        // generate_*interval_snapshot or combine_interval_snapshots.
         interval_state_snapshot_t()
         {
         }
@@ -257,8 +262,9 @@ template <typename RecordType> class analysis_tool_tmpl_t {
         // The following fields are set automatically by the analyzer framework after
         // the tool returns the interval_state_snapshot_t* in the
         // generate_*interval_snapshot APIs. So they'll be available to the tool in
-        // the combine_interval_snapshots (for the parameter snapshots) and
-        // print_interval_results APIs via the above public accessor functions.
+        // the finalize_interval_snapshots(), combine_interval_snapshots() (for the
+        // parameter snapshots), and print_interval_results() APIs via the above
+        // public accessor functions.
 
         // Identifier for the shard to which this interval belongs. Currently, shards
         // map only to threads, so this is the thread id. Set to WHOLE_TRACE_SHARD_ID
@@ -280,23 +286,26 @@ template <typename RecordType> class analysis_tool_tmpl_t {
     };
     /**
      * Notifies the analysis tool that the given trace \p interval_id has ended so
-     * that it can generate a snapshot of its internal state in a struct derived
+     * that it can generate a snapshot of its internal state in a type derived
      * from \p interval_state_snapshot_t, and return a pointer to it. The returned
-     * pointer will be provided to the tool in later combine_interval_snapshots()
+     * pointer will be provided to the tool in later finalize_interval_snapshots(),
      * and print_interval_result() calls.
      *
      * \p interval_id is a positive ordinal of the trace interval that just ended.
-     * Trace intervals have a length equal to the \p -interval_microseconds specified
-     * to the framework. Trace intervals are measured using the value of the
-     * #TRACE_MARKER_TYPE_TIMESTAMP markers. The provided \p interval_id
-     * values will be monotonically increasing but may not be continuous,
-     * i.e. the tool may not see some \p interval_id if the trace did not have
-     * any activity in that interval.
+     * Trace intervals have a length equal to either \p -interval_microseconds or
+     * \p -interval_instr_count. Time-based intervals are measured using the value
+     * of the #TRACE_MARKER_TYPE_TIMESTAMP markers. Instruction count intervals are
+     * measured in terms of shard-local instrs.
      *
-     * The returned \p interval_state_snapshot_t* will be passed to the
-     * combine_interval_snapshots() API which is invoked by the framework to merge
-     * multiple \p interval_state_snapshot_t from different shards in the parallel
-     * mode of the analyzer.
+     * The provided \p interval_id values will be monotonically increasing. For
+     * \p -interval_microseconds intervals, these values may not be continuous,
+     * i.e. the tool may not see some \p interval_id if the trace did not have any
+     * activity in that interval.
+     *
+     * After all interval state snapshots are generated, the list of all returned
+     * \p interval_state_snapshot_t* is passed to finalize_interval_snapshots()
+     * to allow the tool the opportunity to make any holistic adjustments to the
+     * snapshots.
      *
      * Finally, the print_interval_result() API is invoked with a list of
      * \p interval_state_snapshot_t* representing interval snapshots for the
@@ -313,6 +322,40 @@ template <typename RecordType> class analysis_tool_tmpl_t {
     {
         return nullptr;
     }
+    /**
+     * Finalizes the interval snapshots in the given \p interval_snapshots list.
+     * This callback provides an opportunity for tools to make any holistic
+     * adjustments to the snapshot list now that we have all of them together. This
+     * may include, for example, computing the diff with the previous snapshot.
+     *
+     * Tools can modify the individual snapshots and also the list of snapshots itself.
+     * If some snapshots are removed, release_interval_snapshot() will not be invoked
+     * for them and the tool is responsible to de-allocate the resources. Adding new
+     * snapshots to the list is undefined behavior; tools should operate only on the
+     * provided snapshots which were generated in prior generate_*interval_snapshot
+     * calls.
+     *
+     * Tools cannot modify any data set by the framework in the base
+     * \p interval_state_snapshot_t; note that only read-only access is allowed anyway
+     * to those private data members via public accessor functions.
+     *
+     * In the parallel mode, this is invoked for each list of shard-local snapshots
+     * before they are possibly merged to create whole-trace snapshots using
+     * combine_interval_snapshots() and passed to print_interval_result(). In the
+     * serial mode, this is invoked with the list of whole-trace snapshots before it
+     * is passed to print_interval_results().
+     *
+     * This is an optional API. If a tool chooses to not override this, the snapshot
+     * list will simply continue unmodified.
+     *
+     * Returns whether it was successful.
+     */
+    virtual bool
+    finalize_interval_snapshots(
+        std::vector<interval_state_snapshot_t *> &interval_snapshots)
+    {
+        return true;
+    }
     /**
      * Invoked by the framework to combine the shard-local \p interval_state_snapshot_t
      * objects pointed at by \p latest_shard_snapshots, to create the combined
@@ -338,6 +381,10 @@ template <typename RecordType> class analysis_tool_tmpl_t {
      *   \p interval_end_timestamp)
      * - or if the tool mixes cumulative and delta metrics: some field-specific logic that
      *   combines the above two strategies.
+     *
+     * Note that after the given snapshots have been combined to create the whole-trace
+     * snapshot using this API, any change made by the tool to the snapshot contents will
+     * not have any effect.
      */
     virtual interval_state_snapshot_t *
     combine_interval_snapshots(
@@ -350,14 +397,14 @@ template <typename RecordType> class analysis_tool_tmpl_t {
      * Prints the interval results for the given series of interval state snapshots in
      * \p interval_snapshots.
      *
-     * This is currently invoked with the list of whole-trace interval snapshots (for
-     * the parallel mode, these are the snapshots created by merging the shard-local
-     * snapshots).
+     * This is invoked with the list of whole-trace interval snapshots (for the
+     * parallel mode, these are the snapshots created by merging the shard-local
+     * snapshots). For the \p -interval_instr_count snapshots in parallel mode, this is
+     * invoked separately for the snapshots of each shard.
      *
      * The framework should be able to invoke this multiple times, possibly with a
      * different list of interval snapshots. So it should avoid free-ing memory or
-     * changing global state. This is to keep open the possibility of the framework
-     * printing interval results for each shard separately in future.
+     * changing global state.
      */
     virtual bool
     print_interval_results(
@@ -370,6 +417,10 @@ template <typename RecordType> class analysis_tool_tmpl_t {
      * by \p interval_snapshot is no longer needed by the framework. The tool may
      * de-allocate it right away or later, as it needs. Returns whether it was
      * successful.
+     *
+     * Note that if the tool removed some snapshot from the list passed to
+     * finalize_interval_snapshots(), then release_interval_snapshot() will not be
+     * invoked for that snapshot.
      */
     virtual bool
     release_interval_snapshot(interval_state_snapshot_t *interval_snapshot)
@@ -476,10 +527,10 @@ template <typename RecordType> class analysis_tool_tmpl_t {
     /**
      * Notifies the analysis tool that the given trace \p interval_id in the shard
      * represented by the given \p shard_data has ended, so that it can generate a
-     * snapshot of its internal state in a struct derived from \p
+     * snapshot of its internal state in a type derived from \p
      * interval_state_snapshot_t, and return a pointer to it. The returned pointer will
-     * be provided to the tool in later combine_interval_snapshots() and
-     * print_interval_result() calls.
+     * be provided to the tool in later combine_interval_snapshots(),
+     * finalize_interval_snapshots(), and print_interval_result() calls.
      *
      * Note that the provided \p interval_id is local to the shard that is
      * represented by the given \p shard_data, and not the whole-trace interval. The
@@ -488,30 +539,22 @@ template <typename RecordType> class analysis_tool_tmpl_t {
      * shard-local \p interval_state_snapshot_t corresponding to that whole-trace
      * interval.
      *
-     * \p interval_id is a positive ordinal of the trace interval that just ended.
-     * Trace intervals have a length equal to the \p -interval_microseconds specified
-     * to the framework. Trace intervals are measured using the value of the
-     * #TRACE_MARKER_TYPE_TIMESTAMP markers. The provided \p interval_id
-     * values will be monotonically increasing but may not be continuous,
-     * i.e. the tool may not see some \p interval_id if the trace shard did not
-     * have any activity in that interval.
+     * The \p interval_id field is defined similar to the same field in
+     * generate_interval_snapshot().
      *
-     * The returned \p interval_state_snapshot_t* will be passed to the
-     * combine_interval_snapshot() API which is invoked by the framework to merge
-     * multiple \p interval_state_snapshot_t from different shards in the parallel
-     * mode of the analyzer.
+     * The returned \p interval_state_snapshot_t* is treated in the same manner as
+     * the same in generate_interval_snapshot(), with the following additions:
      *
-     * Finally, the print_interval_result() API is invoked with a list of
-     * \p interval_state_snapshot_t* representing interval snapshots for the
-     * whole trace. In the parallel mode of the analyzer, this list is computed by
-     * combining the shard-local \p interval_state_snapshot_t using the tool's
-     * combine_interval_snapshot() API.
+     * In case of \p -interval_microseconds in the parallel mode: after
+     * finalize_interval_snapshots() has been invoked, the \p interval_state_snapshot_t*
+     * objects generated at the same time period across different shards are passed to
+     * the combine_interval_snapshot() API by the framework to merge them to create the
+     * whole-trace interval snapshots. The print_interval_result() API is then invoked
+     * with the list of whole-trace \p interval_state_snapshot_t* thus obtained.
      *
-     * The tool must not de-allocate the state snapshot until
-     * release_interval_snapshot() is invoked by the framework.
-     *
-     * An example use case of this API is to create a time series of some output
-     * metric over the whole trace.
+     * In case of \p -interval_instr_count in the parallel mode: no merging across
+     * shards is done, and the print_interval_results() API is invoked for each list
+     * of shard-local \p interval_state_snapshot_t*.
      */
     virtual interval_state_snapshot_t *
     generate_shard_interval_snapshot(void *shard_data, uint64_t interval_id)
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index f0425eb586c..110a847fc75 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -40,7 +40,6 @@
 #include <iostream>
 #include <limits>
 #include <memory>
-#include <queue>
 #include <string>
 #include <thread>
 #include <utility>
@@ -528,10 +527,12 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_serial(analyzer_worker_data_t &
                         "Failed to read from trace: " + worker.stream->get_stream_name();
                 }
             } else if (interval_microseconds_ != 0 || interval_instr_count_ != 0) {
-                process_interval(worker.shard_data[0].cur_interval_index,
-                                 worker.shard_data[0].cur_interval_init_instr_count,
-                                 &worker,
-                                 /*parallel=*/false, /*at_instr_record=*/false);
+                if (!process_interval(worker.shard_data[0].cur_interval_index,
+                                      worker.shard_data[0].cur_interval_init_instr_count,
+                                      &worker,
+                                      /*parallel=*/false, /*at_instr_record=*/false) ||
+                    !finalize_interval_snapshots(&worker, /*parallel=*/false))
+                    return;
             }
             return;
         }
@@ -566,10 +567,11 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_shard_exit(
            worker->stream->get_stream_name().c_str());
     worker->shard_data[shard_index].exited = true;
     if ((interval_microseconds_ != 0 || interval_instr_count_ != 0) &&
-        !process_interval(worker->shard_data[shard_index].cur_interval_index,
-                          worker->shard_data[shard_index].cur_interval_init_instr_count,
-                          worker,
-                          /*parallel=*/true, /*at_instr_record=*/false, shard_index))
+        (!process_interval(worker->shard_data[shard_index].cur_interval_index,
+                           worker->shard_data[shard_index].cur_interval_init_instr_count,
+                           worker,
+                           /*parallel=*/true, /*at_instr_record=*/false, shard_index) ||
+         !finalize_interval_snapshots(worker, /*parallel=*/true, shard_index)))
         return false;
     for (int i = 0; i < num_tools_; ++i) {
         if (!tools_[i]->parallel_shard_exit(
@@ -756,11 +758,9 @@ analyzer_tmpl_t<RecordType, ReaderType>::combine_interval_snapshots(
 template <typename RecordType, typename ReaderType>
 bool
 analyzer_tmpl_t<RecordType, ReaderType>::merge_shard_interval_results(
-    // intervals[shard_idx] is a queue of interval_state_snapshot_t*
-    // representing the interval snapshots for that shard. This is a queue as we
-    // process the intervals here in a FIFO manner. Using a queue also makes code
-    // a bit simpler.
-    std::vector<std::queue<
+    // intervals[shard_idx] is a vector of interval_state_snapshot_t*
+    // representing the interval snapshots for that shard.
+    std::vector<std::vector<
         typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>>
         &intervals,
     // This function will write the resulting whole-trace intervals to
@@ -775,6 +775,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::merge_shard_interval_results(
     // numbered by the earliest shard's timestamp.
     uint64_t earliest_ever_interval_end_timestamp = std::numeric_limits<uint64_t>::max();
     size_t shard_count = intervals.size();
+    std::vector<size_t> at_idx(shard_count, 0);
     bool any_shard_has_results_left = true;
     std::vector<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
         last_snapshot_per_shard(shard_count, nullptr);
@@ -783,11 +784,11 @@ analyzer_tmpl_t<RecordType, ReaderType>::merge_shard_interval_results(
         // one with the earliest interval-end timestamp.
         uint64_t earliest_interval_end_timestamp = std::numeric_limits<uint64_t>::max();
         for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
-            if (intervals[shard_idx].empty())
+            if (at_idx[shard_idx] == intervals[shard_idx].size())
                 continue;
-            earliest_interval_end_timestamp =
-                std::min(earliest_interval_end_timestamp,
-                         intervals[shard_idx].front()->interval_end_timestamp_);
+            earliest_interval_end_timestamp = std::min(
+                earliest_interval_end_timestamp,
+                intervals[shard_idx][at_idx[shard_idx]]->interval_end_timestamp_);
         }
         // We're done if no shard has any interval left unprocessed.
         if (earliest_interval_end_timestamp == std::numeric_limits<uint64_t>::max()) {
@@ -802,10 +803,10 @@ analyzer_tmpl_t<RecordType, ReaderType>::merge_shard_interval_results(
         // Update last_snapshot_per_shard for shards that were active during this
         // interval, which have a timestamp == earliest_interval_end_timestamp.
         for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
-            if (intervals[shard_idx].empty())
+            if (at_idx[shard_idx] == intervals[shard_idx].size())
                 continue;
             uint64_t cur_interval_end_timestamp =
-                intervals[shard_idx].front()->interval_end_timestamp_;
+                intervals[shard_idx][at_idx[shard_idx]]->interval_end_timestamp_;
             assert(cur_interval_end_timestamp >= earliest_interval_end_timestamp);
             if (cur_interval_end_timestamp > earliest_interval_end_timestamp)
                 continue;
@@ -818,8 +819,8 @@ analyzer_tmpl_t<RecordType, ReaderType>::merge_shard_interval_results(
                     return false;
                 }
             }
-            last_snapshot_per_shard[shard_idx] = intervals[shard_idx].front();
-            intervals[shard_idx].pop();
+            last_snapshot_per_shard[shard_idx] = intervals[shard_idx][at_idx[shard_idx]];
+            ++at_idx[shard_idx];
         }
         // Merge last_snapshot_per_shard to form the result of the current
         // whole-trace interval.
@@ -853,23 +854,6 @@ analyzer_tmpl_t<RecordType, ReaderType>::merge_shard_interval_results(
     return true;
 }
 
-template <typename RecordType, typename ReaderType>
-void
-analyzer_tmpl_t<RecordType, ReaderType>::drain_interval_snapshot_queue_to_vector(
-    std::queue<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
-        &que,
-    std::vector<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
-        &vec)
-{
-    // If the worker's interval snapshot data were an std::vector we could just
-    // std::move here, but that would make the implementation of
-    // merge_shard_interval_results more complex, so there are trade-offs.
-    while (!que.empty()) {
-        vec.push_back(que.front());
-        que.pop();
-    }
-}
-
 template <typename RecordType, typename ReaderType>
 void
 analyzer_tmpl_t<RecordType, ReaderType>::populate_unmerged_shard_interval_results()
@@ -880,9 +864,8 @@ analyzer_tmpl_t<RecordType, ReaderType>::populate_unmerged_shard_interval_result
             for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
                 key_tool_shard_t tool_shard_key = { tool_idx,
                                                     shard_data.second.shard_index };
-                drain_interval_snapshot_queue_to_vector(
-                    shard_data.second.tool_data[tool_idx].interval_snapshot_data,
-                    per_shard_interval_snapshots_[tool_shard_key]);
+                per_shard_interval_snapshots_[tool_shard_key] = std::move(
+                    shard_data.second.tool_data[tool_idx].interval_snapshot_data);
             }
         }
     }
@@ -900,9 +883,8 @@ analyzer_tmpl_t<RecordType, ReaderType>::populate_serial_interval_results()
     assert(static_cast<int>(worker_data_[0].shard_data[0].tool_data.size()) ==
            num_tools_);
     for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
-        drain_interval_snapshot_queue_to_vector(
-            worker_data_[0].shard_data[0].tool_data[tool_idx].interval_snapshot_data,
-            whole_trace_interval_snapshots_[tool_idx]);
+        whole_trace_interval_snapshots_[tool_idx] = std::move(
+            worker_data_[0].shard_data[0].tool_data[tool_idx].interval_snapshot_data);
     }
 }
 
@@ -921,9 +903,9 @@ analyzer_tmpl_t<RecordType, ReaderType>::collect_and_maybe_merge_shard_interval_
         populate_unmerged_shard_interval_results();
         return true;
     }
-    // all_intervals[tool_idx][shard_idx] contains a queue of the
+    // all_intervals[tool_idx][shard_idx] contains a vector of the
     // interval_state_snapshot_t* that were output by that tool for that shard.
-    std::vector<std::vector<std::queue<
+    std::vector<std::vector<std::vector<
         typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>>>
         all_intervals(num_tools_);
     for (const auto &worker : worker_data_) {
@@ -1072,13 +1054,40 @@ analyzer_tmpl_t<RecordType, ReaderType>::print_stats()
     return true;
 }
 
+template <typename RecordType, typename ReaderType>
+bool
+analyzer_tmpl_t<RecordType, ReaderType>::finalize_interval_snapshots(
+    analyzer_worker_data_t *worker, bool parallel, int shard_idx)
+{
+    assert(parallel ||
+           shard_idx == 0); // Only parallel mode supports a non-zero shard_idx.
+    for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
+        if (!worker->shard_data[shard_idx]
+                 .tool_data[tool_idx]
+                 .interval_snapshot_data.empty() &&
+            !tools_[tool_idx]->finalize_interval_snapshots(worker->shard_data[shard_idx]
+                                                               .tool_data[tool_idx]
+                                                               .interval_snapshot_data)) {
+            worker->error = tools_[tool_idx]->get_error_string();
+            VPRINT(this, 1,
+                   "Worker %d hit finalize_interval_snapshots error %s during %s "
+                   "analysis in trace shard %s\n",
+                   worker->index, worker->error.c_str(), parallel ? "parallel" : "serial",
+                   worker->stream->get_stream_name().c_str());
+            return false;
+        }
+    }
+    return true;
+}
+
 template <typename RecordType, typename ReaderType>
 bool
 analyzer_tmpl_t<RecordType, ReaderType>::process_interval(
     uint64_t interval_id, uint64_t interval_init_instr_count,
     analyzer_worker_data_t *worker, bool parallel, bool at_instr_record, int shard_idx)
 {
-    assert(parallel || shard_idx == 0); // Default to zero for the serial mode.
+    assert(parallel ||
+           shard_idx == 0); // Only parallel mode supports a non-zero shard_idx.
     for (int tool_idx = 0; tool_idx < num_tools_; ++tool_idx) {
         typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *snapshot;
         if (parallel) {
@@ -1121,8 +1130,9 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_interval(
                 worker->stream->get_instruction_ordinal() - (at_instr_record ? 1 : 0);
             snapshot->instr_count_delta_ =
                 snapshot->instr_count_cumulative_ - interval_init_instr_count;
-            worker->shard_data[shard_idx].tool_data[tool_idx].interval_snapshot_data.push(
-                snapshot);
+            worker->shard_data[shard_idx]
+                .tool_data[tool_idx]
+                .interval_snapshot_data.push_back(snapshot);
         }
     }
     return true;
diff --git a/clients/drcachesim/analyzer.h b/clients/drcachesim/analyzer.h
index 964159c6aee..f0b54bc4c05 100644
--- a/clients/drcachesim/analyzer.h
+++ b/clients/drcachesim/analyzer.h
@@ -47,7 +47,6 @@
 
 #include <iterator>
 #include <memory>
-#include <queue>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -145,9 +144,10 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
         }
 
         void *shard_data;
-        // This is a queue as merge_shard_interval_results processes the intervals in a
-        // FIFO manner. Using a queue also makes code a bit simpler.
-        std::queue<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
+        // Stores the interval state snapshots generated by this tool for this shard
+        // in the same order as they are generated.
+        std::vector<
+            typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
             interval_snapshot_data;
 
     private:
@@ -266,6 +266,14 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     RecordType
     create_idle_marker();
 
+    // Invoked after all interval state snapshots have been generated for the given
+    // shard_idx and before any merging or printing of interval snapshots. This
+    // invokes the finalize_interval_snapshots API for all tools that returned some
+    // non-null interval snapshot.
+    bool
+    finalize_interval_snapshots(analyzer_worker_data_t *worker, bool parallel,
+                                int shard_idx = 0);
+
     // Invoked when the given interval finishes during serial or parallel
     // analysis of the trace. For parallel analysis, the shard_id
     // parameter should be set to the shard_id for which the interval
@@ -317,7 +325,7 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     // that map to the same final whole-trace interval.
     bool
     merge_shard_interval_results(
-        std::vector<std::queue<
+        std::vector<std::vector<
             typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>>
             &intervals,
         std::vector<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t
@@ -350,13 +358,6 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     uint64_t
     get_current_microseconds();
 
-    void
-    drain_interval_snapshot_queue_to_vector(
-        std::queue<typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *>
-            &que,
-        std::vector<
-            typename analysis_tool_tmpl_t<RecordType>::interval_state_snapshot_t *> &vec);
-
     bool success_;
     scheduler_tmpl_t<RecordType, ReaderType> scheduler_;
     std::string error_string_;
diff --git a/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp b/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
index ec5546f32e5..ac684d21594 100644
--- a/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
+++ b/clients/drcachesim/tests/trace_interval_analysis_unit_tests.cpp
@@ -204,6 +204,7 @@ class dummy_analysis_tool_t : public analysis_tool_t {
     analysis_tool_t::interval_state_snapshot_t *
     generate_interval_snapshot(uint64_t interval_id) override
     {
+        saw_serial_generate_snapshot_ = true;
         ++generate_snapshot_count_;
         return nullptr;
     }
@@ -236,7 +237,27 @@ class dummy_analysis_tool_t : public analysis_tool_t {
     generate_shard_interval_snapshot(void *shard_data, uint64_t interval_id) override
     {
         ++generate_snapshot_count_;
-        return nullptr;
+        // We generate a snapshot here, but we clear them all in
+        // finalize_interval_snapshots to test that scenario.
+        auto *snapshot = new interval_state_snapshot_t();
+        return snapshot;
+    }
+    bool
+    finalize_interval_snapshots(
+        std::vector<interval_state_snapshot_t *> &interval_snapshots)
+    {
+        if (saw_serial_generate_snapshot_) {
+            error_string_ = "Did not expect finalize_interval_snapshots call in serial "
+                            "mode which does not generate any snapshot.";
+            return false;
+        }
+        for (auto snapshot : interval_snapshots) {
+            delete snapshot;
+        }
+        // We clear the snapshots here so that there will be no
+        // combine_interval_snapshots or print_interval_results calls.
+        interval_snapshots.clear();
+        return true;
     }
     analysis_tool_t::interval_state_snapshot_t *
     combine_interval_snapshots(
@@ -269,6 +290,7 @@ class dummy_analysis_tool_t : public analysis_tool_t {
 
 private:
     int generate_snapshot_count_;
+    bool saw_serial_generate_snapshot_ = false;
 };
 
 #define SERIAL_TID 0
@@ -361,6 +383,8 @@ struct recorded_snapshot_t : public analysis_tool_t::interval_state_snapshot_t {
     // Stores the shard_id recorded by the test tool. Compared with the shard_id
     // stored by the framework in the base struct.
     int64_t tool_shard_id;
+    // Stores whether this snapshot was seen by finalize_interval_snapshots.
+    bool saw_finalize_call = false;
 };
 
 // Test analysis_tool_t that records information about when the
@@ -450,6 +474,26 @@ class test_analysis_tool_t : public analysis_tool_t {
         ++outstanding_snapshots_;
         return snapshot;
     }
+    bool
+    finalize_interval_snapshots(
+        std::vector<interval_state_snapshot_t *> &interval_snapshots)
+    {
+        for (auto snapshot : interval_snapshots) {
+            if (snapshot == nullptr) {
+                error_string_ =
+                    "Did not expect a nullptr snapshot in finalize_interval_snapshots";
+                return false;
+            }
+            auto recorded_snapshot = dynamic_cast<recorded_snapshot_t *>(snapshot);
+            if (recorded_snapshot->saw_finalize_call) {
+                error_string_ = "interval_state_snapshot_t presented "
+                                "to finalize_interval_snapshots multiple times";
+                return false;
+            }
+            recorded_snapshot->saw_finalize_call = true;
+        }
+        return true;
+    }
     analysis_tool_t::interval_state_snapshot_t *
     combine_interval_snapshots(
         const std::vector<const analysis_tool_t::interval_state_snapshot_t *>
@@ -486,6 +530,11 @@ class test_analysis_tool_t : public analysis_tool_t {
                                 recorded_snapshot->get_shard_id());
                     return nullptr;
                 }
+                if (!recorded_snapshot->saw_finalize_call) {
+                    error_string_ =
+                        "combine_interval_snapshots saw non-finalized snapshot";
+                    return nullptr;
+                }
                 result->component_intervals.insert(
                     result->component_intervals.end(),
                     recorded_snapshot->component_intervals.begin(),