From ba7cf50970220c818d4c28e376ff8fcde09d7370 Mon Sep 17 00:00:00 2001
From: John Giorshev <johngiorshev@gmail.com>
Date: Mon, 11 Sep 2023 19:46:50 -0400
Subject: [PATCH] Bound unique sort case  (#43)

---
 .github/workflows/tests.yml |   2 +-
 CMakeLists.txt              |  15 +++-
 perf/gen_perf_stats.bash    |   3 +
 perf/perf.md                |  54 +++++++--------
 readme.md                   |  24 +++----
 src/algo_utils.hpp          |   1 +
 src/args.hpp                |  45 +++++++++---
 src/test.cpp                | 132 ++++++++++++++++++------------------
 src/token.hpp               |  41 ++++++-----
 src/uniqueness_utils.hpp    |  18 ++---
 10 files changed, 183 insertions(+), 152 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 58ffb9d..292828c 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -10,7 +10,7 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y build-essential libboost-test-dev cmake pkg-config libpcre2-dev libncursesw5-dev
+          sudo apt-get install -y build-essential libboost-test-dev cmake pkg-config libpcre2-dev libncursesw5-dev libtbb-dev
       - name: Build project
         run: |
           cd build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 36a5e97..18b3d59 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,7 @@ project(choose)
 add_executable(choose src/main.cpp)
 target_include_directories(choose PRIVATE src)
 
-target_compile_options(choose PRIVATE -Wall -Wextra -O3 -fno-rtti)
+target_compile_options(choose PRIVATE -Wall -Wextra -O3)
 
 set(CURSES_NEED_NCURSES TRUE)
 set(CURSES_NEED_WIDE TRUE)
@@ -29,6 +29,13 @@ pkg_check_modules(PCRE REQUIRED libpcre2-8)
 target_include_directories(choose PRIVATE ${PCRE_INCLUDEDIR})
 target_link_libraries(choose PRIVATE ${PCRE_LIBRARIES})
 
+# https://stackoverflow.com/a/74755391/15534181
+# optional link here since sometimes it's ok not to have. this is the least invasive
+find_package(TBB QUIET)
+if(TBB_FOUND)
+  target_link_libraries(choose PRIVATE TBB::tbb)
+endif()
+
 if (NO_SCROLL_BORDER)
   target_compile_definitions(choose PRIVATE
     CHOOSE_NO_SCROLL_BORDER
@@ -52,7 +59,7 @@ if(BUILD_TESTING)
   add_executable(unit_tests src/test.cpp)
   target_include_directories(unit_tests PRIVATE src)
   # -O0 gives better coverage info than -Og
-  target_compile_options(unit_tests PRIVATE -Wall -Wextra -O0 -g -fno-rtti)
+  target_compile_options(unit_tests PRIVATE -Wall -Wextra -O0 -g)
 
   target_include_directories(unit_tests PRIVATE ${Boost_UNIT_TEST_FRAMEWORK_HEADER_NAME})
   target_link_libraries(unit_tests PRIVATE ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
@@ -64,6 +71,10 @@ if(BUILD_TESTING)
   target_include_directories(unit_tests PRIVATE ${PCRE_INCLUDEDIR})
   target_link_libraries(unit_tests PRIVATE ${PCRE_LIBRARIES})
 
+  if(TBB_FOUND)
+    target_link_libraries(unit_tests PRIVATE TBB::tbb)
+  endif()
+
   if (DISABLE_FIELD)
     target_compile_definitions(unit_tests PRIVATE
       CHOOSE_DISABLE_FIELD
diff --git a/perf/gen_perf_stats.bash b/perf/gen_perf_stats.bash
index 23b4e16..9c84f72 100755
--- a/perf/gen_perf_stats.bash
+++ b/perf/gen_perf_stats.bash
@@ -5,6 +5,9 @@
 set -e
 sudo echo -n '' # do nothing. perf requires sudo. doing the prompt at the beginning
 
+# fairer performance for sort
+export LC_ALL=C
+
 # e.g. -n makes the benchmarks apply sorting and uniqueness numerically (but not for just uniqueness since awk doesn't support this)
 COMP_FLAGS=
 
diff --git a/perf/perf.md b/perf/perf.md
index b3af174..cad5534 100644
--- a/perf/perf.md
+++ b/perf/perf.md
@@ -20,7 +20,7 @@ Also note the compile time option called `DISABLE_FIELD`. It disables the `--fie
 
 ### Sorting, and Sorting + Uniqueness
 
-In most cases, `choose` is faster than `sort` and `sort -u` at sorting and sorting + uniqueness, respectively. Note however that the c locale isn't used for comparisons, and again it is the task clock that is reported.
+`sort` is using naive byte order (via `LC_ALL=C`), as this is the fairest. `sort` is faster than `choose` at sorting. If truncation is leveraged, or if there are many duplicates (when applying uniqueness as well), then `choose` is faster than `sort`.
 
 ## Input Data
 
@@ -82,80 +82,80 @@ garbage,5,garbage
 
 ### Versions
 ```txt
-choose 0.3.0, ncurses 6.1.20180127, pcre2 10.42
-pcre2grep version 10.42 2022-12-11
-sed (GNU sed) 4.4
-GNU Awk 4.1.4, API: 1.1 (GNU MPFR 4.0.1, GNU MP 6.1.2)
-sort (GNU coreutils) 8.28
+choose 0.3.0, ncurses 6.2.20200212, pcre2 10.43
+pcre2grep version 10.43-DEV 2023-04-14
+sed (GNU sed) 4.7
+GNU Awk 5.0.1, API: 2.0 (GNU MPFR 4.0.2, GNU MP 6.2.0)
+sort (GNU coreutils) 8.30
 ```
 ### Specs
 ```txt
 5.15.90.1-microsoft-standard-WSL2
-Intel(R) Core(TM) i5-8600K CPU @ 3.60GHz
-ram: 8116584 kB
+AMD Ryzen 7 3800X 8-Core Processor
+ram: 16331032 kB
 ```
 
 ### Grepping
 
 | (ms)             | choose | pcre2grep  |
 |------------------|--------|------------|
-| plain_text       | 238.334100 | 246.104400 | 
-| test_repeated    | 1536.390100 | 1446.540000 | 
-| no_duplicates    | 321.083200 | 313.054700 | 
+| plain_text       | 247.17 | 269.69 | 
+| test_repeated    | 1620.34 | 1583.77 | 
+| no_duplicates    | 323.59 | 370.16 | 
 
 ### Stream Editing
 
 | (ms)             | choose | sed  |
 |------------------|--------|------|
-| plain_text       | 173.019600 | 156.455300 | 
-| test_repeated    | 2563.258500 | 1024.358400 | 
-| no_duplicates    | 8.424300 | 46.834200 | 
+| plain_text       | 179.57 | 135.57 | 
+| test_repeated    | 2725.50 | 1157.39 | 
+| no_duplicates    | 5.10 | 44.00 | 
 
 (here is a cherry picked great case for choose compared to sed)
 
 | (ms)             | choose | sed (with newline delimiter) |
 |------------------|--------|------|
-| no_duplicates    | 8.245600 | 437.878300 | 
+| no_duplicates    | 5.13 | 543.93 | 
 
 (a special case, where choose cheats by using a literal replacement string)
 
 | (ms)             | choose (delimiter sub) | sed |
 |------------------|------------------------|-----|
-| test_repeated    | 1457.271000 | 1010.783600 | 
+| test_repeated    | 1521.93 | 1156.64 | 
 
 ### Sorting 
 
 | (ms)             | choose | sort |
 |------------------|--------|------|
-| plain_text       | 694.556000 | 1905.257700 | 
-| test_repeated    | 2226.087400 | 1987.113500 | 
-| no_duplicates    | 2120.992700 | 5092.179100 | 
+| plain_text       | 1628.88 | 448.06 | 
+| test_repeated    | 1850.89 | 1616.13 | 
+| no_duplicates    | 3714.94 | 1036.93 | 
 
 (a special case that leverages truncation)
 
 | (ms)             | choose -s --out 5 | sort \| head -n 5 |
 |------------------|--------|------|
-| no_duplicates    | 251.069600 | 5063.083100 | 
+| no_duplicates    | 354.20 | 1059.81 | 
 
 ### Uniqueness 
 
 | (ms)             | choose | awk |
 |------------------|--------|-----|
-| plain_text       | 114.649800 | 208.971700 | 
-| test_repeated    | 578.412600 | 972.325200 | 
-| no_duplicates    | 2480.435700 | 1477.912300 | 
+| plain_text       | 111.95 | 214.41 | 
+| test_repeated    | 565.31 | 1147.75 | 
+| no_duplicates    | 2340.37 | 1496.42 | 
 
 ### Sorting and Uniqueness   -u
 
 | (ms)             | choose | sort |
 |------------------|--------|------|
-| plain_text       | 106.970100 | 1906.801600 | 
-| test_repeated    | 574.516000 | 1961.279100 | 
-| no_duplicates    | 4165.485200 | 5670.807600 | 
+| plain_text       | 122.80 | 440.57 | 
+| test_repeated    | 558.86 | 1640.79 | 
+| no_duplicates    | 5742.11 | 1168.84 | 
 
 
 ### Sorting and Uniqueness based on field   -u
 
 | (ms)             | choose | sort |
 |------------------|--------|------|
-| csv_field        | 1779.289000 | 1987.503500 | 
+| csv_field        | 2770.27 | 474.02 | 
diff --git a/readme.md b/readme.md
index b69f33c..14c0c02 100644
--- a/readme.md
+++ b/readme.md
@@ -18,7 +18,7 @@ Also it's fast. See benchmarks [here](./perf/perf.md) comparing choose to other
 
 ## Install
 ```bash
-sudo apt-get install cmake pkg-config libpcre2-dev libncursesw5-dev
+sudo apt-get install cmake pkg-config libpcre2-dev libncursesw5-dev libtbb-dev
 git clone https://github.com/jagprog5/choose.git && cd choose
 make install
 source ~/.bashrc
@@ -187,7 +187,7 @@ ccc
 
 # Monitoring
 
-Suppose there's an input that's running for a _really_ long time. For example, a python http server, with an output like this:
+Suppose there's an input that's running for a _long_ time. For example, a python http server, with an output like:
 
 ```txt
 127.0.0.1 - - [08/Sep/2023 22:11:48] "GET /tester.txt HTTP/1.1" 200 -
@@ -199,8 +199,11 @@ The goal is to monitor the output and print unique IPs:
 
 ```bash
 # serves current dir on 8080
-python3 -m http.server --directory . 8080 2>&1 >/dev/null\
-  | choose --match --multiline -r "^(?>(?:\d++\.){3})\d++" --unique-limit 1000 --unique-expiry 900 --flush
+python3 -u -m http.server --directory . 8080 2>&1 >/dev/null\
+  | choose --match --multiline -r '^(?:\d++\.){3}\d++' \
+      --unique-limit 1000\
+      --unique-expiry 900\
+      --flush
 ```
 
 This applies a form of bounded uniqueness in the face of an infinite input; tokens can be forgotten based on space and time constraints, meaning they can pass to the output again:
@@ -249,20 +252,10 @@ sed can't make a substitution if the target contains the delimiter (a newline ch
 `ch_hist` is a bash function installed with choose. It allows a previous command to be re-run, like [fzf](https://github.com/junegunn/fzf).
 
 ```txt
-  git log --oneline
-  top
-  cat temp.txt
-  git commit --amend
-  git push
-  clear
-  cd ~/
-  ls
-  cd choose/
-  git pull
   cd build
   rm -rf *
   cmake ..
-  sudo make install
+  make install
 > choose -h
 ┌────────────────────────────────────────────────────────────────────────────────┐
 │Select a line to edit then run.                                                 │
@@ -271,6 +264,7 @@ sed can't make a substitution if the target contains the delimiter (a newline ch
 ## Examples
 
 ```bash
+ch_hist
 ch_hist git
 ch_hist hello there
 ```
diff --git a/src/algo_utils.hpp b/src/algo_utils.hpp
index 55e00e8..b32058f 100644
--- a/src/algo_utils.hpp
+++ b/src/algo_utils.hpp
@@ -46,6 +46,7 @@ void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end,
 bool general_numeric_compare(const char* lhs_begin, const char* lhs_end, const char* rhs_begin, const char* rhs_end) { //
   float lhs, rhs;
   // if from_chars isn't found, get a newer compiler. e.g.
+  //    add-apt-repository -y ppa:ubuntu-toolchain-r/test
   //    apt-get install g++-11
   //    cd build && cmake .. -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11
   std::from_chars_result lhs_ret = std::from_chars(lhs_begin, lhs_end, lhs, std::chars_format::general);
diff --git a/src/args.hpp b/src/args.hpp
index 25161b8..7e30ae1 100644
--- a/src/args.hpp
+++ b/src/args.hpp
@@ -117,6 +117,13 @@ struct Arguments {
     return is_direct_output() && !unique;
   }
 
+  // the elements in output vector are being inserted with any excess being discarded
+  bool mem_is_bounded() const {
+    return out_end.has_value()   //
+           && !truncate_no_bound //
+           && (unique ? sort && unique_type == sort_type : true);
+  }
+
   void drop_warning() {
     if (this->can_drop_warn) {
       this->can_drop_warn = false;
@@ -134,8 +141,9 @@ struct Arguments {
 namespace {
 
 struct UncompiledCodes {
-  // all args must be parsed before the args are compiled
-  // the uncompiled args are stored here before transfer to the Arguments output.
+  // all args must be parsed before the args are compiled. the uncompiled args
+  // are stored here before transfer to the Arguments output. this also contains
+  // fields that aren't needed in the rest of the program, past the arg parsing
   uint32_t re_options = PCRE2_LITERAL;
   std::vector<uncompiled::UncompiledOrderedOp> ordered_ops;
 
@@ -152,6 +160,8 @@ struct UncompiledCodes {
   bool bout_delimiter_set = false;
   bool primary_set = false;
 
+  bool is_bounded_query = false;
+
   void compile(Arguments& output) const {
     for (const uncompiled::UncompiledOrderedOp& op : ordered_ops) {
       OrderedOp oo = uncompiled::compile(op, re_options);
@@ -315,6 +325,9 @@ void print_help_message() {
       "                must have at least one digit. parse failures are smallest\n"
       "        -i, --ignore-case\n"
       "                make the positional argument case-insensitive\n"
+      "        --is-bounded\n"
+      "                prints a line indicating if the memory usage is bounded from\n"
+      "                truncation (--out/--tail), then exits\n"
       "        --load-factor <positive float, default: " choose_xstr(UNIQUE_LOAD_FACTOR_DEFAULT) ">\n"
       "                if a hash table is used for uniqueness, set the max load factor\n"
       "        --locale <locale>\n"
@@ -375,19 +388,19 @@ void print_help_message() {
       "                on tui confirmed selection, do not exit; but still flush the\n"
       "                current selection to the output as a batch\n"
       "        --truncate-no-bound\n"
-      "                if truncation is specified (--out/--tail) and uniqueness is not\n"
-      "                specified, then choose only retains the relevant n values in\n"
-      "                memory. This is only faster for small values of n, as elements\n"
-      "                are shifted within this storage space. If n is large, this\n"
-      "                option should be used to disable this optimization, leading to\n"
-      "                faster speed but more space used\n"
+      "                if truncation is specified (--out/--tail) then choose may retain\n"
+      "                only the relevant n values in memory. see --is-bounded. this is\n"
+      "                faster for small values of n, as elements are shifted within\n"
+      "                this storage space. If n is large, this option should be used to\n"
+      "                disable this optimization\n"
       "        -u, --unique\n"
       "                remove duplicate input tokens. leaves first occurrences. applied\n"
       "                before sorting\n"
       "        --uniq\n"
       "                unrelated to any other uniqueness options. after sorting, remove\n"
-      "                consecutive duplicate elements. requires --sort. ignored by\n"
-      "                truncation --out/--tail (use normal -u in these cases instead)\n"
+      "                consecutive duplicate elements. requires --sort. ignored if\n"
+      "                memory is bounded from truncation (see --is-bounded. use normal\n"
+      "                -u in these cases instead)\n"
       "        --unique-numeric\n"
       "                apply uniqueness numerically. implies -u\n"
       "        --unique-expiry <# seconds>\n"
@@ -396,9 +409,11 @@ void print_help_message() {
       "        --unique-general-numeric\n"
       "                apply uniqueness general numerically. implies -u\n"
       "        --unique-limit <#tokens>\n"
-      "                implies -u. forget least recently used tokens\n"
+      "                implies -u. forget least recently used tokens. ignored if memory\n"
+      "                is bounded from truncation (see --is-bounded)\n"
       "        --unique-use-set\n"
       "                implies -u. apply uniqueness with a tree instead of a hash table\n"
+      "                ignored if memory is bounded from truncation (see --is-bounded)\n"
       "        --use-delimiter\n"
       "                don't ignore a delimiter at the end of the input\n"
       "        --utf\n"
@@ -527,6 +542,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
         {"flip", no_argument, NULL, 0},
         {"flush", no_argument, NULL, 0},
         {"ignore-case", no_argument, NULL, 'i'},
+        {"is-bounded", no_argument, NULL, 0},
         {"multi", no_argument, NULL, 'm'},
         {"multiline", no_argument, NULL, 0},
         {"match", no_argument, NULL, 0},
@@ -776,6 +792,8 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
           } else if (strcmp("unique-general-numeric", name) == 0) {
             ret.unique = true;
             ret.unique_type = general_numeric;
+          } else if (strcmp("is-bounded", name) == 0) {
+            uncompiled_output.is_bounded_query = true;
           } else if (strcmp("multiline", name) == 0) {
             uncompiled_output.re_options &= ~PCRE2_LITERAL;
             uncompiled_output.re_options |= PCRE2_MULTILINE;
@@ -998,6 +1016,11 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
     }
   }
 
+  if (uncompiled_output.is_bounded_query) {
+    int exit_code = puts(ret.mem_is_bounded() ? "yes" : "no") < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
+    exit(exit_code);
+  }
+
   if (isatty(fileno(ret.input))) {
     int exit_code = puts("Try 'choose --help' for more information.") < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
     exit(exit_code);
diff --git a/src/test.cpp b/src/test.cpp
index 5aee06d..4a520de 100644
--- a/src/test.cpp
+++ b/src/test.cpp
@@ -15,6 +15,11 @@ std::optional<size_t> output_size_bound_testing;
 /*
 valgrind should give a clean bill of health:
 valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --verbose --log-file=valgrind-out.txt ./unit_tests
+with one exception:
+on some platforms, tbb is used in the backend for std::execution.
+if tbb is used, then valgrind might indicate memory leaks. it uses an arena thread that doesn't clean up in time before termination.
+looks like "by 0x48D6B6F: ??? (in /usr/lib/x86_64-linux-gnu/libtbb.so.2)"
+ - https://github.com/oneapi-src/oneTBB/issues/206
 */
 
 using namespace choose;
@@ -579,6 +584,17 @@ BOOST_AUTO_TEST_CASE(unique_limit_lru) {
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
 
+// simulate passage of time
+struct FakeClock {
+  using duration = std::chrono::nanoseconds;
+  using rep = duration::rep;
+  using period = duration::period;
+  using time_point = std::chrono::time_point<FakeClock, duration>;
+  static time_point now() { return FakeClock::seconds_since_epoch; }
+
+  inline static time_point seconds_since_epoch; // set this
+};
+
 BOOST_AUTO_TEST_CASE(unique_expiry) {
   // visual inspections also include:
   // (echo "1";echo "2";echo "3";sleep 1;echo "2";sleep 1;echo "1";echo "2";echo "3") | choose --unique-limit 1000 --unique-expiry 2 --flush
@@ -595,80 +611,62 @@ BOOST_AUTO_TEST_CASE(unique_expiry) {
     return lhs.buffer == rhs.buffer;
   };
 
-  auto delay = std::chrono::milliseconds(10);
+  auto delay = FakeClock::duration(64); // time until staleness
 
   choose::Token token_1(to_vec("1"));
   choose::Token token_2(to_vec("2"));
+  choose::Token token_3(to_vec("3"));
 
-  std::atomic_bool has_errors = false;
-  // this test is tempermental for CI / valgrind, where the program is run in an
-  // environment where it can be paused unexpectantly or otherwise take too
-  // long. hence use of local_assert
-  auto local_assert = [&](bool p) {
-    if (!p) {
-      has_errors = true;
-    }
-  };
-
-  // running these tests in parallel since they incur delays.
-  std::thread set_thread([&]() {
-    ForgetfulSet<choose::Token, decltype(cmp)> s(cmp, 2, delay);
+  {
+    FakeClock::seconds_since_epoch = FakeClock::time_point();
+    ForgetfulSet<choose::Token, decltype(cmp), FakeClock> s(cmp, 2, delay);
     s.setup();
     // first insertions are successful
-    local_assert(s.insert(token_2).second);
-    local_assert(s.insert(token_1).second);
-    auto start_time1 = std::chrono::steady_clock::now();
-
-    // spin lock until delay has passed.
-    // deliberately not using sleep_until / sleep_for.
-    // this test is impossible to perfect unless running on a hard real time OS
-    while (start_time1 + delay > std::chrono::steady_clock::now()) {
-      local_assert(!s.insert(token_1).second);
+    BOOST_REQUIRE(s.insert(token_1).second);
+    BOOST_REQUIRE(s.insert(token_2).second);
+    BOOST_REQUIRE(s.insert(token_3).second);
+
+    // refresh token 1 over a staleness period, until 2 is now stale
+    for (int i = 0; i < 4; ++i) {
+      BOOST_REQUIRE(!s.insert(token_2).second);
+      FakeClock::seconds_since_epoch += delay / 4;
     }
 
     // correct token was being refreshed
-    local_assert(!s.insert(token_1).second);
-    local_assert(s.insert(token_2).second);
-    auto start_time2 = std::chrono::steady_clock::now();
-    // spin lock until delay has passed
-    while (start_time2 + delay > std::chrono::steady_clock::now()) {
-    }
-    local_assert(s.insert(token_1).second);
-    local_assert(s.insert(token_2).second);
-  });
+    BOOST_REQUIRE(s.insert(token_1).second);
+    BOOST_REQUIRE(!s.insert(token_2).second);
+    BOOST_REQUIRE(s.insert(token_3).second);
+
+    FakeClock::seconds_since_epoch += delay;
+    BOOST_REQUIRE(s.insert(token_1).second);
+    BOOST_REQUIRE(s.insert(token_2).second);
+    BOOST_REQUIRE(s.insert(token_3).second);
+  }
 
-  std::thread unordered_thread([&]() {
-    ForgetfulUnorderedSet<choose::Token, decltype(hash), decltype(eq)> s(hash, eq, 1, 2, delay);
+  {
+    FakeClock::seconds_since_epoch = FakeClock::time_point();
+    ForgetfulUnorderedSet<choose::Token, decltype(hash), decltype(eq), FakeClock> s(hash, eq, 1, 2, delay);
     s.setup();
     // first insertions are successful
-    local_assert(s.insert(token_2).second);
-    local_assert(s.insert(token_1).second);
-    auto start_time1 = std::chrono::steady_clock::now();
-
-    // spin lock until delay has passed.
-    // deliberately not using sleep_until / sleep_for.
-    // this test is impossible to perfect unless running on a hard real time OS
-    while (start_time1 + delay > std::chrono::steady_clock::now()) {
-      local_assert(!s.insert(token_1).second);
+    BOOST_REQUIRE(s.insert(token_1).second);
+    BOOST_REQUIRE(s.insert(token_2).second);
+    BOOST_REQUIRE(s.insert(token_3).second);
+
+    // refresh token 1 over a staleness period, until 2 is now stale
+    for (int i = 0; i < 4; ++i) {
+      BOOST_REQUIRE(!s.insert(token_2).second);
+      FakeClock::seconds_since_epoch += delay / 4;
     }
 
     // correct token was being refreshed
-    local_assert(!s.insert(token_1).second);
-    local_assert(s.insert(token_2).second);
-    auto start_time2 = std::chrono::steady_clock::now();
-    // spin lock until delay has passed
-    while (start_time2 + delay > std::chrono::steady_clock::now()) {
-    }
-    local_assert(s.insert(token_1).second);
-    local_assert(s.insert(token_2).second);
-  });
-
-  set_thread.join();
-  unordered_thread.join();
-
-  if (has_errors) {
-    std::cerr << boost::unit_test::framework::current_test_case().full_name().c_str();
-    std::cerr << ": test failed due to timing. generally ignore this\n";
+    BOOST_REQUIRE(s.insert(token_1).second);
+    BOOST_REQUIRE(!s.insert(token_2).second);
+    BOOST_REQUIRE(s.insert(token_3).second);
+
+    FakeClock::seconds_since_epoch += delay;
+    BOOST_REQUIRE(s.insert(token_1).second);
+    BOOST_REQUIRE(s.insert(token_2).second);
+    BOOST_REQUIRE(s.insert(token_3).second);
   }
 }
 
@@ -920,30 +918,34 @@ BOOST_AUTO_TEST_CASE(out_tail) {
 }
 
 BOOST_AUTO_TEST_CASE(sort_unique_out) {
-  // entirety of the input is read, but the mem size should not exceed the number of
-  // unique elements from the input
-  OutputSizeBoundFixture f(4);
+  OutputSizeBoundFixture f(2);
   choose_output out = run_choose("this\nis\nis\na\na\ntest", {"--sort", "--unique", "--out=2"});
   choose_output correct_output{to_vec("a\nis\n")};
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
 
+BOOST_AUTO_TEST_CASE(sort_unique_out_diff) {
+  choose_output out = run_choose("5\n-5\n-5\n-10\n-10\n0", {"--sort", "--unique-numeric", "--out=2"});
+  choose_output correct_output{to_vec("-10\n-5\n")};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
 BOOST_AUTO_TEST_CASE(sort_unique_out_min) {
-  OutputSizeBoundFixture f(4);
+  OutputSizeBoundFixture f(2);
   choose_output out = run_choose("this\nis\nis\na\na\ntest", {"--sort", "--unique", "--out=1,2"});
   choose_output correct_output{to_vec("is\n")};
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
 
 BOOST_AUTO_TEST_CASE(sort_unique_tail) {
-  OutputSizeBoundFixture f(4);
+  OutputSizeBoundFixture f(2);
   choose_output out = run_choose("this\nis\nis\na\na\ntest", {"--sort", "--unique", "--tail=2"});
   choose_output correct_output{to_vec("test\nthis\n")};
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
 
 BOOST_AUTO_TEST_CASE(sort_unique_tail_min) {
-  OutputSizeBoundFixture f(4);
+  OutputSizeBoundFixture f(2);
   choose_output out = run_choose("this\nis\nis\na\na\ntest", {"--sort", "--unique", "--tail=1,2"});
   choose_output correct_output{to_vec("test\n")};
   BOOST_REQUIRE_EQUAL(out, correct_output);
diff --git a/src/token.hpp b/src/token.hpp
index f9771b7..4ce2d95 100644
--- a/src/token.hpp
+++ b/src/token.hpp
@@ -182,7 +182,7 @@ struct TokenOutputStream {
   }
 };
 
-// leads to an exit unless this is a unit test
+// exit unless this is a unit test
 struct termination_request : public std::exception {};
 
 namespace {
@@ -243,14 +243,7 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
   const bool sort = args.sort;
   const Comparison sort_type = args.sort_type;
   const bool sort_reversed = args.sort_reverse;
-
-  // the elements in output are being inserted with any excess being discarded.
-  // this branch is incompatible with uniqueness since the data structures point
-  // within output, and if things are moving around then the iterators' elements
-  // are also moved. this makes sense anyway, since if uniqueness is specified,
-  // then choose needs to keep track of what has been seen before (can't be
-  // bounded). this var also could be named "output_size_is_bounded"
-  const bool output_is_shifting = args.out_end.has_value() && !unique && !args.truncate_no_bound;
+  const bool mem_is_bounded = args.mem_is_bounded();
 
   char subject[args.buf_size]; // match buffer
   size_t subject_size = 0;     // how full is the buffer
@@ -439,7 +432,7 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
 #ifndef CHOOSE_DISABLE_FIELD
         t.set_field(args.field, field_data);
 #endif
-        if (!output_is_shifting) {
+        if (!mem_is_bounded) {
           // typical case
           output.push_back(std::move(t));
           if (unique) {
@@ -450,24 +443,28 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
             }
           }
         } else {
-          // output size is bounded.
-          // precondition here is that unique is false. but everything else should be handled
+          // output size is bounded. from --tail or --out
           if (sort) {
             // note that the sorting is reversed if tail is used. so this
             // handles tail and non tail cases. see UncompiledCodes.
-            // always a stable sort, given insertion position
             auto insertion_pos = std::upper_bound(output.begin(), output.end(), t, sort_comparison);
-            if (likely(output.size() == *args.out_end)) {
-              // a faster branch that avoids any vector realloc logic.
-              // it's basically a fixed length buffer at this point
-              while (insertion_pos < output.end()) {
-                std::swap(*insertion_pos++, t);
+            if (!unique || (insertion_pos == output.begin() || !consecutive_equality_predicate(insertion_pos[-1], t))) {
+              // uniqueness is not used, or t does not yet exist in output
+              if (likely(output.size() == *args.out_end)) {
+                while (insertion_pos < output.end()) {
+                  std::swap(*insertion_pos++, t);
+                }
+                return false;
+              } else {
+                output.insert(insertion_pos, std::move(t));
               }
-              return false;
             } else {
-              output.insert(insertion_pos, std::move(t));
+              // unique is being used and t already exists in the output
+              return false;
             }
           } else {
+            // unsorted memory bounded case.
+            // precondition unique is false (can't be applied in a mem bounded way)
             if (tail && likely(output.size() == *args.out_end)) {
               // same reasoning as above. fixed length buffer being moved around
               auto it = output.rbegin();
@@ -575,7 +572,7 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
       } else {
         check_unique_then_append(); // result ignored
         // handle the case mentioned in check_unique_then_append
-        if (output_is_shifting && !sort && !tail) {
+        if (mem_is_bounded && !sort && !tail) {
           if (output.size() == *args.out_end) {
             return true;
           }
@@ -877,7 +874,7 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
       }
     } else {
       // truncate the ends, leaving only the beginning elements
-      if (output_is_shifting) {
+      if (mem_is_bounded) {
         // sort and end truncation has already been applied
       } else {
         // truncate the end
diff --git a/src/uniqueness_utils.hpp b/src/uniqueness_utils.hpp
index 73937a7..7c9efc1 100644
--- a/src/uniqueness_utils.hpp
+++ b/src/uniqueness_utils.hpp
@@ -15,9 +15,9 @@ namespace {
 // this manages the least recently used logic and expiry logic
 // if a new element is inserted into a uniqueness set, it should also be inserted here.
 // upon insertion here, the least recently used element will be removed from the uniqueness set if capacity is reached.
-template <typename T>
+template <typename T, typename Clock_T>
 struct ForgetfulManager {
-  using time_point_t = std::chrono::time_point<std::chrono::steady_clock>;
+  using time_point_t = std::chrono::time_point<Clock_T>;
   using duration_t = typename time_point_t::duration;
 
   T* obj = 0;
@@ -52,7 +52,7 @@ struct ForgetfulManager {
   void remove_old() {
     if (expiry != duration_t::zero()) {
       while (!elems.empty()) {
-        if (std::chrono::steady_clock::now() - elems.back().t >= expiry) {
+        if (Clock_T::now() - elems.back().t >= expiry) {
           obj->erase(elems.back().it);
           elems.pop_back();
         } else {
@@ -65,7 +65,7 @@ struct ForgetfulManager {
   // if an insertion into the uniqueness set failed because it already existed,
   // call this function to update the recent-ness of that element
   void refresh(refresh_handle it) {
-    it->t = std::chrono::steady_clock::now();
+    it->t = Clock_T::now();
     if (it == elems.begin()) {
       // no splice is needed. refreshed element is already most recent
     } else {
@@ -76,7 +76,7 @@ struct ForgetfulManager {
 
   // call when an insertion into the uniqueness set succeeds
   void insert(typename T::iterator it) {
-    Entry e{it, std::chrono::steady_clock::now()};
+    Entry e{it, Clock_T::now()};
     elems.push_front(e);
     if (likely(elems.size() > n)) {
       obj->erase(elems.back().it);
@@ -93,7 +93,7 @@ struct ForgetfulManager {
 } // namespace
 
 // only remembers last n elements. least recently used forgotten
-template <typename Key, typename Compare>
+template <typename Key, typename Compare, typename Clock_T = std::chrono::steady_clock>
 struct ForgetfulSet {
   struct KeyInternal {
     Key key;
@@ -106,7 +106,7 @@ struct ForgetfulSet {
   };
 
   std::set<KeyInternal, Compare> s;
-  ForgetfulManager<decltype(s)> lru;
+  ForgetfulManager<decltype(s), Clock_T> lru;
 
  public:
   ForgetfulSet(const Compare& comp, //
@@ -157,7 +157,7 @@ struct ForgetfulSet {
 };
 
 // largely copy paste from ForgetfulSet.
-template <typename Key, typename Hash, typename KeyEqual>
+template <typename Key, typename Hash, typename KeyEqual, typename Clock_T = std::chrono::steady_clock>
 struct ForgetfulUnorderedSet {
   struct KeyInternal {
     Key key;
@@ -170,7 +170,7 @@ struct ForgetfulUnorderedSet {
   };
 
   std::unordered_set<KeyInternal, Hash, KeyEqual> s;
-  ForgetfulManager<decltype(s)> lru;
+  ForgetfulManager<decltype(s), Clock_T> lru;
 
  public:
   ForgetfulUnorderedSet(const Hash& hash, //