.

jagprog5 · Aug 5, 2023 · cc1ae5b · cc1ae5b
1 parent 8cd6581
commit cc1ae5b
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 54 deletions.
diff --git a/readme.md b/readme.md
@@ -141,9 +141,9 @@ cat some_content | choose -f "test" --head 5
 
 The former is restricted to working with `lines`, whereas the latter works with `tokens`. Tokens are arbitrary and can contain newline characters, whereas lines can't.
 
-# Ordering and Uniqueness
+# Sorting and Uniqueness
 
-choose allows for lexicographical comparison and **user defined** comparison between tokens. Using this comparison, it can apply ordering and uniqueness.
+choose allows for lexicographical comparison and **user defined** comparison between tokens. Using this comparison, it can apply sorting and uniqueness.
 
 For example, this command sorts the input and leaves only unique entries:
 
@@ -198,6 +198,8 @@ Banana
 </tr>
 </table>
 
+Additionally, if the output is truncated via `--out`, then a partial sort is applied to only sort the range of elements that are used.
+
 # Matching
 
 Rather than specifying how tokens are terminated, the tokens themselves can be matched for. A match and each match group form a token. This is like `grep -o`.

diff --git a/src/algo_utils.hpp b/src/algo_utils.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+namespace choose {
+
+template <typename it, typename Comp>
+void stable_partial_sort(it begin, it middle, it end, Comp comp) {
+  // adapted from https://stackoverflow.com/a/27248519/15534181
+  std::vector<it> sorted;
+  sorted.resize(end - begin);
+  auto to = sorted.begin();
+  for (auto p = begin; p != end; ++p) {
+    *to++ = p;
+  }
+  auto n = middle - begin;
+
+  std::partial_sort(sorted.begin(), sorted.begin() + n, sorted.end(), [&](const it& lhs, const it& rhs) {
+    // First see if the underlying elements differ.
+    if (comp(*lhs, *rhs))
+      return true;
+    if (comp(*rhs, *lhs))
+      return false;
+    // Underlying elements are the same, so compare iterators; these represent
+    // position in original vector.
+    return lhs < rhs;
+  });
+
+  std::vector<typename it::value_type> replacement;
+  replacement.resize(n);
+  for (decltype(n) i = 0; i < n; ++i) {
+    replacement[i] = *sorted[i];
+  }
+
+  auto from = replacement.cbegin();
+  for (auto pos = begin; pos != middle; ++end) {
+    *pos++ = *from++;
+  }
+}
+
+} // namespace choose
diff --git a/src/args.hpp b/src/args.hpp
@@ -47,9 +47,9 @@ struct Arguments {
   bool delimit_on_empty = false;
 
   // truncate beginning of result, inclusive
-  std::optional<ssize_t> out_start;
+  std::optional<size_t> out_start;
   // truncate end of result, exclusive
-  std::optional<ssize_t> out_end;
+  std::optional<size_t> out_end;
 
   // number of bytes
   // args will set it to a default value if it is unset. max indicates unset
@@ -266,16 +266,16 @@ void print_help_message() {
       "        --no-warn\n"
       "        -o, --output-delimiter <delimiter, default: '\\n'>\n"
       "                an output delimiter is placed after each token in the output\n"
-      "        --out [<+/-# tokens>|<start inclusive>,<stop exclusive>|<default: +10>]\n"
-      "                send only the first n tokens to the output or tui. like --head\n"
-      "                applied after sorting, uniqueness, and reverse.\n"
+      "        --out [<# tokens>|<start inclusive>,<stop exclusive>|<default: +10>]\n"
+      "                truncate after sorting and uniqueness\n"
       "        -p, --prompt <tui prompt>\n"
       "        -r, --regex\n"
       "                use PCRE2 regex for the positional argument.\n"
       "        --read <# bytes, default: <buf-size>>\n"
       "                the number of bytes read from stdin per iteration\n"
       "        --reverse\n"
-      "                reverse the token order after sorting\n"
+      "                reverse the token order. this is the last step before being sent\n"
+      "                to the output or to the tui\n"
       "        -s, --sort\n"
       "                sort each token lexicographically\n"
       "        --sed\n"
@@ -495,21 +495,15 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
             auto val = num::parse_number_pair<InLimitOp::T>(on_num_err, optarg);
             InLimitOp::T first = std::get<0>(val);
             std::optional<InLimitOp::T> second = std::get<1>(val);
-            if (first > std::numeric_limits<decltype(ret.out_start)::value_type>::max() //
-                || (second && *second > std::numeric_limits<decltype(ret.out_end)::value_type>::max())) {
-              // careful for bounds since out_start and out_end are signed
-              on_num_err();
+            if (second) {
+              ret.out_start = first;
+              ret.out_end = second;
+              uncompiled_output.ordered_ops.insert(uncompiled_output.ordered_ops.begin(), //
+                                                   uncompiled::UncompiledInLimitOp(first, *second));
             } else {
-              if (second) {
-                ret.out_start = first;
-                ret.out_end = second;
-                uncompiled_output.ordered_ops.insert(uncompiled_output.ordered_ops.begin(), //
-                                                    uncompiled::UncompiledInLimitOp(first, *second));
-              } else {
-                ret.out_end = first;
-                uncompiled_output.ordered_ops.insert(uncompiled_output.ordered_ops.begin(), //
-                                                    uncompiled::UncompiledInLimitOp(first));
-              }
+              ret.out_end = first;
+              uncompiled_output.ordered_ops.insert(uncompiled_output.ordered_ops.begin(), //
+                                                   uncompiled::UncompiledInLimitOp(first));
             }
           } else {
             ret.out_end = 10;
@@ -520,7 +514,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
 
         auto out_handler = [&](bool has_arg) {
           if (has_arg || OPTIONAL_ARGUMENT_IS_PRESENT) {
-            auto val = num::parse_number_pair<ssize_t>(on_num_err, optarg);
+            auto val = num::parse_number_pair<size_t>(on_num_err, optarg);
             auto first = std::get<0>(val);
             auto second = std::get<1>(val);
             if (second) {
@@ -530,7 +524,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
               ret.out_end = first;
             }
           } else {
-            ret.out_start = 10;
+            ret.out_end = 10;
           }
         };
 

diff --git a/src/numeric_utils.hpp b/src/numeric_utils.hpp
@@ -137,7 +137,7 @@ std::tuple<T, std::optional<T>> parse_number_pair(OnErr onErr, const char* str)
   while (1) {
     char ch = *str;
     if (ch == '\0') {
-      return { first, std::nullopt };
+      return {first, std::nullopt};
     } else if (ch == ',') {
       ++str;
       break;

diff --git a/src/test.cpp b/src/test.cpp
@@ -400,13 +400,13 @@ BOOST_AUTO_TEST_CASE(output_rm_filter) {
 }
 
 BOOST_AUTO_TEST_CASE(zero_with_tui) {
-  choose_output out = run_choose("anything", {"--head=0", "-t"});
+  choose_output out = run_choose("anything", {"--out=0", "-t"});
   choose_output correct_output{std::vector<choose::Token>{}};
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
 
 BOOST_AUTO_TEST_CASE(zero_no_tui) {
-  choose_output out = run_choose("anything", {"--head=0"});
+  choose_output out = run_choose("anything", {"--out=0"});
   choose_output correct_output{to_vec("")};
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
@@ -460,6 +460,13 @@ BOOST_AUTO_TEST_CASE(defined_sort) {
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
 
+BOOST_AUTO_TEST_CASE(partial_stable_sort) {
+  // this also checks the delimiter and sort stability
+  choose_output out = run_choose("John Doe\nApple\nJohn Doe\nBanana\nJohn Smith", {"-r", "--comp-sort", "^John", "--out=3", "-t"});
+  choose_output correct_output{std::vector<choose::Token>{"John Doe", "John Doe", "John Smith"}};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
 // mix of both lex and user defined
 
 BOOST_AUTO_TEST_CASE(lex_unique_defined_sort) {
@@ -513,6 +520,36 @@ BOOST_AUTO_TEST_CASE(out_limit) {
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
 
+BOOST_AUTO_TEST_CASE(out_limit_start) {
+  choose_output out = run_choose("0\n1\n2\n3\n4\n5\n6\n7\n8\n9", {"--out=2,5"});
+  choose_output correct_output{to_vec("2\n3\n4\n")};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
+BOOST_AUTO_TEST_CASE(out_limit_with_index) {
+  choose_output out = run_choose("0\n1\n2\n3\n4\n5\n6\n7\n8\n9", {"--index=before", "--out=2,5"});
+  choose_output correct_output{to_vec("0 2\n1 3\n2 4\n")};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
+BOOST_AUTO_TEST_CASE(out_limit_start_with_sort) {
+  choose_output out = run_choose("this\nis\na\ntest", {"--sort", "--out=1,3"});
+  choose_output correct_output{to_vec("is\ntest\n")};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
+BOOST_AUTO_TEST_CASE(out_limit_with_sort_past_end_stop) {
+  choose_output out = run_choose("a\nb\nc", {"--sort", "--out=70"});
+  choose_output correct_output{to_vec("a\nb\nc\n")};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
+BOOST_AUTO_TEST_CASE(out_limit_with_sort_past_end_start) {
+  choose_output out = run_choose("this\nis\na\ntest", {"--sort", "--out=100000,3"});
+  choose_output correct_output{to_vec("")};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
 BOOST_AUTO_TEST_CASE(out_limit_unique) {
   choose_output out = run_choose("d\nd\nd\nd\nc\nb\na", {"--out=2", "--unique"});
   choose_output correct_output{to_vec("d\nc\n")};

diff --git a/src/token.hpp b/src/token.hpp
@@ -6,6 +6,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include "algo_utils.hpp"
 #include "args.hpp"
 #include "regex.hpp"
 #include "string_utils.hpp"
@@ -56,14 +57,20 @@ struct TokenOutputStream {
 
   TokenOutputStream(const Arguments& args) : args(args) {}
 
+  bool begin_discard() const { //
+    return this->args.out_start && this->out_count < *this->args.out_start;
+  }
+
   // write a part of a token to the output.
   // the last part of a token must instead use write_output
   void write_output_fragment(const char* begin, const char* end) {
-    if (delimit_required_ && !args.sed) {
-      str::write_f(args.output, args.out_delimiter);
+    if (!begin_discard()) {
+      if (delimit_required_ && !args.sed) {
+        str::write_f(args.output, args.out_delimiter);
+      }
+      delimit_required_ = false;
+      has_written = true;
     }
-    delimit_required_ = false;
-    has_written = true;
     str::write_f(args.output, begin, end);
   }
 
@@ -75,12 +82,14 @@ struct TokenOutputStream {
   void write_output(const char* begin, //
                     const char* end,
                     T handler = TokenOutputStream::default_write) {
-    if (delimit_required_ && !args.sed) {
-      str::write_f(args.output, args.out_delimiter);
+    if (!begin_discard()) {
+      if (delimit_required_ && !args.sed) {
+        str::write_f(args.output, args.out_delimiter);
+      }
+      delimit_required_ = true;
+      has_written = true;
+      handler(args.output, begin, end);
     }
-    delimit_required_ = true;
-    has_written = true;
-    handler(args.output, begin, end);
     ++out_count;
   }
 
@@ -661,34 +670,43 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
       set->clear();
     }
 
-    // apply sorting
-    // if (!args.out.has_value() && ) {
-
-    // }
-
-    if (args.out_end.has_value() && output.size() > *args.out_end && args.sort && !args.comp_sort) {
-      // if lexicographically sorting and the output is being truncated then do a
-      // partial sort instead. can only be applied to lexicographical since there's no
-      // stable partial sort (and stability is required for user defined comp sort)
-      std::partial_sort(output.begin(), output.begin() + *args.out_end, output.end(), lexicographical_comparison);
-    } else {
+    if (!args.out_start && !args.out_end) {
+      // no truncation needed. this is the simplest case
       if (args.comp_sort) {
         std::stable_sort(output.begin(), output.end(), user_defined_comparison);
       } else if (args.sort) {
         std::sort(output.begin(), output.end(), lexicographical_comparison);
       }
+    } else {
+      // truncate the end, leaving only the beginning elements
+      typename std::vector<Token>::iterator middle;
+      middle = output.begin() + *args.out_end;
+      if (middle > output.end()) {
+        middle = output.end();
+      }
+
+      if (args.comp_sort) {
+        choose::stable_partial_sort(output.begin(), middle, output.end(), user_defined_comparison);
+      } else if (args.sort) {
+        std::partial_sort(output.begin(), middle, output.end(), lexicographical_comparison);
+      }
+      output.resize(middle - output.begin());
+      if (args.out_start) {
+        if (*args.out_start < output.size()) {
+          output.erase(output.begin(), output.begin() + *args.out_start);
+        } else {
+          output.clear();
+        }
+      }
     }
+    // don't apply truncation again, since it was just done above
+    args.out_start = std::nullopt;
+    args.out_end = std::nullopt;
 
-    // apply after sorting
+    // apply reverse last
     if (args.reverse) {
       std::reverse(output.begin(), output.end());
     }
-
-    // last thing to be applied is truncating the result
-    if (args.out_end.has_value() && output.size() > *args.out_end) {
-      output.resize(*args.out_end);
-    }
-
   } // scope for goto
 
 skip_all: