Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
jagprog5 committed Aug 5, 2023
1 parent 8cd6581 commit cc1ae5b
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 54 deletions.
6 changes: 4 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@ cat some_content | choose -f "test" --head 5

The former is restricted to working with `lines`, whereas the latter works with `tokens`. Tokens are arbitrary and can contain newline characters, whereas lines can't.

# Ordering and Uniqueness
# Sorting and Uniqueness

choose allows for lexicographical comparison and **user defined** comparison between tokens. Using this comparison, it can apply ordering and uniqueness.
choose allows for lexicographical comparison and **user defined** comparison between tokens. Using this comparison, it can apply sorting and uniqueness.

For example, this command sorts the input and leaves only unique entries:

Expand Down Expand Up @@ -198,6 +198,8 @@ Banana
</tr>
</table>

Additionally, if the output is truncated via `--out`, then a partial sort is applied to only sort the range of elements that are used.

# Matching

Rather than specifying how tokens are terminated, the tokens themselves can be matched for. A match and each match group form a token. This is like `grep -o`.
Expand Down
42 changes: 42 additions & 0 deletions src/algo_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once

#include <algorithm>
#include <vector>

namespace choose {

template <typename it, typename Comp>
void stable_partial_sort(it begin, it middle, it end, Comp comp) {
// adapted from https://stackoverflow.com/a/27248519/15534181
std::vector<it> sorted;
sorted.resize(end - begin);
auto to = sorted.begin();
for (auto p = begin; p != end; ++p) {
*to++ = p;
}
auto n = middle - begin;

std::partial_sort(sorted.begin(), sorted.begin() + n, sorted.end(), [&](const it& lhs, const it& rhs) {
// First see if the underlying elements differ.
if (comp(*lhs, *rhs))
return true;
if (comp(*rhs, *lhs))
return false;
// Underlying elements are the same, so compare iterators; these represent
// position in original vector.
return lhs < rhs;
});

std::vector<typename it::value_type> replacement;
replacement.resize(n);
for (decltype(n) i = 0; i < n; ++i) {
replacement[i] = *sorted[i];
}

auto from = replacement.cbegin();
for (auto pos = begin; pos != middle; ++end) {
*pos++ = *from++;
}
}

} // namespace choose
38 changes: 16 additions & 22 deletions src/args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ struct Arguments {
bool delimit_on_empty = false;

// truncate beginning of result, inclusive
std::optional<ssize_t> out_start;
std::optional<size_t> out_start;
// truncate end of result, exclusive
std::optional<ssize_t> out_end;
std::optional<size_t> out_end;

// number of bytes
// args will set it to a default value if it is unset. max indicates unset
Expand Down Expand Up @@ -266,16 +266,16 @@ void print_help_message() {
" --no-warn\n"
" -o, --output-delimiter <delimiter, default: '\\n'>\n"
" an output delimiter is placed after each token in the output\n"
" --out [<+/-# tokens>|<start inclusive>,<stop exclusive>|<default: +10>]\n"
" send only the first n tokens to the output or tui. like --head\n"
" applied after sorting, uniqueness, and reverse.\n"
" --out [<# tokens>|<start inclusive>,<stop exclusive>|<default: +10>]\n"
" truncate after sorting and uniqueness\n"
" -p, --prompt <tui prompt>\n"
" -r, --regex\n"
" use PCRE2 regex for the positional argument.\n"
" --read <# bytes, default: <buf-size>>\n"
" the number of bytes read from stdin per iteration\n"
" --reverse\n"
" reverse the token order after sorting\n"
" reverse the token order. this is the last step before being sent\n"
" to the output or to the tui\n"
" -s, --sort\n"
" sort each token lexicographically\n"
" --sed\n"
Expand Down Expand Up @@ -495,21 +495,15 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
auto val = num::parse_number_pair<InLimitOp::T>(on_num_err, optarg);
InLimitOp::T first = std::get<0>(val);
std::optional<InLimitOp::T> second = std::get<1>(val);
if (first > std::numeric_limits<decltype(ret.out_start)::value_type>::max() //
|| (second && *second > std::numeric_limits<decltype(ret.out_end)::value_type>::max())) {
// careful for bounds since out_start and out_end are signed
on_num_err();
if (second) {
ret.out_start = first;
ret.out_end = second;
uncompiled_output.ordered_ops.insert(uncompiled_output.ordered_ops.begin(), //
uncompiled::UncompiledInLimitOp(first, *second));
} else {
if (second) {
ret.out_start = first;
ret.out_end = second;
uncompiled_output.ordered_ops.insert(uncompiled_output.ordered_ops.begin(), //
uncompiled::UncompiledInLimitOp(first, *second));
} else {
ret.out_end = first;
uncompiled_output.ordered_ops.insert(uncompiled_output.ordered_ops.begin(), //
uncompiled::UncompiledInLimitOp(first));
}
ret.out_end = first;
uncompiled_output.ordered_ops.insert(uncompiled_output.ordered_ops.begin(), //
uncompiled::UncompiledInLimitOp(first));
}
} else {
ret.out_end = 10;
Expand All @@ -520,7 +514,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out

auto out_handler = [&](bool has_arg) {
if (has_arg || OPTIONAL_ARGUMENT_IS_PRESENT) {
auto val = num::parse_number_pair<ssize_t>(on_num_err, optarg);
auto val = num::parse_number_pair<size_t>(on_num_err, optarg);
auto first = std::get<0>(val);
auto second = std::get<1>(val);
if (second) {
Expand All @@ -530,7 +524,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
ret.out_end = first;
}
} else {
ret.out_start = 10;
ret.out_end = 10;
}
};

Expand Down
2 changes: 1 addition & 1 deletion src/numeric_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ std::tuple<T, std::optional<T>> parse_number_pair(OnErr onErr, const char* str)
while (1) {
char ch = *str;
if (ch == '\0') {
return { first, std::nullopt };
return {first, std::nullopt};
} else if (ch == ',') {
++str;
break;
Expand Down
41 changes: 39 additions & 2 deletions src/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,13 +400,13 @@ BOOST_AUTO_TEST_CASE(output_rm_filter) {
}

BOOST_AUTO_TEST_CASE(zero_with_tui) {
choose_output out = run_choose("anything", {"--head=0", "-t"});
choose_output out = run_choose("anything", {"--out=0", "-t"});
choose_output correct_output{std::vector<choose::Token>{}};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(zero_no_tui) {
choose_output out = run_choose("anything", {"--head=0"});
choose_output out = run_choose("anything", {"--out=0"});
choose_output correct_output{to_vec("")};
BOOST_REQUIRE_EQUAL(out, correct_output);
}
Expand Down Expand Up @@ -460,6 +460,13 @@ BOOST_AUTO_TEST_CASE(defined_sort) {
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(partial_stable_sort) {
// this also checks the delimiter and sort stability
choose_output out = run_choose("John Doe\nApple\nJohn Doe\nBanana\nJohn Smith", {"-r", "--comp-sort", "^John", "--out=3", "-t"});
choose_output correct_output{std::vector<choose::Token>{"John Doe", "John Doe", "John Smith"}};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

// mix of both lex and user defined

BOOST_AUTO_TEST_CASE(lex_unique_defined_sort) {
Expand Down Expand Up @@ -513,6 +520,36 @@ BOOST_AUTO_TEST_CASE(out_limit) {
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(out_limit_start) {
choose_output out = run_choose("0\n1\n2\n3\n4\n5\n6\n7\n8\n9", {"--out=2,5"});
choose_output correct_output{to_vec("2\n3\n4\n")};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(out_limit_with_index) {
choose_output out = run_choose("0\n1\n2\n3\n4\n5\n6\n7\n8\n9", {"--index=before", "--out=2,5"});
choose_output correct_output{to_vec("0 2\n1 3\n2 4\n")};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(out_limit_start_with_sort) {
choose_output out = run_choose("this\nis\na\ntest", {"--sort", "--out=1,3"});
choose_output correct_output{to_vec("is\ntest\n")};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(out_limit_with_sort_past_end_stop) {
choose_output out = run_choose("a\nb\nc", {"--sort", "--out=70"});
choose_output correct_output{to_vec("a\nb\nc\n")};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(out_limit_with_sort_past_end_start) {
choose_output out = run_choose("this\nis\na\ntest", {"--sort", "--out=100000,3"});
choose_output correct_output{to_vec("")};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(out_limit_unique) {
choose_output out = run_choose("d\nd\nd\nd\nc\nb\na", {"--out=2", "--unique"});
choose_output correct_output{to_vec("d\nc\n")};
Expand Down
72 changes: 45 additions & 27 deletions src/token.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <unordered_set>
#include <utility>

#include "algo_utils.hpp"
#include "args.hpp"
#include "regex.hpp"
#include "string_utils.hpp"
Expand Down Expand Up @@ -56,14 +57,20 @@ struct TokenOutputStream {

TokenOutputStream(const Arguments& args) : args(args) {}

bool begin_discard() const { //
return this->args.out_start && this->out_count < *this->args.out_start;
}

// write a part of a token to the output.
// the last part of a token must instead use write_output
void write_output_fragment(const char* begin, const char* end) {
if (delimit_required_ && !args.sed) {
str::write_f(args.output, args.out_delimiter);
if (!begin_discard()) {
if (delimit_required_ && !args.sed) {
str::write_f(args.output, args.out_delimiter);
}
delimit_required_ = false;
has_written = true;
}
delimit_required_ = false;
has_written = true;
str::write_f(args.output, begin, end);
}

Expand All @@ -75,12 +82,14 @@ struct TokenOutputStream {
void write_output(const char* begin, //
const char* end,
T handler = TokenOutputStream::default_write) {
if (delimit_required_ && !args.sed) {
str::write_f(args.output, args.out_delimiter);
if (!begin_discard()) {
if (delimit_required_ && !args.sed) {
str::write_f(args.output, args.out_delimiter);
}
delimit_required_ = true;
has_written = true;
handler(args.output, begin, end);
}
delimit_required_ = true;
has_written = true;
handler(args.output, begin, end);
++out_count;
}

Expand Down Expand Up @@ -661,34 +670,43 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
set->clear();
}

// apply sorting
// if (!args.out.has_value() && ) {

// }

if (args.out_end.has_value() && output.size() > *args.out_end && args.sort && !args.comp_sort) {
// if lexicographically sorting and the output is being truncated then do a
// partial sort instead. can only be applied to lexicographical since there's no
// stable partial sort (and stability is required for user defined comp sort)
std::partial_sort(output.begin(), output.begin() + *args.out_end, output.end(), lexicographical_comparison);
} else {
if (!args.out_start && !args.out_end) {
// no truncation needed. this is the simplest case
if (args.comp_sort) {
std::stable_sort(output.begin(), output.end(), user_defined_comparison);
} else if (args.sort) {
std::sort(output.begin(), output.end(), lexicographical_comparison);
}
} else {
// truncate the end, leaving only the beginning elements
typename std::vector<Token>::iterator middle;
middle = output.begin() + *args.out_end;
if (middle > output.end()) {
middle = output.end();
}

if (args.comp_sort) {
choose::stable_partial_sort(output.begin(), middle, output.end(), user_defined_comparison);
} else if (args.sort) {
std::partial_sort(output.begin(), middle, output.end(), lexicographical_comparison);
}
output.resize(middle - output.begin());
if (args.out_start) {
if (*args.out_start < output.size()) {
output.erase(output.begin(), output.begin() + *args.out_start);
} else {
output.clear();
}
}
}
// don't apply truncation again, since it was just done above
args.out_start = std::nullopt;
args.out_end = std::nullopt;

// apply after sorting
// apply reverse last
if (args.reverse) {
std::reverse(output.begin(), output.end());
}

// last thing to be applied is truncating the result
if (args.out_end.has_value() && output.size() > *args.out_end) {
output.resize(*args.out_end);
}

} // scope for goto

skip_all:
Expand Down

0 comments on commit cc1ae5b

Please sign in to comment.