Skip to content

Commit

Permalink
Unique limit (#40)
Browse files Browse the repository at this point in the history
  • Loading branch information
jagprog5 committed Sep 7, 2023
1 parent 0ebf32e commit 0b67392
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 21 deletions.
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ choose is a tool for performing transformations with regular expressions. It als

# Why?

Here's a few distinct use cases that other tools have trouble with:
Here's a few use cases that other tools have trouble with:

- [sort csv and truncate output](https://stackoverflow.com/a/77025562/15534181)
- [sort csv with embedded commas](https://stackoverflow.com/a/77034520/15534181)
Expand Down
103 changes: 98 additions & 5 deletions src/algo_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@

#include <algorithm>
#include <charconv>
#include <cstring> // memmove
#include <execution>
#include <set>
#include <unordered_set>
#include <vector>

#include "likely_unlikely.hpp"

namespace choose {

template <typename ExecutionPolicy, typename it, typename Comp>
void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end, Comp comp) {
static_assert(std::is_same_v<typename std::iterator_traits<it>::iterator_category, //
std::random_access_iterator_tag>);
// adapted from https://stackoverflow.com/a/27248519/15534181
std::vector<it> sorted;
sorted.resize(end - begin);
Expand Down Expand Up @@ -43,6 +46,98 @@ void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end,
}
}

// only remembers last n elements
template <typename Key, typename Compare = std::less<Key>, typename Allocator = std::allocator<Key>, bool unordered = false>
class ForgetfulSet {
std::set<Key, Compare, Allocator> s;
const size_t n; // cap for iters
std::vector<typename decltype(s)::iterator> iters; // point within s
public:
ForgetfulSet(const Compare& comp, size_t n) : s(comp), n(n == 0 ? 1 : n) {
// ^ n must be positive.
// given the context where it is constructed, arg n is never 0 anyways. but this is for safety.
// required for precondition below
iters.reserve(this->n);
}

void clear() {
s.clear();
iters.clear();
}

auto insert(Key k) {
auto ret = this->s.insert(k);

if (likely(this->s.size() > this->n)) {
// precondition this->iters not empty

// element was inserted AND it's now exceeding capacity
this->s.erase(*this->iters.begin());

// treating this->iters like a fixed size array.
// erase first element and push_back
std::memmove(this->iters.data(), this->iters.data() + 1, (this->iters.size() - 1) * sizeof(typename decltype(iters)::value_type));
*this->iters.rbegin() = ret.first;
return ret;
}

if (ret.second) {
this->iters.push_back(ret.first);
}

return ret;
}
};

// only remembers last n elements.
// largely copy paste from ForgetfulSet.
template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>, typename Allocator = std::allocator<Key>>
class ForgetfulUnorderedSet {
std::unordered_set<Key, Hash, KeyEqual, Allocator> s;
const size_t n; // cap for iters
std::vector<typename decltype(s)::iterator> iters; // point within s

public:
ForgetfulUnorderedSet(const Hash& hash, const KeyEqual key_equal, float load_factor, size_t n) : s(0, hash, key_equal), n(n == 0 ? 1 : n) {
// ^ n must be positive.
// given the context where it is constructed, arg n is never 0 anyways. but this is for safety.
// required for precondition below
iters.reserve(this->n);

s.max_load_factor(load_factor);
// prevent rehashing by allocating a large enough bucket size. required to prevent iters invalidation
s.reserve(this->n);
}

void clear() {
s.clear();
iters.clear();
}

auto insert(Key k) {
auto ret = this->s.insert(k);

if (likely(this->s.size() > this->n)) {
// precondition this->iters not empty

// element was inserted AND it's now exceeding capacity
this->s.erase(*this->iters.begin());

// treating this->iters like a fixed size array.
// erase first element and push_back
std::memmove(this->iters.data(), this->iters.data() + 1, (this->iters.size() - 1) * sizeof(typename decltype(iters)::value_type));
*this->iters.rbegin() = ret.first;
return ret;
}

if (ret.second) {
this->iters.push_back(ret.first);
}

return ret;
}
};

bool general_numeric_compare(const char* lhs_begin, const char* lhs_end, const char* rhs_begin, const char* rhs_end) { //
float lhs, rhs;
// if from_chars isn't found, get a newer compiler. e.g.
Expand Down Expand Up @@ -96,11 +191,9 @@ size_t general_numeric_hash(const char* begin, const char* end) {
// implementation is based on c strings, versus here ranges were used. so this
// didn't exist yet.

// leveraged under the following assumptions:
// likely and unlikely are leveraged under the following assumptions:
// - end of string has not been reached
// - character frequency. e.g. obtaining any non-zero digit is more likely than zero digit (8/9 vs 1/9)
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

namespace {

Expand Down
16 changes: 13 additions & 3 deletions src/args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ struct Arguments {
float unique_load_factor = UNIQUE_LOAD_FACTOR_DEFAULT;
bool unique_consecutive = false; // after sorting uniqueness

size_t unique_limit = 0; // 0 indicates unused

bool flip = false;
bool flush = false;
bool multiple_selections = false;
Expand Down Expand Up @@ -385,11 +387,14 @@ void print_help_message() {
" consecutive duplicate elements. requires --sort. ignored by\n"
" truncation --out/--tail (use normal -u in these cases instead)\n"
" --unique-numeric\n"
" apply uniqueness numerically. implies --unique\n"
" apply uniqueness numerically. implies -u\n"
" --unique-general-numeric\n"
" apply uniqueness general numerically. implies --unique\n"
" apply uniqueness general numerically. implies -u\n"
" --unique-limit [<#tokens>]\n"
" implies -u. checks uniqueness against only the previous tokens.\n"
" older tokens are forgotten\n"
" --unique-use-set\n"
" apply uniqueness with a tree instead of a hash table\n"
" implies -u. apply uniqueness with a tree instead of a hash table\n"
" --use-delimiter\n"
" don't ignore a delimiter at the end of the input\n"
" --utf\n"
Expand Down Expand Up @@ -502,6 +507,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
{"load-factor", required_argument, NULL, 0},
{"locale", required_argument, NULL, 0},
{"replace", required_argument, NULL, 0},
{"unique-limit", required_argument, NULL, 0},
{"head", optional_argument, NULL, 0},
{"index", optional_argument, NULL, 0},
{"out", optional_argument, NULL, 0},
Expand Down Expand Up @@ -685,6 +691,9 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
}
}
uncompiled_output.ordered_ops.push_back(uncompiled::UncompiledReplaceOp(optarg));
} else if (strcmp("unique-limit", name) == 0) {
ret.unique_limit = num::parse_number<decltype(ret.unique_limit)>(on_num_err, optarg, false);
ret.unique = true;
} else if (strcmp("sub", name) == 0 || strcmp("substitute", name) == 0) {
// special handing here since getopt doesn't normally support multiple arguments
if (optind >= argc) {
Expand Down Expand Up @@ -773,6 +782,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
} else if (strcmp("index", name) == 0) {
index_handler(false);
} else if (strcmp("unique-use-set", name) == 0) {
ret.unique = true;
ret.unique_use_set = true;
} else if (strcmp("use-delimiter", name) == 0) {
ret.use_input_delimiter = true;
Expand Down
4 changes: 4 additions & 0 deletions src/likely_unlikely.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
8 changes: 6 additions & 2 deletions src/string_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <stdexcept>
#include <vector>

#include "likely_unlikely.hpp"

namespace choose {

namespace str {
Expand Down Expand Up @@ -299,9 +301,10 @@ struct QueuedOutput {
}
};

// block until n bytes have been made available in the file, or EOF
size_t get_bytes(FILE* f, size_t n, char* out) {
size_t read_ret = fread(out, sizeof(char), n, f);
if (read_ret == 0) {
if (unlikely(read_ret == 0)) {
if (feof(f)) {
return read_ret;
} else if (ferror(f)) {
Expand All @@ -312,9 +315,10 @@ size_t get_bytes(FILE* f, size_t n, char* out) {
return read_ret;
}

// returns once any positive number of bytes are made available in the file, or EOF
size_t get_bytes_unbuffered(int fileno, size_t n, char* out) {
ssize_t read_ret = read(fileno, out, n);
if (read_ret == -1) {
if (unlikely(read_ret == -1)) {
const char* err_string = strerror(errno);
throw std::runtime_error(err_string);
}
Expand Down
12 changes: 12 additions & 0 deletions src/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,18 @@ BOOST_AUTO_TEST_CASE(general_numeric_unique_with_parse_failure) {
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(unique_limit_set) {
choose_output out = run_choose("1\n1\n2\n2\n3\n3\n1\n2\n3\n4\n1\n1\n4\n4\n3\n3", {"--tui", "--unique-use-set", "--unique-limit", "3"});
choose_output correct_output{std::vector<choose::Token>{"1", "2", "3", "4", "1"}};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(unique_limit) {
choose_output out = run_choose("1\n1\n2\n2\n3\n3\n1\n2\n3\n4\n1\n1\n4\n4\n3\n3", {"--tui", "--unique-limit", "3"});
choose_output correct_output{std::vector<choose::Token>{"1", "2", "3", "4", "1"}};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(numeric_unique_use_set) {
choose_output out = run_choose("-0\n0\n.0\n1\n1.0\n0001.0", {"--unique-numeric", "--unique-use-set"});
choose_output correct_output{to_vec("-0\n1\n")};
Expand Down
40 changes: 30 additions & 10 deletions src/token.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,6 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
const bool tail = args.tail;

const bool unique = args.unique;
const bool unique_use_set = args.unique_use_set;
const Comparison unique_type = args.unique_type;
const bool sort = args.sort;
const Comparison sort_type = args.sort_type;
Expand Down Expand Up @@ -346,18 +345,31 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
}
};

using uniqueness_set_T = std::set<indirect, decltype(uniqueness_set_comparison)>;
using unordered_uniqueness_set_T = std::unordered_set<indirect, decltype(unordered_set_hash), decltype(unordered_set_equals)>;
using unique_checker_T = std::variant<std::monostate, uniqueness_set_T, unordered_uniqueness_set_T>;
using unordered_uniqueness_limit_set_T = ForgetfulUnorderedSet<indirect, decltype(unordered_set_hash), decltype(unordered_set_equals)>;
using uniqueness_set_T = std::set<indirect, decltype(uniqueness_set_comparison)>;
using uniqueness_limit_set_T = ForgetfulSet<indirect, decltype(uniqueness_set_comparison)>;
using unique_checker_T = std::variant<std::monostate, unordered_uniqueness_set_T, unordered_uniqueness_limit_set_T, uniqueness_set_T, uniqueness_limit_set_T>;

unique_checker_T unique_checker = [&]() -> unique_checker_T {
if (unique) {
if (unique_use_set) {
return unique_checker_T(uniqueness_set_T(uniqueness_set_comparison));
if (args.unique_use_set) {
if (args.unique_limit == 0) {
return unique_checker_T(uniqueness_set_T(uniqueness_set_comparison));
} else {
return unique_checker_T(uniqueness_limit_set_T(uniqueness_set_comparison, args.unique_limit));
}
} else {
auto s = unordered_uniqueness_set_T(8, unordered_set_hash, unordered_set_equals);
s.max_load_factor(args.unique_load_factor);
return unique_checker_T(std::move(s));
if (args.unique_limit == 0) {
auto s = unordered_uniqueness_set_T(8, unordered_set_hash, unordered_set_equals);
s.max_load_factor(args.unique_load_factor);
return unique_checker_T(std::move(s));
} else {
return unique_checker_T(unordered_uniqueness_limit_set_T(unordered_set_hash, //
unordered_set_equals, //
args.unique_load_factor, //
args.unique_limit));
}
}
} else {
return unique_checker_T();
Expand All @@ -366,10 +378,14 @@ std::vector<Token> create_tokens(choose::Arguments& args) {

// returns true if output[elem] is unique. requires unique == true
auto uniqueness_check = [&](indirect elem) -> bool { //
if (unordered_uniqueness_set_T* set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
if (unordered_uniqueness_set_T* unordered_set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
return unordered_set->insert(elem).second;
} else if (unordered_uniqueness_limit_set_T* unordered_set_limit = std::get_if<unordered_uniqueness_limit_set_T>(&unique_checker)) {
return unordered_set_limit->insert(elem).second;
} else if (uniqueness_set_T* set = std::get_if<uniqueness_set_T>(&unique_checker)) {
return set->insert(elem).second;
} else {
return std::get<uniqueness_set_T>(unique_checker).insert(elem).second;
return std::get<uniqueness_limit_set_T>(unique_checker).insert(elem).second;
}
};

Expand Down Expand Up @@ -806,8 +822,12 @@ std::vector<Token> create_tokens(choose::Arguments& args) {

if (unordered_uniqueness_set_T* uniqueness_unordered_set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
uniqueness_unordered_set->clear();
} else if (unordered_uniqueness_limit_set_T* uniqueness_limit_unordered_set = std::get_if<unordered_uniqueness_limit_set_T>(&unique_checker)) {
uniqueness_limit_unordered_set->clear();
} else if (uniqueness_set_T* set = std::get_if<uniqueness_set_T>(&unique_checker)) {
set->clear();
} else if (uniqueness_limit_set_T* set_limit = std::get_if<uniqueness_limit_set_T>(&unique_checker)) {
set_limit->clear();
}

if (!args.out_start && !args.out_end) {
Expand Down

0 comments on commit 0b67392

Please sign in to comment.