Skip to content

Commit

Permalink
Bound unique sort case (#43)
Browse files Browse the repository at this point in the history
  • Loading branch information
jagprog5 committed Sep 12, 2023
1 parent 9a3a1c4 commit ba7cf50
Show file tree
Hide file tree
Showing 10 changed files with 183 additions and 152 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y build-essential libboost-test-dev cmake pkg-config libpcre2-dev libncursesw5-dev
sudo apt-get install -y build-essential libboost-test-dev cmake pkg-config libpcre2-dev libncursesw5-dev libtbb-dev
- name: Build project
run: |
cd build
Expand Down
15 changes: 13 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ project(choose)
add_executable(choose src/main.cpp)
target_include_directories(choose PRIVATE src)

target_compile_options(choose PRIVATE -Wall -Wextra -O3 -fno-rtti)
target_compile_options(choose PRIVATE -Wall -Wextra -O3)

set(CURSES_NEED_NCURSES TRUE)
set(CURSES_NEED_WIDE TRUE)
Expand All @@ -29,6 +29,13 @@ pkg_check_modules(PCRE REQUIRED libpcre2-8)
target_include_directories(choose PRIVATE ${PCRE_INCLUDEDIR})
target_link_libraries(choose PRIVATE ${PCRE_LIBRARIES})

# https://stackoverflow.com/a/74755391/15534181
# optional link here since sometimes it's ok not to have. this is the least invasive
find_package(TBB QUIET)
if(TBB_FOUND)
target_link_libraries(choose PRIVATE TBB::tbb)
endif()

if (NO_SCROLL_BORDER)
target_compile_definitions(choose PRIVATE
CHOOSE_NO_SCROLL_BORDER
Expand All @@ -52,7 +59,7 @@ if(BUILD_TESTING)
add_executable(unit_tests src/test.cpp)
target_include_directories(unit_tests PRIVATE src)
# -O0 gives better coverage info than -Og
target_compile_options(unit_tests PRIVATE -Wall -Wextra -O0 -g -fno-rtti)
target_compile_options(unit_tests PRIVATE -Wall -Wextra -O0 -g)

target_include_directories(unit_tests PRIVATE ${Boost_UNIT_TEST_FRAMEWORK_HEADER_NAME})
target_link_libraries(unit_tests PRIVATE ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY})
Expand All @@ -64,6 +71,10 @@ if(BUILD_TESTING)
target_include_directories(unit_tests PRIVATE ${PCRE_INCLUDEDIR})
target_link_libraries(unit_tests PRIVATE ${PCRE_LIBRARIES})

if(TBB_FOUND)
target_link_libraries(unit_tests PRIVATE TBB::tbb)
endif()

if (DISABLE_FIELD)
target_compile_definitions(unit_tests PRIVATE
CHOOSE_DISABLE_FIELD
Expand Down
3 changes: 3 additions & 0 deletions perf/gen_perf_stats.bash
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
set -e
sudo echo -n '' # do nothing. perf requires sudo. doing the prompt at the beginning

# fairer performance for sort
export LC_ALL=C

# e.g. -n makes the benchmarks apply sorting and uniqueness numerically (but not for just uniqueness since awk doesn't support this)
COMP_FLAGS=

Expand Down
54 changes: 27 additions & 27 deletions perf/perf.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Also note the compile time option called `DISABLE_FIELD`. It disables the `--fie

### Sorting, and Sorting + Uniqueness

In most cases, `choose` is faster than `sort` and `sort -u` at sorting and sorting + uniqueness, respectively. Note however that the c locale isn't used for comparisons, and again it is the task clock that is reported.
`sort` is using naive byte order (via `LC_ALL=C`), as this is the fairest. `sort` is faster than `choose` at sorting. If truncation is leveraged, or if there are many duplicates (when applying uniqueness as well), then `choose` is faster than `sort`.

## Input Data

Expand Down Expand Up @@ -82,80 +82,80 @@ garbage,5,garbage

### Versions
```txt
choose 0.3.0, ncurses 6.1.20180127, pcre2 10.42
pcre2grep version 10.42 2022-12-11
sed (GNU sed) 4.4
GNU Awk 4.1.4, API: 1.1 (GNU MPFR 4.0.1, GNU MP 6.1.2)
sort (GNU coreutils) 8.28
choose 0.3.0, ncurses 6.2.20200212, pcre2 10.43
pcre2grep version 10.43-DEV 2023-04-14
sed (GNU sed) 4.7
GNU Awk 5.0.1, API: 2.0 (GNU MPFR 4.0.2, GNU MP 6.2.0)
sort (GNU coreutils) 8.30
```
### Specs
```txt
5.15.90.1-microsoft-standard-WSL2
Intel(R) Core(TM) i5-8600K CPU @ 3.60GHz
ram: 8116584 kB
AMD Ryzen 7 3800X 8-Core Processor
ram: 16331032 kB
```

### Grepping

| (ms) | choose | pcre2grep |
|------------------|--------|------------|
| plain_text | 238.334100 | 246.104400 |
| test_repeated | 1536.390100 | 1446.540000 |
| no_duplicates | 321.083200 | 313.054700 |
| plain_text | 247.17 | 269.69 |
| test_repeated | 1620.34 | 1583.77 |
| no_duplicates | 323.59 | 370.16 |

### Stream Editing

| (ms) | choose | sed |
|------------------|--------|------|
| plain_text | 173.019600 | 156.455300 |
| test_repeated | 2563.258500 | 1024.358400 |
| no_duplicates | 8.424300 | 46.834200 |
| plain_text | 179.57 | 135.57 |
| test_repeated | 2725.50 | 1157.39 |
| no_duplicates | 5.10 | 44.00 |

(here is a cherry picked great case for choose compared to sed)

| (ms) | choose | sed (with newline delimiter) |
|------------------|--------|------|
| no_duplicates | 8.245600 | 437.878300 |
| no_duplicates | 5.13 | 543.93 |

(a special case, where choose cheats by using a literal replacement string)

| (ms) | choose (delimiter sub) | sed |
|------------------|------------------------|-----|
| test_repeated | 1457.271000 | 1010.783600 |
| test_repeated | 1521.93 | 1156.64 |

### Sorting

| (ms) | choose | sort |
|------------------|--------|------|
| plain_text | 694.556000 | 1905.257700 |
| test_repeated | 2226.087400 | 1987.113500 |
| no_duplicates | 2120.992700 | 5092.179100 |
| plain_text | 1628.88 | 448.06 |
| test_repeated | 1850.89 | 1616.13 |
| no_duplicates | 3714.94 | 1036.93 |

(a special case that leverages truncation)

| (ms) | choose -s --out 5 | sort \| head -n 5 |
|------------------|--------|------|
| no_duplicates | 251.069600 | 5063.083100 |
| no_duplicates | 354.20 | 1059.81 |

### Uniqueness

| (ms) | choose | awk |
|------------------|--------|-----|
| plain_text | 114.649800 | 208.971700 |
| test_repeated | 578.412600 | 972.325200 |
| no_duplicates | 2480.435700 | 1477.912300 |
| plain_text | 111.95 | 214.41 |
| test_repeated | 565.31 | 1147.75 |
| no_duplicates | 2340.37 | 1496.42 |

### Sorting and Uniqueness -u

| (ms) | choose | sort |
|------------------|--------|------|
| plain_text | 106.970100 | 1906.801600 |
| test_repeated | 574.516000 | 1961.279100 |
| no_duplicates | 4165.485200 | 5670.807600 |
| plain_text | 122.80 | 440.57 |
| test_repeated | 558.86 | 1640.79 |
| no_duplicates | 5742.11 | 1168.84 |


### Sorting and Uniqueness based on field -u

| (ms) | choose | sort |
|------------------|--------|------|
| csv_field | 1779.289000 | 1987.503500 |
| csv_field | 2770.27 | 474.02 |
24 changes: 9 additions & 15 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Also it's fast. See benchmarks [here](./perf/perf.md) comparing choose to other

## Install
```bash
sudo apt-get install cmake pkg-config libpcre2-dev libncursesw5-dev
sudo apt-get install cmake pkg-config libpcre2-dev libncursesw5-dev libtbb-dev
git clone https://github.com/jagprog5/choose.git && cd choose
make install
source ~/.bashrc
Expand Down Expand Up @@ -187,7 +187,7 @@ ccc

# Monitoring

Suppose there's an input that's running for a _really_ long time. For example, a python http server, with an output like this:
Suppose there's an input that's running for a _long_ time. For example, a python http server, with an output like:

```txt
127.0.0.1 - - [08/Sep/2023 22:11:48] "GET /tester.txt HTTP/1.1" 200 -
Expand All @@ -199,8 +199,11 @@ The goal is to monitor the output and print unique IPs:

```bash
# serves current dir on 8080
python3 -m http.server --directory . 8080 2>&1 >/dev/null\
| choose --match --multiline -r "^(?>(?:\d++\.){3})\d++" --unique-limit 1000 --unique-expiry 900 --flush
python3 -u -m http.server --directory . 8080 2>&1 >/dev/null\
| choose --match --multiline -r '^(?:\d++\.){3}\d++' \
--unique-limit 1000\
--unique-expiry 900\
--flush
```

This applies a form of bounded uniqueness in the face of an infinite input; tokens can be forgotten based on space and time constraints, meaning they can pass to the output again:
Expand Down Expand Up @@ -249,20 +252,10 @@ sed can't make a substitution if the target contains the delimiter (a newline ch
`ch_hist` is a bash function installed with choose. It allows a previous command to be re-run, like [fzf](https://github.com/junegunn/fzf).

```txt
git log --oneline
top
cat temp.txt
git commit --amend
git push
clear
cd ~/
ls
cd choose/
git pull
cd build
rm -rf *
cmake ..
sudo make install
make install
> choose -h
┌────────────────────────────────────────────────────────────────────────────────┐
│Select a line to edit then run. │
Expand All @@ -271,6 +264,7 @@ sed can't make a substitution if the target contains the delimiter (a newline ch
## Examples

```bash
ch_hist
ch_hist git
ch_hist hello there
```
1 change: 1 addition & 0 deletions src/algo_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end,
bool general_numeric_compare(const char* lhs_begin, const char* lhs_end, const char* rhs_begin, const char* rhs_end) { //
float lhs, rhs;
// if from_chars isn't found, get a newer compiler. e.g.
// add-apt-repository -y ppa:ubuntu-toolchain-r/test
// apt-get install g++-11
// cd build && cmake .. -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11
std::from_chars_result lhs_ret = std::from_chars(lhs_begin, lhs_end, lhs, std::chars_format::general);
Expand Down
45 changes: 34 additions & 11 deletions src/args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ struct Arguments {
return is_direct_output() && !unique;
}

// the elements in output vector are being inserted with any excess being discarded
bool mem_is_bounded() const {
return out_end.has_value() //
&& !truncate_no_bound //
&& (unique ? sort && unique_type == sort_type : true);
}

void drop_warning() {
if (this->can_drop_warn) {
this->can_drop_warn = false;
Expand All @@ -134,8 +141,9 @@ struct Arguments {
namespace {

struct UncompiledCodes {
// all args must be parsed before the args are compiled
// the uncompiled args are stored here before transfer to the Arguments output.
// all args must be parsed before the args are compiled. the uncompiled args
// are stored here before transfer to the Arguments output. this also contains
// fields that aren't needed in the rest of the program, past the arg parsing
uint32_t re_options = PCRE2_LITERAL;
std::vector<uncompiled::UncompiledOrderedOp> ordered_ops;

Expand All @@ -152,6 +160,8 @@ struct UncompiledCodes {
bool bout_delimiter_set = false;
bool primary_set = false;

bool is_bounded_query = false;

void compile(Arguments& output) const {
for (const uncompiled::UncompiledOrderedOp& op : ordered_ops) {
OrderedOp oo = uncompiled::compile(op, re_options);
Expand Down Expand Up @@ -315,6 +325,9 @@ void print_help_message() {
" must have at least one digit. parse failures are smallest\n"
" -i, --ignore-case\n"
" make the positional argument case-insensitive\n"
" --is-bounded\n"
" prints a line indicating if the memory usage is bounded from\n"
" truncation (--out/--tail), then exits\n"
" --load-factor <positive float, default: " choose_xstr(UNIQUE_LOAD_FACTOR_DEFAULT) ">\n"
" if a hash table is used for uniqueness, set the max load factor\n"
" --locale <locale>\n"
Expand Down Expand Up @@ -375,19 +388,19 @@ void print_help_message() {
" on tui confirmed selection, do not exit; but still flush the\n"
" current selection to the output as a batch\n"
" --truncate-no-bound\n"
" if truncation is specified (--out/--tail) and uniqueness is not\n"
" specified, then choose only retains the relevant n values in\n"
" memory. This is only faster for small values of n, as elements\n"
" are shifted within this storage space. If n is large, this\n"
" option should be used to disable this optimization, leading to\n"
" faster speed but more space used\n"
" if truncation is specified (--out/--tail) then choose may retain\n"
" only the relevant n values in memory. see --is-bounded. this is\n"
" faster for small values of n, as elements are shifted within\n"
" this storage space. If n is large, this option should be used to\n"
" disable this optimization\n"
" -u, --unique\n"
" remove duplicate input tokens. leaves first occurrences. applied\n"
" before sorting\n"
" --uniq\n"
" unrelated to any other uniqueness options. after sorting, remove\n"
" consecutive duplicate elements. requires --sort. ignored by\n"
" truncation --out/--tail (use normal -u in these cases instead)\n"
" consecutive duplicate elements. requires --sort. ignored if\n"
" memory is bounded from truncation (see --is-bounded. use normal\n"
" -u in these cases instead)\n"
" --unique-numeric\n"
" apply uniqueness numerically. implies -u\n"
" --unique-expiry <# seconds>\n"
Expand All @@ -396,9 +409,11 @@ void print_help_message() {
" --unique-general-numeric\n"
" apply uniqueness general numerically. implies -u\n"
" --unique-limit <#tokens>\n"
" implies -u. forget least recently used tokens\n"
" implies -u. forget least recently used tokens. ignored if memory\n"
" is bounded from truncation (see --is-bounded)\n"
" --unique-use-set\n"
" implies -u. apply uniqueness with a tree instead of a hash table\n"
" ignored if memory is bounded from truncation (see --is-bounded)\n"
" --use-delimiter\n"
" don't ignore a delimiter at the end of the input\n"
" --utf\n"
Expand Down Expand Up @@ -527,6 +542,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
{"flip", no_argument, NULL, 0},
{"flush", no_argument, NULL, 0},
{"ignore-case", no_argument, NULL, 'i'},
{"is-bounded", no_argument, NULL, 0},
{"multi", no_argument, NULL, 'm'},
{"multiline", no_argument, NULL, 0},
{"match", no_argument, NULL, 0},
Expand Down Expand Up @@ -776,6 +792,8 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
} else if (strcmp("unique-general-numeric", name) == 0) {
ret.unique = true;
ret.unique_type = general_numeric;
} else if (strcmp("is-bounded", name) == 0) {
uncompiled_output.is_bounded_query = true;
} else if (strcmp("multiline", name) == 0) {
uncompiled_output.re_options &= ~PCRE2_LITERAL;
uncompiled_output.re_options |= PCRE2_MULTILINE;
Expand Down Expand Up @@ -998,6 +1016,11 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
}
}

if (uncompiled_output.is_bounded_query) {
int exit_code = puts(ret.mem_is_bounded() ? "yes" : "no") < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
exit(exit_code);
}

if (isatty(fileno(ret.input))) {
int exit_code = puts("Try 'choose --help' for more information.") < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
exit(exit_code);
Expand Down
Loading

0 comments on commit ba7cf50

Please sign in to comment.