From a6ace6c675a0f4484e6a6d303267f088808aef29 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 08:17:45 -0600
Subject: [PATCH 01/19] bumped version to 1.11.0

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 1cbd88ad..1b98c030 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "feroxbuster"
-version = "1.10.3"
+version = "1.11.0"
 authors = ["Ben 'epi' Risher <epibar052@gmail.com>"]
 license = "MIT"
 edition = "2018"

From c777ab4f677c05199b5ddbe0c0ba8461638766b7 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 09:42:34 -0600
Subject: [PATCH 02/19] added --filter-similar-to to parser

---
 shell_completions/_feroxbuster     |  1 +
 shell_completions/_feroxbuster.ps1 |  1 +
 shell_completions/feroxbuster.bash |  6 +++++-
 shell_completions/feroxbuster.fish |  1 +
 src/parser.rs                      | 11 +++++++++++
 5 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/shell_completions/_feroxbuster b/shell_completions/_feroxbuster
index 5f4f5972..b0c058ad 100644
--- a/shell_completions/_feroxbuster
+++ b/shell_completions/_feroxbuster
@@ -55,6 +55,7 @@ _feroxbuster() {
 '*--filter-lines=[Filter out messages of a particular line count (ex: -N 20 -N 31,30)]' \
 '*-C+[Filter out status codes (deny list) (ex: -C 200 -C 401)]' \
 '*--filter-status=[Filter out status codes (deny list) (ex: -C 200 -C 401)]' \
+'*--filter-similar-to=[Filter out pages that are similar to the given page (ex. --filter-similar-to http://site.xyz/soft404)]' \
 '-L+[Limit total number of concurrent scans (default: 0, i.e. no limit)]' \
 '--scan-limit=[Limit total number of concurrent scans (default: 0, i.e. no limit)]' \
 '--time-limit=[Limit total run time of all scans (ex: --time-limit 10m)]' \
diff --git a/shell_completions/_feroxbuster.ps1 b/shell_completions/_feroxbuster.ps1
index e0eabdc7..1c443abd 100644
--- a/shell_completions/_feroxbuster.ps1
+++ b/shell_completions/_feroxbuster.ps1
@@ -60,6 +60,7 @@ Register-ArgumentCompleter -Native -CommandName 'feroxbuster' -ScriptBlock {
             [CompletionResult]::new('--filter-lines', 'filter-lines', [CompletionResultType]::ParameterName, 'Filter out messages of a particular line count (ex: -N 20 -N 31,30)')
             [CompletionResult]::new('-C', 'C', [CompletionResultType]::ParameterName, 'Filter out status codes (deny list) (ex: -C 200 -C 401)')
             [CompletionResult]::new('--filter-status', 'filter-status', [CompletionResultType]::ParameterName, 'Filter out status codes (deny list) (ex: -C 200 -C 401)')
+            [CompletionResult]::new('--filter-similar-to', 'filter-similar-to', [CompletionResultType]::ParameterName, 'Filter out pages that are similar to the given page (ex. --filter-similar-to http://site.xyz/soft404)')
             [CompletionResult]::new('-L', 'L', [CompletionResultType]::ParameterName, 'Limit total number of concurrent scans (default: 0, i.e. no limit)')
             [CompletionResult]::new('--scan-limit', 'scan-limit', [CompletionResultType]::ParameterName, 'Limit total number of concurrent scans (default: 0, i.e. no limit)')
             [CompletionResult]::new('--time-limit', 'time-limit', [CompletionResultType]::ParameterName, 'Limit total run time of all scans (ex: --time-limit 10m)')
diff --git a/shell_completions/feroxbuster.bash b/shell_completions/feroxbuster.bash
index 7a8ac072..5e50186f 100644
--- a/shell_completions/feroxbuster.bash
+++ b/shell_completions/feroxbuster.bash
@@ -20,7 +20,7 @@ _feroxbuster() {
 
     case "${cmd}" in
         feroxbuster)
-            opts=" -v -q -D -r -k -n -f -e -h -V -w -u -t -d -T -p -P -R -s -o -a -x -H -Q -S -X -W -N -C -L  --verbosity --quiet --json --dont-filter --redirects --insecure --no-recursion --add-slash --stdin --extract-links --help --version --wordlist --url --threads --depth --timeout --proxy --replay-proxy --replay-codes --status-codes --output --resume-from --debug-log --user-agent --extensions --headers --query --filter-size --filter-regex --filter-words --filter-lines --filter-status --scan-limit --time-limit  "
+            opts=" -v -q -D -r -k -n -f -e -h -V -w -u -t -d -T -p -P -R -s -o -a -x -H -Q -S -X -W -N -C -L  --verbosity --quiet --json --dont-filter --redirects --insecure --no-recursion --add-slash --stdin --extract-links --help --version --wordlist --url --threads --depth --timeout --proxy --replay-proxy --replay-codes --status-codes --output --resume-from --debug-log --user-agent --extensions --headers --query --filter-size --filter-regex --filter-words --filter-lines --filter-status --filter-similar-to --scan-limit --time-limit  "
             if [[ ${cur} == -* || ${COMP_CWORD} -eq 1 ]] ; then
                 COMPREPLY=( $(compgen -W "${opts}" -- "${cur}") )
                 return 0
@@ -187,6 +187,10 @@ _feroxbuster() {
                     COMPREPLY=($(compgen -f "${cur}"))
                     return 0
                     ;;
+                --filter-similar-to)
+                    COMPREPLY=($(compgen -f "${cur}"))
+                    return 0
+                    ;;
                 --scan-limit)
                     COMPREPLY=($(compgen -f "${cur}"))
                     return 0
diff --git a/shell_completions/feroxbuster.fish b/shell_completions/feroxbuster.fish
index 0b8f0e56..eed024bc 100644
--- a/shell_completions/feroxbuster.fish
+++ b/shell_completions/feroxbuster.fish
@@ -19,6 +19,7 @@ complete -c feroxbuster -n "__fish_use_subcommand" -s X -l filter-regex -d 'Filt
 complete -c feroxbuster -n "__fish_use_subcommand" -s W -l filter-words -d 'Filter out messages of a particular word count (ex: -W 312 -W 91,82)'
 complete -c feroxbuster -n "__fish_use_subcommand" -s N -l filter-lines -d 'Filter out messages of a particular line count (ex: -N 20 -N 31,30)'
 complete -c feroxbuster -n "__fish_use_subcommand" -s C -l filter-status -d 'Filter out status codes (deny list) (ex: -C 200 -C 401)'
+complete -c feroxbuster -n "__fish_use_subcommand" -l filter-similar-to -d 'Filter out pages that are similar to the given page (ex. --filter-similar-to http://site.xyz/soft404)'
 complete -c feroxbuster -n "__fish_use_subcommand" -s L -l scan-limit -d 'Limit total number of concurrent scans (default: 0, i.e. no limit)'
 complete -c feroxbuster -n "__fish_use_subcommand" -l time-limit -d 'Limit total run time of all scans (ex: --time-limit 10m)'
 complete -c feroxbuster -n "__fish_use_subcommand" -s v -l verbosity -d 'Increase verbosity level (use -vv or more for greater effect. [CAUTION] 4 -v\'s is probably too much)'
diff --git a/src/parser.rs b/src/parser.rs
index 1ec4280e..ac2c8ba0 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -301,6 +301,17 @@ pub fn initialize() -> App<'static, 'static> {
                     "Filter out status codes (deny list) (ex: -C 200 -C 401)",
                 ),
         )
+        .arg(
+            Arg::with_name("filter_similar")
+                .long("filter-similar-to")
+                .value_name("UNWANTED_PAGE")
+                .takes_value(true)
+                .multiple(true)
+                .use_delimiter(true)
+                .help(
+                    "Filter out pages that are similar to the given page (ex. --filter-similar-to http://site.xyz/soft404)",
+                ),
+        )
         .arg(
             Arg::with_name("extract_links")
                 .short("e")

From d530329478a6bb2ba1784e6d9cfb3693c730187d Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 13:46:20 -0600
Subject: [PATCH 03/19] added SimilarityFilter to filters

---
 Cargo.toml     |  1 +
 src/filters.rs | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index 1b98c030..07aa7cc0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ regex = "1"
 crossterm = "0.18"
 rlimit = "0.5"
 ctrlc = "3.1"
+strsim = "0.10"
 
 [dev-dependencies]
 tempfile = "3.1"
diff --git a/src/filters.rs b/src/filters.rs
index 68c8579f..f199e1d1 100644
--- a/src/filters.rs
+++ b/src/filters.rs
@@ -4,6 +4,7 @@ use crate::FeroxResponse;
 use regex::Regex;
 use std::any::Any;
 use std::fmt::Debug;
+use strsim::normalized_levenshtein;
 
 // references:
 //   https://dev.to/magnusstrale/rust-trait-objects-in-a-vector-non-trivial-4co5
@@ -282,6 +283,36 @@ impl PartialEq for RegexFilter {
     }
 }
 
+/// Simple implementor of FeroxFilter; used to filter out responses based on the similarity of a
+/// Response body with a known response; specified using --filter-similar-to
+#[derive(Default, Debug, PartialEq)]
+pub struct SimilarityFilter {
+    /// Response's body to be used for comparison for similarity
+    pub text: String,
+
+    /// Percentage of similarity at which a page is determined to be a near-duplicate of another
+    pub threshold: f64,
+}
+
+/// implementation of FeroxFilter for SimilarityFilter
+impl FeroxFilter for SimilarityFilter {
+    /// Check `FeroxResponse::text` against what was requested from the site passed in via
+    /// --filter-similar-to
+    fn should_filter_response(&self, response: &FeroxResponse) -> bool {
+        (normalized_levenshtein(&self.text, &response.text) - self.threshold).abs() <= 0.00001
+    }
+
+    /// Compare one SizeFilter to another
+    fn box_eq(&self, other: &dyn Any) -> bool {
+        other.downcast_ref::<Self>().map_or(false, |a| self == a)
+    }
+
+    /// Return self as Any for dynamic dispatch purposes
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

From 2ce988f87d2f44b4bc9e747c2aac432cfdf5dcc6 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 14:05:08 -0600
Subject: [PATCH 04/19] added test for SimilarityFilter

---
 src/filters.rs | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/src/filters.rs b/src/filters.rs
index f199e1d1..ea8baa11 100644
--- a/src/filters.rs
+++ b/src/filters.rs
@@ -299,7 +299,7 @@ impl FeroxFilter for SimilarityFilter {
     /// Check `FeroxResponse::text` against what was requested from the site passed in via
     /// --filter-similar-to
     fn should_filter_response(&self, response: &FeroxResponse) -> bool {
-        (normalized_levenshtein(&self.text, &response.text) - self.threshold).abs() <= 0.00001
+        normalized_levenshtein(&self.text, &response.text).abs() >= self.threshold
     }
 
     /// Compare one SizeFilter to another
@@ -450,4 +450,49 @@ mod tests {
 
         assert!(filter.should_filter_response(&resp));
     }
+
+    #[test]
+    /// simple test for similarity filter, taken from strsim docs
+    fn similarity_filter_is_accurate() {
+        let mut resp = FeroxResponse {
+            text: String::from("sitting"),
+            wildcard: false,
+            url: Url::parse("http://localhost/stuff").unwrap(),
+            content_length: 100,
+            word_count: 50,
+            line_count: 25,
+            headers: reqwest::header::HeaderMap::new(),
+            status: reqwest::StatusCode::OK,
+        };
+
+        let mut filter = SimilarityFilter {
+            text: String::from("kitten"),
+            threshold: 0.95,
+        };
+
+        // assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001)
+        // kitten/sitting is 57% similar, so a threshold of 95 should not be filtered
+        assert!(!filter.should_filter_response(&resp));
+
+        resp.text = String::new();
+        filter.text = String::new();
+        filter.threshold = 1.0;
+
+        // assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001)
+        // two empty strings are the same
+        assert!(filter.should_filter_response(&resp));
+
+        // assert!(normalized_levenshtein("", "second").abs() < 0.00001)
+        // completely dissimilar; should not pass the similarity test
+        resp.text = String::from("second");
+        filter.threshold = 0.95;
+
+        assert!(!filter.should_filter_response(&resp));
+
+        // assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
+        // same should pass
+        filter.text = String::from("second");
+        filter.threshold = 0.99999;
+        assert!(filter.should_filter_response(&resp));
+    }
 }

From 3c6d7f398ea83d0c351dd0f3de3d6492de4ba567 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 14:07:39 -0600
Subject: [PATCH 05/19] added new entry and related test for banner

---
 src/banner.rs        | 11 +++++++++++
 tests/test_banner.rs | 27 +++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/src/banner.rs b/src/banner.rs
index 3916953a..74a8fdc5 100644
--- a/src/banner.rs
+++ b/src/banner.rs
@@ -305,6 +305,17 @@ by Ben "epi" Risher {}                 ver: {}"#,
         }
     }
 
+    if !config.filter_similar.is_empty() {
+        for filter in &config.filter_similar {
+            writeln!(
+                &mut writer,
+                "{}",
+                format_banner_entry!(format_emoji("💢"), "Similarity Filter", filter)
+            )
+            .unwrap_or_default(); // 💢
+        }
+    }
+
     for filter in &config.filter_word_count {
         writeln!(
             &mut writer,
diff --git a/tests/test_banner.rs b/tests/test_banner.rs
index 3bb09ff3..ef14a89f 100644
--- a/tests/test_banner.rs
+++ b/tests/test_banner.rs
@@ -810,3 +810,30 @@ fn banner_prints_time_limit() {
                 .and(predicate::str::contains("─┴─")),
         );
 }
+
+#[test]
+/// test allows non-existent wordlist to trigger the banner printing to stderr
+/// expect to see all mandatory prints + similarity filter
+fn banner_prints_similarity_filter() {
+    Command::cargo_bin("feroxbuster")
+        .unwrap()
+        .arg("--url")
+        .arg("http://localhost")
+        .arg("--filter-similar-to")
+        .arg("https://somesite.com")
+        .assert()
+        .success()
+        .stderr(
+            predicate::str::contains("─┬─")
+                .and(predicate::str::contains("Target Url"))
+                .and(predicate::str::contains("http://localhost"))
+                .and(predicate::str::contains("Threads"))
+                .and(predicate::str::contains("Wordlist"))
+                .and(predicate::str::contains("Status Codes"))
+                .and(predicate::str::contains("Timeout (secs)"))
+                .and(predicate::str::contains("User-Agent"))
+                .and(predicate::str::contains("Similarity Filter"))
+                .and(predicate::str::contains("│ https://somesite.com"))
+                .and(predicate::str::contains("─┴─")),
+        );
+}

From 75ced453b023d9c426d1782d30deda84eb81c8a2 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 14:13:21 -0600
Subject: [PATCH 06/19] added filter_similar to config

---
 src/config.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/config.rs b/src/config.rs
index 8904c95d..a6f3a3dc 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -233,6 +233,10 @@ pub struct Configuration {
     /// non-negative integer and the next character is either s, m, h, or d (case insensitive)
     #[serde(default)]
     pub time_limit: String,
+
+    /// Filter out response bodies that meet a certain threshold of similarity
+    #[serde(default)]
+    pub filter_similar: Vec<String>,
 }
 
 // functions timeout, threads, status_codes, user_agent, wordlist, save_state, and depth are used to provide
@@ -328,6 +332,7 @@ impl Default for Configuration {
             filter_line_count: Vec::new(),
             filter_word_count: Vec::new(),
             filter_status: Vec::new(),
+            filter_similar: Vec::new(),
             headers: HashMap::new(),
             depth: depth(),
             threads: threads(),
@@ -359,6 +364,7 @@ impl Configuration {
     /// - **insecure**: `false` (don't be insecure, i.e. don't allow invalid certs)
     /// - **extensions**: `None`
     /// - **filter_size**: `None`
+    /// - **filter_similar**: `None`
     /// - **filter_regex**: `None`
     /// - **filter_word_count**: `None`
     /// - **filter_line_count**: `None`
@@ -552,6 +558,10 @@ impl Configuration {
             config.filter_regex = arg.map(|val| val.to_string()).collect();
         }
 
+        if let Some(arg) = args.values_of("filter_similar") {
+            config.filter_similar = arg.map(|val| val.to_string()).collect();
+        }
+
         if let Some(arg) = args.values_of("filter_size") {
             config.filter_size = arg
                 .map(|size| {
@@ -760,6 +770,11 @@ impl Configuration {
             new.filter_regex,
             Vec::<String>::new()
         );
+        update_if_not_default!(
+            &mut conf.filter_similar,
+            new.filter_similar,
+            Vec::<String>::new()
+        );
         update_if_not_default!(
             &mut conf.filter_word_count,
             new.filter_word_count,
@@ -893,6 +908,7 @@ mod tests {
             depth = 1
             filter_size = [4120]
             filter_regex = ["^ignore me$"]
+            filter_similar = ["https://somesite.com/soft404"]
             filter_word_count = [994, 992]
             filter_line_count = [34]
             filter_status = [201]
@@ -936,6 +952,7 @@ mod tests {
         assert_eq!(config.extensions, Vec::<String>::new());
         assert_eq!(config.filter_size, Vec::<u64>::new());
         assert_eq!(config.filter_regex, Vec::<String>::new());
+        assert_eq!(config.filter_similar, Vec::<String>::new());
         assert_eq!(config.filter_word_count, Vec::<usize>::new());
         assert_eq!(config.filter_line_count, Vec::<usize>::new());
         assert_eq!(config.filter_status, Vec::<u16>::new());
@@ -1103,6 +1120,13 @@ mod tests {
         assert_eq!(config.filter_regex, vec!["^ignore me$"]);
     }
 
+    #[test]
+    /// parse the test config and see that the value parsed is correct
+    fn config_reads_filter_similar() {
+        let config = setup_config_test();
+        assert_eq!(config.filter_similar, vec!["https://somesite.com/soft404"]);
+    }
+
     #[test]
     /// parse the test config and see that the value parsed is correct
     fn config_reads_filter_size() {

From 3adf8ff854697504983c71c58d06f825a277daf4 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 16:11:41 -0600
Subject: [PATCH 07/19] added ssdeep

---
 Cargo.toml     |  1 +
 src/filters.rs | 10 +++++++---
 src/main.rs    |  2 +-
 src/scanner.rs | 22 +++++++++++++++++++++-
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 07aa7cc0..e4b5662f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,6 +42,7 @@ crossterm = "0.18"
 rlimit = "0.5"
 ctrlc = "3.1"
 strsim = "0.10"
+ssdeep = "0.2.0"
 
 [dev-dependencies]
 tempfile = "3.1"
diff --git a/src/filters.rs b/src/filters.rs
index ea8baa11..41929b8f 100644
--- a/src/filters.rs
+++ b/src/filters.rs
@@ -2,9 +2,10 @@ use crate::config::CONFIGURATION;
 use crate::utils::get_url_path_length;
 use crate::FeroxResponse;
 use regex::Regex;
+use ssdeep;
 use std::any::Any;
 use std::fmt::Debug;
-use strsim::normalized_levenshtein;
+use strsim::{jaro, normalized_levenshtein};
 
 // references:
 //   https://dev.to/magnusstrale/rust-trait-objects-in-a-vector-non-trivial-4co5
@@ -291,7 +292,7 @@ pub struct SimilarityFilter {
     pub text: String,
 
     /// Percentage of similarity at which a page is determined to be a near-duplicate of another
-    pub threshold: f64,
+    pub threshold: i8,
 }
 
 /// implementation of FeroxFilter for SimilarityFilter
@@ -299,7 +300,10 @@ impl FeroxFilter for SimilarityFilter {
     /// Check `FeroxResponse::text` against what was requested from the site passed in via
     /// --filter-similar-to
     fn should_filter_response(&self, response: &FeroxResponse) -> bool {
-        normalized_levenshtein(&self.text, &response.text).abs() >= self.threshold
+        // normalized_levenshtein(&self.text, &response.text).abs() >= self.threshold
+        // jaro(&self.text, &response.text).abs() >= self.threshold
+        let other = ssdeep::hash(response.text.as_ref()).unwrap();
+        ssdeep::compare(self.text.as_ref(), &other.as_ref()).unwrap() >= self.threshold
     }
 
     /// Compare one SizeFilter to another
diff --git a/src/main.rs b/src/main.rs
index 923df773..9786572f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -117,7 +117,7 @@ async fn scan(
         return Err(Box::new(err));
     }
 
-    scanner::initialize(words.len(), &CONFIGURATION);
+    scanner::initialize(words.len(), &CONFIGURATION).await;
 
     if CONFIGURATION.resumed {
         if let Ok(scans) = SCANNED_URLS.scans.lock() {
diff --git a/src/scanner.rs b/src/scanner.rs
index b6534b89..be1006dc 100644
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -1,3 +1,4 @@
+use crate::filters::SimilarityFilter;
 use crate::{
     config::{Configuration, CONFIGURATION},
     extractor::{get_links, request_feroxresponse_from_new_link},
@@ -17,6 +18,7 @@ use futures::{
 use lazy_static::lazy_static;
 use regex::Regex;
 use reqwest::Url;
+use ssdeep;
 #[cfg(not(test))]
 use std::process::exit;
 use std::{
@@ -595,7 +597,7 @@ pub async fn scan_url(
 
 /// Perform steps necessary to run scans that only need to be performed once (warming up the
 /// engine, as it were)
-pub fn initialize(num_words: usize, config: &Configuration) {
+pub async fn initialize(num_words: usize, config: &Configuration) {
     log::trace!("enter: initialize({}, {:?})", num_words, config,);
 
     // number of requests only needs to be calculated once, and then can be reused
@@ -666,6 +668,24 @@ pub fn initialize(num_words: usize, config: &Configuration) {
         add_filter_to_list_of_ferox_filters(boxed_filter, FILTERS.clone());
     }
 
+    // add any similarity filters to `FILTERS` (--filter-similar-to)
+    for similarity_filter in &config.filter_similar {
+        // todo unwrap and url etc
+        if let Ok(resp) = make_request(
+            &CONFIGURATION.client,
+            &Url::parse(similarity_filter).unwrap(),
+        )
+        .await
+        {
+            let filter = SimilarityFilter {
+                text: ssdeep::hash(resp.text().await.unwrap().as_bytes()).unwrap(),
+                threshold: 95,
+            };
+            let boxed_filter = Box::new(filter);
+            add_filter_to_list_of_ferox_filters(boxed_filter, FILTERS.clone());
+        }
+    }
+
     if config.scan_limit == 0 {
         // scan_limit == 0 means no limit should be imposed... however, scoping the Semaphore
         // permit is tricky, so as a workaround, we'll add a ridiculous number of permits to

From ac3c029bff4051cfa8a96a621baaaeb1c02988df Mon Sep 17 00:00:00 2001
From: epi052 <43392618+epi052@users.noreply.github.com>
Date: Sat, 26 Dec 2020 19:02:50 -0600
Subject: [PATCH 08/19] removed todos/unwraps/etc

---
 Cargo.toml     |  1 -
 src/lib.rs     |  3 +++
 src/scanner.rs | 38 +++++++++++++++++++++-----------------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index e4b5662f..109bb6ef 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,7 +41,6 @@ regex = "1"
 crossterm = "0.18"
 rlimit = "0.5"
 ctrlc = "3.1"
-strsim = "0.10"
 ssdeep = "0.2.0"
 
 [dev-dependencies]
diff --git a/src/lib.rs b/src/lib.rs
index fa5d0f5d..757d39aa 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -51,6 +51,9 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 /// Maximum number of file descriptors that can be opened during a scan
 pub const DEFAULT_OPEN_FILE_LIMIT: usize = 8192;
 
+/// Default value used to determine near-duplicate web pages (equivalent to 95%)
+pub const SIMILARITY_THRESHOLD: i8 = 95;
+
 /// Default wordlist to use when `-w|--wordlist` isn't specified and not `wordlist` isn't set
 /// in a [ferox-config.toml](constant.DEFAULT_CONFIG_NAME.html) config file.
 ///
diff --git a/src/scanner.rs b/src/scanner.rs
index be1006dc..0e745cea 100644
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -1,15 +1,14 @@
-use crate::filters::SimilarityFilter;
 use crate::{
     config::{Configuration, CONFIGURATION},
     extractor::{get_links, request_feroxresponse_from_new_link},
     filters::{
-        FeroxFilter, LinesFilter, RegexFilter, SizeFilter, StatusCodeFilter, WildcardFilter,
-        WordsFilter,
+        FeroxFilter, LinesFilter, RegexFilter, SimilarityFilter, SizeFilter, StatusCodeFilter,
+        WildcardFilter, WordsFilter,
     },
     heuristics,
     scan_manager::{FeroxResponses, FeroxScans, PAUSE_SCAN},
     utils::{format_url, get_current_depth, make_request},
-    FeroxChannel, FeroxResponse,
+    FeroxChannel, FeroxResponse, SIMILARITY_THRESHOLD,
 };
 use futures::{
     future::{BoxFuture, FutureExt},
@@ -670,19 +669,24 @@ pub async fn initialize(num_words: usize, config: &Configuration) {
 
     // add any similarity filters to `FILTERS` (--filter-similar-to)
     for similarity_filter in &config.filter_similar {
-        // todo unwrap and url etc
-        if let Ok(resp) = make_request(
-            &CONFIGURATION.client,
-            &Url::parse(similarity_filter).unwrap(),
-        )
-        .await
-        {
-            let filter = SimilarityFilter {
-                text: ssdeep::hash(resp.text().await.unwrap().as_bytes()).unwrap(),
-                threshold: 95,
-            };
-            let boxed_filter = Box::new(filter);
-            add_filter_to_list_of_ferox_filters(boxed_filter, FILTERS.clone());
+        // url as-is based on input, ignores user-specified url manipulation options (add-slash etc)
+        if let Some(url) = format_url(&similarity_filter, &"", false, &Vec::new(), None) {
+            // attempt to request the given url
+            if let Ok(resp) = make_request(&CONFIGURATION.client, &url).await {
+                // if successful, create a filter based on the response's body
+                let fr = FeroxResponse::from(resp, true).await;
+
+                if let Ok(hash) = ssdeep::hash(fr.text().as_bytes()) {
+                    // hash the response body and store the resulting has in the filter object
+                    let filter = SimilarityFilter {
+                        text: hash,
+                        threshold: SIMILARITY_THRESHOLD,
+                    };
+
+                    let boxed_filter = Box::new(filter);
+                    add_filter_to_list_of_ferox_filters(boxed_filter, FILTERS.clone());
+                }
+            }
         }
     }
 

From 0726376955f38f60db7a51b862aafb6992dab866 Mon Sep 17 00:00:00 2001
From: epi052 <43392618+epi052@users.noreply.github.com>
Date: Sat, 26 Dec 2020 19:11:58 -0600
Subject: [PATCH 09/19] started documentation, fixed scanner option/result

---
 .github/workflows/build.yml | 2 +-
 README.md                   | 5 +++++
 ferox-config.toml.example   | 1 +
 src/scanner.rs              | 2 +-
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8d1bbeeb..c7480ead 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,7 +5,7 @@ on: [push]
 jobs:
   build-nix:
     runs-on: ${{ matrix.os }}
-    if: github.ref == 'refs/heads/master'
+#    if: github.ref == 'refs/heads/master'
     strategy:
       matrix:
         type: [ubuntu-x64, ubuntu-x86]
diff --git a/README.md b/README.md
index 2a35050f..084c7b00 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,8 @@ This attack is also known as Predictable Resource Location, File Enumeration, Di
     - [Filter Response Using a Regular Expression (new in `v1.8.0`)](#filter-response-using-a-regular-expression-new-in-v180)
     - [Stop and Resume Scans (save scan's state to disk) (new in `v1.9.0`)](#stop-and-resume-scans---resume-from-file-new-in-v190)
     - [Enforce a Time Limit on Your Scan (new in `v1.10.0`)](#enforce-a-time-limit-on-your-scan-new-in-v1100)
+    - [Extract Links from robots.txt (New in `v1.10.2`)](#extract-links-from-robotstxt-new-in-v1102)
+    - [Filter Response by Similarity to A Given Page (new in `v1.11.0`)](#filter-response-by-similarity-to-a-given-page-new-in-v1110)
 - [Comparison w/ Similar Tools](#-comparison-w-similar-tools)
 - [Common Problems/Issues (FAQ)](#-common-problemsissues-faq)
     - [No file descriptors available](#no-file-descriptors-available)
@@ -352,6 +354,7 @@ A pre-made configuration file with examples of all available settings can be fou
 # depth = 1
 # filter_size = [5174]
 # filter_regex = ["^ignore me$"]
+# filter_similar = ["https://somesite.com/soft404"]
 # filter_word_count = [993]
 # filter_line_count = [35, 36]
 # queries = [["name","value"], ["rick", "astley"]]
@@ -658,6 +661,8 @@ In addition to [extracting links from the response body](#extract-links-from-res
 `--extract-links` makes a request to `/robots.txt` and examines all `Allow` and `Disallow` entries.  Directory entries 
 are added to the scan queue, while file entries are requested and then reported if appropriate.  
 
+### Filter Response by Similarity to A Given Page (new in `v1.11.0`)
+
 ## 🧐 Comparison w/ Similar Tools
 
 There are quite a few similar tools for forced browsing/content discovery.  Burp Suite Pro, Dirb, Dirbuster, etc... 
diff --git a/ferox-config.toml.example b/ferox-config.toml.example
index 59a3d563..bfc801c8 100644
--- a/ferox-config.toml.example
+++ b/ferox-config.toml.example
@@ -33,6 +33,7 @@
 # depth = 1
 # filter_size = [5174]
 # filter_regex = ["^ignore me$"]
+# filter_similar = ["https://somesite.com/soft404"]
 # filter_word_count = [993]
 # filter_line_count = [35, 36]
 # queries = [["name","value"], ["rick", "astley"]]
diff --git a/src/scanner.rs b/src/scanner.rs
index 0e745cea..da5d8bbe 100644
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -676,7 +676,7 @@ pub async fn initialize(num_words: usize, config: &Configuration) {
                 // if successful, create a filter based on the response's body
                 let fr = FeroxResponse::from(resp, true).await;
 
-                if let Ok(hash) = ssdeep::hash(fr.text().as_bytes()) {
+                if let Some(hash) = ssdeep::hash(fr.text().as_bytes()) {
                     // hash the response body and store the resulting has in the filter object
                     let filter = SimilarityFilter {
                         text: hash,

From 883c5e306b319a9b7aa7c13724eca5d83c6cefb2 Mon Sep 17 00:00:00 2001
From: epi052 <43392618+epi052@users.noreply.github.com>
Date: Sat, 26 Dec 2020 19:14:23 -0600
Subject: [PATCH 10/19] removed build test

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c7480ead..8d1bbeeb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,7 +5,7 @@ on: [push]
 jobs:
   build-nix:
     runs-on: ${{ matrix.os }}
-#    if: github.ref == 'refs/heads/master'
+    if: github.ref == 'refs/heads/master'
     strategy:
       matrix:
         type: [ubuntu-x64, ubuntu-x86]

From 9680e36f9d37e08dafb45267ddb13174a6dc1505 Mon Sep 17 00:00:00 2001
From: epi <43392618+epi052@users.noreply.github.com>
Date: Sat, 26 Dec 2020 19:15:10 -0600
Subject: [PATCH 11/19] Update build.yml

testing build on feature branch
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8d1bbeeb..1c27b32c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,7 +5,7 @@ on: [push]
 jobs:
   build-nix:
     runs-on: ${{ matrix.os }}
-    if: github.ref == 'refs/heads/master'
+#     if: github.ref == 'refs/heads/master'
     strategy:
       matrix:
         type: [ubuntu-x64, ubuntu-x86]

From 059ba24b68bc4aa2e946b2393e3891cc26b8b240 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 19:44:00 -0600
Subject: [PATCH 12/19] fixed up build/tests

---
 src/filters.rs      | 50 +++++++++++++++++++++++++--------------------
 src/scan_manager.rs |  4 ++--
 src/scanner.rs      |  9 ++++----
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/src/filters.rs b/src/filters.rs
index 41929b8f..6f164883 100644
--- a/src/filters.rs
+++ b/src/filters.rs
@@ -1,11 +1,9 @@
 use crate::config::CONFIGURATION;
 use crate::utils::get_url_path_length;
-use crate::FeroxResponse;
+use crate::{FeroxResponse, FeroxSerialize};
 use regex::Regex;
-use ssdeep;
 use std::any::Any;
 use std::fmt::Debug;
-use strsim::{jaro, normalized_levenshtein};
 
 // references:
 //   https://dev.to/magnusstrale/rust-trait-objects-in-a-vector-non-trivial-4co5
@@ -300,10 +298,15 @@ impl FeroxFilter for SimilarityFilter {
     /// Check `FeroxResponse::text` against what was requested from the site passed in via
     /// --filter-similar-to
     fn should_filter_response(&self, response: &FeroxResponse) -> bool {
-        // normalized_levenshtein(&self.text, &response.text).abs() >= self.threshold
-        // jaro(&self.text, &response.text).abs() >= self.threshold
-        let other = ssdeep::hash(response.text.as_ref()).unwrap();
-        ssdeep::compare(self.text.as_ref(), &other.as_ref()).unwrap() >= self.threshold
+        if let Some(other) = ssdeep::hash(response.text.as_ref()) {
+            if let Some(result) = ssdeep::compare(self.text.as_ref(), &other.as_ref()) {
+                return result >= self.threshold;
+            }
+        };
+
+        // couldn't hash the response, don't filter
+        log::warn!("Could not hash body from {}", response.as_str());
+        false
     }
 
     /// Compare one SizeFilter to another
@@ -471,7 +474,7 @@ mod tests {
 
         let mut filter = SimilarityFilter {
             text: String::from("kitten"),
-            threshold: 0.95,
+            threshold: 95,
         };
 
         // assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001)
@@ -480,23 +483,26 @@ mod tests {
 
         resp.text = String::new();
         filter.text = String::new();
-        filter.threshold = 1.0;
+        filter.threshold = 100;
 
         // assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001)
-        // two empty strings are the same
-        assert!(filter.should_filter_response(&resp));
-
-        // assert!(normalized_levenshtein("", "second").abs() < 0.00001)
-        // completely dissimilar; should not pass the similarity test
-        resp.text = String::from("second");
-        filter.threshold = 0.95;
-
+        // two empty strings are the same, however ssdeep doesn't accept empty strings, expect false
         assert!(!filter.should_filter_response(&resp));
 
-        // assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
-        // same should pass
-        filter.text = String::from("second");
-        filter.threshold = 0.99999;
-        assert!(filter.should_filter_response(&resp));
+        // let lorem =
+        //     "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor";
+        //
+        // // assert!(normalized_levenshtein("", "second").abs() < 0.00001)
+        // // completely dissimilar; should not pass the similarity test
+        // resp.text = String::from(lorem);
+        // filter.threshold = 95;
+        //
+        // assert!(!filter.should_filter_response(&resp));
+        //
+        // // assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
+        // // same should pass
+        // filter.text = String::from(lorem);
+        // filter.threshold = 95;
+        // assert!(filter.should_filter_response(&resp));
     }
 }
diff --git a/src/scan_manager.rs b/src/scan_manager.rs
index 46ea4480..26dd9a88 100644
--- a/src/scan_manager.rs
+++ b/src/scan_manager.rs
@@ -1035,10 +1035,10 @@ mod tests {
 
         let json_state = ferox_state.as_json();
         let expected = format!(
-            r#"{{"scans":[{{"id":"{}","url":"https://spiritanimal.com","scan_type":"Directory","complete":false}}],"config":{{"type":"configuration","wordlist":"/usr/share/seclists/Discovery/Web-Content/raft-medium-directories.txt","config":"","proxy":"","replay_proxy":"","target_url":"","status_codes":[200,204,301,302,307,308,401,403,405],"replay_codes":[200,204,301,302,307,308,401,403,405],"filter_status":[],"threads":50,"timeout":7,"verbosity":0,"quiet":false,"json":false,"output":"","debug_log":"","user_agent":"feroxbuster/{}","redirects":false,"insecure":false,"extensions":[],"headers":{{}},"queries":[],"no_recursion":false,"extract_links":false,"add_slash":false,"stdin":false,"depth":4,"scan_limit":0,"filter_size":[],"filter_line_count":[],"filter_word_count":[],"filter_regex":[],"dont_filter":false,"resumed":false,"save_state":false,"time_limit":""}},"responses":[{{"type":"response","url":"https://nerdcore.com/css","path":"/css","wildcard":true,"status":301,"content_length":173,"line_count":10,"word_count":16,"headers":{{"server":"nginx/1.16.1"}}}}]}}"#,
+            r#"{{"scans":[{{"id":"{}","url":"https://spiritanimal.com","scan_type":"Directory","complete":false}}],"config":{{"type":"configuration","wordlist":"/usr/share/seclists/Discovery/Web-Content/raft-medium-directories.txt","config":"","proxy":"","replay_proxy":"","target_url":"","status_codes":[200,204,301,302,307,308,401,403,405],"replay_codes":[200,204,301,302,307,308,401,403,405],"filter_status":[],"threads":50,"timeout":7,"verbosity":0,"quiet":false,"json":false,"output":"","debug_log":"","user_agent":"feroxbuster/{}","redirects":false,"insecure":false,"extensions":[],"headers":{{}},"queries":[],"no_recursion":false,"extract_links":false,"add_slash":false,"stdin":false,"depth":4,"scan_limit":0,"filter_size":[],"filter_line_count":[],"filter_word_count":[],"filter_regex":[],"dont_filter":false,"resumed":false,"save_state":false,"time_limit":"","filter_similar":[]}},"responses":[{{"type":"response","url":"https://nerdcore.com/css","path":"/css","wildcard":true,"status":301,"content_length":173,"line_count":10,"word_count":16,"headers":{{"server":"nginx/1.16.1"}}}}]}}"#,
             saved_id, VERSION
         );
-
+        println!("{}\n{}", expected, json_state);
         assert!(predicates::str::similar(expected).eval(&json_state));
     }
 
diff --git a/src/scanner.rs b/src/scanner.rs
index da5d8bbe..5d780665 100644
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -17,7 +17,6 @@ use futures::{
 use lazy_static::lazy_static;
 use regex::Regex;
 use reqwest::Url;
-use ssdeep;
 #[cfg(not(test))]
 use std::process::exit;
 use std::{
@@ -670,7 +669,7 @@ pub async fn initialize(num_words: usize, config: &Configuration) {
     // add any similarity filters to `FILTERS` (--filter-similar-to)
     for similarity_filter in &config.filter_similar {
         // url as-is based on input, ignores user-specified url manipulation options (add-slash etc)
-        if let Some(url) = format_url(&similarity_filter, &"", false, &Vec::new(), None) {
+        if let Ok(url) = format_url(&similarity_filter, &"", false, &Vec::new(), None) {
             // attempt to request the given url
             if let Ok(resp) = make_request(&CONFIGURATION.client, &url).await {
                 // if successful, create a filter based on the response's body
@@ -798,12 +797,12 @@ mod tests {
         assert!(result);
     }
 
-    #[test]
+    #[tokio::test(core_threads = 1)]
     #[should_panic]
     /// call initialize with a bad regex, triggering a panic
-    fn initialize_panics_on_bad_regex() {
+    async fn initialize_panics_on_bad_regex() {
         let mut config = Configuration::default();
         config.filter_regex = vec![r"(".to_string()];
-        initialize(1, &config);
+        initialize(1, &config).await;
     }
 }

From 5308b399bdedebb5d97154335729486aa94928ba Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 19:56:05 -0600
Subject: [PATCH 13/19] added C compiler to build dependencies for CI/CD

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1c27b32c..706f3c2f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -27,7 +27,7 @@ jobs:
       - name: Install System Dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y --no-install-recommends libssl-dev pkg-config
+          sudo apt-get install -y --no-install-recommends libssl-dev pkg-config build-essential gcc
       - uses: actions-rs/toolchain@v1
         with:
           toolchain: stable

From 4d6f54128582dfecb9fd1c65596d1803bbef93e7 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 20:33:17 -0600
Subject: [PATCH 14/19] swapped ssdeep for fuzzyhash (c wrapper vs pure rust)

---
 .github/workflows/build.yml |  2 +-
 Cargo.toml                  |  2 +-
 src/filters.rs              | 29 +++++++----------------------
 src/lib.rs                  |  2 +-
 src/scanner.rs              | 19 ++++++++++---------
 5 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 706f3c2f..1c27b32c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -27,7 +27,7 @@ jobs:
       - name: Install System Dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y --no-install-recommends libssl-dev pkg-config build-essential gcc
+          sudo apt-get install -y --no-install-recommends libssl-dev pkg-config
       - uses: actions-rs/toolchain@v1
         with:
           toolchain: stable
diff --git a/Cargo.toml b/Cargo.toml
index 109bb6ef..d48eaaf7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,7 +41,7 @@ regex = "1"
 crossterm = "0.18"
 rlimit = "0.5"
 ctrlc = "3.1"
-ssdeep = "0.2.0"
+fuzzyhash = "0.2"
 
 [dev-dependencies]
 tempfile = "3.1"
diff --git a/src/filters.rs b/src/filters.rs
index 6f164883..f1e4c9d7 100644
--- a/src/filters.rs
+++ b/src/filters.rs
@@ -1,6 +1,7 @@
 use crate::config::CONFIGURATION;
 use crate::utils::get_url_path_length;
 use crate::{FeroxResponse, FeroxSerialize};
+use fuzzyhash::FuzzyHash;
 use regex::Regex;
 use std::any::Any;
 use std::fmt::Debug;
@@ -290,7 +291,7 @@ pub struct SimilarityFilter {
     pub text: String,
 
     /// Percentage of similarity at which a page is determined to be a near-duplicate of another
-    pub threshold: i8,
+    pub threshold: u32,
 }
 
 /// implementation of FeroxFilter for SimilarityFilter
@@ -298,11 +299,11 @@ impl FeroxFilter for SimilarityFilter {
     /// Check `FeroxResponse::text` against what was requested from the site passed in via
     /// --filter-similar-to
     fn should_filter_response(&self, response: &FeroxResponse) -> bool {
-        if let Some(other) = ssdeep::hash(response.text.as_ref()) {
-            if let Some(result) = ssdeep::compare(self.text.as_ref(), &other.as_ref()) {
-                return result >= self.threshold;
-            }
-        };
+        let other = FuzzyHash::new(&response.text);
+
+        if let Ok(result) = FuzzyHash::compare(&self.text, &other.to_string()) {
+            return result >= self.threshold;
+        }
 
         // couldn't hash the response, don't filter
         log::warn!("Could not hash body from {}", response.as_str());
@@ -488,21 +489,5 @@ mod tests {
         // assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001)
         // two empty strings are the same, however ssdeep doesn't accept empty strings, expect false
         assert!(!filter.should_filter_response(&resp));
-
-        // let lorem =
-        //     "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor";
-        //
-        // // assert!(normalized_levenshtein("", "second").abs() < 0.00001)
-        // // completely dissimilar; should not pass the similarity test
-        // resp.text = String::from(lorem);
-        // filter.threshold = 95;
-        //
-        // assert!(!filter.should_filter_response(&resp));
-        //
-        // // assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
-        // // same should pass
-        // filter.text = String::from(lorem);
-        // filter.threshold = 95;
-        // assert!(filter.should_filter_response(&resp));
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 757d39aa..50e3cf35 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -52,7 +52,7 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 pub const DEFAULT_OPEN_FILE_LIMIT: usize = 8192;
 
 /// Default value used to determine near-duplicate web pages (equivalent to 95%)
-pub const SIMILARITY_THRESHOLD: i8 = 95;
+pub const SIMILARITY_THRESHOLD: u32 = 95;
 
 /// Default wordlist to use when `-w|--wordlist` isn't specified and not `wordlist` isn't set
 /// in a [ferox-config.toml](constant.DEFAULT_CONFIG_NAME.html) config file.
diff --git a/src/scanner.rs b/src/scanner.rs
index 5d780665..9a6b233d 100644
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -14,6 +14,7 @@ use futures::{
     future::{BoxFuture, FutureExt},
     stream, StreamExt,
 };
+use fuzzyhash::FuzzyHash;
 use lazy_static::lazy_static;
 use regex::Regex;
 use reqwest::Url;
@@ -675,16 +676,16 @@ pub async fn initialize(num_words: usize, config: &Configuration) {
                 // if successful, create a filter based on the response's body
                 let fr = FeroxResponse::from(resp, true).await;
 
-                if let Some(hash) = ssdeep::hash(fr.text().as_bytes()) {
-                    // hash the response body and store the resulting has in the filter object
-                    let filter = SimilarityFilter {
-                        text: hash,
-                        threshold: SIMILARITY_THRESHOLD,
-                    };
+                // hash the response body and store the resulting hash in the filter object
+                let hash = FuzzyHash::new(&fr.text()).to_string();
 
-                    let boxed_filter = Box::new(filter);
-                    add_filter_to_list_of_ferox_filters(boxed_filter, FILTERS.clone());
-                }
+                let filter = SimilarityFilter {
+                    text: hash,
+                    threshold: SIMILARITY_THRESHOLD,
+                };
+
+                let boxed_filter = Box::new(filter);
+                add_filter_to_list_of_ferox_filters(boxed_filter, FILTERS.clone());
             }
         }
     }

From 3f594befec367842c14432e13246846e5631aded Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 20:41:08 -0600
Subject: [PATCH 15/19] removed build test from build.yml

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1c27b32c..8d1bbeeb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,7 +5,7 @@ on: [push]
 jobs:
   build-nix:
     runs-on: ${{ matrix.os }}
-#     if: github.ref == 'refs/heads/master'
+    if: github.ref == 'refs/heads/master'
     strategy:
       matrix:
         type: [ubuntu-x64, ubuntu-x86]

From 73627af26bb446afb6d00609c331bd6e02ee62c3 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sat, 26 Dec 2020 21:02:41 -0600
Subject: [PATCH 16/19] added integration test for similarity filter

---
 tests/test_filters.rs | 58 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tests/test_filters.rs b/tests/test_filters.rs
index 6ded920c..293a6d58 100644
--- a/tests/test_filters.rs
+++ b/tests/test_filters.rs
@@ -189,3 +189,61 @@ fn filters_size_should_filter_response() {
     assert_eq!(mock_two.hits(), 1);
     teardown_tmp_directory(tmp_dir);
 }
+
+#[test]
+/// create a FeroxResponse that should elicit a true from
+/// SimilarityFilter::should_filter_response
+fn filter_similar_should_filter_response() {
+    let srv = MockServer::start();
+    let (tmp_dir, file) = setup_tmp_directory(
+        &["not-similar".to_string(), "similar".to_string()],
+        "wordlist",
+    )
+    .unwrap();
+
+    // ''.join(random.choices(string.ascii_letters + string.digits + string.whitespace, k=4096))
+    let content = "VCiYFr0HKsEIK6r\r1hJLYnOr90Aji\rDWAjQA3LVAzrluN48FuSPrRpm\n \tV\x0cx\nSCc5sX\nTB\x0c6Of7ns\t2HDwQCduKTqG8gG\x0beszazwljW01H60HMOLziOKwQwEYV7CbrLWQiLeCWKVxX\rvag\nAAEOhjER7gURuGXw\nMyY\t8mSw\x0b\x0bK0Z9G0Pt\x0bJZItAIqAq FxeaoOeLqWVFvxtDFfko0YVYt1I\rNmSXZ4lnOoiBCLbu6TLb80lClhY\tPN7Lp36F786I\nglwRK2oD45EtN SWW IF6uqKdf\x0czAcVycf\x0cBzHYnn1HAkU2Jluos0qwMGJ2m74z\nLd3\x0cIUVZmnRmHHWQGd1u2xmsZR\x0bfnml10ur6J\x0ba8xOZatiY 15Aq3KOGWdD3xQwqo\r5SKnnxH5tqU\rO\rZpJ\n7t7UUgfE\niWFgqWDpMeOG 1248M I\ro5B9Yed\r2aq2\tXxLn31s3hCV WEfQd60DKp6eFhUeUSeXDq6qjgTnWigoCZQERf\rXp7s2L37 iOEMl3\r41\nBShOjLfD8Kj0\rbu0ENreRjP\nY77jsrsaYgOsUrEzw\x0bw3OLi\n8fkddcaOvJeutTy B\rsDMkK\x0cnx2S0N\x0cDaY\x0c9iyo6p4IL\tOC1qgNlWP4VLg\tWmPG46ZMCirth5h4FwkS\nD2WsiEA2Z\n0xbLd7Uww hUQC6 3V\r1SsWem4UcQxG\rfuVvWl\nD9\nDpZQFFgiqhQiq1I0LMAR\r\rKBmj4iurrxaoMHTl9oj\x0b0N3AfD17gyqZiJ67bgizvecsRGeB1f\x0c\nYRvieJqIVHDKOOR\ruhqnVZz4BQ5FFBusz\x0cZl5\x0bt\tbdOUhAAAKyA6Jwl 7OjzojiRHGD6dl ncsgndsKURhFv4\tV5d\n73iPzbT\t8v6IrJtnq\nJuFl7A\x0b\rVnnsjTW0Y4QB1BgCy3B\x0cma7\tpPt5jmcJH7v5J\tYKEXh UqRChBFY5nbFbmXjJYxevPYJmSHC\rDQ4j9de\rTMZ\rtWaPAzkJjH\x0c\nyrEuf9WaMM\trFlKo9r9w\r\nQkQqIEu8Gfr\t aRzvN\r2oZhCyB4fa\np37\tXQi4Wa\no7gHUDQLoRvkK1dy2K3ydrI0O6\rFTGS7oHA\x0bajFOd\rcS5W25tFGhocwxM0\nuugNGDLjBQ\tWGdJV0\x0c\r7bNLs\x0cr deAWt35A4co\x0bPCuYmQ ExxtK\rvpckCyJxLrK5xULK\x0cvqtiGyovHQW8aDjV3rhXhR\nmQvmK\x0czLx\x0cECSYSF5jP35zN VkaRzQ lZ4 l06X4HHpsVn 8y8fGbIP\tRWFUAeFI24\rqN\x0cBW7u7WPMv36BmkgzQ\x0c2\x0cyLf\tYo8iRjE7zMsceym4ZnWg7EsOedh2cES\rz2n\x0cJi52uIPfSkAPzW\rEekjgWdb8y 285F4xae8\n8AiIkT4l3AOy\rT4yeXgaRMCI4t3PkHeFZ\rEb6R4FNCE \nbVil\x0c6qxSVPnU\nh\ttFMNE4\x0c\rwF\t\x0cW5vebbRWG\x0biVZLP\x0ct\x0c5gQ4CJ9KJl\x0cwyIfSIYaCvi4m1r\tJbYqmI0NVO36A\t8BSPNlaKbR73l9mxZxoqD4yca\n5h\r7a0z\tVm34aTy\tnLj5nSrh8er5lN0J7hcjmUk2DL\nyWEVNXTF8RWfC\x0bpcgBQXOQzidyYO\x0bh76UyUPAjELmNoECgGq06hiFGDI LiPZcofhcm\r62fEixIoyG\tmI\x0cYLQvBCbCluGgbm\x0c7GI6\n19il8PdPqss2uQqA5KgkHMIb hh211YuqV9kdmVnwyD63pz3p t58q6kHX\r\teYBrg6eDh\x0bx8\x0cI1SOV3Gt5qubmixHR\rApbgkTQJQ\tX0t11IP55hys2d\x0bF dh7j7G0Ac\x0bQMNvkSU9AV\x0b8mcIPHy9d\x0cyINf5qu\x0cdiBFrhiNRmCZ4r\tSx4N5VOm6KCp2T8bOVEjOR6otPAN5e\n\x0csyJ3giBjkgg 9dYQKq5P75AG5\x0bfD6zZO4DxQ44uX7Kz50dv4ncXQA\rqgHT\rLRcsRl\rW\t7We\tpAEJHMChxwVK\x0cprVvINvolf7hj\tUrob\rW3pXlqKIEQT8t7\x0bGODJanb328OiQCxE\rPfW4j\rl3p\x0cRXDB55u0MN7isBL\ty3UvE1 7I\nfuoZVPzk7az1\rMzA2FROXu0k\rFq pby6pHMqfTQT7iTw izlk0CUpyoUaq5w3UPFK7\rMOPw2cZ6FsVITbCoPhT\rIvuImCFGqmYpE hNevWkPCtwwnx2sX\x0c7oKzBExp32ZpdY\tstuDjSzfalsO1M\x0bNMUegnBDr3Liv3Lv\x0b\n37VZT2LEJ9fNYDi9r\t\x0bYC\rHSt0oJbk\x0b\x0bUdS8eB\nMXBPDEppZjHR7vGZYqX7yFm t1i682AXWf VPTzYTvm6mhOre8\x0bk0spJNYuI\tk\tC1B1N0 AYYDWH\t\tX1TjinXdkXcbFTlIiBLzx\rmUoyx9b7paJSVMX\tfLo8hU1Dmuluyk8R8\x0c4\x0cBe\nCrIMlyek4i\x0bFwuE9\nXUqpVxikH0PZspopUwPM9Kcue\rBh2Mf\rme3h4qelC\x0bEH\x0bkkxi6U\x0cE\x0ctqBgN93 V4ovmocLrK6\ngCQlf\x0cshRVvrPq\x0cOjgbjhSEK8PIx8OYqjjDDkJ0AgLhfbdGw2\nLMv2M0E08PGXnqUyVsjN\t C 4\n80 Fia g\x0b5dEFvyl5Y80U6sMAdHgk2nzC5ElDBhgcBprXC\x0bIMKXyt\x0ce5SkYcRartfblLqD1 A5\nre\x0bj67lJYCs\t8b50xA69eMHqGDLLP8sJceN19kkonjLj\t\rS\tk9sMOeewQHbT \x0cp53aMX9\x0bDYCZWAtdA6h\rAFHDEYFBE1MzdOxMO\x0cvDE7QfLb3jq4s\tI3aVTmDDOQAnuvWb2AGUUP\rf2HinUAiF13LKEfpqcD06S8aQC0Kyl729L7a6CbuoB0GRlJx tD yuTVqD62HuXpfKrDsbejEdp3\rxjc\x0bn4lLNaViizec\rWR\x0cTT5aZ\ny9\rO1qB1XGQPnES\nUhJtU Ll7t3Zglj1IAEx 8Rh3V\x0bfmUSC4\x0bVR9l33LS3bPAJpLbH3Q2\nv2fqMeIt3nGR\x0cgCixM4qzVSx7Yb192a1HWx8nnuWQIEK7QHL6p\x0cD3d0Y1FoZqsmY2U\rspvt3gwKOHR6RaZlmhX\n3bmIEF6\x0b\x0bMXJKOnXPgjkdhun4aGDBw\x0cOEW\repDYTcc48oZ4lg7PukNq7TU\tWP0ZJbzVKK\rxAMaZujwTqQXsXODiE2DdwnstAa6CMYfzj7J\x0c2Q\tY2764IYCy 3Fqm0\x0ckbe7VvfqWUh0\tUlubxZ\rX59MfNSfCfcH8GFZIGIRPt\rZVXfra1 H7VI2yJ\x0cspGDCi\rcgHfZa8528CP9tilUx0ifWPGqskLVDPLJP\nciNxodMQSrJXp\ro\r9aBFHCV\x0cR\rrp\x0bmMfxg5rG\tSuWonbJQlmHQ\ri34w8S\x0cN9Ezj2k2OmLH\x0cEcVUDjXNZIFCtlA843I44p GZyhlOctwpd7 OZnUxk4uacN\r8NihNGO\n9eXy5l6gQe5srySxxvuX5jtCzuJ35xvCfEXYa\x0b2lTDBOAaSYpnl v9L\x0cY8RLg2oE7xeCUbD\tSHKZgeXHZIzYAmA7bsmiZUfzmo5ZZUhtBh4F\x0bTx1\x0bz zQov5mYwfpWJTR2Q\x0bLRXMuBzj\x0bZC\x0b pFNPj8ixWJQggQlr9eNW6SHLJk731nc\x0cBn\x0ckQxg2BdRT\x0bp6lf7G\x0bnIMDeY8w6fUf\x0cjGE1Pfsekv7EYEIHsOAsZb3lBfBPO9\tXpHPBMRmRtzMc5WoX6C5cc\x0cBuTPtPOgXnap1Y3xq7pcMcgu55xblsXEAJKsojjR7aDB\tU84kUKRNEj\n8mcqEyOmvq1WA\na6bhzYf9VQv2aj9KLfByVqUKNFVIc4Mkha\x0c0aCPQSKe0GGwPlSfbtNXhdhxAb3RLf1J\x0cshJzjQe4DCmlRmjt\tlB0BwzBpkg2hTYM\r S\x0cux\x0bj6IcEZ\n\ngQ\rKKgg \rrv4sUMy5sfY1aatjK1MmUyXR\rRHk\x0cqq\x0cD1fy4C0\n\x0byd4SFKOyKJqx2mzI74vPxLLo\x0c0OamjXuUu\nWGkiA70nuf0PGRfwLEBPCMeyneJI1HcIXH\nCTFEIMiAq6fT\rmJgC hXEU\rriAhCm3OzgbcDgvQgDSyUw5jl\x0cTaLOPuFseq\x0cj2npTd57itktTdWBY7sqlOGKNSc\x0ctx2mUoHi31EF3l5lvYPDeG6bIPFwIn7\tG6G \x0bgNkSn89flvqcvI73RA";
+    let mutated = "VCiYFr0HKsEIK6r\r1hJLYnOr90Aji\rDWAjQA3LVAzrluN484327FuSPrRpm\n \tV\x0cx\nSCc5sX\nTB\x0c6Of7ns\t2HDwQCduKTqG8gG\x0beszazwljW01H60HMOLziOKwQwEYV7CbrLWQiLeCWKVxX\rvag\nAAEOhjER7gURuGXw\nMyY\t8mSw\x0b\x0bK0Z9G0Pt\x0bJZItAIqAq FxeaoOeLqWVFvxtDFfko0YVYt1I\rNmSXZ4lnOoiBCLbu6TLb80lClhY\tPN7Lp36F786I\nglwRK2oD45EtN SWW IF6uqKdf\x0czAcVycf\x0cBzHYnn1HAkU2Jluos0qwMGJ2m74z\nLd3\x0cIUVZmnRmHHWQGd1u2xmsZR\x0bfnml10ur6J\x0ba8xOZatiY 15Aq3KOGWdD3xQwqo\r5SKnnxH5tqU\rO\rZpJ\n7t7UUgfE\niWFgqWDpMeOG 1248M I\ro5B9Yed\r2aq2\tXxLn31s3hCV WEfQd60DKp6eFhUeUSeXDq6qjgTnWigoCZQERf\rXp7s2L37 iOEMl3\r41\nBShOjLfD8Kj0\rbu0ENreRjP\nY77jsrsaYgOsUrEzw\x0bw3OLi\n8fkddcaOvJeutTy B\rsDMkK\x0cnx2S0N\x0cDaY\x0c9iyo6p4IL\tOC1qgNlWP4VLg\tWmPG46ZMCirth5h4FwkS\nD2WsiEA2Z\n0xbLd7Uww hUQC6 3V\r1SsWem4UcQxG\rfuVvWl\nD9\nDpZQFFgiqhQiq1I0LMAR\r\rKBmj4iurrxaoMHTl9oj\x0b0N3AfD17gyqZiJ67bgizvecsRGeB1f\x0c\nYRvieJqIVHDKOOR\ruhqnVZz4BQ5FFBusz\x0cZl5\x0bt\tbdOUhAAAKyA6Jwl 7OjzojiRHGD6dl ncsgndsKURhFv4\tV5d\n73iPzbT\t8v6IrJtnq\nJuFl7A\x0b\rVnnsjTW0Y4QB1BgCy3B\x0cma7\tpPt5jmcJH7v5J\tYKEXh UqRChBFY5nbFbmXjJYxevPYJmSHC\rDQ4j9de\rTMZ\rtWaPAzkJjH\x0c\nyrEuf9WaMM\trFlKo9r9w\r\nQkQqIEu8Gfr\t aRzvN\r2oZhCyB4fa\np37\tXQi4Wa\no7gHUDQLoRvkK1dy2K3ydrI0O6\rFTGS7oHA\x0bajFOd\rcS5W25tFGhocwxM0\nuugNGDLjBQ\tWGdJV0\x0c\r7bNLs\x0cr deAWt35A4co\x0bPCuYmQ ExxtK\rvpckCyJxLrK5xULK\x0cvqtiGyovHQW8aDjV3rhXhR\nmQvmK\x0czLx\x0cECSYSF5jP35zN VkaRzQ lZ4 l06X4HHpsVn 8y8fGbIP\tRWFUAeFI24\rqN\x0cBW7u7WPMv36BmkgzQ\x0c2\x0cyLf\tYo8iRjE7zMsceym4ZnWg7EsOedh2cES\rz2n\x0cJi52uIPfSkAPzW\rEekjgWdb8y 285F4xae8\n8AiIkT4l3AOy\rT4yeXgaRMCI4t3PkHeFZ\rEb6R4FNCE \nbVil\x0c6qxSVPnU\nh\ttFMNE4\x0c\rwF\t\x0cW5vebbRWG\x0biVZLP\x0ct\x0c5gQ4CJ9KJl\x0cwyIfSIYaCvi4m1r\tJbYqmI0NVO36A\t8BSPNlaKbR73l9mxZxoqD4yca\n5h\r7a0z\tVm34aTy\tnLj5nSrh8er5lN0J7hcjmUk2DL\nyWEVNXTF8RWfC\x0bpcgBQXOQzidyYO\x0bh76UyUPAjELmNoECgGq06hiFGDI LiPZcofhcm\r62fEixIoyG\tmI\x0cYLQvBCbCluGgbm\x0c7GI6\n19il8PdPqss2uQqA5KgkHMIb hh211YuqV9kdmVnwyD63pz3p t58q6kHX\r\teYBrg6eDh\x0bx8\x0cI1SOV3Gt5qubmixHR\rApbgkTQJQ\tX0t11IP55hys2d\x0bF dh7j7G0Ac\x0bQMNvkSU9AV\x0b8mcIPHy9d\x0cyINf5qu\x0cdiBFrhiNRmCZ4r\tSx4N5VOm6KCp2T8bOVEjOR6otPAN5e\n\x0csyJ3giBjkgg 9dYQKq5P75AG5\x0bfD6zZO4DxQ44uX7Kz50dv4ncXQA\rqgHT\rLRcsRl\rW\t7We\tpAEJHMChxwVK\x0cprVvINvolf7hj\tUrob\rW3pXlqKIEQT8t7\x0bGODJanb328OiQCxE\rPfW4j\rl3p\x0cRXDB55u0MN7isBL\ty3UvE1 7I\nfuoZVPzk7az1\rMzA2FROXu0k\rFq pby6pHMqfTQT7iTw izlk0CUpyoUaq5w3UPFK7\rMOPw2cZ6FsVITbCoPhT\rIvuImCFGqmYpE hNevWkPCtwwnx2sX\x0c7oKzBExp32ZpdY\tstuDjSzfalsO1M\x0bNMUegnBDr3Liv3Lv\x0b\n37VZT2LEJ9fNYDi9r\t\x0bYC\rHSt0oJbk\x0b\x0bUdS8eB\nMXBPDEppZjHR7vGZYqX7yFm t1i682AXWf VPTzYTvm6mhOre8\x0bk0spJNYuI\tk\tC1B1N0 AYYDWH\t\tX1TjinXdkXcbFTlIiBLzx\rmUoyx9b7paJSVMX\tfLo8hU1Dmuluyk8R8\x0c4\x0cBe\nCrIMlyek4i\x0bFwuE9\nXUqpVxikH0PZspopUwPM9Kcue\rBh2Mf\rme3h4qelC\x0bEH\x0bkkxi6U\x0cE\x0ctqBgN93 V4ovmocLrK6\ngCQlf\x0cshRVvrPq\x0cOjgbjhSEK8PIx8OYqjjDDkJ0AgLhfbdGw2\nLMv2M0E08PGXnqUyVsjN\t C 4\n80 Fia g\x0b5dEFvyl5Y80U6sMAdHgk2nzC5ElDBhgcBprXC\x0bIMKXyt\x0ce5SkYcRartfblLqD1 A5\nre\x0bj67lJYCs\t8b50xA69eMHqGDLLP8sJceN19kkonjLj\t\rS\tk9sMOeewQHbT \x0cp53aMX9\x0bDYCZWAtdA6h\rAFHDEYFBE1MzdOxMO\x0cvDE7QfLb3jq4s\tI3aVTmDDOQAnuvWb2AGUUP\rf2HinUAiF13LKEfpqcD06S8aQC0Kyl729L7a6CbuoB0GRlJx tD yuTVqD62HuXpfKrDsbejEdp3\rxjc\x0bn4lLNaViizec\rWR\x0cTT5aZ\ny9\rO1qB1XGQPnES\nUhJtU Ll7t3Zglj1IAEx 8Rh3V\x0bfmUSC4\x0bVR9l33LS3bPAJpLbH3Q2\nv2fqMeIt3nGR\x0cgCixM4qzVSx7Yb192a1HWx8nnuWQIEK7QHL6p\x0cD3d0Y1FoZqsmY2U\rspvt3gwKOHR6RaZlmhX\n3bmIEF6\x0b\x0bMXJKOnXPgjkdhun4aGDBw\x0cOEW\repDYTcc48oZ4lg7PukNq7TU\tWP0ZJbzVKK\rxAMaZujwTqQXsXODiE2DdwnstAa6CMYfzj7J\x0c2Q\tY2764IYCy 3Fqm0\x0ckbe7VvfqWUh0\tUlubxZ\rX59MfNSfCfcH8GFZIGIRPt\rZVXfra1 H7VI2yJ\x0cspGDCi\rcgHfZa8528CP9tilUx0ifWPGqskLVDPLJP\nciNxodMQSrJXp\ro\r9aBFHCV\x0cR\rrp\x0bmMfxg5rG\tSuWonbJQlmHQ\ri34w8S\x0cN9Ezj2k2OmLH\x0cEcVUDjXNZIFCtlA843I44p GZyhlOctwpd7 OZnUxk4uacN\r8NihNGO\n9eXy5l6gQe5srySxxvuX5jtCzuJ35xvCfEXYa\x0b2lTDBOAaSYpnl v9L\x0cY8RLg2oE7xeCUbD\tSHKZgeXHZIzYAmA7bsmiZUfzmo5ZZUhtBh4F\x0bTx1\x0bz zQov5mYwfpWJTR2Q\x0bLRXMuBzj\x0bZC\x0b pFNPj8ixWJQggQlr9eNW6SHLJk731nc\x0cBn\x0ckQxg2BdRT\x0bp6lf7G\x0bnIMDeY8w6fUf\x0cjGE1Pfsekv7EYEIHsOAsZb3lBfBPO9\tXpHPBMRmRtzMc5WoX6C5cc\x0cBuTPtPOgXnap1Y3xq7pcMcgu55xblsXEAJKsojjR7aDB\tU84kUKRNEj\n8mcqEyOmvq1WA\na6bhzYf9VQv2aj9KLfByVqUKNFVIc4Mkha\x0c0aCPQSKe0GGwPlSfbtNXhdhxAb3RLf1J\x0cshJzjQe4DCmlRmjt\tlB0BwzBpkg2hTYM\r S\x0cux\x0bj6IcEZ\n\ngQ\rKKgg \rrv4sUMy5sfY1aatjK1MmUyXR\rRHk\x0cqq\x0cD1fy4C0\n\x0byd4SFKOyKJqx2mzI74vPxLLo\x0c0OamjXuUu\nWGkiA70nuf0PGRfwLEBPCMeyneJI1HcIXH\nCTFEIMiAq6fT\rmJgC hXEU\rriAhCm3OzgbcDgvQgDSyUw5jl\x0cTaimauFseq\x0cj2npTd57itktTdWBY7sqlOGKNSc\x0ctx2mUoHi31EF3l5lvYPDeG6bIPFwIn7\tG6G \x0bgNkSn89flvqcvI73RA";
+
+    let canary = srv.mock(|when, then| {
+        when.method(GET).path("/canary");
+        then.status(200).body(content);
+    });
+
+    // not similar, should see results in output
+    let not_similar = srv.mock(|when, then| {
+        when.method(GET).path("/not-similar");
+        then.status(302).body("this is a test");
+    });
+
+    // similar, should not see results
+    let similar = srv.mock(|when, then| {
+        when.method(GET).path("/similar");
+        then.status(200).body(mutated);
+    });
+
+    let cmd = Command::cargo_bin("feroxbuster")
+        .unwrap()
+        .arg("--url")
+        .arg(srv.url("/"))
+        .arg("--wordlist")
+        .arg(file.as_os_str())
+        .arg("--filter-similar-to")
+        .arg(srv.url("/canary"))
+        .unwrap();
+
+    cmd.assert().success().stdout(
+        predicate::str::contains("/LICfdafdsafdsafadsENSE")
+            .and(predicate::str::contains("302"))
+            .and(predicate::str::contains("14c"))
+            .and(predicate::str::contains("/similar"))
+            .not()
+            .and(predicate::str::contains("4100c"))
+            .not(),
+    );
+
+    assert_eq!(canary.hits(), 1);
+    assert_eq!(similar.hits(), 1);
+    assert_eq!(not_similar.hits(), 1);
+    teardown_tmp_directory(tmp_dir);
+}

From c7ac717d9f039d7d62a18472529ea55fff565f41 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sun, 27 Dec 2020 06:55:03 -0600
Subject: [PATCH 17/19] increased filters code coverage

---
 src/filters.rs | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/filters.rs b/src/filters.rs
index f1e4c9d7..8f31e0a1 100644
--- a/src/filters.rs
+++ b/src/filters.rs
@@ -490,4 +490,19 @@ mod tests {
         // two empty strings are the same, however ssdeep doesn't accept empty strings, expect false
         assert!(!filter.should_filter_response(&resp));
     }
+
+    #[test]
+    /// just a simple test to increase code coverage by hitting as_any and the inner value
+    fn similarity_filter_as_any() {
+        let filter = SimilarityFilter {
+            text: String::from("stuff"),
+            threshold: 95,
+        };
+
+        assert_eq!(filter.text, "stuff");
+        assert_eq!(
+            *filter.as_any().downcast_ref::<SimilarityFilter>().unwrap(),
+            filter
+        );
+    }
 }

From 42df23982f03c07798218407c12e55d47bee3a22 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sun, 27 Dec 2020 07:30:17 -0600
Subject: [PATCH 18/19] fixed similarity filter test; removed strsim remnants

---
 src/filters.rs | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/filters.rs b/src/filters.rs
index 8f31e0a1..e32bed8e 100644
--- a/src/filters.rs
+++ b/src/filters.rs
@@ -310,7 +310,7 @@ impl FeroxFilter for SimilarityFilter {
         false
     }
 
-    /// Compare one SizeFilter to another
+    /// Compare one SimilarityFilter to another
     fn box_eq(&self, other: &dyn Any) -> bool {
         other.downcast_ref::<Self>().map_or(false, |a| self == a)
     }
@@ -460,7 +460,7 @@ mod tests {
     }
 
     #[test]
-    /// simple test for similarity filter, taken from strsim docs
+    /// a few simple tests for similarity filter
     fn similarity_filter_is_accurate() {
         let mut resp = FeroxResponse {
             text: String::from("sitting"),
@@ -474,11 +474,10 @@ mod tests {
         };
 
         let mut filter = SimilarityFilter {
-            text: String::from("kitten"),
+            text: FuzzyHash::new("kitten").to_string(),
             threshold: 95,
         };
 
-        // assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001)
         // kitten/sitting is 57% similar, so a threshold of 95 should not be filtered
         assert!(!filter.should_filter_response(&resp));
 
@@ -486,9 +485,15 @@ mod tests {
         filter.text = String::new();
         filter.threshold = 100;
 
-        // assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001)
         // two empty strings are the same, however ssdeep doesn't accept empty strings, expect false
         assert!(!filter.should_filter_response(&resp));
+
+        resp.text = String::from("some data to hash for the purposes of running a test");
+        filter.text =
+            FuzzyHash::new("some data to hash for the purposes of running a te").to_string();
+        filter.threshold = 17;
+
+        assert!(filter.should_filter_response(&resp));
     }
 
     #[test]

From 883570731e4e8ea183051890a62b5e31cc36bcd0 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Sun, 27 Dec 2020 08:07:51 -0600
Subject: [PATCH 19/19] added long form doc of --filter-similar-to

---
 README.md | 329 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 213 insertions(+), 116 deletions(-)

diff --git a/README.md b/README.md
index 084c7b00..1d9d8668 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@
   <a href="https://crates.io/crates/feroxbuster">
     <img src="https://img.shields.io/crates/v/feroxbuster?color=blue&label=version&logo=rust">
   </a>
- 
+
   <a href="https://crates.io/crates/feroxbuster">
     <img src="https://img.shields.io/crates/d/feroxbuster?label=downloads&logo=rust&color=inactive">
   </a>
@@ -45,20 +45,26 @@
 
 ## 😕 What the heck is a ferox anyway?
 
-Ferox is short for Ferric Oxide. Ferric Oxide, simply put, is rust.  The name rustbuster was taken, so I decided on a variation.  🤷	
+Ferox is short for Ferric Oxide. Ferric Oxide, simply put, is rust. The name rustbuster was taken, so I decided on a
+variation. 🤷
 
-## 🤔 What's it do tho? 
+## 🤔 What's it do tho?
 
-`feroxbuster` is a tool designed to perform [Forced Browsing](https://owasp.org/www-community/attacks/Forced_browsing).  
+`feroxbuster` is a tool designed to perform [Forced Browsing](https://owasp.org/www-community/attacks/Forced_browsing).
 
-Forced browsing is an attack where the aim is to enumerate and access resources that are not referenced by the web application, but are still accessible by an attacker.
+Forced browsing is an attack where the aim is to enumerate and access resources that are not referenced by the web
+application, but are still accessible by an attacker.
 
-`feroxbuster` uses brute force combined with a wordlist to search for unlinked content in target directories. These resources may store sensitive information about web applications and operational systems, such as source code, credentials, internal network addressing, etc...
+`feroxbuster` uses brute force combined with a wordlist to search for unlinked content in target directories. These
+resources may store sensitive information about web applications and operational systems, such as source code,
+credentials, internal network addressing, etc...
 
-This attack is also known as Predictable Resource Location, File Enumeration, Directory Enumeration, and Resource Enumeration.
+This attack is also known as Predictable Resource Location, File Enumeration, Directory Enumeration, and Resource
+Enumeration.
 
 📖 Table of Contents
 -----------------
+
 - [Installation](#-installation)
     - [Download a Release](#download-a-release)
     - [Snap Install](#snap-install)
@@ -90,7 +96,7 @@ This attack is also known as Predictable Resource Location, File Enumeration, Di
     - [Stop and Resume Scans (save scan's state to disk) (new in `v1.9.0`)](#stop-and-resume-scans---resume-from-file-new-in-v190)
     - [Enforce a Time Limit on Your Scan (new in `v1.10.0`)](#enforce-a-time-limit-on-your-scan-new-in-v1100)
     - [Extract Links from robots.txt (New in `v1.10.2`)](#extract-links-from-robotstxt-new-in-v1102)
-    - [Filter Response by Similarity to A Given Page (new in `v1.11.0`)](#filter-response-by-similarity-to-a-given-page-new-in-v1110)
+    - [Filter Response by Similarity to A Given Page (fuzzy filter) (new in `v1.11.0`)](#filter-response-by-similarity-to-a-given-page-fuzzy-filter-new-in-v1110)
 - [Comparison w/ Similar Tools](#-comparison-w-similar-tools)
 - [Common Problems/Issues (FAQ)](#-common-problemsissues-faq)
     - [No file descriptors available](#no-file-descriptors-available)
@@ -103,9 +109,11 @@ This attack is also known as Predictable Resource Location, File Enumeration, Di
 
 ### Download a Release
 
-Releases for multiple architectures can be found in the [Releases](https://github.com/epi052/feroxbuster/releases) section.  The latest release for each of the following systems can be downloaded and executed as shown below.
+Releases for multiple architectures can be found in the [Releases](https://github.com/epi052/feroxbuster/releases)
+section. The latest release for each of the following systems can be downloaded and executed as shown below.
 
 #### Linux (32 and 64-bit) & MacOS
+
 ```
 curl -sL https://raw.githubusercontent.com/epi052/feroxbuster/master/install-nix.sh | bash
 ```
@@ -134,7 +142,7 @@ Install using `snap`
 sudo snap install feroxbuster
 ```
 
-The only gotcha here is that the snap package can only read wordlists from a few specific locations. There are a few 
+The only gotcha here is that the snap package can only read wordlists from a few specific locations. There are a few
 possible solutions, of which two are shown below.
 
 If the wordlist is on the same partition as your home directory, it can be hard-linked into `~/snap/feroxbuster/common`
@@ -144,7 +152,7 @@ ln /path/to/the/wordlist ~/snap/feroxbuster/common
 ./feroxbuster -u http://localhost -w ~/snap/feroxbuster/common/wordlist
 ``` 
 
-If the wordlist is on a separate partition, hard-linking won't work.  You'll need to copy it into the snap directory.
+If the wordlist is on a separate partition, hard-linking won't work. You'll need to copy it into the snap directory.
 
 ```
 cp /path/to/the/wordlist ~/snap/feroxbuster/common
@@ -179,7 +187,8 @@ cargo install feroxbuster
 
 ### apt Install
 
-Download `feroxbuster_amd64.deb` from the [Releases](https://github.com/epi052/feroxbuster/releases) section.  After that, use your favorite package manager to install the `.deb`.
+Download `feroxbuster_amd64.deb` from the [Releases](https://github.com/epi052/feroxbuster/releases) section. After
+that, use your favorite package manager to install the `.deb`.
 
 ```
 wget -sLO https://github.com/epi052/feroxbuster/releases/latest/download/feroxbuster_amd64.deb.zip
@@ -228,7 +237,9 @@ cat targets.txt | sudo docker run --net=host --init -i feroxbuster --stdin -x js
 
 #### Mount a volume to pass in `ferox-config.toml`
 
-You've got some options available if you want to pass in a config file.  [`ferox-buster.toml`](#ferox-configtoml) can live in multiple locations and still be valid, so it's up to you how you'd like to pass it in.  Below are a few valid examples:
+You've got some options available if you want to pass in a config file.  [`ferox-buster.toml`](#ferox-configtoml) can
+live in multiple locations and still be valid, so it's up to you how you'd like to pass it in. Below are a few valid
+examples:
 
 ```
 sudo docker run --init -v $(pwd)/ferox-config.toml:/etc/feroxbuster/ferox-config.toml -it feroxbuster -u http://example.com
@@ -251,7 +262,9 @@ alias feroxbuster="sudo docker run --init -v ~/.config/feroxbuster:/root/.config
 ```
 
 ## ⚙️ Configuration
+
 ### Default Values
+
 Configuration begins with with the following built-in default values baked into the binary:
 
 - timeout: `7` seconds
@@ -269,11 +282,20 @@ Configuration begins with with the following built-in default values baked into
 
 ### Threads and Connection Limits At A High-Level
 
-This section explains how the `-t` and `-L` options work together to determine the overall aggressiveness of a scan. The combination of the two values set by these options determines how hard your target will get hit and to some extent also determines how many resources will be consumed on your local machine.
+This section explains how the `-t` and `-L` options work together to determine the overall aggressiveness of a scan. The
+combination of the two values set by these options determines how hard your target will get hit and to some extent also
+determines how many resources will be consumed on your local machine.
 
 #### A Note on Green Threads
 
-`feroxbuster` uses so-called [green threads](https://en.wikipedia.org/wiki/Green_threads) as opposed to traditional kernel/OS threads. This means (at a high-level) that the threads are implemented entirely in userspace, within a single running process. As a result, a scan with 30 green threads will appear to the OS to be a single process with no additional light-weight processes associated with it as far as the kernel is concerned. As such, there will not be any impact to process (`nproc`) limits when specifying larger values for `-t`. However, these threads will still consume file descriptors, so you will need to ensure that you have a suitable `nlimit` set when scaling up the amount of threads. More detailed documentation on setting appropriate `nlimit` values can be found in the [No File Descriptors Available](#no-file-descriptors-available) section of the FAQ
+`feroxbuster` uses so-called [green threads](https://en.wikipedia.org/wiki/Green_threads) as opposed to traditional
+kernel/OS threads. This means (at a high-level) that the threads are implemented entirely in userspace, within a single
+running process. As a result, a scan with 30 green threads will appear to the OS to be a single process with no
+additional light-weight processes associated with it as far as the kernel is concerned. As such, there will not be any
+impact to process (`nproc`) limits when specifying larger values for `-t`. However, these threads will still consume
+file descriptors, so you will need to ensure that you have a suitable `nlimit` set when scaling up the amount of
+threads. More detailed documentation on setting appropriate `nlimit` values can be found in
+the [No File Descriptors Available](#no-file-descriptors-available) section of the FAQ
 
 #### Threads and Connection Limits: The Implementation
 
@@ -282,13 +304,18 @@ This section explains how the `-t` and `-L` options work together to determine t
 
 #### Threads and Connection Limits: Examples
 
-To truly have only 30 active requests to a site at any given time, `-t 30 -L 1` is necessary. Using `-t 30 -L 2` will result in a maximum of 60 total requests being processed at any given time for that site. And so on. For a conversation on this, please see [Issue #126](https://github.com/epi052/feroxbuster/issues/126) which may provide more (or less) clarity :wink:
+To truly have only 30 active requests to a site at any given time, `-t 30 -L 1` is necessary. Using `-t 30 -L 2` will
+result in a maximum of 60 total requests being processed at any given time for that site. And so on. For a conversation
+on this, please see [Issue #126](https://github.com/epi052/feroxbuster/issues/126) which may provide more (or less)
+clarity :wink:
 
 ### ferox-config.toml
+
 After setting built-in default values, any values defined in a `ferox-config.toml` config file will override the
-built-in defaults.  
+built-in defaults.
 
 `feroxbuster` searches for `ferox-config.toml` in the following locations (in the order shown):
+
 - `/etc/feroxbuster/` (global)
 - `CONFIG_DIR/ferxobuster/` (per-user)
 - The same directory as the `feroxbuster` executable (per-user)
@@ -299,14 +326,15 @@ built-in defaults.
 > - MacOs: `$HOME/Library/Application Support` i.e. `/Users/bob/Library/Application Support`
 > - Windows: `{FOLDERID_RoamingAppData}` i.e. `C:\Users\Bob\AppData\Roaming`
 
-If more than one valid configuration file is found, each one overwrites the values found previously.  
+If more than one valid configuration file is found, each one overwrites the values found previously.
 
 If no configuration file is found, nothing happens at this stage.
 
-As an example, let's say that we prefer to use a different wordlist as our default when scanning; we can
-set the `wordlist` value in the config file to override the baked-in default.
+As an example, let's say that we prefer to use a different wordlist as our default when scanning; we can set
+the `wordlist` value in the config file to override the baked-in default.
 
 Notes of interest:
+
 - it's ok to only specify values you want to change without specifying anything else
 - variable names in `ferox-config.toml` must match their command-line counterpart
 
@@ -317,6 +345,7 @@ wordlist = "/wordlists/jhaddix/all.txt"
 ```
 
 A pre-made configuration file with examples of all available settings can be found in `ferox-config.toml.example`.
+
 ```toml
 # ferox-config.toml
 # Example configuration for feroxbuster
@@ -376,7 +405,9 @@ A pre-made configuration file with examples of all available settings can be fou
 ```
 
 ### Command Line Parsing
-Finally, after parsing the available config file, any options/arguments given on the commandline will override any values that were set as a built-in or config-file value.
+
+Finally, after parsing the available config file, any options/arguments given on the commandline will override any
+values that were set as a built-in or config-file value.
 
 ```
 USAGE:
@@ -433,7 +464,7 @@ OPTIONS:
 
 ### Multiple Values
 
-Options that take multiple values are very flexible.  Consider the following ways of specifying extensions:
+Options that take multiple values are very flexible. Consider the following ways of specifying extensions:
 
 ```
 ./feroxbuster -u http://127.1 -x pdf -x js,html -x php txt json,docx
@@ -441,7 +472,8 @@ Options that take multiple values are very flexible.  Consider the following way
 
 The command above adds .pdf, .js, .html, .php, .txt, .json, and .docx to each url
 
-All of the methods above (multiple flags, space separated, comma separated, etc...) are valid and interchangeable.  The same goes for urls, headers, status codes, queries, and size filters.
+All of the methods above (multiple flags, space separated, comma separated, etc...) are valid and interchangeable. The
+same goes for urls, headers, status codes, queries, and size filters.
 
 ### Include Headers
 
@@ -473,18 +505,19 @@ cat targets | ./feroxbuster --stdin --quiet -s 200 301 302 --redirects -x js | f
 ./feroxbuster -u http://127.1 --proxy socks5://127.0.0.1:9050
 ```
 
-### Pass auth token via query parameter 
+### Pass auth token via query parameter
 
 ```
 ./feroxbuster -u http://127.1 --query token=0123456789ABCDEF
 ```
 
-### Extract Links from Response Body (New in `v1.1.0`) 
+### Extract Links from Response Body (New in `v1.1.0`)
 
 Search through the body of valid responses (html, javascript, etc...) for additional endpoints to scan. This turns
-`feroxbuster` into a hybrid that looks for both linked and unlinked content. 
+`feroxbuster` into a hybrid that looks for both linked and unlinked content.
 
 Example request/response with `--extract-links` enabled:
+
 - Make request to `http://example.com/index.html`
 - Receive, and read in, the `body` of the response
 - Search the `body` for absolute and relative links (i.e. `homepage/assets/img/icons/handshake.svg`)
@@ -499,7 +532,8 @@ Example request/response with `--extract-links` enabled:
 ./feroxbuster -u http://127.1 --extract-links
 ```
 
-Here's a comparison of a wordlist-only scan vs `--extract-links` using [Feline](https://www.hackthebox.eu/home/machines/profile/274) from Hack the Box:
+Here's a comparison of a wordlist-only scan vs `--extract-links`
+using [Feline](https://www.hackthebox.eu/home/machines/profile/274) from Hack the Box:
 
 Wordlist only
 
@@ -511,8 +545,8 @@ With `--extract-links`
 
 ### Limit Total Number of Concurrent Scans (new in `v1.2.0`)
 
-Limit the number of scans permitted to run at any given time.  Recursion will still identify new directories, but newly
-discovered directories can only begin scanning when the total number of active scans drops below the value passed to 
+Limit the number of scans permitted to run at any given time. Recursion will still identify new directories, but newly
+discovered directories can only begin scanning when the total number of active scans drops below the value passed to
 `--scan-limit`.
 
 ```
@@ -523,9 +557,9 @@ discovered directories can only begin scanning when the total number of active s
 
 ### Filter Response by Status Code  (new in `v1.3.0`)
 
-Version 1.3.0 included an overhaul to the filtering system which will allow for a wide array of filters to be added 
-with minimal effort. The first such filter is a Status Code Filter. As responses come back from the scanned server,
-each one is checked against a list of known filters and either displayed or not according to which filters are set.
+Version 1.3.0 included an overhaul to the filtering system which will allow for a wide array of filters to be added with
+minimal effort. The first such filter is a Status Code Filter. As responses come back from the scanned server, each one
+is checked against a list of known filters and either displayed or not according to which filters are set.
 
 ```
 ./feroxbuster -u http://127.1 --filter-status 301
@@ -539,42 +573,52 @@ Scans can be paused and resumed by pressing the ENTER key (shown below)
 
 ### Replay Responses to a Proxy based on Status Code (new in `v1.5.0`)
 
-The `--replay-proxy` and `--replay-codes` options were added as a way to only send a select few responses to a proxy.  This is in stark contrast to `--proxy` which proxies EVERY request.  
+The `--replay-proxy` and `--replay-codes` options were added as a way to only send a select few responses to a proxy.
+This is in stark contrast to `--proxy` which proxies EVERY request.
 
-Imagine you only care about proxying responses that have either the status code `200` or `302` (or you just don't want to clutter up your Burp history).  These two options will allow you to fine-tune what gets proxied and what doesn't.  
+Imagine you only care about proxying responses that have either the status code `200` or `302` (or you just don't want
+to clutter up your Burp history). These two options will allow you to fine-tune what gets proxied and what doesn't.
 
 ```
 ./feroxbuster -u http://127.1 --replay-proxy http://localhost:8080 --replay-codes 200 302 --insecure
 ```
 
-Of note: this means that for every response that matches your replay criteria, you'll end up sending the request that generated that response a second time.  Depending on the target and your engagement terms (if any), it may not make sense from a traffic generated perspective.
+Of note: this means that for every response that matches your replay criteria, you'll end up sending the request that
+generated that response a second time. Depending on the target and your engagement terms (if any), it may not make sense
+from a traffic generated perspective.
 
 ![replay-proxy-demo](img/replay-proxy-demo.gif)
 
 ### Filter Response by Word Count & Line Count  (new in `v1.6.0`)
 
-In addition to filtering on the size of a response, version 1.6.0 added the ability to filter out responses based on the number of lines and/or words contained within the response body.  This change drove a change to the information displayed to the user as well. This section will detail the new information and how to make use of it with the new filters provided.
+In addition to filtering on the size of a response, version 1.6.0 added the ability to filter out responses based on the
+number of lines and/or words contained within the response body. This change drove a change to the information displayed
+to the user as well. This section will detail the new information and how to make use of it with the new filters
+provided.
 
 Example output:
+
 ```
 200        10l        212w       38437c https://example-site.com/index.html
 ```
 
 There are five columns of output above:
+
 - column 1: status code - can be filtered with `-C|--filter-status`
 - column 2: number of lines - can be filtered with `-N|--filter-lines`
 - column 3: number of words - can be filtered with `-W|--filter-words`
 - column 4: number of bytes (overall size) - can be filtered with `-S|--filter-size`
 - column 5: url to discovered resource
 
-### Filter Response Using a Regular Expression (new in `v1.8.0`) 
+### Filter Response Using a Regular Expression (new in `v1.8.0`)
 
-Version 1.3.0 included an overhaul to the filtering system which will allow for a wide array of filters to be added 
-with minimal effort. The latest addition is a Regular Expression Filter. As responses come back from the scanned server,
-the **body** of the response is checked against the filter's regular expression.  If the expression is found in the 
-body, then that response is filtered out.  
+Version 1.3.0 included an overhaul to the filtering system which will allow for a wide array of filters to be added with
+minimal effort. The latest addition is a Regular Expression Filter. As responses come back from the scanned server,
+the **body** of the response is checked against the filter's regular expression. If the expression is found in the body,
+then that response is filtered out.
 
-**NOTE: Using regular expressions to filter large responses or many regular expressions may negatively impact performance.**  
+**NOTE: Using regular expressions to filter large responses or many regular expressions may negatively impact
+performance.**
 
 ```
 ./feroxbuster -u http://127.1 --filter-regex '[aA]ccess [dD]enied.?' --output results.txt --json
@@ -582,7 +626,8 @@ body, then that response is filtered out.
 
 ### Stop and Resume Scans (`--resume-from FILE`) (new in `v1.9.0`)
 
-Version 1.9.0 adds a few features that allow for completely stopping a scan, and resuming that same scan from a file on disk. 
+Version 1.9.0 adds a few features that allow for completely stopping a scan, and resuming that same scan from a file on
+disk.
 
 A simple `Ctrl+C` during a scan will create a file that contains information about the scan that was cancelled.
 
@@ -592,48 +637,51 @@ A simple `Ctrl+C` during a scan will create a file that contains information abo
 // example snippet of state file
 
 {
-   "scans":[
-      {
-         "id":"057016a14769414aac9a7a62707598cb",
-         "url":"https://localhost.com",
-         "scan_type":"Directory",
-         "complete":true
-      },
-      {
-         "id":"400b2323a16f43468a04ffcbbeba34c6",
-         "url":"https://localhost.com/css",
-         "scan_type":"Directory",
-         "complete":false
-      }
-   ],
-   "config":{
-      "wordlist":"/wordlists/seclists/Discovery/Web-Content/common.txt",
-      "...":"..."
-   },
-   "responses":[
-      {
-         "type":"response",
-         "url":"https://localhost.com/Login",
-         "path":"/Login",
-         "wildcard":false,
-         "status":302,
-         "content_length":0,
-         "line_count":0,
-         "word_count":0,
-         "headers":{
-            "content-length":"0",
-            "server":"nginx/1.16.1"
-         }
+  "scans": [
+    {
+      "id": "057016a14769414aac9a7a62707598cb",
+      "url": "https://localhost.com",
+      "scan_type": "Directory",
+      "complete": true
+    },
+    {
+      "id": "400b2323a16f43468a04ffcbbeba34c6",
+      "url": "https://localhost.com/css",
+      "scan_type": "Directory",
+      "complete": false
+    }
+  ],
+  "config": {
+    "wordlist": "/wordlists/seclists/Discovery/Web-Content/common.txt",
+    "...": "..."
+  },
+  "responses": [
+    {
+      "type": "response",
+      "url": "https://localhost.com/Login",
+      "path": "/Login",
+      "wildcard": false,
+      "status": 302,
+      "content_length": 0,
+      "line_count": 0,
+      "word_count": 0,
+      "headers": {
+        "content-length": "0",
+        "server": "nginx/1.16.1"
       }
-   ]
+    }
+  ]
 },
 ```
 
-Based on the example image above, the same scan can be resumed by using `feroxbuster --resume-from ferox-http_localhost-1606947491.state`.  Directories that were already complete are not rescanned, however partially complete scans are started from the beginning.  
+Based on the example image above, the same scan can be resumed by
+using `feroxbuster --resume-from ferox-http_localhost-1606947491.state`. Directories that were already complete are not
+rescanned, however partially complete scans are started from the beginning.
 
 ![resumed-scan](img/resumed-scan.gif)
 
-In order to prevent state file creation when `Ctrl+C` is pressed, you can simply add the entry below to your `ferox-config.toml`.
+In order to prevent state file creation when `Ctrl+C` is pressed, you can simply add the entry below to
+your `ferox-config.toml`.
 
 ```toml
 # ferox-config.toml
@@ -643,9 +691,12 @@ save_state = false
 
 ### Enforce a Time Limit on Your Scan (new in `v1.10.0`)
 
-Version 1.10.0 adds the ability to set a maximum runtime, or time limit, on your scan.  The usage is pretty simple: a number followed directly by a single character representing seconds, minutes, hours, or days.  `feroxbuster` refers to this combination as a time_spec.  
+Version 1.10.0 adds the ability to set a maximum runtime, or time limit, on your scan. The usage is pretty simple: a
+number followed directly by a single character representing seconds, minutes, hours, or days.  `feroxbuster` refers to
+this combination as a time_spec.
 
 Examples of possible time_specs:
+
 - `30s` - 30 seconds
 - `20m` - 20 minutes
 - `1h`  - 1 hour
@@ -657,27 +708,52 @@ A valid time_spec can be passed to `--time-limit` in order to force a shutdown a
 
 ### Extract Links from robots.txt (New in `v1.10.2`)
 
-In addition to [extracting links from the response body](#extract-links-from-response-body-new-in-v110), using 
-`--extract-links` makes a request to `/robots.txt` and examines all `Allow` and `Disallow` entries.  Directory entries 
-are added to the scan queue, while file entries are requested and then reported if appropriate.  
+In addition to [extracting links from the response body](#extract-links-from-response-body-new-in-v110), using
+`--extract-links` makes a request to `/robots.txt` and examines all `Allow` and `Disallow` entries. Directory entries
+are added to the scan queue, while file entries are requested and then reported if appropriate.
+
+### Filter Response by Similarity to A Given Page (fuzzy filter) (new in `v1.11.0`)
+
+Version 1.11.0 adds the ability to specify an example page for filtering pages that are similar to the given example.
+
+For example, consider a site that attempts to redirect new users to a `/register` endpoint. The `/register` page has a
+CSRF token that alters the page's response slightly with each new request (sometimes affecting overall length). This
+means that a simple line/word/char filter won't be able to filter all responses. In order to filter those redirects out,
+one could use a command like this:
+
+```
+./feroxbuster -u https://somesite.xyz --filter-similar-to https://somesite.xyz/register
+```
+
+`--filter-similar-to` requests the page passed to it via CLI (`https://somesite.xyz/register`), after which it hashes 
+the response body using the [SSDeep algorithm](https://ssdeep-project.github.io/ssdeep/index.html).  All subsequent 
+pages are hashed and compared to the original request's hash. If the comparison of the two hashes meets a certain 
+percentage of similarity (currently 95%), then that request will be filtered out.
+
+SSDeep was selected as it does a good job of identifying near-duplicate pages once content-length reaches a certain 
+size, while remaining performant.  Other algorithms were tested but resulted in huge performance hits (orders of 
+magnitude slower on requests/second).
 
-### Filter Response by Similarity to A Given Page (new in `v1.11.0`)
+**NOTE**
+- SSDeep/`--filter-similar-to` does not do well at detecting similarity of very small responses
+  - The lack of accuracy with very small responses is considered a fair trade-off for not negatively impacting performance
+- Using a bunch of `--filter-similar-to` values **may** negatively impact performance
 
 ## 🧐 Comparison w/ Similar Tools
 
-There are quite a few similar tools for forced browsing/content discovery.  Burp Suite Pro, Dirb, Dirbuster, etc... 
-However, in my opinion, there are two that set the standard: [gobuster](https://github.com/OJ/gobuster) and 
-[ffuf](https://github.com/ffuf/ffuf).  Both are mature, feature-rich, and all-around incredible tools to use.
+There are quite a few similar tools for forced browsing/content discovery. Burp Suite Pro, Dirb, Dirbuster, etc...
+However, in my opinion, there are two that set the standard: [gobuster](https://github.com/OJ/gobuster) and
+[ffuf](https://github.com/ffuf/ffuf). Both are mature, feature-rich, and all-around incredible tools to use.
 
-So, why would you ever want to use feroxbuster over ffuf/gobuster?  In most cases, you probably won't.  ffuf in particular
-can do the vast majority of things that feroxbuster can, while still offering boatloads more functionality.  Here are
-a few of the use-cases in which feroxbuster may be a better fit:
+So, why would you ever want to use feroxbuster over ffuf/gobuster? In most cases, you probably won't. ffuf in particular
+can do the vast majority of things that feroxbuster can, while still offering boatloads more functionality. Here are a
+few of the use-cases in which feroxbuster may be a better fit:
 
 - You want a **simple** tool usage experience
 - You want to be able to run your content discovery as part of some crazy 12 command unix **pipeline extravaganza**
 - You want to scan through a **SOCKS** proxy
 - You want **auto-filtering** of Wildcard responses by default
-- You want an integrated **link extractor** to increase discovered endpoints
+- You want an integrated **link extractor/robots.txt parser** to increase discovered endpoints
 - You want **recursion** along with some other thing mentioned above (ffuf also does recursion)
 - You want a **configuration file** option for overriding built-in default values for your scans
 
@@ -707,13 +783,14 @@ a few of the use-cases in which feroxbuster may be a better fit:
 | save scan's state to disk (can pick up where it left off) (`v1.9.0`)         | ✔ |   |   |
 | maximum run time limit (`v1.10.0`)                                           | ✔ |   | ✔ |
 | use robots.txt to increase scan coverage (`v1.10.2`)                         | ✔ |   |   |
+| use example page's response to fuzzily filter similar pages  (`v1.11.0`)     | ✔ |   |   |
 | **huge** number of other options                                             |   |   | ✔ |
 
-Of note, there's another written-in-rust content discovery tool, [rustbuster](https://github.com/phra/rustbuster). I 
-came across rustbuster when I was naming my tool (😢). I don't have any experience using it, but it appears to 
-be able to do POST requests with an HTTP body, has SOCKS support, and has an 8.3 shortname scanner (in addition to vhost
-dns, directory, etc...).  In short, it definitely looks interesting and may be what you're looking for as it has some 
-capability I haven't seen in similar tools.  
+Of note, there's another written-in-rust content discovery tool, [rustbuster](https://github.com/phra/rustbuster). I
+came across rustbuster when I was naming my tool (😢). I don't have any experience using it, but it appears to be able
+to do POST requests with an HTTP body, has SOCKS support, and has an 8.3 shortname scanner (in addition to vhost dns,
+directory, etc...). In short, it definitely looks interesting and may be what you're looking for as it has some
+capability I haven't seen in similar tools.
 
 ## 🤯 Common Problems/Issues (FAQ)
 
@@ -723,21 +800,24 @@ Why do I get a bunch of `No file descriptors available (os error 24)` errors?
 
 ---
 
-There are a few potential causes of this error.  The simplest is that your operating system sets an open file limit that is aggressively low.  Through personal testing, I've found that `4096` is a reasonable open file limit (this will vary based on your exact setup).
+There are a few potential causes of this error. The simplest is that your operating system sets an open file limit that
+is aggressively low. Through personal testing, I've found that `4096` is a reasonable open file limit (this will vary
+based on your exact setup).
 
-There are quite a few options to solve this particular problem, of which a handful are shown below.  
+There are quite a few options to solve this particular problem, of which a handful are shown below.
 
 #### Increase the Number of Open Files
 
-We'll start by increasing the number of open files the OS allows. On my Kali install, the default was `1024`, and I know some MacOS installs use `256` 😕.
+We'll start by increasing the number of open files the OS allows. On my Kali install, the default was `1024`, and I know
+some MacOS installs use `256` 😕.
 
 ##### Edit `/etc/security/limits.conf`
 
-One option to up the limit is to edit `/etc/security/limits.conf` so that it includes the two lines below.  
+One option to up the limit is to edit `/etc/security/limits.conf` so that it includes the two lines below.
 
 - `*` represents all users
-- `hard` and `soft` indicate the hard and soft limits for the OS 
-- `nofile` is the number of open files option. 
+- `hard` and `soft` indicate the hard and soft limits for the OS
+- `nofile` is the number of open files option.
 
 ```
 /etc/security/limits.conf
@@ -758,20 +838,25 @@ ulimit -n 4096
 
 #### Additional Tweaks (may not be needed)
 
-If you still find yourself hitting the file limit with the above changes, there are a few additional tweaks that may help.  
+If you still find yourself hitting the file limit with the above changes, there are a few additional tweaks that may
+help.
 
-> This section was shamelessly stolen from this [stackoverflow answer](https://stackoverflow.com/a/3923785).  More information is included in that post and is recommended reading if you end up needing to use this section.
+> This section was shamelessly stolen from this [stackoverflow answer](https://stackoverflow.com/a/3923785). More information is included in that post and is recommended reading if you end up needing to use this section.
 
-✨ Special thanks to HTB user [@sparkla](https://www.hackthebox.eu/home/users/profile/221599) for their help with identifying these additional tweaks ✨
+✨ Special thanks to HTB user [@sparkla](https://www.hackthebox.eu/home/users/profile/221599) for their help with
+identifying these additional tweaks ✨
 
 ##### Increase the ephemeral port range, and decrease the tcp_fin_timeout.
 
-The ephermal port range defines the maximum number of outbound sockets a host can create from a particular I.P. address. The fin_timeout defines the minimum time these sockets will stay in TIME_WAIT state (unusable after being used once). Usual system defaults are
+The ephermal port range defines the maximum number of outbound sockets a host can create from a particular I.P. address.
+The fin_timeout defines the minimum time these sockets will stay in TIME_WAIT state (unusable after being used once).
+Usual system defaults are
 
 - `net.ipv4.ip_local_port_range = 32768   61000`
 - `net.ipv4.tcp_fin_timeout = 60`
 
-This basically means your system cannot consistently guarantee more than `(61000 - 32768) / 60 = 470` sockets per second.
+This basically means your system cannot consistently guarantee more than `(61000 - 32768) / 60 = 470` sockets per
+second.
 
 ```
 sudo sysctl net.ipv4.ip_local_port_range="15000 61000"
@@ -780,7 +865,9 @@ sudo sysctl net.ipv4.tcp_fin_timeout=30
 
 ##### Allow socket reuse while in a `TIME_WAIT` status
 
-This allows fast cycling of sockets in time_wait state and re-using them. Make sure to read post [Coping with the TCP TIME-WAIT](https://vincent.bernat.ch/en/blog/2014-tcp-time-wait-state-linux) from Vincent Bernat to understand the implications.
+This allows fast cycling of sockets in time_wait state and re-using them. Make sure to read
+post [Coping with the TCP TIME-WAIT](https://vincent.bernat.ch/en/blog/2014-tcp-time-wait-state-linux) from Vincent
+Bernat to understand the implications.
 
 ```
 sudo sysctl net.ipv4.tcp_tw_reuse=1 
@@ -788,30 +875,39 @@ sudo sysctl net.ipv4.tcp_tw_reuse=1
 
 ### Progress bars print one line at a time
 
-`feroxbuster` needs a terminal width of at least the size of what's being printed in order to do progress bar printing correctly.  If your width is too small, you may see output like what's shown below.
+`feroxbuster` needs a terminal width of at least the size of what's being printed in order to do progress bar printing
+correctly. If your width is too small, you may see output like what's shown below.
 
 ![small-term](img/small-term.png)
 
-If you can, simply make the terminal wider and rerun.  If you're unable to make your terminal wider
-consider using `-q` to suppress the progress bars.
+If you can, simply make the terminal wider and rerun. If you're unable to make your terminal wider consider using `-q`
+to suppress the progress bars.
 
 ### What do each of the numbers beside the URL mean?
 
-Please refer to [this section](#filter-response-by-word-count--line-count--new-in-v160) where each number's meaning and how to use it to filter responses is discussed.
+Please refer to [this section](#filter-response-by-word-count--line-count--new-in-v160) where each number's meaning and
+how to use it to filter responses is discussed.
 
 ### Connection closed before message completed
 
-The error in question can be boiled down to 'networking stuff'. `feroxbuster` uses [reqwest](https://docs.rs/reqwest/latest/) which uses [hyper](https://docs.rs/hyper/latest/hyper/) to make requests to the server. [This issue report](https://github.com/hyperium/hyper/issues/2136#issuecomment-589345238) to the hyper project explains what is happening (quoted below to save you a click). This isn't a bug so much as it's a target-specific tuning issue. When lowering the `-t` value, the error doesn't occur (or happens much less frequently).
+The error in question can be boiled down to 'networking stuff'. `feroxbuster`
+uses [reqwest](https://docs.rs/reqwest/latest/) which uses [hyper](https://docs.rs/hyper/latest/hyper/) to make requests
+to the server. [This issue report](https://github.com/hyperium/hyper/issues/2136#issuecomment-589345238) to the hyper
+project explains what is happening (quoted below to save you a click). This isn't a bug so much as it's a
+target-specific tuning issue. When lowering the `-t` value, the error doesn't occur (or happens much less frequently).
 
-This isn't a bug. Simply slow down the scan. A `-t` value of 50 was chosen as a sane default that's still quite fast out of the box. However, network related errors may occur when the client and/or server become over-saturated.  The [Threads and Connection Limits At A High-Level](#threads-and-connection-limits-at-a-high-level) section details how to accomplish per-target tuning.
+This isn't a bug. Simply slow down the scan. A `-t` value of 50 was chosen as a sane default that's still quite fast out
+of the box. However, network related errors may occur when the client and/or server become over-saturated.
+The [Threads and Connection Limits At A High-Level](#threads-and-connection-limits-at-a-high-level) section details how
+to accomplish per-target tuning.
 
 > This is just due to the racy nature of networking.
-> 
+>
 > hyper has a connection pool of idle connections, and it selected one to send your request. Most of the time, hyper will receive the server's FIN and drop the dead connection from its pool. But occasionally, a connection will be selected from the pool and written to at the same time the server is deciding to close the connection. Since hyper already wrote some of the request, it can't really retry it automatically on a new connection, since the server may have acted already.
 
 ### SSL Error routines:tls_process_server_certificate:certificate verify failed
 
-In the event you see an error similar to 
+In the event you see an error similar to
 
 ![self-signed](img/insecure.png)
 
@@ -821,4 +917,5 @@ error trying to connect: error:1416F086:SSL routines:tls_process_server_certific
 
 You just need to add the `-k|--insecure` flag to your command.
 
-`feroxbuster` rejects self-signed certs and other "insecure" certificates/site configurations by default. You can choose to scan these services anyway by telling `feroxbuster` to ignore insecure server certs.
+`feroxbuster` rejects self-signed certs and other "insecure" certificates/site configurations by default. You can choose
+to scan these services anyway by telling `feroxbuster` to ignore insecure server certs.