Merge pull request #163 from epi052/137-extract-robots-txt

add robots.txt extraction to increase scan coverage
epi052 · Dec 19, 2020 · 7b3540e · 7b3540e
2 parents b10c4ca + 4e49293
commit 7b3540e
Show file tree

Hide file tree

Showing 6 changed files with 325 additions and 55 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "feroxbuster"
-version = "1.10.1"
+version = "1.10.2"
 authors = ["Ben 'epi' Risher <[email protected]>"]
 license = "MIT"
 edition = "2018"

diff --git a/README.md b/README.md
@@ -95,6 +95,7 @@ This attack is also known as Predictable Resource Location, File Enumeration, Di
     - [Progress bars print one line at a time](#progress-bars-print-one-line-at-a-time)
     - [What do each of the numbers beside the URL mean?](#what-do-each-of-the-numbers-beside-the-url-mean)
     - [Connection closed before message completed](#connection-closed-before-message-completed)
+    - [SSL Error routines:tls_process_server_certificate:certificate verify failed](#ssl-error-routinestls_process_server_certificatecertificate-verify-failed)
 
 ## 💿 Installation
 
@@ -651,6 +652,12 @@ A valid time_spec can be passed to `--time-limit` in order to force a shutdown a
 
 ![time-limit](img/time-limit.gif)
 
+### Extract Links from robots.txt (New in `v1.10.2`)
+
+In addition to [extracting links from the response body](#extract-links-from-response-body-new-in-v110), using 
+`--extract-links` makes a request to `/robots.txt` and examines all `Allow` and `Disallow` entries.  Directory entries 
+are added to the scan queue, while file entries are requested and then reported if appropriate.  
+
 ## 🧐 Comparison w/ Similar Tools
 
 There are quite a few similar tools for forced browsing/content discovery.  Burp Suite Pro, Dirb, Dirbuster, etc... 
@@ -694,6 +701,7 @@ a few of the use-cases in which feroxbuster may be a better fit:
 | filter out responses by regular expression (`v1.8.0`)                        | ✔ |   | ✔ |
 | save scan's state to disk (can pick up where it left off) (`v1.9.0`)         | ✔ |   |   |
 | maximum run time limit (`v1.10.0`)                                           | ✔ |   | ✔ |
+| use robots.txt to increase scan coverage (`v1.10.2`)                         | ✔ |   |   |
 | **huge** number of other options                                             |   |   | ✔ |
 
 Of note, there's another written-in-rust content discovery tool, [rustbuster](https://github.com/phra/rustbuster). I 
@@ -795,3 +803,17 @@ This isn't a bug. Simply slow down the scan. A `-t` value of 50 was chosen as a
 > This is just due to the racy nature of networking.
 > 
 > hyper has a connection pool of idle connections, and it selected one to send your request. Most of the time, hyper will receive the server's FIN and drop the dead connection from its pool. But occasionally, a connection will be selected from the pool and written to at the same time the server is deciding to close the connection. Since hyper already wrote some of the request, it can't really retry it automatically on a new connection, since the server may have acted already.
+
+### SSL Error routines:tls_process_server_certificate:certificate verify failed
+
+In the event you see an error similar to 
+
+![self-signed](img/insecure.png)
+
+```
+error trying to connect: error:1416F086:SSL routines:tls_process_server_certificate:certificate verify failed:ssl/statem/statem_clnt.c:1913: (self signed certificate)
+```
+
+You just need to add the `-k|--insecure` flag to your command.
+
+`feroxbuster` rejects self-signed certs and other "insecure" certificates/site configurations by default. You can choose to scan these services anyway by telling `feroxbuster` to ignore insecure server certs.
diff --git a/src/extractor.rs b/src/extractor.rs
@@ -1,4 +1,10 @@
-use crate::FeroxResponse;
+use crate::{
+    client,
+    config::{Configuration, CONFIGURATION},
+    scanner::SCANNED_URLS,
+    utils::{format_url, make_request},
+    FeroxResponse,
+};
 use lazy_static::lazy_static;
 use regex::Regex;
 use reqwest::Url;
@@ -9,9 +15,18 @@ use std::collections::HashSet;
 /// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
 const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;
 
+/// Regular expression to pull url paths from robots.txt
+///
+/// ref: https://developers.google.com/search/reference/robots_txt
+const ROBOTS_TXT_REGEX: &str =
+    r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
+
 lazy_static! {
     /// `LINKFINDER_REGEX` as a regex::Regex type
-    static ref REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();
+    static ref LINKS_REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();
+
+    /// `ROBOTS_TXT_REGEX` as a regex::Regex type
+    static ref ROBOTS_REGEX: Regex = Regex::new(ROBOTS_TXT_REGEX).unwrap();
 }
 
 /// Iterate over a given path, return a list of every sub-path found
@@ -90,7 +105,7 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {
 
     let body = response.text();
 
-    for capture in REGEX.captures_iter(&body) {
+    for capture in LINKS_REGEX.captures_iter(&body) {
         // remove single & double quotes from both ends of the capture
         // capture[0] is the entire match, additional capture groups start at [1]
         let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
@@ -105,27 +120,14 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {
                     continue;
                 }
 
-                for sub_path in get_sub_paths_from_path(absolute.path()) {
-                    // take a url fragment like homepage/assets/img/icons/handshake.svg and
-                    // incrementally add
-                    //     - homepage/assets/img/icons/
-                    //     - homepage/assets/img/
-                    //     - homepage/assets/
-                    //     - homepage/
-                    log::debug!("Adding {} to {:?}", sub_path, links);
-                    add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
-                }
+                add_all_sub_paths(absolute.path(), &response, &mut links);
             }
             Err(e) => {
                 // this is the expected error that happens when we try to parse a url fragment
                 //     ex: Url::parse("/login") -> Err("relative URL without a base")
                 // while this is technically an error, these are good results for us
                 if e.to_string().contains("relative URL without a base") {
-                    for sub_path in get_sub_paths_from_path(link) {
-                        // incrementally save all sub-paths that led to the relative url's resource
-                        log::debug!("Adding {} to {:?}", sub_path, links);
-                        add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
-                    }
+                    add_all_sub_paths(link, &response, &mut links);
                 } else {
                     // unexpected error has occurred
                     log::error!("Could not parse given url: {}", e);
@@ -135,6 +137,152 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {
     }
 
     log::trace!("exit: get_links -> {:?}", links);
+
+    links
+}
+
+/// take a url fragment like homepage/assets/img/icons/handshake.svg and
+/// incrementally add
+///     - homepage/assets/img/icons/
+///     - homepage/assets/img/
+///     - homepage/assets/
+///     - homepage/
+fn add_all_sub_paths(url_path: &str, response: &FeroxResponse, mut links: &mut HashSet<String>) {
+    log::trace!(
+        "enter: add_all_sub_paths({}, {}, {:?})",
+        url_path,
+        response,
+        links
+    );
+
+    for sub_path in get_sub_paths_from_path(url_path) {
+        log::debug!("Adding {} to {:?}", sub_path, links);
+        add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
+    }
+
+    log::trace!("exit: add_all_sub_paths");
+}
+
+/// Wrapper around link extraction logic
+/// currently used in two places:
+///   - links from response bodys
+///   - links from robots.txt responses
+///
+/// general steps taken:
+///   - create a new Url object based on cli options/args
+///   - check if the new Url has already been seen/scanned -> None
+///   - make a request to the new Url ? -> Some(response) : None
+pub async fn request_feroxresponse_from_new_link(url: &str) -> Option<FeroxResponse> {
+    log::trace!("enter: request_feroxresponse_from_new_link({})", url);
+
+    // create a url based on the given command line options, return None on error
+    let new_url = match format_url(
+        &url,
+        &"",
+        CONFIGURATION.add_slash,
+        &CONFIGURATION.queries,
+        None,
+    ) {
+        Ok(url) => url,
+        Err(_) => {
+            log::trace!("exit: request_feroxresponse_from_new_link -> None");
+            return None;
+        }
+    };
+
+    if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
+        //we've seen the url before and don't need to scan again
+        log::trace!("exit: request_feroxresponse_from_new_link -> None");
+        return None;
+    }
+
+    // make the request and store the response
+    let new_response = match make_request(&CONFIGURATION.client, &new_url).await {
+        Ok(resp) => resp,
+        Err(_) => {
+            log::trace!("exit: request_feroxresponse_from_new_link -> None");
+            return None;
+        }
+    };
+
+    let new_ferox_response = FeroxResponse::from(new_response, true).await;
+
+    log::trace!(
+        "exit: request_feroxresponse_from_new_link -> {:?}",
+        new_ferox_response
+    );
+    Some(new_ferox_response)
+}
+
+/// helper function that simply requests /robots.txt on the given url's base url
+///
+/// example:
+///     http://localhost/api/users -> http://localhost/robots.txt
+///     
+/// The length of the given path has no effect on what's requested; it's always
+/// base url + /robots.txt
+pub async fn request_robots_txt(base_url: &str, config: &Configuration) -> Option<FeroxResponse> {
+    log::trace!("enter: get_robots_file({})", base_url);
+
+    // more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
+    // similar; to account for that, create a client that will follow redirects, regardless of
+    // what the user specified for the scanning client. Other than redirects, it will respect
+    // all other user specified settings
+    let follow_redirects = true;
+
+    let proxy = if config.proxy.is_empty() {
+        None
+    } else {
+        Some(config.proxy.as_str())
+    };
+
+    let client = client::initialize(
+        config.timeout,
+        &config.user_agent,
+        follow_redirects,
+        config.insecure,
+        &config.headers,
+        proxy,
+    );
+
+    if let Ok(mut url) = Url::parse(base_url) {
+        url.set_path("/robots.txt"); // overwrite existing path with /robots.txt
+
+        if let Ok(response) = make_request(&client, &url).await {
+            let ferox_response = FeroxResponse::from(response, true).await;
+
+            log::trace!("exit: get_robots_file -> {}", ferox_response);
+            return Some(ferox_response);
+        }
+    }
+
+    None
+}
+
+/// Entry point to perform link extraction from robots.txt
+///
+/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
+/// root of the url
+/// given the url:
+///     http://localhost/stuff/things
+/// this function requests:
+///     http://localhost/robots.txt
+pub async fn extract_robots_txt(base_url: &str, config: &Configuration) -> HashSet<String> {
+    log::trace!("enter: extract_robots_txt({}, CONFIGURATION)", base_url);
+    let mut links = HashSet::new();
+
+    if let Some(response) = request_robots_txt(&base_url, &config).await {
+        for capture in ROBOTS_REGEX.captures_iter(response.text.as_str()) {
+            if let Some(new_path) = capture.name("url_path") {
+                if let Ok(mut new_url) = Url::parse(base_url) {
+                    new_url.set_path(new_path.as_str());
+                    add_all_sub_paths(new_url.path(), &response, &mut links);
+                }
+            }
+        }
+    }
+
+    log::trace!("exit: extract_robots_txt -> {:?}", links);
     links
 }
 
@@ -266,4 +414,27 @@ mod tests {
         assert_eq!(mock.hits(), 1);
         Ok(())
     }
+
+    #[tokio::test(core_threads = 1)]
+    /// test that /robots.txt is correctly requested given a base url (happy path)
+    async fn request_robots_txt_with_and_without_proxy() {
+        let srv = MockServer::start();
+
+        let mock = srv.mock(|when, then| {
+            when.method(GET).path("/robots.txt");
+            then.status(200).body("this is a test");
+        });
+
+        let mut config = Configuration::default();
+
+        request_robots_txt(&srv.url("/api/users/stuff/things"), &config).await;
+
+        // note: the proxy doesn't actually do anything other than hit a different code branch
+        // in this unit test; it would however have an effect on an integration test
+        config.proxy = srv.url("/ima-proxy");
+
+        request_robots_txt(&srv.url("/api/different/path"), &config).await;
+
+        assert_eq!(mock.hits(), 2);
+    }
 }
diff --git a/src/main.rs b/src/main.rs
@@ -1,11 +1,13 @@
 use crossterm::event::{self, Event, KeyCode};
-use feroxbuster::progress::add_bar;
 use feroxbuster::{
     banner,
     config::{CONFIGURATION, PROGRESS_BAR, PROGRESS_PRINTER},
-    heuristics, logger, reporter,
+    extractor::{extract_robots_txt, request_feroxresponse_from_new_link},
+    heuristics, logger,
+    progress::add_bar,
+    reporter,
     scan_manager::{self, PAUSE_SCAN},
-    scanner::{self, scan_url, RESPONSES, SCANNED_URLS},
+    scanner::{self, scan_url, send_report, RESPONSES, SCANNED_URLS},
     utils::{ferox_print, get_current_depth, module_colorizer, status_colorizer},
     FeroxError, FeroxResponse, FeroxResult, FeroxSerialize, SLEEP_DURATION, VERSION,
 };
@@ -97,7 +99,7 @@ fn get_unique_words_from_wordlist(path: &str) -> FeroxResult<Arc<HashSet<String>
 
 /// Determine whether it's a single url scan or urls are coming from stdin, then scan as needed
 async fn scan(
-    targets: Vec<String>,
+    mut targets: Vec<String>,
     tx_term: UnboundedSender<FeroxResponse>,
     tx_file: UnboundedSender<FeroxResponse>,
 ) -> FeroxResult<()> {
@@ -142,6 +144,37 @@ async fn scan(
         }
     }
 
+    if CONFIGURATION.extract_links {
+        for target in targets.clone() {
+            // modifying the targets vector, so we can't have a reference to it while we borrow
+            // it as mutable; thus the clone
+            let robots_links = extract_robots_txt(&target, &CONFIGURATION).await;
+
+            for robot_link in robots_links {
+                // create a url based on the given command line options, continue on error
+                let ferox_response = match request_feroxresponse_from_new_link(&robot_link).await {
+                    Some(resp) => resp,
+                    None => continue,
+                };
+
+                if ferox_response.is_file() {
+                    SCANNED_URLS.add_file_scan(&robot_link);
+                    send_report(tx_term.clone(), ferox_response);
+                } else {
+                    let (unknown, _) = SCANNED_URLS.add_directory_scan(&robot_link);
+
+                    if !unknown {
+                        // known directory; can skip (unlikely)
+                        continue;
+                    }
+
+                    // unknown directory; add to targets for scanning
+                    targets.push(robot_link);
+                }
+            }
+        }
+    }
+
     let mut tasks = vec![];
 
     for target in targets {