diff --git a/flathunter/abstract_crawler.py b/flathunter/abstract_crawler.py index 9e4bb90..33fafde 100644 --- a/flathunter/abstract_crawler.py +++ b/flathunter/abstract_crawler.py @@ -204,9 +204,8 @@ def resolve_awsawf(self, driver): # Intercept background network traffic via log sniffing sleep(2) - logs_raw = driver.get_log("performance") - logs = [json.loads(lr["message"])["message"] for lr in logs_raw] - + logs = [json.loads(lr["message"])["message"] for lr in driver.get_log("performance")] + def log_filter(log_): return ( # is an actual response @@ -214,31 +213,30 @@ def log_filter(log_): # and json and "json" in log_["params"]["response"]["mimeType"] ) - + for log in filter(log_filter, logs): request_id = log["params"]["requestId"] resp_url = log["params"]["response"]["url"] if "problem" in resp_url and "awswaf" in resp_url: - response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id}) + response = driver.execute_cdp_cmd( + "Network.getResponseBody", {"requestId": request_id} + ) response_json = json.loads(response["body"]) iv = response_json["state"]["iv"] context = response_json["state"]["payload"] sitekey = response_json["key"] - sitekey = re.findall( r"apiKey: \"(.*?)\"", driver.page_source)[0] - patternChallenge = r'src="([^"]*challenge\.js)"' - challenge_matches = re.findall(patternChallenge, driver.page_source) + challenge_matches = re.findall(r'src="([^"]*challenge\.js)"', driver.page_source) for match in challenge_matches: - print(f'Challenge SRC Value: {match}') + logger.debug('Challenge SRC Value: %s', match) challenge = match - patternJsApi = r'src="([^"]*jsapi\.js)"' - jsapi_matches = re.findall(patternJsApi, driver.page_source) + jsapi_matches = re.findall(r'src="([^"]*jsapi\.js)"', driver.page_source) for match in jsapi_matches: - print(f'JsApi SRC Value: {match}') + logger.debug('JsApi SRC Value: %s', match) jsapi = match try: diff --git a/flathunter/captcha/capmonster_solver.py b/flathunter/captcha/capmonster_solver.py index e393c92..b310702 100644 --- a/flathunter/captcha/capmonster_solver.py +++ b/flathunter/captcha/capmonster_solver.py @@ -17,9 +17,24 @@ class CapmonsterSolver(CaptchaSolver): """Implementation of Captcha solver for CapMonster""" - - - def solve_awswaf(self, sitekey: str, iv: str, context: str, challenge_script: str, captcha_script: str, page_url: str) -> AwsAwfResponse: + + def solve_geetest(self, geetest: str, challenge: str, page_url: str) -> GeetestResponse: + """Should be implemented in subclass""" + raise NotImplementedError("Geetest captcha solving is not implemented for CapMonster") + + def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaResponse: + """Should be implemented in subclass""" + raise NotImplementedError("Recaptcha captcha solving is not implemented for Capmonster") + + def solve_awswaf( + self, + sitekey: str, + iv: str, + context: str, + challenge_script: str, + captcha_script: str, + page_url: str + ) -> AwsAwfResponse: """Solves AWS WAF Captcha""" logger.info("Trying to solve AWS WAF.") params = { @@ -49,7 +64,6 @@ def __submit_capmonster_request(self, params: Dict[str, str]) -> str: return response_json["taskId"] - @backoff.on_exception(**CaptchaSolver.backoff_options) def __retrieve_capmonster_result(self, captcha_id: str): retrieve_url = "https://api.capmonster.cloud/getTaskResult" @@ -70,4 +84,4 @@ def __retrieve_capmonster_result(self, captcha_id: str): sleep(5) continue if response_json["status"] == "ready": - return response_json["solution"]["cookies"]["aws-waf-token"] \ No newline at end of file + return response_json["solution"]["cookies"]["aws-waf-token"] diff --git a/flathunter/captcha/captcha_solver.py b/flathunter/captcha/captcha_solver.py index eb203ba..627caba 100644 --- a/flathunter/captcha/captcha_solver.py +++ b/flathunter/captcha/captcha_solver.py @@ -38,8 +38,16 @@ def __init__(self, api_key): def solve_geetest(self, geetest: str, challenge: str, page_url: str) -> GeetestResponse: """Should be implemented in subclass""" raise NotImplementedError() - - def solve_awswaf(self, sitekey: str, iv: str, context: str, page_url: str) -> AwsAwfResponse: + + def solve_awswaf( + self, + sitekey: str, + iv: str, + context: str, + challenge_script: str, + captcha_script: str, + page_url: str + ) -> AwsAwfResponse: """Should be implemented in subclass""" raise NotImplementedError() diff --git a/flathunter/captcha/imagetyperz_solver.py b/flathunter/captcha/imagetyperz_solver.py index fd5f142..8f93c03 100644 --- a/flathunter/captcha/imagetyperz_solver.py +++ b/flathunter/captcha/imagetyperz_solver.py @@ -11,6 +11,7 @@ CaptchaSolver, CaptchaUnsolvableError, GeetestResponse, + AwsAwfResponse, RecaptchaResponse, ) @@ -58,6 +59,17 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo ) return RecaptchaResponse(self.__retrieve_imagetyperz_result(captcha_id)) + def solve_awswaf( + self, + sitekey: str, + iv: str, + context: str, + challenge_script: str, + captcha_script: str, + page_url: str + ) -> AwsAwfResponse: + """Should be implemented at some point""" + raise NotImplementedError("AWS WAF captchas not supported for Imagetyperz") @backoff.on_exception(**CaptchaSolver.backoff_options) def __submit_imagetyperz_request(self, submit_url: str, params: Dict[str, str]) -> str: diff --git a/flathunter/captcha/twocaptcha_solver.py b/flathunter/captcha/twocaptcha_solver.py index 544ad3e..d034050 100644 --- a/flathunter/captcha/twocaptcha_solver.py +++ b/flathunter/captcha/twocaptcha_solver.py @@ -47,6 +47,17 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo captcha_id = self.__submit_2captcha_request(params) return RecaptchaResponse(self.__retrieve_2captcha_result(captcha_id)) + def solve_awswaf( + self, + sitekey: str, + iv: str, + context: str, + challenge_script: str, + captcha_script: str, + page_url: str + ) -> AwsAwfResponse: + """Should be implemented at some point""" + raise NotImplementedError("AWS WAF captchas not supported for 2Captcha") @backoff.on_exception(**CaptchaSolver.backoff_options) def __submit_2captcha_request(self, params: Dict[str, str]) -> str: @@ -89,4 +100,4 @@ def __retrieve_2captcha_result(self, captcha_id: str): if not retrieve_response.text.startswith("OK"): raise requests.HTTPError(response=retrieve_response) - return retrieve_response.text.split("|", 1)[1] \ No newline at end of file + return retrieve_response.text.split("|", 1)[1] diff --git a/flathunter/chrome_wrapper.py b/flathunter/chrome_wrapper.py index acdb664..a62e8a7 100644 --- a/flathunter/chrome_wrapper.py +++ b/flathunter/chrome_wrapper.py @@ -59,6 +59,8 @@ def get_chrome_driver(driver_arguments): """Configure Chrome WebDriver""" logger.info('Initializing Chrome WebDriver for crawler...') chrome_options = uc.ChromeOptions() # pylint: disable=no-member + if platform == "darwin": + chrome_options.add_argument("--headless") if driver_arguments is not None: for driver_argument in driver_arguments: chrome_options.add_argument(driver_argument) diff --git a/flathunter/config.py b/flathunter/config.py index 3c96fe0..1b1e0c6 100644 --- a/flathunter/config.py +++ b/flathunter/config.py @@ -302,7 +302,7 @@ def _get_imagetyperz_token(self): def get_twocaptcha_key(self) -> str: """API Token for 2captcha""" return self._read_yaml_path("captcha.2captcha.api_key", "") - + def get_capmonster_key(self) -> str: """API Token for Capmonster""" return self._read_yaml_path("captcha.capmonster.api_key", "") @@ -316,7 +316,7 @@ def _get_captcha_solver(self) -> Optional[CaptchaSolver]: twocaptcha_api_key = self.get_twocaptcha_key() if twocaptcha_api_key: return TwoCaptchaSolver(twocaptcha_api_key) - + capmonster_api_key = self.get_capmonster_key() if capmonster_api_key: return CapmonsterSolver(capmonster_api_key) @@ -409,7 +409,7 @@ def _get_imagetyperz_token(self): def get_twocaptcha_key(self) -> str: """Return the currently configured 2captcha API key""" return Env.FLATHUNTER_2CAPTCHA_KEY() or super().get_twocaptcha_key() # pylint: disable=no-member - + def get_capmonster_key(self) -> str: """Return the currently configured Capmonster API key""" return Env.FLATHUNTER_CAPMONSTER_KEY() or super().get_capmonster_key() diff --git a/flathunter/crawler/immobilienscout.py b/flathunter/crawler/immobilienscout.py index a7eb5c5..53086f6 100644 --- a/flathunter/crawler/immobilienscout.py +++ b/flathunter/crawler/immobilienscout.py @@ -124,7 +124,7 @@ def get_entries_from_javascript(self): logger.error( "IS24 bot detection has identified our script as a bot - we've been blocked" ) - logger.info(self.get_driver_force().page_source) + logger.debug(self.get_driver_force().page_source) return [] return self.get_entries_from_json(result_json)