diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3badf8d3..a015f466 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: isort - repo: https://github.com/ambv/black - rev: 20.8b1 + rev: 22.3.0 hooks: - id: black language_version: python3.8 diff --git a/README.rst b/README.rst index ac8d6f3d..09b53cd6 100644 --- a/README.rst +++ b/README.rst @@ -18,23 +18,29 @@ scrapy-zyte-api :target: https://codecov.io/gh/scrapy-plugins/scrapy-zyte-api :alt: Coverage report + +Scrapy plugin for `Zyte API`_. + +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html + + Requirements ------------- +============ * Python 3.7+ * Scrapy 2.0.1+ + Installation ------------- +============ .. code-block:: pip install scrapy-zyte-api -This package requires Python 3.7+. Configuration -------------- +============= Replace the default ``http`` and ``https`` in Scrapy's `DOWNLOAD_HANDLERS `_ @@ -43,7 +49,7 @@ in the ``settings.py`` of your Scrapy project. You also need to set the ``ZYTE_API_KEY``. Lastly, make sure to `install the asyncio-based Twisted reactor -`_ +`_ in the ``settings.py`` file as well. Here's an example of the things needed inside a Scrapy project's ``settings.py`` file: @@ -60,12 +66,88 @@ Here's an example of the things needed inside a Scrapy project's ``settings.py`` TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +The ``ZYTE_API_ENABLED`` setting, which is ``True`` by default, can be set to +``False`` to disable this plugin. + + Usage ------ +===== + +You can send requests through Zyte API in one of the following ways: + +- Send all request through Zyte API by default, letting Zyte API parameters + be chosen automatically based on your Scrapy request parameters. See + **Using transparent mode** below. + +- Send specific requests through Zyte API, setting all Zyte API parameters + manually, keeping full control of what is sent to Zyte API. See **Sending + requests with manually-defined parameters** below. + +- Send specific requests through Zyte API, letting Zyte API parameters be + chosen automatically based on your Scrapy request parameters. See **Sending + requests with automatically-mapped parameters** below. + +Zyte API response parameters are mapped into Scrapy response parameters where +possible. See **Response mapping** below for details. + +If multiple requests target the same URL with different Zyte API parameters, +pass ``dont_filter=True`` to ``Request`` to prevent the duplicate request +filter of Scrapy from dropping all but the first request. + + +Using transparent mode +---------------------- + +Set the ``ZYTE_API_TRANSPARENT_MODE`` `Scrapy setting`_ to ``True`` to handle +Scrapy requests as follows: + +- By default, requests are sent through Zyte API with automatically-mapped + parameters. See **Sending requests with automatically-mapped parameters** + below for details about automatic request parameter mapping. + + You do not need to set the ``zyte-api-automap`` request meta key to + ``True``, but you can set it to a dictionary to extend your Zyte API + request parameters. + +- Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent + through Zyte API with manually-defined parameters. See **Sending requests + with manually-defined parameters** below. + +- Requests with the ``zyte_api_automap`` request meta key set to ``False`` + are *not* sent through Zyte API. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + start_urls = ["https://quotes.toscrape.com/"] + + custom_settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + } + + def parse(self, response): + print(response.text) + # "…" + + +Sending requests with manually-defined parameters +------------------------------------------------- -To enable a ``scrapy.Request`` to go through Zyte API, the ``zyte_api`` key in +To send a Scrapy request through Zyte API with manually-defined parameters, +define your Zyte API parameters in the ``zyte_api`` key in `Request.meta `_ -must be present and contain a dict with Zyte API parameters: +as a ``dict``. + +The only exception is the ``url`` parameter, which should not be defined as a +Zyte API parameter. The value from ``Request.url`` is used automatically. + +For example: .. code-block:: python @@ -77,8 +159,7 @@ must be present and contain a dict with Zyte API parameters: def start_requests(self): yield scrapy.Request( - url="http://quotes.toscrape.com/", - callback=self.parse, + url="https://quotes.toscrape.com/", meta={ "zyte_api": { "browserHtml": True, @@ -87,58 +168,54 @@ must be present and contain a dict with Zyte API parameters: ) def parse(self, response): - yield {"URL": response.url, "HTML": response.body} + print(response.text) + # "…" - print(response.raw_api_response) - # { - # 'url': 'https://quotes.toscrape.com/', - # 'statusCode': 200, - # 'browserHtml': ' ... ', - # } +Note that response headers are necessary for raw response decoding. When +defining parameters manually and requesting ``httpResponseBody`` extraction, +remember to also request ``httpResponseHeaders`` extraction: -You can see the full list of parameters in the `Zyte API Specification -`_. -The ``url`` parameter is filled automatically from ``request.url``, other -parameters should be set explicitly. +.. code-block:: python -The raw Zyte API response can be accessed via the ``raw_api_response`` -attribute of the response object. + import scrapy -When you use the Zyte API parameters ``browserHtml``, ``httpResponseBody``, or -``httpResponseHeaders``, the response body and headers are set accordingly. -Note that, for Zyte API requests, the spider gets responses of -``ZyteAPIResponse`` and ``ZyteAPITextResponse`` types, -which are respectively subclasses of ``scrapy.http.Response`` -and ``scrapy.http.TextResponse``. + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" -If multiple requests target the same URL with different Zyte API parameters, -pass ``dont_filter=True`` to ``Request``. + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api": { + "httpResponseBody": True, + "httpResponseHeaders": True, + } + }, + ) -Setting default parameters --------------------------- -Often the same configuration needs to be used for all Zyte API requests. -For example, all requests may need to set the same geolocation, or -the spider only uses ``browserHtml`` requests. + def parse(self, response): + print(response.text) + # "…" -To set the default parameters for Zyte API enabled requests, you can set the -following in the ``settings.py`` file or `any other settings within Scrapy -`_: +To learn more about Zyte API parameters, see the `data extraction usage`_ and +`API reference`_ pages of the `Zyte API documentation`_. -.. code-block:: python +.. _API reference: https://docs.zyte.com/zyte-api/openapi.html +.. _data extraction usage: https://docs.zyte.com/zyte-api/usage/extract.html +.. _Zyte API documentation: https://docs.zyte.com/zyte-api/get-started.html - ZYTE_API_DEFAULT_PARAMS = { - "browserHtml": True, - "geolocation": "US", - } +Sending requests with automatically-mapped parameters +----------------------------------------------------- -``ZYTE_API_DEFAULT_PARAMS`` works if the ``zyte_api`` -key in `Request.meta `_ -is set, i.e. having ``ZYTE_API_DEFAULT_PARAMS`` doesn't make all requests -to go through Zyte API. Parameters in ``ZYTE_API_DEFAULT_PARAMS`` are merged -with parameters set via the ``zyte_api`` meta key, with the values in meta -taking priority. +To send a Scrapy request through Zyte API letting Zyte API parameters be +automatically chosen based on the parameters of that Scrapy request, set the +``zyte_api_automap`` key in +`Request.meta `_ +to ``True``. + +For example: .. code-block:: python @@ -148,94 +225,326 @@ taking priority. class SampleQuotesSpider(scrapy.Spider): name = "sample_quotes" - custom_settings = { - "ZYTE_API_DEFAULT_PARAMS": { - "geolocation": "US", # You can set any Geolocation region you want. - } - } - def start_requests(self): yield scrapy.Request( - url="http://quotes.toscrape.com/", - callback=self.parse, + url="https://quotes.toscrape.com/", meta={ "zyte_api": { - "browserHtml": True, - "javascript": True, - "echoData": {"some_value_I_could_track": 123}, + "zyte_api_automap": True, } }, ) def parse(self, response): - yield {"URL": response.url, "HTML": response.body} - - print(response.raw_api_response) - # { - # 'url': 'https://quotes.toscrape.com/', - # 'statusCode': 200, - # 'browserHtml': ' ... ', - # 'echoData': {'some_value_I_could_track': 123}, - # } - - print(response.request.meta) - # { - # 'zyte_api': { - # 'browserHtml': True, - # 'geolocation': 'US', - # 'javascript': True, - # 'echoData': {'some_value_I_could_track': 123} - # }, - # 'download_timeout': 180.0, - # 'download_slot': 'quotes.toscrape.com' - # } - -There is a shortcut, in case a request uses the same parameters as -defined in the ``ZYTE_API_DEFAULT_PARAMS`` setting, without any further -customization - the ``zyte_api`` meta key can be set to ``True`` or ``{}``: + print(response.text) + # "…" + +See also **Using transparent mode** above and **Automated request parameter +mapping** below. + + +Response mapping +---------------- + +Zyte API responses are mapped with one of the following classes: + +- ``scrapy_zyte_api.responses.ZyteAPITextResponse``, a subclass of + ``scrapy.http.TextResponse``, is used to map text responses, i.e. responses + with ``browserHtml`` or responses with both ``httpResponseBody`` and + ``httpResponseHeaders`` with a text body (e.g. plain text, HTML, JSON). + +- ``scrapy_zyte_api.responses.ZyteAPIResponse``, a subclass of + ``scrapy.http.Response``, is used to map any other response. + +Zyte API response parameters are mapped into response class attributes where +possible: + +- ``url`` becomes ``response.url``. + +- ``statusCode`` becomes ``response.status``. + +- ``httpResponseHeaders`` becomes ``response.headers``. + +- ``browserHtml`` and ``httpResponseBody`` are mapped into both + ``response.text`` (``str``) and ``response.body`` (``bytes``). + + If none of these parameters were present, e.g. if the only requested output + was ``screenshot``, ``response.text`` and ``response.body`` would be empty. + + If a future version of Zyte API supported requesting both outputs on the + same request, and both parameters were present, ``browserHtml`` would be + the one mapped into ``response.text`` and ``response.body``. + +Both response classes have a ``raw_zyte_api`` attribute that contains a +``dict`` with the complete, raw response from Zyte API, where you can find all +Zyte API response parameters, including those that are not mapped into other +response class atttributes. + +For example, for a request for ``httpResponseBody`` and +``httpResponseHeaders``, you would get: .. code-block:: python - import scrapy + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {b"Content-Type": [b"text/html"], …} + print(response.text) + # "…" + print(response.body) + # b"…" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", + # "httpResponseHeaders": […], + # } + +For a request for ``screenshot``, on the other hand, the response would look +as follows: +.. code-block:: python - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {} + print(response.text) + # "" + print(response.body) + # b"" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "screenshot": "iVBORw0KGgoAAAANSUh…", + # } + from base64 import b64decode + print(b64decode(response.raw_api_response["screenshot"])) + # b'\x89PNG\r\n\x1a\n\x00\x00\x00\r…' + + +Automated request parameter mapping +----------------------------------- + +When you enable automated request parameter mapping, be it through transparent +mode (see **Using transparent mode** above) or for a specific request (see +**Sending requests with automatically-mapped parameters** above), Zyte API +parameters are chosen as follows by default: + +- ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. + + This is subject to change without prior notice in future versions of + scrapy-zyte-api, so please account for the following: + + - If you are requesting a binary resource, such as a PDF file or an + image file, set ``httpResponseBody`` to ``True`` explicitly in your + requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/img/zyte.png", + meta={ + "zyte_api": { + "zyte_api_automap": {"httpResponseBody": True}, + } + }, + ) - custom_settings = { - "ZYTE_API_DEFAULT_PARAMS": { - "browserHtml": True, - } - } + In the future, we may stop setting ``httpResponseBody`` to ``True`` by + default, and instead use a different, new Zyte API parameter that only + works for non-binary responses (e.g. HMTL, JSON, plain text). - def start_requests(self): - yield scrapy.Request( - url="http://quotes.toscrape.com/", - callback=self.parse, - meta={"zyte_api": True}, + - If you need to access response headers, be it through + ``response.headers`` or through + ``response.raw_zyte_api["httpResponseHeaders"]``, set + ``httpResponseHeaders`` to ``True`` explicitly in your requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/", + meta={ + "zyte_api": { + "zyte_api_automap": {"httpResponseHeaders": True}, + } + }, ) - def parse(self, response): - yield {"URL": response.url, "HTML": response.body} - - print(response.raw_api_response) - # { - # 'url': 'https://quotes.toscrape.com/', - # 'statusCode': 200, - # 'browserHtml': ' ... ', - # } - - print(response.request.meta) - # { - # 'zyte_api': { - # 'browserHtml': True, - # }, - # 'download_timeout': 180.0, - # 'download_slot': 'quotes.toscrape.com' - # } + At the moment we request response headers because some response headers + are necessary to properly decode the response body as text. In the + future, Zyte API may be able to handle this decoding automatically, so + we would stop setting ``httpResponseHeaders`` to ``True`` by default. + +- ``Request.url`` becomes ``url``, same as in requests with manually-defined + parameters. + +- If ``Request.method`` is something other than ``"GET"``, it becomes + ``httpRequestMethod``. + +- ``Request.headers`` become ``customHttpRequestHeaders``. + +- ``Request.body`` becomes ``httpRequestBody``. + +For example, the following Scrapy request: + +.. code-block:: python + + Request( + method="POST" + url="https://httpbin.org/anything", + headers={"Content-Type": "application/json"}, + body=b'{"foo": "bar"}', + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "httpResponseBody": true, + "httpResponseHeaders": true, + "url": "https://httpbin.org/anything", + "httpRequestMethod": "POST", + "customHttpRequestHeaders": [{"name": "Content-Type", "value": "application/json"}], + "httpRequestBody": "eyJmb28iOiAiYmFyIn0=" + } + +You may set the ``zyte_api_automap`` key in +`Request.meta `_ +to a ``dict`` of Zyte API parameters to extend or override choices made by +automated request parameter mapping. + +Setting ``browserHtml`` or ``screenshot`` to ``True`` unsets +``httpResponseBody`` and ``httpResponseHeaders``, and makes ``Request.headers`` +become ``requestHeaders`` instead of ``customHttpRequestHeaders``. For example, +the following Scrapy request: + +.. code-block:: python + + Request( + url="https://quotes.toscrape.com", + headers={"Referer": "https://example.com/"}, + meta={"zyte_api_automap": {"browserHtml": True}}, + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "browserHtml": true, + "url": "https://quotes.toscrape.com", + "requestHeaders": {"referer": "https://example.com/"}, + } + +When mapping headers, headers not supported by Zyte API are excluded from the +mapping by default. Use the following `Scrapy settings`_` to change which +headers are included or excluded from header mapping: + +.. _Scrapy settings: https://docs.scrapy.org/en/latest/topics/settings.html + +- ``ZYTE_API_SKIP_HEADERS`` determines headers that must *not* be mapped as + ``customHttpRequestHeaders``, and its default value is: + + .. code-block:: python + + ["Cookie", "User-Agent"] + +- ``ZYTE_API_BROWSER_HEADERS`` determines headers that *can* be mapped as + ``requestHeaders``. It is a ``dict``, where keys are header names and + values are the key that represents them in ``requestHeaders``. Its default + value is: + + .. code-block:: python + + {"Referer": "referer"} + +To maximize support for potential future changes in Zyte API, automated +request parameter mapping allows some parameter values and parameter +combinations that Zyte API does not currently support, and may never support: + +- ``Request.method`` becomes ``httpRequestMethod`` even for unsupported_ + ``httpRequestMethod`` values, and even if ``httpResponseBody`` is unset. + + .. _unsupported: https://docs.zyte.com/zyte-api/usage/extract.html#zyte-api-set-method + +- You can set ``customHttpRequestHeaders`` or ``requestHeaders`` to ``True`` + to force their mapping from ``Request.headers`` in scenarios where they + would not be mapped otherwise. + + Conversely, you can set ``customHttpRequestHeaders`` or ``requestHeaders`` + to ``False`` to prevent their mapping from ``Request.headers``. + +- ``Request.body`` becomes ``httpRequestBody`` even if ``httpResponseBody`` + is unset. + +- You can set ``httpResponseBody`` to ``False`` (which unsets the parameter), + and not set ``browserHtml`` or ``screenshot`` to ``True``. In this case, + ``Request.headers`` is mapped as ``requestHeaders``. + +- You can set ``httpResponseBody`` to ``True`` and also set ``browserHtml`` + or ``screenshot`` to ``True``. In this case, ``Request.headers`` is mapped + both as ``customHttpRequestHeaders`` and as ``requestHeaders``, and + ``browserHtml`` is used as the Scrapy response body. + + +Setting default parameters +========================== + +Often the same configuration needs to be used for all Zyte API requests. For +example, all requests may need to set the same geolocation, or the spider only +uses ``browserHtml`` requests. + +The following settings allow you to define Zyte API parameters to be included +in all requests: + +- ``ZYTE_API_DEFAULT_PARAMS`` is a ``dict`` of parameters to be combined with + manually-defined parameters. See **Sending requests with manually-defined + parameters** above. + + You may set the ``zyte_api`` request meta key to an empty ``dict`` to only + use default parameters for that request. + +- ``ZYTE_API_AUTOMAP_PARAMS`` is a ``dict`` of parameters to be combined with + automatically-mapped parameters. See **Sending requests with + automatically-mapped parameters** above. + +For example, if you set ``ZYTE_API_DEFAULT_PARAMS`` to +``{"geolocation": "US"}`` and ``zyte_api`` to ``{"browserHtml": True}``, +``{"url: "…", "geolocation": "US", "browserHtml": True}`` is sent to Zyte API. + +Parameters in these settings are merged with request-specific parameters, with +request-specific parameters taking precedence. + +``ZYTE_API_DEFAULT_PARAMS`` has no effect on requests that use automated +request parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on +requests that use manually-defined parameters. + +When using transparent mode (see **Using transparent mode** above), be careful +of which parameters you define through ``ZYTE_API_AUTOMAP_PARAMS``. In +transparent mode, all Scrapy requests go through Zyte API, even requests that +Scrapy sends automatically, such as those for ``robots.txt`` files when +ROBOTSTXT_OBEY_ is ``True``, or those for sitemaps when using a `sitemap +spider`_. Certain parameters, like ``browserHtml`` or ``screenshot``, are not +meant to be used for every single request. + +.. _ROBOTSTXT_OBEY: https://docs.scrapy.org/en/latest/topics/settings.html#robotstxt-obey +.. _sitemap spider: https://docs.scrapy.org/en/latest/topics/spiders.html#sitemapspider + Customizing the retry policy ----------------------------- +============================ API requests are retried automatically using the default retry policy of `python-zyte-api`_. @@ -254,7 +563,7 @@ Scrapy settings must be picklable, which `retry policies are not policy objects directly to the ``ZYTE_API_RETRY_POLICY`` setting, and must use their import path string instead. -When setting a retry policy through request metadata, you can assign the +When setting a retry policy through request meta, you can assign the ``zyte_api_retry_policy`` request meta key either the retry policy object itself or its import path string. If you need your requests to be serializable, however, you may also need to use the import path string. @@ -300,7 +609,7 @@ subclass RetryFactory_ as follows: Stats ------ +===== Stats from python-zyte-api_ are exposed as Scrapy stats with the ``scrapy-zyte-api`` prefix. diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py new file mode 100644 index 00000000..df1390d2 --- /dev/null +++ b/scrapy_zyte_api/_params.py @@ -0,0 +1,425 @@ +from base64 import b64decode, b64encode +from copy import copy +from logging import getLogger +from typing import Any, Dict, Mapping, Optional, Set +from warnings import warn + +from scrapy import Request +from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS +from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT + +logger = getLogger(__name__) + +_DEFAULT_API_PARAMS = { + "browserHtml": False, + "screenshot": False, +} + + +def _iter_headers( + *, + api_params: Dict[str, Any], + request: Request, + header_parameter: str, +): + headers = api_params.get(header_parameter) + if headers not in (None, True): + logger.warning( + f"Request {request} defines the Zyte API {header_parameter} " + f"parameter, overriding Request.headers. Use Request.headers " + f"instead." + ) + return + if not request.headers: + return + for k, v in request.headers.items(): + if not v: + continue + decoded_v = b",".join(v).decode() + lowercase_k = k.strip().lower() + yield k, lowercase_k, decoded_v + + +def _map_custom_http_request_headers( + *, + api_params: Dict[str, Any], + request: Request, + skip_headers: Set[str], +): + headers = [] + for k, lowercase_k, decoded_v in _iter_headers( + api_params=api_params, + request=request, + header_parameter="customHttpRequestHeaders", + ): + if lowercase_k in skip_headers: + if lowercase_k != b"user-agent" or decoded_v != DEFAULT_USER_AGENT: + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte API " + f"customHttpRequestHeaders parameter." + ) + continue + headers.append({"name": k.decode(), "value": decoded_v}) + if headers: + api_params["customHttpRequestHeaders"] = headers + + +def _map_request_headers( + *, + api_params: Dict[str, Any], + request: Request, + browser_headers: Dict[str, str], +): + request_headers = {} + for k, lowercase_k, decoded_v in _iter_headers( + api_params=api_params, + request=request, + header_parameter="requestHeaders", + ): + key = browser_headers.get(lowercase_k) + if key is not None: + request_headers[key] = decoded_v + elif not ( + ( + lowercase_k == b"accept" + and decoded_v == DEFAULT_REQUEST_HEADERS["Accept"] + ) + or ( + lowercase_k == b"accept-language" + and decoded_v == DEFAULT_REQUEST_HEADERS["Accept-Language"] + ) + or (lowercase_k == b"user-agent" and decoded_v == DEFAULT_USER_AGENT) + ): + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte API requestHeaders " + f"parameter." + ) + if request_headers: + api_params["requestHeaders"] = request_headers + + +def _set_request_headers_from_request( + *, + api_params: Dict[str, Any], + request: Request, + skip_headers: Set[str], + browser_headers: Dict[str, str], +): + """Updates *api_params*, in place, based on *request*.""" + custom_http_request_headers = api_params.get("customHttpRequestHeaders") + request_headers = api_params.get("requestHeaders") + response_body = api_params.get("httpResponseBody") + + if ( + response_body + and custom_http_request_headers is not False + or custom_http_request_headers is True + ): + _map_custom_http_request_headers( + api_params=api_params, + request=request, + skip_headers=skip_headers, + ) + elif custom_http_request_headers is False: + api_params.pop("customHttpRequestHeaders") + + if ( + ( + not response_body + or any(api_params.get(k) for k in ("browserHtml", "screenshot")) + ) + and request_headers is not False + or request_headers is True + ): + _map_request_headers( + api_params=api_params, + request=request, + browser_headers=browser_headers, + ) + elif request_headers is False: + api_params.pop("requestHeaders") + + +def _set_http_response_body_from_request( + *, + api_params: Dict[str, Any], + request: Request, +): + if not any( + api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") + ): + api_params.setdefault("httpResponseBody", True) + elif api_params.get("httpResponseBody") is False: + logger.warning( + f"Request {request} unnecessarily defines the Zyte API " + f"'httpResponseBody' parameter with its default value, False. " + f"It will not be sent to the server." + ) + if api_params.get("httpResponseBody") is False: + api_params.pop("httpResponseBody") + + +def _set_http_response_headers_from_request( + *, + api_params: Dict[str, Any], + default_params: Dict[str, Any], + meta_params: Dict[str, Any], +): + if api_params.get("httpResponseBody"): + api_params.setdefault("httpResponseHeaders", True) + elif ( + api_params.get("httpResponseHeaders") is False + and not default_params.get("httpResponseHeaders") is False + ): + logger.warning( + "You do not need to set httpResponseHeaders to False if " + "neither httpResponseBody nor browserHtml are set to True. Note " + "that httpResponseBody is set to True automatically if " + "neither browserHtml nor screenshot are set to True." + ) + if api_params.get("httpResponseHeaders") is False: + api_params.pop("httpResponseHeaders") + + +def _set_http_request_method_from_request( + *, + api_params: Dict[str, Any], + request: Request, +): + method = api_params.get("httpRequestMethod") + if method: + logger.warning( + f"Request {request} uses the Zyte API httpRequestMethod " + f"parameter, overriding Request.method. Use Request.method " + f"instead." + ) + if method != request.method: + logger.warning( + f"The HTTP method of request {request} ({request.method}) " + f"does not match the Zyte API httpRequestMethod parameter " + f"({method})." + ) + elif request.method != "GET": + api_params["httpRequestMethod"] = request.method + + +def _set_http_request_body_from_request( + *, + api_params: Dict[str, Any], + request: Request, +): + body = api_params.get("httpRequestBody") + if body: + logger.warning( + f"Request {request} uses the Zyte API httpRequestBody parameter, " + f"overriding Request.body. Use Request.body instead." + ) + decoded_body = b64decode(body) + if decoded_body != request.body: + logger.warning( + f"The body of request {request} ({request.body!r}) " + f"does not match the Zyte API httpRequestBody parameter " + f"({body!r}; decoded: {decoded_body!r})." + ) + elif request.body != b"": + base64_body = b64encode(request.body).decode() + api_params["httpRequestBody"] = base64_body + + +def _unset_unneeded_api_params( + *, + api_params: Dict[str, Any], + default_params: Dict[str, Any], + request: Request, +): + for param, default_value in _DEFAULT_API_PARAMS.items(): + if api_params.get(param) != default_value: + continue + if param not in default_params or default_params.get(param) == default_value: + logger.warning( + f"Request {request} unnecessarily defines the Zyte API {param!r} " + f"parameter with its default value, {default_value!r}. It will " + f"not be sent to the server." + ) + api_params.pop(param) + + +def _update_api_params_from_request( + api_params: Dict[str, Any], + request: Request, + *, + default_params: Dict[str, Any], + meta_params: Dict[str, Any], + skip_headers: Set[str], + browser_headers: Dict[str, str], +): + _set_http_response_body_from_request(api_params=api_params, request=request) + _set_http_response_headers_from_request( + api_params=api_params, + default_params=default_params, + meta_params=meta_params, + ) + _set_http_request_method_from_request(api_params=api_params, request=request) + _set_request_headers_from_request( + api_params=api_params, + request=request, + skip_headers=skip_headers, + browser_headers=browser_headers, + ) + _set_http_request_body_from_request(api_params=api_params, request=request) + _unset_unneeded_api_params( + api_params=api_params, request=request, default_params=default_params + ) + return api_params + + +def _copy_meta_params_as_dict( + meta_params: Dict[str, Any], + *, + param: str, + request: Request, +): + if meta_params is True: + return {} + elif not isinstance(meta_params, Mapping): + raise ValueError( + f"'{param}' parameters in the request meta should be provided as " + f"a dictionary, got {type(meta_params)} instead in {request}." + ) + else: + return copy(meta_params) + + +def _merge_params( + *, + default_params: Dict[str, Any], + meta_params: Dict[str, Any], + param: str, + setting: str, + request: Request, +): + params = copy(default_params) + meta_params = copy(meta_params) + for k in list(meta_params): + if meta_params[k] is not None: + continue + meta_params.pop(k) + if k in params: + params.pop(k) + else: + logger.warning( + f"In request {request} {param!r} parameter {k} is None, " + f"which is a value reserved to unset parameters defined in " + f"the {setting} setting, but the setting does not define such " + f"a parameter." + ) + params.update(meta_params) + return params + + +def _get_raw_params( + request: Request, + *, + default_params: Dict[str, Any], +): + meta_params = request.meta.get("zyte_api", False) + if meta_params is False: + return None + + if not meta_params and meta_params != {}: + warn( + f"Setting the zyte_api request metadata key to " + f"{meta_params!r} is deprecated. Use False instead.", + DeprecationWarning, + ) + return None + + meta_params = _copy_meta_params_as_dict( + meta_params, + param="zyte_api", + request=request, + ) + + return _merge_params( + default_params=default_params, + meta_params=meta_params, + param="zyte_api", + setting="ZYTE_API_DEFAULT_PARAMS", + request=request, + ) + + +def _get_automap_params( + request: Request, + *, + default_enabled: bool, + default_params: Dict[str, Any], + skip_headers: Set[str], + browser_headers: Dict[str, str], +): + meta_params = request.meta.get("zyte_api_automap", default_enabled) + if meta_params is False: + return None + + meta_params = _copy_meta_params_as_dict( + meta_params, + param="zyte_api_automap", + request=request, + ) + + params = _merge_params( + default_params=default_params, + meta_params=meta_params, + param="zyte_api_automap", + setting="ZYTE_API_AUTOMAP_PARAMS", + request=request, + ) + + _update_api_params_from_request( + params, + request, + default_params=default_params, + meta_params=meta_params, + skip_headers=skip_headers, + browser_headers=browser_headers, + ) + + return params + + +def _get_api_params( + request: Request, + *, + default_params: Dict[str, Any], + transparent_mode: bool, + automap_params: Dict[str, Any], + skip_headers: Set[str], + browser_headers: Dict[str, str], + job_id: Optional[str], +) -> Optional[dict]: + """Returns a dictionary of API parameters that must be sent to Zyte API for + the specified request, or None if the request should not be sent through + Zyte API.""" + api_params = _get_raw_params(request, default_params=default_params) + if api_params is None: + api_params = _get_automap_params( + request, + default_enabled=transparent_mode, + default_params=automap_params, + skip_headers=skip_headers, + browser_headers=browser_headers, + ) + if api_params is None: + return None + elif request.meta.get("zyte_api_automap", False) is not False: + raise ValueError( + f"Request {request} combines manually-defined parameters and " + f"automatically-mapped parameters." + ) + + if job_id is not None: + api_params["jobId"] = job_id + + return api_params diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index bcebaa2a..0b5e044a 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Dict, Generator, Optional, Union +from typing import Generator, Optional, Union from scrapy import Spider from scrapy.core.downloader.handlers.http import HTTPDownloadHandler @@ -16,16 +16,57 @@ from zyte_api.apikey import NoApiKey from zyte_api.constants import API_URL +from ._params import _get_api_params from .responses import ZyteAPIResponse, ZyteAPITextResponse, _process_response logger = logging.getLogger(__name__) +def _load_default_params(settings, setting): + params = settings.getdict(setting) + for param in list(params): + if params[param] is not None: + continue + logger.warning( + f"Parameter {param!r} in the {setting} setting is None. Default " + f"parameters should never be None." + ) + params.pop(param) + return params + + +def _load_skip_headers(settings): + return { + header.strip().lower().encode() + for header in settings.getlist( + "ZYTE_API_SKIP_HEADERS", + ["Cookie", "User-Agent"], + ) + } + + +def _load_browser_headers(settings): + browser_headers = settings.getdict( + "ZYTE_API_BROWSER_HEADERS", + {"Referer": "referer"}, + ) + return {k.strip().lower().encode(): v for k, v in browser_headers.items()} + + +def _load_retry_policy(settings): + policy = settings.get("ZYTE_API_RETRY_POLICY") + if policy: + policy = load_object(policy) + return policy + + class ScrapyZyteAPIDownloadHandler(HTTPDownloadHandler): def __init__( self, settings: Settings, crawler: Crawler, client: AsyncClient = None ): super().__init__(settings=settings, crawler=crawler) + if not settings.getbool("ZYTE_API_ENABLED", True): + raise NotConfigured if not client: try: client = AsyncClient( @@ -51,41 +92,32 @@ def __init__( "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ) self._stats = crawler.stats - self._job_id = crawler.settings.get("JOB") - self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") self._session = create_session(connection_pool_size=self._client.n_conn) - self._retry_policy = settings.get("ZYTE_API_RETRY_POLICY") - if self._retry_policy: - self._retry_policy = load_object(self._retry_policy) + + self._automap_params = _load_default_params(settings, "ZYTE_API_AUTOMAP_PARAMS") + self._browser_headers = _load_browser_headers(settings) + self._default_params = _load_default_params(settings, "ZYTE_API_DEFAULT_PARAMS") + self._job_id = crawler.settings.get("JOB") + self._retry_policy = _load_retry_policy(settings) + self._transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False) + self._skip_headers = _load_skip_headers(settings) def download_request(self, request: Request, spider: Spider) -> Deferred: - api_params = self._prepare_api_params(request) - if api_params: + api_params = _get_api_params( + request, + default_params=self._default_params, + transparent_mode=self._transparent_mode, + automap_params=self._automap_params, + skip_headers=self._skip_headers, + browser_headers=self._browser_headers, + job_id=self._job_id, + ) + if api_params is not None: return deferred_from_coro( self._download_request(api_params, request, spider) ) return super().download_request(request, spider) - def _prepare_api_params(self, request: Request) -> Optional[dict]: - meta_params = request.meta.get("zyte_api") - if not meta_params and meta_params != {}: - return None - - if meta_params is True: - meta_params = {} - - api_params: Dict[str, Any] = self._zyte_api_default_params or {} - try: - api_params.update(meta_params) - except (ValueError, TypeError): - logger.error( - f"'zyte_api' parameters in the request meta should be " - f"provided as dictionary, got {type(request.meta.get('zyte_api'))} " - f"instead. ({request})." - ) - raise - return api_params - def _update_stats(self): prefix = "scrapy-zyte-api" for stat in ( @@ -136,8 +168,6 @@ async def _download_request( ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: # Define url by default api_data = {**{"url": request.url}, **api_params} - if self._job_id is not None: - api_data["jobId"] = self._job_id retrying = request.meta.get("zyte_api_retry_policy") if retrying: retrying = load_object(retrying) diff --git a/tests/__init__.py b/tests/__init__.py index 4626b36e..5a35538c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,6 +2,7 @@ from os import environ from typing import Optional +from scrapy.exceptions import NotConfigured from scrapy.utils.misc import create_instance from scrapy.utils.test import get_crawler from zyte_api.aio.client import AsyncClient @@ -29,15 +30,19 @@ async def make_handler(settings: dict, api_url: Optional[str] = None): if api_url is not None: settings["ZYTE_API_URL"] = api_url crawler = get_crawler(settings_dict=settings) - handler = create_instance( - ScrapyZyteAPIDownloadHandler, - settings=None, - crawler=crawler, - ) + try: + handler = create_instance( + ScrapyZyteAPIDownloadHandler, + settings=None, + crawler=crawler, + ) + except NotConfigured: # i.e. ZYTE_API_ENABLED=False + handler = None try: yield handler finally: - await handler._close() # NOQA + if handler is not None: + await handler._close() # NOQA @contextmanager diff --git a/tests/mockserver.py b/tests/mockserver.py index e059458a..00ae7050 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -60,11 +60,7 @@ def render_POST(self, request): return json.dumps(response_data).encode() response_data["url"] = request_data["url"] - if request_data.get("jobId") is not None: - html = f"{request_data['jobId']}" - else: - html = "Hello

World!

" - + html = "Hello

World!

" if "browserHtml" in request_data: if "httpResponseBody" in request_data: request.setResponseCode(422) @@ -77,11 +73,21 @@ def render_POST(self, request): } ).encode() response_data["browserHtml"] = html - elif "httpResponseBody" in request_data: - base64_html = b64encode(html.encode()).decode() - response_data["httpResponseBody"] = base64_html - - if "httpResponseHeaders" in request_data: + if "httpResponseBody" in request_data: + headers = request_data.get("customHttpRequestHeaders", []) + for header in headers: + if header["name"].strip().lower() == "accept": + accept = header["value"] + break + else: + accept = None + if accept == "application/octet-stream": + body = b64encode(b"\x00").decode() + else: + body = b64encode(html.encode()).decode() + response_data["httpResponseBody"] = body + + if request_data.get("httpResponseHeaders") is True: response_data["httpResponseHeaders"] = [ {"name": "test_header", "value": "test_value"} ] @@ -148,10 +154,7 @@ def urljoin(self, path): async def make_handler(self, settings: dict = None): settings = settings or {} async with make_handler(settings, self.urljoin("/")) as handler: - try: - yield handler - finally: - await handler._close() # NOQA + yield handler def main(): diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index d3cf6b5e..a06db7a0 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,198 +1,147 @@ import sys from asyncio import iscoroutine +from copy import copy +from functools import partial +from inspect import isclass from typing import Any, Dict from unittest import mock +from unittest.mock import patch import pytest from _pytest.logging import LogCaptureFixture # NOQA from pytest_twisted import ensureDeferred from scrapy import Request, Spider -from scrapy.exceptions import NotSupported +from scrapy.exceptions import CloseSpider from scrapy.http import Response, TextResponse -from scrapy.utils.defer import deferred_from_coro +from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS +from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT from scrapy.utils.test import get_crawler from twisted.internet.defer import Deferred from zyte_api.aio.errors import RequestError +from scrapy_zyte_api.handler import _get_api_params + from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS from .mockserver import DelayedResource, MockServer, produce_request_response -@ensureDeferred -@pytest.mark.parametrize( - "meta", - [ - {"zyte_api": {"browserHtml": True}}, - {"zyte_api": {"browserHtml": True, "geolocation": "US"}}, - {"zyte_api": {"browserHtml": True, "geolocation": "US", "echoData": 123}}, - {"zyte_api": {"browserHtml": True, "randomParameter": None}}, - ], -) -async def test_browser_html_request(meta: Dict[str, Dict[str, Any]], mockserver): - req, resp = await produce_request_response(mockserver, meta) - assert isinstance(resp, TextResponse) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"Hello

World!

" - assert resp.text == "Hello

World!

" - assert resp.css("h1 ::text").get() == "World!" - assert resp.xpath("//body/text()").getall() == ["Hello"] - - @pytest.mark.parametrize( "meta", [ - {"zyte_api": {"httpResponseBody": True}}, - {"zyte_api": {"httpResponseBody": True, "geolocation": "US"}}, { - "zyte_api": { - "httpResponseBody": True, - "geolocation": "US", - "echoData": 123, - } + "httpResponseBody": True, + "customHttpRequestHeaders": [ + {"name": "Accept", "value": "application/octet-stream"} + ], }, - {"zyte_api": {"httpResponseBody": True, "randomParameter": None}}, + pytest.param( + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "Accept", "value": "application/octet-stream"} + ], + }, + marks=pytest.mark.xfail( + reason="https://github.com/scrapy-plugins/scrapy-zyte-api/issues/47", + strict=True, + ), + ), ], ) @ensureDeferred -async def test_http_response_body_request(meta: Dict[str, Dict[str, Any]], mockserver): - req, resp = await produce_request_response(mockserver, meta) +async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): + """Test that binary (i.e. non-text) responses from Zyte API are + successfully mapped to a subclass of Response that is not also a subclass + of TextResponse. + + Whether response headers are retrieved or not should have no impact on the + outcome if the body is unequivocally binary. + """ + req, resp = await produce_request_response(mockserver, {"zyte_api": meta}) assert isinstance(resp, Response) + assert not isinstance(resp, TextResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "zyte-api" in resp.flags - assert resp.body == b"Hello

World!

" - - with pytest.raises(NotSupported): - assert resp.css("h1 ::text").get() == "World!" - with pytest.raises(NotSupported): - assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.body == b"\x00" +@ensureDeferred @pytest.mark.parametrize( "meta", [ - {"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": True}}, - {"zyte_api": {"browserHtml": True, "httpResponseHeaders": True}}, + {"browserHtml": True, "httpResponseHeaders": True}, + {"browserHtml": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + pytest.param( + {"httpResponseBody": True}, + marks=pytest.mark.xfail( + reason="https://github.com/scrapy-plugins/scrapy-zyte-api/issues/47", + strict=True, + ), + ), ], ) -@ensureDeferred -async def test_http_response_headers_request( - meta: Dict[str, Dict[str, Any]], mockserver -): - req, resp = await produce_request_response(mockserver, meta) +async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): + """Test that HTML responses from Zyte API are successfully mapped to a + subclass of TextResponse. + + Whether response headers are retrieved or not should have no impact on the + outcome if the body is unequivocally HTML. + """ + req, resp = await produce_request_response(mockserver, {"zyte_api": meta}) + assert isinstance(resp, TextResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "zyte-api" in resp.flags assert resp.body == b"Hello

World!

" - assert resp.headers == {b"Test_Header": [b"test_value"]} + assert resp.text == "Hello

World!

" + assert resp.css("h1 ::text").get() == "World!" + assert resp.xpath("//body/text()").getall() == ["Hello"] + if meta.get("httpResponseHeaders", False) is True: + assert resp.headers == {b"Test_Header": [b"test_value"]} + else: + assert not resp.headers + + +UNSET = object() @ensureDeferred -@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") @pytest.mark.parametrize( - "meta,settings,expected,use_zyte_api", + "setting,enabled", [ - ({}, {}, {}, False), - ({"zyte_api": {}}, {}, {}, False), - ({"zyte_api": True}, {}, {}, False), - ({"zyte_api": False}, {}, {}, False), - ( - {}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA"}, - False, - ), - ( - {"zyte_api": False}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {}, - False, - ), - ( - {"zyte_api": None}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {}, - False, - ), - ( - {"zyte_api": {}}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA"}, - True, - ), - ( - {"zyte_api": True}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA"}, - True, - ), - ( - {"zyte_api": {"javascript": True, "geolocation": "US"}}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "US", "javascript": True}, - True, - ), + (UNSET, True), + (True, True), + (False, False), ], ) -async def test_zyte_api_request_meta( - meta: Dict[str, Dict[str, Any]], - settings: Dict[str, str], - expected: Dict[str, str], - use_zyte_api: bool, - mockserver, -): +async def test_enabled(setting, enabled, mockserver): + settings = {} + if setting is not UNSET: + settings["ZYTE_API_ENABLED"] = setting async with mockserver.make_handler(settings) as handler: - req = Request(mockserver.urljoin("/"), meta=meta) - unmocked_client = handler._client - handler._client = mock.AsyncMock(unmocked_client) - handler._client.request_raw.side_effect = unmocked_client.request_raw - await handler.download_request(req, None) - - # What we're interested in is the Request call in the API - request_call = [ - c for c in handler._client.mock_calls if "request_raw(" in str(c) - ] - - if not use_zyte_api: - assert request_call == [] - return + if enabled: + assert handler is not None + else: + assert handler is None - elif not request_call: - pytest.fail("The client's request_raw() method was not called.") - args_used = request_call[0].args[0] - args_used.pop("url") - - assert args_used == expected - - -@pytest.mark.parametrize( - "meta", - [ - {"zyte_api": {"waka": True}}, - {"zyte_api": True}, - {"zyte_api": {"browserHtml": True}}, - {"zyte_api": {}}, - {"zyte_api": None}, - {"randomParameter": True}, - {}, - None, - ], -) +@pytest.mark.parametrize("zyte_api", [True, False]) @ensureDeferred -async def test_coro_handling(meta: Dict[str, Dict[str, Any]], mockserver): +async def test_coro_handling(zyte_api: bool, mockserver): + """ScrapyZyteAPIDownloadHandler.download_request must return a deferred + both when using Zyte API and when using the regular downloader logic.""" settings = {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True}} async with mockserver.make_handler(settings) as handler: req = Request( # this should really be a URL to a website, not to the API server, # but API server URL works ok mockserver.urljoin("/"), - meta=meta, + meta={"zyte_api": zyte_api}, ) dfd = handler.download_request(req, Spider("test")) assert not iscoroutine(dfd) @@ -207,26 +156,21 @@ async def test_coro_handling(meta: Dict[str, Dict[str, Any]], mockserver): ( {"zyte_api": {"echoData": Request("http://test.com")}}, TypeError, - "Got an error when processing Zyte API request (http://example.com): " - "Object of type Request is not JSON serializable", - ), - ( - {"zyte_api": ["some", "bad", "non-dict", "value"]}, - ValueError, - "'zyte_api' parameters in the request meta should be provided as " - "dictionary, got instead. ().", - ), - ( - {"zyte_api": 1}, - TypeError, - "'zyte_api' parameters in the request meta should be provided as " - "dictionary, got instead. ().", + ( + "Got an error when processing Zyte API request " + "(http://example.com): Object of type Request is not JSON " + "serializable" + ), ), ( {"zyte_api": {"browserHtml": True, "httpResponseBody": True}}, RequestError, - "Got Zyte API error (status=422, type='/request/unprocessable') while processing URL (http://example.com): " - "Incompatible parameters were found in the request.", + ( + "Got Zyte API error (status=422, " + "type='/request/unprocessable') while processing URL " + "(http://example.com): Incompatible parameters were found in " + "the request." + ), ), ], ) @@ -239,51 +183,24 @@ async def test_exceptions( ): async with mockserver.make_handler() as handler: req = Request("http://example.com", method="POST", meta=meta) - - with pytest.raises(exception_type): # NOQA - api_params = handler._prepare_api_params(req) - await deferred_from_coro( - handler._download_request(api_params, req, Spider("test")) # NOQA - ) # NOQA + with pytest.raises(exception_type): + await handler.download_request(req, None) assert exception_text in caplog.text -@pytest.mark.parametrize( - "job_id", - ["547773/99/6"], -) -@ensureDeferred -async def test_job_id(job_id, mockserver): - settings = {"JOB": job_id} - async with mockserver.make_handler(settings) as handler: - req = Request( - "http://example.com", - method="POST", - meta={"zyte_api": {"browserHtml": True}}, - ) - api_params = handler._prepare_api_params(req) - resp = await deferred_from_coro( - handler._download_request(api_params, req, Spider("test")) # NOQA - ) - - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == f"{job_id}".encode("utf8") - - @ensureDeferred async def test_higher_concurrency(): - """Send DEFAULT_CLIENT_CONCURRENCY + 1 requests, the first and last taking - less time than the rest, and ensure that the first 2 responses are the - first and the last, verifying that a concurrency ≥ - DEFAULT_CLIENT_CONCURRENCY + 1 has been reached.""" + """Make sure that CONCURRENT_REQUESTS and CONCURRENT_REQUESTS_PER_DOMAIN + have an effect on Zyte API requests.""" + # Send DEFAULT_CLIENT_CONCURRENCY + 1 requests, the last one taking less + # time than the rest, and ensure that the first response comes from the + # last request, verifying that a concurrency ≥ DEFAULT_CLIENT_CONCURRENCY + # + 1 has been reached. concurrency = DEFAULT_CLIENT_CONCURRENCY + 1 response_indexes = [] - expected_first_indexes = {0, concurrency - 1} + expected_first_index = concurrency - 1 fast_seconds = 0.001 - slow_seconds = 0.1 + slow_seconds = 0.2 with MockServer(DelayedResource) as server: @@ -300,7 +217,7 @@ def start_requests(self): "browserHtml": True, "delay": ( fast_seconds - if index in expected_first_indexes + if index == expected_first_index else slow_seconds ), }, @@ -310,6 +227,7 @@ def start_requests(self): async def parse(self, response): response_indexes.append(response.meta["index"]) + raise CloseSpider crawler = get_crawler( TestSpider, @@ -322,6 +240,1440 @@ async def parse(self, response): ) await crawler.crawl() - assert ( - set(response_indexes[: len(expected_first_indexes)]) == expected_first_indexes + assert response_indexes[0] == expected_first_index + + +AUTOMAP_PARAMS: Dict[str, Any] = {} +BROWSER_HEADERS = {b"referer": "referer"} +DEFAULT_PARAMS: Dict[str, Any] = {} +TRANSPARENT_MODE = False +SKIP_HEADERS = {b"cookie", b"user-agent"} +JOB_ID = None +GET_API_PARAMS_KWARGS = { + "default_params": DEFAULT_PARAMS, + "transparent_mode": TRANSPARENT_MODE, + "automap_params": AUTOMAP_PARAMS, + "skip_headers": SKIP_HEADERS, + "browser_headers": BROWSER_HEADERS, + "job_id": JOB_ID, +} + + +@ensureDeferred +async def test_get_api_params_input_default(mockserver): + request = Request(url="https://example.com") + async with mockserver.make_handler() as handler: + patch_path = "scrapy_zyte_api.handler._get_api_params" + with patch(patch_path) as _get_api_params: + _get_api_params.side_effect = RuntimeError("That’s it!") + with pytest.raises(RuntimeError): + await handler.download_request(request, None) + _get_api_params.assert_called_once_with( + request, + **GET_API_PARAMS_KWARGS, + ) + + +@ensureDeferred +async def test_get_api_params_input_custom(mockserver): + request = Request(url="https://example.com") + settings = { + "JOB": "1/2/3", + "ZYTE_API_TRANSPARENT_MODE": True, + "ZYTE_API_BROWSER_HEADERS": {"B": "b"}, + "ZYTE_API_DEFAULT_PARAMS": {"a": "b"}, + "ZYTE_API_AUTOMAP_PARAMS": {"c": "d"}, + "ZYTE_API_SKIP_HEADERS": {"A"}, + } + async with mockserver.make_handler(settings) as handler: + patch_path = "scrapy_zyte_api.handler._get_api_params" + with patch(patch_path) as _get_api_params: + _get_api_params.side_effect = RuntimeError("That’s it!") + with pytest.raises(RuntimeError): + await handler.download_request(request, None) + _get_api_params.assert_called_once_with( + request, + default_params={"a": "b"}, + transparent_mode=True, + automap_params={"c": "d"}, + skip_headers={b"a"}, + browser_headers={b"b": "b"}, + job_id="1/2/3", + ) + + +@ensureDeferred +@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") +@pytest.mark.parametrize( + "output,uses_zyte_api", + [ + (None, False), + ({}, True), + ({"a": "b"}, True), + ], +) +async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockserver): + """If _get_api_params returns None, requests go outside Zyte API, but if it + returns a dictionary, even if empty, requests go through Zyte API.""" + request = Request(url=mockserver.urljoin("/")) + async with mockserver.make_handler() as handler: + patch_path = "scrapy_zyte_api.handler._get_api_params" + with patch(patch_path) as _get_api_params: + patch_path = "scrapy_zyte_api.handler.super" + with patch(patch_path) as super: + handler._download_request = mock.AsyncMock(side_effect=RuntimeError) + super_mock = mock.Mock() + super_mock.download_request = mock.AsyncMock(side_effect=RuntimeError) + super.return_value = super_mock + _get_api_params.return_value = output + with pytest.raises(RuntimeError): + await handler.download_request(request, None) + if uses_zyte_api: + handler._download_request.assert_called() + else: + super_mock.download_request.assert_called() + + +DEFAULT_AUTOMAP_PARAMS: Dict[str, Any] = { + "httpResponseBody": True, + "httpResponseHeaders": True, +} + + +@pytest.mark.parametrize( + "setting,meta,expected", + [ + (False, None, None), + (False, {}, None), + (False, {"a": "b"}, None), + (False, {"zyte_api": False}, None), + (False, {"zyte_api": True}, {}), + (False, {"zyte_api": {}}, {}), + (False, {"zyte_api": {"a": "b"}}, {"a": "b"}), + (False, {"zyte_api_automap": False}, None), + (False, {"zyte_api_automap": True}, DEFAULT_AUTOMAP_PARAMS), + (False, {"zyte_api_automap": {}}, DEFAULT_AUTOMAP_PARAMS), + (False, {"zyte_api_automap": {"a": "b"}}, {**DEFAULT_AUTOMAP_PARAMS, "a": "b"}), + (False, {"zyte_api": False, "zyte_api_automap": False}, None), + (False, {"zyte_api": False, "zyte_api_automap": True}, DEFAULT_AUTOMAP_PARAMS), + (False, {"zyte_api": False, "zyte_api_automap": {}}, DEFAULT_AUTOMAP_PARAMS), + ( + False, + {"zyte_api": False, "zyte_api_automap": {"a": "b"}}, + {**DEFAULT_AUTOMAP_PARAMS, "a": "b"}, + ), + (False, {"zyte_api": True, "zyte_api_automap": False}, {}), + (False, {"zyte_api": True, "zyte_api_automap": True}, ValueError), + (False, {"zyte_api": True, "zyte_api_automap": {}}, ValueError), + (False, {"zyte_api": True, "zyte_api_automap": {"a": "b"}}, ValueError), + (False, {"zyte_api": {}, "zyte_api_automap": False}, {}), + (False, {"zyte_api": {}, "zyte_api_automap": True}, ValueError), + (False, {"zyte_api": {}, "zyte_api_automap": {}}, ValueError), + (False, {"zyte_api": {}, "zyte_api_automap": {"a": "b"}}, ValueError), + (False, {"zyte_api": {"a": "b"}, "zyte_api_automap": False}, {"a": "b"}), + (False, {"zyte_api": {"a": "b"}, "zyte_api_automap": True}, ValueError), + (False, {"zyte_api": {"a": "b"}, "zyte_api_automap": {}}, ValueError), + (False, {"zyte_api": {"a": "b"}, "zyte_api_automap": {"a": "b"}}, ValueError), + (True, None, DEFAULT_AUTOMAP_PARAMS), + (True, {}, DEFAULT_AUTOMAP_PARAMS), + (True, {"a": "b"}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api": False}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api": True}, {}), + (True, {"zyte_api": {}}, {}), + (True, {"zyte_api": {"a": "b"}}, {"a": "b"}), + (True, {"zyte_api_automap": False}, None), + (True, {"zyte_api_automap": True}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api_automap": {}}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api_automap": {"a": "b"}}, {**DEFAULT_AUTOMAP_PARAMS, "a": "b"}), + (True, {"zyte_api": False, "zyte_api_automap": False}, None), + (True, {"zyte_api": False, "zyte_api_automap": True}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api": False, "zyte_api_automap": {}}, DEFAULT_AUTOMAP_PARAMS), + ( + True, + {"zyte_api": False, "zyte_api_automap": {"a": "b"}}, + {**DEFAULT_AUTOMAP_PARAMS, "a": "b"}, + ), + (True, {"zyte_api": True, "zyte_api_automap": False}, {}), + (True, {"zyte_api": True, "zyte_api_automap": True}, ValueError), + (True, {"zyte_api": True, "zyte_api_automap": {}}, ValueError), + (True, {"zyte_api": True, "zyte_api_automap": {"a": "b"}}, ValueError), + (True, {"zyte_api": {}, "zyte_api_automap": False}, {}), + (True, {"zyte_api": {}, "zyte_api_automap": True}, ValueError), + (True, {"zyte_api": {}, "zyte_api_automap": {}}, ValueError), + (True, {"zyte_api": {}, "zyte_api_automap": {"a": "b"}}, ValueError), + (True, {"zyte_api": {"a": "b"}, "zyte_api_automap": False}, {"a": "b"}), + (True, {"zyte_api": {"a": "b"}, "zyte_api_automap": True}, ValueError), + (True, {"zyte_api": {"a": "b"}, "zyte_api_automap": {}}, ValueError), + (True, {"zyte_api": {"a": "b"}, "zyte_api_automap": {"a": "b"}}, ValueError), + ], +) +def test_transparent_mode_toggling(setting, meta, expected): + """Test how the value of the ``ZYTE_API_TRANSPARENT_MODE`` setting + (*setting*) in combination with request metadata (*meta*) determines what + Zyte API parameters are used (*expected*). + + Note that :func:`test_get_api_params_output_side_effects` already tests how + *expected* affects whether the request is sent through Zyte API or not, + and :func:`test_get_api_params_input_custom` tests how the + ``ZYTE_API_TRANSPARENT_MODE`` setting is mapped to the corresponding + :func:`~scrapy_zyte_api.handler._get_api_params` parameter. + """ + request = Request(url="https://example.com", meta=meta) + func = partial( + _get_api_params, + request, + **{ + **GET_API_PARAMS_KWARGS, + "transparent_mode": setting, + }, + ) + if isclass(expected): + with pytest.raises(expected): + func() + else: + assert func() == expected + + +@pytest.mark.parametrize("meta", [None, 0, "", b"", [], ()]) +def test_api_disabling_deprecated(meta): + """Test how undocumented falsy values of the ``zyte_api`` request metadata + key (*meta*) can be used to disable the use of Zyte API, but trigger a + deprecation warning asking to replace them with False.""" + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + with pytest.warns(DeprecationWarning, match=r".* Use False instead\.$"): + api_params = _get_api_params( + request, + **GET_API_PARAMS_KWARGS, + ) + assert api_params is None + + +@pytest.mark.parametrize("key", ["zyte_api", "zyte_api_automap"]) +@pytest.mark.parametrize("value", [1, ["a", "b"]]) +def test_bad_meta_type(key, value): + """Test how undocumented truthy values (*value*) for the ``zyte_api`` and + ``zyte_api_automap`` request metadata keys (*key*) trigger a + :exc:`ValueError` exception.""" + request = Request(url="https://example.com", meta={key: value}) + with pytest.raises(ValueError): + _get_api_params( + request, + **GET_API_PARAMS_KWARGS, + ) + + +@pytest.mark.parametrize("meta", ["zyte_api", "zyte_api_automap"]) +@ensureDeferred +async def test_job_id(meta, mockserver): + """Test how the value of the ``JOB`` setting is included as ``jobId`` among + the parameters sent to Zyte API, both with manually-defined parameters and + with automatically-mapped parameters. + + Note that :func:`test_get_api_params_input_custom` already tests how the + ``JOB`` setting is mapped to the corresponding + :func:`~scrapy_zyte_api.handler._get_api_params` parameter. + """ + request = Request(url="https://example.com", meta={meta: True}) + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + "job_id": "1/2/3", + }, + ) + assert api_params["jobId"] == "1/2/3" + + +@ensureDeferred +async def test_default_params_none(mockserver, caplog): + """Test how setting a value to ``None`` in the dictionary of the + ZYTE_API_DEFAULT_PARAMS and ZYTE_API_AUTOMAP_PARAMS settings causes a + warning, because that is not expected to be a valid value. + + Note that ``None`` is however a valid value for parameters defined in the + ``zyte_api`` and ``zyte_api_automap`` request metadata keys. It can be used + to unset parameters set in those settings for a specific request. + + Also note that :func:`test_get_api_params_input_custom` already tests how + the settings are mapped to the corresponding + :func:`~scrapy_zyte_api.handler._get_api_params` parameter. + """ + request = Request(url="https://example.com") + settings = { + "ZYTE_API_DEFAULT_PARAMS": {"a": None, "b": "c"}, + "ZYTE_API_AUTOMAP_PARAMS": {"d": None, "e": "f"}, + } + with caplog.at_level("WARNING"): + async with mockserver.make_handler(settings) as handler: + patch_path = "scrapy_zyte_api.handler._get_api_params" + with patch(patch_path) as _get_api_params: + _get_api_params.side_effect = RuntimeError("That’s it!") + with pytest.raises(RuntimeError): + await handler.download_request(request, None) + _get_api_params.assert_called_once_with( + request, + **{ + **GET_API_PARAMS_KWARGS, + "default_params": {"b": "c"}, + "automap_params": {"e": "f"}, + }, + ) + assert "Parameter 'a' in the ZYTE_API_DEFAULT_PARAMS setting is None" in caplog.text + assert "Parameter 'd' in the ZYTE_API_AUTOMAP_PARAMS setting is None" in caplog.text + + +@pytest.mark.parametrize( + "setting,meta,expected,warnings", + [ + ({}, {}, {}, []), + ({}, {"b": 2}, {"b": 2}, []), + ({}, {"b": None}, {}, ["does not define such a parameter"]), + ({"a": 1}, {}, {"a": 1}, []), + ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}, []), + ({"a": 1}, {"b": None}, {"a": 1}, ["does not define such a parameter"]), + ({"a": 1}, {"a": 2}, {"a": 2}, []), + ({"a": 1}, {"a": None}, {}, []), + ], +) +@pytest.mark.parametrize( + "arg_key,meta_key,ignore_keys", + [ + ("default_params", "zyte_api", set()), + ( + "automap_params", + "zyte_api_automap", + {"httpResponseBody", "httpResponseHeaders"}, + ), + ], +) +def test_default_params_merging( + arg_key, meta_key, ignore_keys, setting, meta, expected, warnings, caplog +): + """Test how Zyte API parameters defined in the *arg_key* _get_api_params + parameter and those defined in the *meta_key* request metadata key are + combined. + + Request metadata takes precedence. Also, ``None`` values in request + metadata can be used to unset parameters defined in the setting. Request + metadata ``None`` values for keys that do not exist in the setting cause a + warning. + + This test also makes sure that, when `None` is used to unset a parameter, + the original request metadata key value is not modified. + """ + request = Request(url="https://example.com") + request.meta[meta_key] = meta + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + arg_key: setting, + }, + ) + for key in ignore_keys: + api_params.pop(key) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records + + +@pytest.mark.parametrize( + "setting,meta", + [ + # append + ( + {"a": "b"}, + {"b": "c"}, + ), + # overwrite + ( + {"a": "b"}, + {"a": "c"}, + ), + # drop + ( + {"a": "b"}, + {"a": None}, + ), + ], +) +@pytest.mark.parametrize( + "arg_key,meta_key", + [ + ("default_params", "zyte_api"), + ( + "automap_params", + "zyte_api_automap", + ), + ], +) +def test_default_params_immutability(arg_key, meta_key, setting, meta): + """Make sure that the merging of Zyte API parameters from the *arg_key* + _get_api_params parameter with those from the *meta_key* request metadata + key does not affect the contents of the setting for later requests.""" + request = Request(url="https://example.com") + request.meta[meta_key] = meta + default_params = copy(setting) + _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + arg_key: default_params, + }, ) + assert default_params == setting + + +def _test_automap(global_kwargs, request_kwargs, meta, expected, warnings, caplog): + request = Request(url="https://example.com", **request_kwargs) + request.meta["zyte_api_automap"] = meta + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + **global_kwargs, + "transparent_mode": True, + }, + ) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records + + +@pytest.mark.parametrize( + "meta,expected,warnings", + [ + # If no other known main output is specified in meta, httpResponseBody + # is requested. + ({}, {"httpResponseBody": True, "httpResponseHeaders": True}, []), + ( + {"unknownMainOutput": True}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "unknownMainOutput": True, + }, + [], + ), + # httpResponseBody can be explicitly requested in meta, and should be + # in cases where a binary response is expected, since automated mapping + # may stop working for binary responses in the future. + ( + {"httpResponseBody": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + [], + ), + # If other main outputs are specified in meta, httpRequestBody is not + # set. + ( + {"browserHtml": True}, + {"browserHtml": True}, + [], + ), + ( + {"screenshot": True}, + {"screenshot": True}, + [], + ), + ( + {"browserHtml": True, "screenshot": True}, + {"browserHtml": True, "screenshot": True}, + [], + ), + # If no known main output is specified, and httpResponseBody is + # explicitly set to False, httpResponseBody is unset and no main output + # is added. + ( + {"httpResponseBody": False}, + {}, + [], + ), + ( + {"httpResponseBody": False, "unknownMainOutput": True}, + {"unknownMainOutput": True}, + [], + ), + # We allow httpResponseBody and browserHtml to be both set to True, in + # case that becomes possible in the future. + ( + {"httpResponseBody": True, "browserHtml": True}, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ], +) +def test_automap_main_outputs(meta, expected, warnings, caplog): + _test_automap({}, {}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "meta,expected,warnings", + [ + # Test cases where httpResponseHeaders is not specifically set to True + # or False, where it is automatically set to True if httpResponseBody + # is also True, are covered in test_automap_main_outputs. + # + # If httpResponseHeaders is set to True in a scenario where it would + # not be implicitly set to True, it is passed as such. + ( + {"httpResponseBody": False, "httpResponseHeaders": True}, + {"httpResponseHeaders": True}, + [], + ), + ( + {"browserHtml": True, "httpResponseHeaders": True}, + {"browserHtml": True, "httpResponseHeaders": True}, + [], + ), + ( + {"screenshot": True, "httpResponseHeaders": True}, + {"screenshot": True, "httpResponseHeaders": True}, + [], + ), + ( + { + "unknownMainOutput": True, + "httpResponseBody": False, + "httpResponseHeaders": True, + }, + {"unknownMainOutput": True, "httpResponseHeaders": True}, + [], + ), + # Setting httpResponseHeaders to True where it would be already True + # implicitly, i.e. where httpResponseBody is set to True implicitly or + # explicitly, is OK and should not generate any warning. It is a way + # to make code future-proof, in case in the future httpResponseHeaders + # stops being set to True by default in those scenarios. + ( + {"httpResponseHeaders": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + [], + ), + ( + {"httpResponseBody": True, "httpResponseHeaders": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + [], + ), + ( + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"unknownMainOutput": True, "httpResponseHeaders": True}, + { + "unknownMainOutput": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + # If httpResponseHeaders is set to False, httpResponseHeaders is not + # defined, even if httpResponseBody is set to True, implicitly or + # explicitly. + ({"httpResponseHeaders": False}, {"httpResponseBody": True}, []), + ( + {"httpResponseBody": True, "httpResponseHeaders": False}, + {"httpResponseBody": True}, + [], + ), + ( + { + "httpResponseBody": True, + "browserHtml": True, + "httpResponseHeaders": False, + }, + {"browserHtml": True, "httpResponseBody": True}, + [], + ), + ( + {"unknownMainOutput": True, "httpResponseHeaders": False}, + {"unknownMainOutput": True, "httpResponseBody": True}, + [], + ), + # If httpResponseHeaders is unnecessarily set to False where + # httpResponseBody is set to False implicitly or explicitly, + # httpResponseHeaders is not defined, and a warning is + # logged. + ( + {"httpResponseBody": False, "httpResponseHeaders": False}, + {}, + ["do not need to set httpResponseHeaders to False"], + ), + ( + {"browserHtml": True, "httpResponseHeaders": False}, + {"browserHtml": True}, + ["do not need to set httpResponseHeaders to False"], + ), + ( + {"screenshot": True, "httpResponseHeaders": False}, + {"screenshot": True}, + ["do not need to set httpResponseHeaders to False"], + ), + ( + { + "unknownMainOutput": True, + "httpResponseBody": False, + "httpResponseHeaders": False, + }, + {"unknownMainOutput": True}, + ["do not need to set httpResponseHeaders to False"], + ), + ], +) +def test_automap_header_output(meta, expected, warnings, caplog): + _test_automap({}, {}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "method,meta,expected,warnings", + [ + # The GET HTTP method is not mapped, since it is the default method. + ( + "GET", + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + # Other HTTP methods, regardless of whether they are supported, + # unsupported, or unknown, are mapped as httpRequestMethod, letting + # Zyte API decide whether or not they are allowed. + *( + ( + method, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": method, + }, + [], + ) + for method in ( + "POST", + "PUT", + "DELETE", + "OPTIONS", + "TRACE", + "PATCH", + "HEAD", + "CONNECT", + "FOO", + ) + ), + # If httpRequestMethod is also specified in meta with the same value + # as Request.method, a warning is logged asking to use only + # Request.method. + *( + ( + request_method, + {"httpRequestMethod": meta_method}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": meta_method, + }, + ["Use Request.method"], + ) + for request_method, meta_method in ( + ("GET", "GET"), + ("POST", "POST"), + ) + ), + # If httpRequestMethod is also specified in meta with a different value + # from Request.method, a warning is logged asking to use Request.meta, + # and the meta value takes precedence. + *( + ( + request_method, + {"httpRequestMethod": meta_method}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": meta_method, + }, + [ + "Use Request.method", + "does not match the Zyte API httpRequestMethod", + ], + ) + for request_method, meta_method in ( + ("GET", "POST"), + ("PUT", "GET"), + ) + ), + # If httpResponseBody is not True, implicitly or explicitly, + # Request.method is still mapped for anything other than GET. + ( + "POST", + {"browserHtml": True}, + { + "browserHtml": True, + "httpRequestMethod": "POST", + }, + [], + ), + ( + "POST", + {"screenshot": True}, + { + "screenshot": True, + "httpRequestMethod": "POST", + }, + [], + ), + ], +) +def test_automap_method(method, meta, expected, warnings, caplog): + _test_automap({}, {"method": method}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "headers,meta,expected,warnings", + [ + # If httpResponseBody is True, implicitly or explicitly, + # Request.headers are mapped as customHttpRequestHeaders. + ( + {"Referer": "a"}, + {}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + # If browserHtml or screenshot are True, Request.headers are mapped as + # requestHeaders. + ( + {"Referer": "a"}, + {"browserHtml": True}, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"screenshot": True}, + { + "requestHeaders": {"referer": "a"}, + "screenshot": True, + }, + [], + ), + # If both httpResponseBody and browserHtml (or screenshot, or both) are + # True, implicitly or explicitly, Request.headers are mapped both as + # customHttpRequestHeaders and as requestHeaders. + ( + {"Referer": "a"}, + {"browserHtml": True, "httpResponseBody": True}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"screenshot": True, "httpResponseBody": True}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + "screenshot": True, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True, "screenshot": True, "httpResponseBody": True}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + "screenshot": True, + }, + [], + ), + # If httpResponseBody is True, implicitly or explicitly, and there is + # no other known main output parameter (browserHtml, screenshot), + # Request.headers are mapped as customHttpRequestHeaders only. + # + # While future main output parameters are likely to use requestHeaders + # instead, we cannot know if an unknown parameter is a main output + # parameter or a different type of parameter for httpRequestBody, and + # what we know for sure is that, at the time of writing, Zyte API does + # not allow requestHeaders to be combined with httpRequestBody. + ( + {"Referer": "a"}, + {"unknownMainOutput": True}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "unknownMainOutput": True, + }, + [], + ), + # If no known main output is requested, implicitly or explicitly, we + # assume that some unknown main output is being requested, and we map + # Request.headers as requestHeaders because that is the most likely way + # headers will need to be mapped for a future main output. + ( + {"Referer": "a"}, + {"httpResponseBody": False}, + { + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"unknownMainOutput": True, "httpResponseBody": False}, + { + "requestHeaders": {"referer": "a"}, + "unknownMainOutput": True, + }, + [], + ), + # False disables header mapping. + ( + {"Referer": "a"}, + {"customHttpRequestHeaders": False}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True, "requestHeaders": False}, + { + "browserHtml": True, + }, + [], + ), + ( + {"Referer": "a"}, + { + "browserHtml": True, + "httpResponseBody": True, + "customHttpRequestHeaders": False, + }, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True, "httpResponseBody": True, "requestHeaders": False}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": "a"}, + { + "browserHtml": True, + "httpResponseBody": True, + "customHttpRequestHeaders": False, + "requestHeaders": False, + }, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + # True forces header mapping. + ( + {"Referer": "a"}, + {"requestHeaders": True}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True, "customHttpRequestHeaders": True}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "requestHeaders": {"referer": "a"}, + }, + [], + ), + # Headers with None as value are not mapped. + ( + {"Referer": None}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": None}, + {"browserHtml": True}, + { + "browserHtml": True, + }, + [], + ), + ( + {"Referer": None}, + {"browserHtml": True, "httpResponseBody": True}, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": None}, + {"screenshot": True}, + { + "screenshot": True, + }, + [], + ), + ( + {"Referer": None}, + {"screenshot": True, "httpResponseBody": True}, + { + "screenshot": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": None}, + {"unknownMainOutput": True}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "unknownMainOutput": True, + }, + [], + ), + ( + {"Referer": None}, + {"unknownMainOutput": True, "httpResponseBody": False}, + { + "unknownMainOutput": True, + }, + [], + ), + ( + {"Referer": None}, + {"httpResponseBody": False}, + {}, + [], + ), + # Warn if header parameters are used in meta, even if the values match + # request headers, and even if there are no request headers to match in + # the first place. If they do not match, meta takes precedence. + ( + {"Referer": "a"}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ] + }, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + ( + {"Referer": "a"}, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + }, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + }, + ["Use Request.headers instead"], + ), + ( + {"Referer": "a"}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "b"}, + ] + }, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "b"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + ( + {"Referer": "a"}, + { + "browserHtml": True, + "requestHeaders": {"referer": "b"}, + }, + { + "browserHtml": True, + "requestHeaders": {"referer": "b"}, + }, + ["Use Request.headers instead"], + ), + ( + {}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ] + }, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + ( + {}, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + }, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + }, + ["Use Request.headers instead"], + ), + # If httpRequestBody is True and requestHeaders is defined in meta, or + # if browserHtml is True and customHttpRequestHeaders is defined in + # meta, keep the meta parameters and do not issue a warning. There is + # no need for a warning because the request should get an error + # response from Zyte API. And if Zyte API were not to send an error + # response, that would mean the Zyte API has started supporting this + # scenario, all the more reason not to warn and let the parameters + # reach Zyte API. + ( + {}, + { + "requestHeaders": {"referer": "a"}, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + }, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + }, + [], + ), + # Unsupported headers not present in Scrapy requests by default are + # dropped with a warning. + # If all headers are unsupported, the header parameter is not even set. + ( + {"Cookie": "a=b"}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + {"a": "b"}, + {"browserHtml": True}, + { + "browserHtml": True, + }, + ["cannot be mapped"], + ), + # Headers with an empty string as value are not silently ignored. + ( + {"Cookie": ""}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + {"a": ""}, + {"browserHtml": True}, + { + "browserHtml": True, + }, + ["cannot be mapped"], + ), + # Unsupported headers are looked up case-insensitively. + ( + {"user-Agent": ""}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # The Accept and Accept-Language headers, when unsupported, are dropped + # silently if their value matches the default value of Scrapy for + # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. + ( + { + k: v + for k, v in DEFAULT_REQUEST_HEADERS.items() + if k in {"Accept", "Accept-Language"} + }, + {"browserHtml": True}, + { + "browserHtml": True, + }, + [], + ), + ( + { + "Accept": "application/json", + "Accept-Language": "uk", + }, + {"browserHtml": True}, + { + "browserHtml": True, + }, + ["cannot be mapped"], + ), + # The User-Agent header, which Scrapy sets by default, is dropped + # silently if it matches the default value of the USER_AGENT setting, + # or with a warning otherwise. + ( + {"User-Agent": DEFAULT_USER_AGENT}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"User-Agent": ""}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + {"User-Agent": DEFAULT_USER_AGENT}, + {"browserHtml": True}, + { + "browserHtml": True, + }, + [], + ), + ( + {"User-Agent": ""}, + {"browserHtml": True}, + { + "browserHtml": True, + }, + ["cannot be mapped"], + ), + ], +) +def test_automap_headers(headers, meta, expected, warnings, caplog): + _test_automap({}, {"headers": headers}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "global_kwargs,headers,meta,expected,warnings", + [ + # You may update the ZYTE_API_SKIP_HEADERS setting to remove + # headers that the customHttpRequestHeaders parameter starts supporting + # in the future. + ( + { + "skip_headers": {b"cookie"}, + }, + { + "Cookie": "", + "User-Agent": "", + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "User-Agent", "value": ""}, + ], + }, + [ + "defines header b'Cookie', which cannot be mapped", + ], + ), + # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support + # for new fields that the requestHeaders parameter may support in the + # future. + ( + { + "browser_headers": { + b"referer": "referer", + b"user-agent": "userAgent", + }, + }, + {"User-Agent": ""}, + {"browserHtml": True}, + { + "browserHtml": True, + "requestHeaders": {"userAgent": ""}, + }, + [], + ), + ], +) +def test_automap_header_settings( + global_kwargs, headers, meta, expected, warnings, caplog +): + _test_automap(global_kwargs, {"headers": headers}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "body,meta,expected,warnings", + [ + # The body is copied into httpRequestBody, base64-encoded. + ( + "a", + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "YQ==", + }, + [], + ), + # httpRequestBody defined in meta takes precedence, but it causes a + # warning. + ( + "a", + {"httpRequestBody": "Yg=="}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "Yg==", + }, + [ + "Use Request.body instead", + "does not match the Zyte API httpRequestBody parameter", + ], + ), + # httpRequestBody defined in meta causes a warning even if it matches + # request.body. + ( + "a", + {"httpRequestBody": "YQ=="}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "YQ==", + }, + ["Use Request.body instead"], + ), + # The body is mapped even if httpResponseBody is not used. + ( + "a", + {"browserHtml": True}, + { + "browserHtml": True, + "httpRequestBody": "YQ==", + }, + [], + ), + ( + "a", + {"screenshot": True}, + { + "httpRequestBody": "YQ==", + "screenshot": True, + }, + [], + ), + ], +) +def test_automap_body(body, meta, expected, warnings, caplog): + _test_automap({}, {"body": body}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "meta,expected,warnings", + [ + # When httpResponseBody, browserHtml, screenshot, or + # httpResponseHeaders, are unnecessarily set to False, they are not + # defined in the parameters sent to Zyte API, and a warning is logged. + ( + { + "browserHtml": True, + "httpResponseBody": False, + }, + { + "browserHtml": True, + }, + ["unnecessarily defines"], + ), + ( + { + "browserHtml": False, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["unnecessarily defines"], + ), + ( + { + "screenshot": False, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["unnecessarily defines"], + ), + ( + { + "httpResponseHeaders": False, + "screenshot": True, + }, + { + "screenshot": True, + }, + ["do not need to set httpResponseHeaders to False"], + ), + ], +) +def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): + _test_automap({}, {}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "default_params,meta,expected,warnings", + [ + ( + {"browserHtml": True}, + {"screenshot": True, "browserHtml": False}, + {"screenshot": True}, + [], + ), + ], +) +def test_default_params_automap(default_params, meta, expected, warnings, caplog): + """Warnings about unneeded parameters should not apply if those parameters + are needed to extend or override parameters set in the + ``ZYTE_API_AUTOMAP_PARAMS`` setting.""" + request = Request(url="https://example.com") + request.meta["zyte_api_automap"] = meta + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + "transparent_mode": True, + "automap_params": default_params, + }, + ) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records diff --git a/tests/test_responses.py b/tests/test_responses.py index a0acb946..0a67e25e 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -13,6 +13,7 @@ from scrapy_zyte_api.utils import _RESPONSE_HAS_IP_ADDRESS, _RESPONSE_HAS_PROTOCOL PAGE_CONTENT = "The cake is a lie!" +PAGE_CONTENT_2 = "Ceci n’est pas une pipe" URL = "https://example.com" @@ -43,7 +44,20 @@ def raw_api_response_body(): } -EXPECTED_HEADERS = {b"Content-Type": [b"text/html"], b"Content-Length": [b"44"]} +def raw_api_response_mixed(): + return { + "url": URL, + "browserHtml": PAGE_CONTENT, + "httpResponseBody": b64encode(PAGE_CONTENT_2.encode("utf-8")), + "echoData": {"some_value": "here"}, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": len(PAGE_CONTENT_2)}, + ], + "statusCode": 200, + } + + EXPECTED_BODY = PAGE_CONTENT.encode("utf-8") @@ -72,19 +86,24 @@ def test_init(api_response, cls): @pytest.mark.parametrize( - "api_response,cls", + "api_response,cls,content_length", [ - (raw_api_response_browser, ZyteAPITextResponse), - (raw_api_response_body, ZyteAPIResponse), + (raw_api_response_browser, ZyteAPITextResponse, 44), + (raw_api_response_body, ZyteAPIResponse, 44), + (raw_api_response_mixed, ZyteAPITextResponse, 49), ], ) -def test_text_from_api_response(api_response, cls): +def test_text_from_api_response(api_response, cls, content_length): response = cls.from_api_response(api_response()) assert response.raw_api_response == api_response() assert response.url == URL assert response.status == 200 - assert response.headers == EXPECTED_HEADERS + expected_headers = { + b"Content-Type": [b"text/html"], + b"Content-Length": [str(content_length).encode()], + } + assert response.headers == expected_headers assert response.body == EXPECTED_BODY assert response.flags == ["zyte-api"] assert response.request is None diff --git a/tox.ini b/tox.ini index 337ac6a1..b2530b78 100644 --- a/tox.ini +++ b/tox.ini @@ -23,6 +23,8 @@ deps = # https://stackoverflow.com/a/73046084 Twisted==21.7.0 + # https://github.com/scrapy/scrapy/issues/5635 + pyopenssl==22.0.0 # Earliest supported Scrapy version.