From ce6f9ba7d750c6032f86ea6a3dca5c327d21c5ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Aug 2022 10:39:52 +0200 Subject: [PATCH 01/81] Add ZYTE_API_ENABLED --- README.rst | 67 +++++-------- scrapy_zyte_api/handler.py | 16 ++- tests/__init__.py | 17 ++-- tests/mockserver.py | 5 +- tests/test_api_requests.py | 198 ++++++++++++++++++++++++++++++++++++- 5 files changed, 249 insertions(+), 54 deletions(-) diff --git a/README.rst b/README.rst index 36d416b9..d58284cc 100644 --- a/README.rst +++ b/README.rst @@ -116,8 +116,12 @@ and ``scrapy.http.TextResponse``. If multiple requests target the same URL with different Zyte Data API parameters, pass ``dont_filter=True`` to ``Request``. + +.. _default-params: + Setting default parameters -------------------------- + Often the same configuration needs to be used for all Zyte API requests. For example, all requests may need to set the same geolocation, or the spider only uses ``browserHtml`` requests. @@ -133,13 +137,7 @@ following in the ``settings.py`` file or `any other settings within Scrapy "geolocation": "US", } - -``ZYTE_API_DEFAULT_PARAMS`` works if the ``zyte_api`` -key in `Request.meta `_ -is set, i.e. having ``ZYTE_API_DEFAULT_PARAMS`` doesn't make all requests -to go through Zyte Data API. Parameters in ``ZYTE_API_DEFAULT_PARAMS`` are -merged with parameters set via the ``zyte_api`` meta key, with the values in -meta taking priority. +For example: .. code-block:: python @@ -191,49 +189,36 @@ meta taking priority. # 'download_slot': 'quotes.toscrape.com' # } -There is a shortcut, in case a request uses the same parameters as -defined in the ``ZYTE_API_DEFAULT_PARAMS`` setting, without any further -customization - the ``zyte_api`` meta key can be set to ``True`` or ``{}``: +``ZYTE_API_DEFAULT_PARAMS`` does not make requests automatically go through +Zyte Data API. See :ref:`enabled`. -.. code-block:: python +Parameters in ``ZYTE_API_DEFAULT_PARAMS`` are merged with parameters set via +the ``zyte_api`` meta key, with the values in meta taking priority. - import scrapy +.. _enabled: - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" +Controlling which requests go through Zyte Data API +--------------------------------------------------- - custom_settings = { - "ZYTE_API_DEFAULT_PARAMS": { - "browserHtml": True, - } - } +The ``ZYTE_API_ENABLED`` setting can be used to control whether all, none, or +some requests go through Zyte Data API. It supports the following values: - def start_requests(self): - yield scrapy.Request( - url="http://quotes.toscrape.com/", - callback=self.parse, - meta={"zyte_api": True}, - ) +- ``None`` (default): only requests where the ``zyte_api`` key in + Request.meta_ is set to ``True`` or set to a dictionary go through Zyte + Data API. - def parse(self, response): - yield {"URL": response.url, "HTML": response.body} + .. _Request.meta: https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta - print(response.raw_api_response) - # { - # 'url': 'https://quotes.toscrape.com/', - # 'statusCode': 200, - # 'browserHtml': ' ... ', - # } +- ``True``: all requests go through Zyte Data API, unless the ``zyte_api`` + key in Request.meta_ is set to ``False``. + +- ``False``: disables this plugin. + +Zyte Data API requests need parameters. You must either set those parameters in +the ``zyte_api`` metadata key of every request or :ref:`set default parameters +`. - print(response.request.meta) - # { - # 'zyte_api': { - # 'browserHtml': True, - # }, - # 'download_timeout': 180.0, - # 'download_slot': 'quotes.toscrape.com' - # } Customizing the retry policy ---------------------------- diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index c85d31eb..5bb29fa2 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,5 +1,6 @@ import logging from typing import Any, Dict, Generator, Optional, Union +from warnings import warn from scrapy import Spider from scrapy.core.downloader.handlers.http import HTTPDownloadHandler @@ -25,6 +26,11 @@ def __init__( self, settings: Settings, crawler: Crawler, client: AsyncClient = None ): super().__init__(settings=settings, crawler=crawler) + enabled = settings.get("ZYTE_API_ENABLED") + if enabled is False: + raise NotConfigured + self._enabled_by_default = enabled or False + if not client: try: client = AsyncClient( @@ -66,10 +72,16 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: return super().download_request(request, spider) def _prepare_api_params(self, request: Request) -> Optional[dict]: - meta_params = request.meta.get("zyte_api") + meta_params = request.meta.get("zyte_api", self._enabled_by_default) + if meta_params is False: + return None if not meta_params and meta_params != {}: + warn( + f"Setting the zyte_api request metadata key to " + f"{meta_params!r} is deprecated. Use False instead.", + DeprecationWarning, + ) return None - if meta_params is True: meta_params = {} diff --git a/tests/__init__.py b/tests/__init__.py index 7d753624..7e122c1d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,6 +2,7 @@ from os import environ from typing import Optional +from scrapy.exceptions import NotConfigured from scrapy.utils.misc import create_instance from scrapy.utils.test import get_crawler from twisted.internet.asyncioreactor import AsyncioSelectorReactor @@ -30,15 +31,19 @@ async def make_handler(settings: dict, api_url: Optional[str] = None): if api_url is not None: settings["ZYTE_API_URL"] = api_url crawler = get_crawler(settings_dict=settings) - handler = create_instance( - ScrapyZyteAPIDownloadHandler, - settings=None, - crawler=crawler, - ) + try: + handler = create_instance( + ScrapyZyteAPIDownloadHandler, + settings=None, + crawler=crawler, + ) + except NotConfigured: + handler = None try: yield handler finally: - await handler._close() # NOQA + if handler is not None: + await handler._close() # NOQA @contextmanager diff --git a/tests/mockserver.py b/tests/mockserver.py index 9f06d04e..45444c9f 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -146,10 +146,7 @@ def urljoin(self, path): async def make_handler(self, settings: dict = None): settings = settings or {} async with make_handler(settings, self.urljoin("/")) as handler: - try: - yield handler - finally: - await handler._close() # NOQA + yield handler def main(): diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 4916aaca..55ae8414 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -91,9 +91,11 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo @ensureDeferred @pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") +@pytest.mark.filterwarnings("ignore:.*None is deprecated") @pytest.mark.parametrize( "meta,settings,expected,use_zyte_api", [ + # Undefined ZYTE_API_ENABLED ({}, {}, {}, False), ({"zyte_api": {}}, {}, {}, False), ({"zyte_api": True}, {}, {}, False), @@ -134,6 +136,186 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"browserHtml": True, "geolocation": "US", "javascript": True}, True, ), + + # ZYTE_API_ENABLED=None + ({}, {"ZYTE_API_ENABLED": None}, {}, False), + ({"zyte_api": {}}, {"ZYTE_API_ENABLED": None}, {}, False), + ({"zyte_api": True}, {"ZYTE_API_ENABLED": None}, {}, False), + ({"zyte_api": False}, {"ZYTE_API_ENABLED": None}, {}, False), + ( + {}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": None, + }, + {"browserHtml": True, "geolocation": "CA"}, + False, + ), + ( + {"zyte_api": False}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": None, + }, + {}, + False, + ), + ( + {"zyte_api": None}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": None, + }, + {}, + False, + ), + ( + {"zyte_api": {}}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": None, + }, + {"browserHtml": True, "geolocation": "CA"}, + True, + ), + ( + {"zyte_api": True}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": None, + }, + {"browserHtml": True, "geolocation": "CA"}, + True, + ), + ( + {"zyte_api": {"javascript": True, "geolocation": "US"}}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": None, + }, + {"browserHtml": True, "geolocation": "US", "javascript": True}, + True, + ), + + # ZYTE_API_ENABLED=True + ({}, {"ZYTE_API_ENABLED": True}, {}, False), + ({"zyte_api": {}}, {"ZYTE_API_ENABLED": True}, {}, False), + ({"zyte_api": True}, {"ZYTE_API_ENABLED": True}, {}, False), + ({"zyte_api": False}, {"ZYTE_API_ENABLED": True}, {}, False), + ( + {}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": True, + }, + {"browserHtml": True, "geolocation": "CA"}, + True, + ), + ( + {"zyte_api": False}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": True, + }, + {}, + False, + ), + ( + {"zyte_api": None}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": True, + }, + {}, + False, + ), + ( + {"zyte_api": {}}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": True, + }, + {"browserHtml": True, "geolocation": "CA"}, + True, + ), + ( + {"zyte_api": True}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": True, + }, + {"browserHtml": True, "geolocation": "CA"}, + True, + ), + ( + {"zyte_api": {"javascript": True, "geolocation": "US"}}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": True, + }, + {"browserHtml": True, "geolocation": "US", "javascript": True}, + True, + ), + + # ZYTE_API_ENABLED=False + ({}, {"ZYTE_API_ENABLED": True}, {}, False), + ({"zyte_api": {}}, {"ZYTE_API_ENABLED": False}, {}, False), + ({"zyte_api": True}, {"ZYTE_API_ENABLED": False}, {}, False), + ({"zyte_api": False}, {"ZYTE_API_ENABLED": False}, {}, False), + ( + {}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": False, + }, + {"browserHtml": True, "geolocation": "CA"}, + False, + ), + ( + {"zyte_api": False}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": False, + }, + {}, + False, + ), + ( + {"zyte_api": None}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": False, + }, + {}, + False, + ), + ( + {"zyte_api": {}}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": False, + }, + {"browserHtml": True, "geolocation": "CA"}, + False, + ), + ( + {"zyte_api": True}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": False, + }, + {"browserHtml": True, "geolocation": "CA"}, + False, + ), + ( + {"zyte_api": {"javascript": True, "geolocation": "US"}}, + { + "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, + "ZYTE_API_ENABLED": False, + }, + {"browserHtml": True, "geolocation": "US", "javascript": True}, + False, + ), ], ) async def test_zyte_api_request_meta( @@ -144,10 +326,14 @@ async def test_zyte_api_request_meta( mockserver, ): async with mockserver.make_handler(settings) as handler: + if handler is None: + assert not use_zyte_api + return req = Request(mockserver.urljoin("/"), meta=meta) unmocked_client = handler._client handler._client = mock.AsyncMock(unmocked_client) handler._client.request_raw.side_effect = unmocked_client.request_raw + await handler.download_request(req, None) # What we're interested in is the Request call in the API @@ -168,6 +354,16 @@ async def test_zyte_api_request_meta( assert args_used == expected +@ensureDeferred +@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") +async def test_zyte_api_request_meta_none_deprecation(mockserver): + async with mockserver.make_handler() as handler: + req = Request(mockserver.urljoin("/"), meta={"zyte_api": None}) + handler._client = mock.AsyncMock(handler._client) + with pytest.warns(DeprecationWarning, match="None is deprecated"): + await handler.download_request(req, None) + + @pytest.mark.parametrize( "meta", [ @@ -175,7 +371,7 @@ async def test_zyte_api_request_meta( {"zyte_api": True}, {"zyte_api": {"browserHtml": True}}, {"zyte_api": {}}, - {"zyte_api": None}, + {"zyte_api": False}, {"randomParameter": True}, {}, None, From 6348e814b294ecac8c12ee84f0766ebf480a59a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Aug 2022 11:24:22 +0200 Subject: [PATCH 02/81] Split ZYTE_API_ALL off of ZYTE_API_ENABLED --- README.rst | 22 +++---- scrapy_zyte_api/handler.py | 8 +-- tests/test_api_requests.py | 116 ++++++++++--------------------------- 3 files changed, 42 insertions(+), 104 deletions(-) diff --git a/README.rst b/README.rst index d58284cc..0f9c0b76 100644 --- a/README.rst +++ b/README.rst @@ -201,23 +201,19 @@ the ``zyte_api`` meta key, with the values in meta taking priority. Controlling which requests go through Zyte Data API --------------------------------------------------- -The ``ZYTE_API_ENABLED`` setting can be used to control whether all, none, or -some requests go through Zyte Data API. It supports the following values: +By default, only requests where the ``zyte_api`` key in Request.meta_ is set to +``True`` or set to a dictionary go through Zyte Data API. -- ``None`` (default): only requests where the ``zyte_api`` key in - Request.meta_ is set to ``True`` or set to a dictionary go through Zyte - Data API. +.. _Request.meta: https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta - .. _Request.meta: https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta +Set the ``ZYTE_API_ALL`` setting to ``True`` to make all requests go through +Zyte Data API unless the ``zyte_api`` key in Request.meta_ is set to ``False``. -- ``True``: all requests go through Zyte Data API, unless the ``zyte_api`` - key in Request.meta_ is set to ``False``. +Zyte Data API requests need parameters. If you set the ``zyte_api`` key in +Request.meta_ or the ``ZYTE_API_ALL`` setting to ``True``, you must also +:ref:`set default parameters `. -- ``False``: disables this plugin. - -Zyte Data API requests need parameters. You must either set those parameters in -the ``zyte_api`` metadata key of every request or :ref:`set default parameters -`. +Set the ``ZYTE_API_ENABLED`` setting to ``False`` to disable this plugin. Customizing the retry policy diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 5bb29fa2..188e462a 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -26,11 +26,8 @@ def __init__( self, settings: Settings, crawler: Crawler, client: AsyncClient = None ): super().__init__(settings=settings, crawler=crawler) - enabled = settings.get("ZYTE_API_ENABLED") - if enabled is False: + if not settings.getbool("ZYTE_API_ENABLED", True): raise NotConfigured - self._enabled_by_default = enabled or False - if not client: try: client = AsyncClient( @@ -62,6 +59,7 @@ def __init__( self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") self._session = create_session(connection_pool_size=self._client.n_conn) self._retry_policy = settings.get("ZYTE_API_RETRY_POLICY") + self._all = settings.getbool("ZYTE_API_ALL") def download_request(self, request: Request, spider: Spider) -> Deferred: api_params = self._prepare_api_params(request) @@ -72,7 +70,7 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: return super().download_request(request, spider) def _prepare_api_params(self, request: Request) -> Optional[dict]: - meta_params = request.meta.get("zyte_api", self._enabled_by_default) + meta_params = request.meta.get("zyte_api", self._all) if meta_params is False: return None if not meta_params and meta_params != {}: diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 55ae8414..8d731223 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -95,7 +95,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo @pytest.mark.parametrize( "meta,settings,expected,use_zyte_api", [ - # Undefined ZYTE_API_ENABLED + # Default ZYTE_API_ALL ({}, {}, {}, False), ({"zyte_api": {}}, {}, {}, False), ({"zyte_api": True}, {}, {}, False), @@ -137,16 +137,16 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo True, ), - # ZYTE_API_ENABLED=None - ({}, {"ZYTE_API_ENABLED": None}, {}, False), - ({"zyte_api": {}}, {"ZYTE_API_ENABLED": None}, {}, False), - ({"zyte_api": True}, {"ZYTE_API_ENABLED": None}, {}, False), - ({"zyte_api": False}, {"ZYTE_API_ENABLED": None}, {}, False), + # ZYTE_API_ALL=False + ({}, {"ZYTE_API_ALL": False}, {}, False), + ({"zyte_api": {}}, {"ZYTE_API_ALL": False}, {}, False), + ({"zyte_api": True}, {"ZYTE_API_ALL": False}, {}, False), + ({"zyte_api": False}, {"ZYTE_API_ALL": False}, {}, False), ( {}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": None, + "ZYTE_API_ALL": False, }, {"browserHtml": True, "geolocation": "CA"}, False, @@ -155,7 +155,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": False}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": None, + "ZYTE_API_ALL": False, }, {}, False, @@ -164,7 +164,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": None}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": None, + "ZYTE_API_ALL": False, }, {}, False, @@ -173,7 +173,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": {}}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": None, + "ZYTE_API_ALL": False, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -182,7 +182,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": True}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": None, + "ZYTE_API_ALL": False, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -191,22 +191,22 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": {"javascript": True, "geolocation": "US"}}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": None, + "ZYTE_API_ALL": False, }, {"browserHtml": True, "geolocation": "US", "javascript": True}, True, ), - # ZYTE_API_ENABLED=True - ({}, {"ZYTE_API_ENABLED": True}, {}, False), - ({"zyte_api": {}}, {"ZYTE_API_ENABLED": True}, {}, False), - ({"zyte_api": True}, {"ZYTE_API_ENABLED": True}, {}, False), - ({"zyte_api": False}, {"ZYTE_API_ENABLED": True}, {}, False), + # ZYTE_API_ALL=True + ({}, {"ZYTE_API_ALL": True}, {}, False), + ({"zyte_api": {}}, {"ZYTE_API_ALL": True}, {}, False), + ({"zyte_api": True}, {"ZYTE_API_ALL": True}, {}, False), + ({"zyte_api": False}, {"ZYTE_API_ALL": True}, {}, False), ( {}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": True, + "ZYTE_API_ALL": True, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -215,7 +215,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": False}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": True, + "ZYTE_API_ALL": True, }, {}, False, @@ -224,7 +224,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": None}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": True, + "ZYTE_API_ALL": True, }, {}, False, @@ -233,7 +233,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": {}}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": True, + "ZYTE_API_ALL": True, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -242,7 +242,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": True}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": True, + "ZYTE_API_ALL": True, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -251,71 +251,11 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": {"javascript": True, "geolocation": "US"}}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": True, + "ZYTE_API_ALL": True, }, {"browserHtml": True, "geolocation": "US", "javascript": True}, True, ), - - # ZYTE_API_ENABLED=False - ({}, {"ZYTE_API_ENABLED": True}, {}, False), - ({"zyte_api": {}}, {"ZYTE_API_ENABLED": False}, {}, False), - ({"zyte_api": True}, {"ZYTE_API_ENABLED": False}, {}, False), - ({"zyte_api": False}, {"ZYTE_API_ENABLED": False}, {}, False), - ( - {}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": False, - }, - {"browserHtml": True, "geolocation": "CA"}, - False, - ), - ( - {"zyte_api": False}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": False, - }, - {}, - False, - ), - ( - {"zyte_api": None}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": False, - }, - {}, - False, - ), - ( - {"zyte_api": {}}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": False, - }, - {"browserHtml": True, "geolocation": "CA"}, - False, - ), - ( - {"zyte_api": True}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": False, - }, - {"browserHtml": True, "geolocation": "CA"}, - False, - ), - ( - {"zyte_api": {"javascript": True, "geolocation": "US"}}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ENABLED": False, - }, - {"browserHtml": True, "geolocation": "US", "javascript": True}, - False, - ), ], ) async def test_zyte_api_request_meta( @@ -326,9 +266,6 @@ async def test_zyte_api_request_meta( mockserver, ): async with mockserver.make_handler(settings) as handler: - if handler is None: - assert not use_zyte_api - return req = Request(mockserver.urljoin("/"), meta=meta) unmocked_client = handler._client handler._client = mock.AsyncMock(unmocked_client) @@ -354,6 +291,13 @@ async def test_zyte_api_request_meta( assert args_used == expected +@ensureDeferred +async def test_disable(mockserver): + settings = {"ZYTE_API_ENABLED": False} + async with mockserver.make_handler(settings) as handler: + assert handler is None + + @ensureDeferred @pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") async def test_zyte_api_request_meta_none_deprecation(mockserver): From 5f78c55df9b230580e06911328061531c2c87417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Aug 2022 12:37:26 +0200 Subject: [PATCH 03/81] =?UTF-8?q?ZYTE=5FAPI=5FALL=20=E2=86=92=20ZYTE=5FAPI?= =?UTF-8?q?=5FON=5FALL=5FREQUESTS;=20mention=20setting=20default=20values?= =?UTF-8?q?=20explicitly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.rst | 10 +++++---- scrapy_zyte_api/handler.py | 4 ++-- tests/test_api_requests.py | 46 +++++++++++++++++++------------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/README.rst b/README.rst index 0f9c0b76..47d8ffeb 100644 --- a/README.rst +++ b/README.rst @@ -206,14 +206,16 @@ By default, only requests where the ``zyte_api`` key in Request.meta_ is set to .. _Request.meta: https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta -Set the ``ZYTE_API_ALL`` setting to ``True`` to make all requests go through -Zyte Data API unless the ``zyte_api`` key in Request.meta_ is set to ``False``. +Set the ``ZYTE_API_ON_ALL_REQUESTS`` setting to ``True`` to make all requests +go through Zyte Data API unless the ``zyte_api`` key in Request.meta_ is set to +``False``. ``ZYTE_API_ON_ALL_REQUESTS`` is ``False`` by default. Zyte Data API requests need parameters. If you set the ``zyte_api`` key in -Request.meta_ or the ``ZYTE_API_ALL`` setting to ``True``, you must also -:ref:`set default parameters `. +Request.meta_ or the ``ZYTE_API_ON_ALL_REQUESTS`` setting to ``True``, you must +also :ref:`set default parameters `. Set the ``ZYTE_API_ENABLED`` setting to ``False`` to disable this plugin. +``ZYTE_API_ENABLED`` is ``True`` by default. Customizing the retry policy diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 188e462a..c8d18fe1 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -59,7 +59,7 @@ def __init__( self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") self._session = create_session(connection_pool_size=self._client.n_conn) self._retry_policy = settings.get("ZYTE_API_RETRY_POLICY") - self._all = settings.getbool("ZYTE_API_ALL") + self._on_all_requests = settings.getbool("ZYTE_API_ON_ALL_REQUESTS") def download_request(self, request: Request, spider: Spider) -> Deferred: api_params = self._prepare_api_params(request) @@ -70,7 +70,7 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: return super().download_request(request, spider) def _prepare_api_params(self, request: Request) -> Optional[dict]: - meta_params = request.meta.get("zyte_api", self._all) + meta_params = request.meta.get("zyte_api", self._on_all_requests) if meta_params is False: return None if not meta_params and meta_params != {}: diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 8d731223..0b97b2b7 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -95,7 +95,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo @pytest.mark.parametrize( "meta,settings,expected,use_zyte_api", [ - # Default ZYTE_API_ALL + # Default ZYTE_API_ON_ALL_REQUESTS ({}, {}, {}, False), ({"zyte_api": {}}, {}, {}, False), ({"zyte_api": True}, {}, {}, False), @@ -137,16 +137,16 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo True, ), - # ZYTE_API_ALL=False - ({}, {"ZYTE_API_ALL": False}, {}, False), - ({"zyte_api": {}}, {"ZYTE_API_ALL": False}, {}, False), - ({"zyte_api": True}, {"ZYTE_API_ALL": False}, {}, False), - ({"zyte_api": False}, {"ZYTE_API_ALL": False}, {}, False), + # ZYTE_API_ON_ALL_REQUESTS=False + ({}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), + ({"zyte_api": {}}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), + ({"zyte_api": True}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), + ({"zyte_api": False}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), ( {}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": False, + "ZYTE_API_ON_ALL_REQUESTS": False, }, {"browserHtml": True, "geolocation": "CA"}, False, @@ -155,7 +155,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": False}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": False, + "ZYTE_API_ON_ALL_REQUESTS": False, }, {}, False, @@ -164,7 +164,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": None}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": False, + "ZYTE_API_ON_ALL_REQUESTS": False, }, {}, False, @@ -173,7 +173,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": {}}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": False, + "ZYTE_API_ON_ALL_REQUESTS": False, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -182,7 +182,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": True}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": False, + "ZYTE_API_ON_ALL_REQUESTS": False, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -191,22 +191,22 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": {"javascript": True, "geolocation": "US"}}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": False, + "ZYTE_API_ON_ALL_REQUESTS": False, }, {"browserHtml": True, "geolocation": "US", "javascript": True}, True, ), - # ZYTE_API_ALL=True - ({}, {"ZYTE_API_ALL": True}, {}, False), - ({"zyte_api": {}}, {"ZYTE_API_ALL": True}, {}, False), - ({"zyte_api": True}, {"ZYTE_API_ALL": True}, {}, False), - ({"zyte_api": False}, {"ZYTE_API_ALL": True}, {}, False), + # ZYTE_API_ON_ALL_REQUESTS=True + ({}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), + ({"zyte_api": {}}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), + ({"zyte_api": True}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), + ({"zyte_api": False}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), ( {}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": True, + "ZYTE_API_ON_ALL_REQUESTS": True, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -215,7 +215,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": False}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": True, + "ZYTE_API_ON_ALL_REQUESTS": True, }, {}, False, @@ -224,7 +224,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": None}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": True, + "ZYTE_API_ON_ALL_REQUESTS": True, }, {}, False, @@ -233,7 +233,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": {}}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": True, + "ZYTE_API_ON_ALL_REQUESTS": True, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -242,7 +242,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": True}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": True, + "ZYTE_API_ON_ALL_REQUESTS": True, }, {"browserHtml": True, "geolocation": "CA"}, True, @@ -251,7 +251,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo {"zyte_api": {"javascript": True, "geolocation": "US"}}, { "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ALL": True, + "ZYTE_API_ON_ALL_REQUESTS": True, }, {"browserHtml": True, "geolocation": "US", "javascript": True}, True, From 68f43dc9e711c375104393db505eb6bd65bb248b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Aug 2022 22:55:16 +0200 Subject: [PATCH 04/81] Initial proposal for transparent mapping --- scrapy_zyte_api/handler.py | 159 ++++++++ tests/mockserver.py | 7 +- tests/test_api_requests.py | 747 ++++++++++++++++++++++++++++++++++++- 3 files changed, 910 insertions(+), 3 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index c8d18fe1..83a676b0 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,4 +1,5 @@ import logging +from base64 import b64decode, b64encode from typing import Any, Dict, Generator, Optional, Union from warnings import warn @@ -8,6 +9,10 @@ from scrapy.exceptions import IgnoreRequest, NotConfigured from scrapy.http import Request from scrapy.settings import Settings +from scrapy.settings.default_settings import ( + DEFAULT_REQUEST_HEADERS, + USER_AGENT as DEFAULT_USER_AGENT +) from scrapy.utils.defer import deferred_from_coro from scrapy.utils.reactor import verify_installed_reactor from twisted.internet.defer import Deferred, inlineCallbacks @@ -60,6 +65,21 @@ def __init__( self._session = create_session(connection_pool_size=self._client.n_conn) self._retry_policy = settings.get("ZYTE_API_RETRY_POLICY") self._on_all_requests = settings.getbool("ZYTE_API_ON_ALL_REQUESTS") + self._automap = settings.getbool("ZYTE_API_AUTOMAP", True) + self._unsupported_headers = { + header.strip().lower().encode() for header in settings.getlist( + "ZYTE_API_UNSUPPORTED_HEADERS", + ["Cookie", "User-Agent"], + ) + } + browser_headers = settings.getdict( + "ZYTE_API_BROWSER_HEADERS", + {"Referer": "referer"}, + ) + self._browser_headers = { + k.strip().lower().encode(): v + for k, v in browser_headers.items() + } def download_request(self, request: Request, spider: Spider) -> Deferred: api_params = self._prepare_api_params(request) @@ -93,6 +113,145 @@ def _prepare_api_params(self, request: Request) -> Optional[dict]: f"instead ({request.url})." ) raise IgnoreRequest() + + if not self._automap: + return api_params + + if not any( + api_params.get(k) + for k in ("httpResponseBody", "browserHtml", "screenshot") + ): + api_params.setdefault("httpResponseBody", True) + response_body = api_params.get("httpResponseBody") + + if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): + if api_params.get("httpResponseHeaders") is True: + logger.warning( + "You do not need to set httpResponseHeaders to True if " + "you httpResponseBody or browserHtml to True. Note that " + "httpResponseBody is set to True automatically if neither " + "browserHtml nor screenshot are set to True." + ) + api_params.setdefault("httpResponseHeaders", True) + + method = api_params.get("httpRequestMethod") + if method: + logger.warning( + f"Request {request} uses the Zyte Data API httpRequestMethod " + f"parameter. Use Request.method instead." + ) + if method != request.method: + logger.warning( + f"The HTTP method of request {request} ({request.method}) " + f"does not match the Zyte Data API httpRequestMethod " + f"parameter ({method})." + ) + elif request.method != "GET": + if response_body: + api_params["httpRequestMethod"] = request.method + else: + logger.warning( + f"The HTTP method of request {request} ({request.method}) " + f"is being ignored. The httpRequestMethod parameter of " + f"Zyte Data API can only be set when the httpResponseBody " + f"parameter is True." + ) + + if response_body: + headers = api_params.get("customHttpRequestHeaders") + if headers is not None: + logger.warning( + f"Request {request} defines the Zyte Data API " + f"customHttpRequestHeaders parameter. Use Request.headers " + f"instead." + ) + elif request.headers: + headers = [] + for k, v in request.headers.items(): + if not v: + continue + v = b','.join(v).decode() + lowercase_k = k.strip().lower() + if lowercase_k in self._unsupported_headers: + if ( + lowercase_k != b'user-agent' + or v != DEFAULT_USER_AGENT + ): + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte Data API " + f"customHttpRequestHeaders parameter." + ) + continue + k = k.decode() + headers.append({"name": k, "value": v}) + if headers: + api_params["customHttpRequestHeaders"] = headers + if ( + not response_body + or any(api_params.get(k) for k in ("browserHtml", "screenshot")) + ): + headers = api_params.get("requestHeaders") + if headers is not None: + logger.warning( + f"Request {request} defines the Zyte Data API " + f"requestHeaders parameter. Use Request.headers instead." + ) + elif request.headers: + request_headers = {} + for k, v in request.headers.items(): + if not v: + continue + v = b','.join(v).decode() + lowercase_k = k.strip().lower() + key = self._browser_headers.get(lowercase_k) + if key is not None: + request_headers[key] = v + elif not ( + ( + lowercase_k == b'accept' + and v == DEFAULT_REQUEST_HEADERS['Accept'] + ) or ( + lowercase_k == b'accept-language' + and v == DEFAULT_REQUEST_HEADERS['Accept-Language'] + ) or ( + lowercase_k == b'user-agent' + and v == DEFAULT_USER_AGENT + ) + ): + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte Data API " + f"requestHeaders parameter." + ) + if request_headers: + api_params["requestHeaders"] = request_headers + + body = api_params.get("httpRequestBody") + if body: + logger.warning( + f"Request {request} uses the Zyte Data API httpRequestBody " + f"parameter. Use Request.body instead." + ) + decoded_body = b64decode(body) + if decoded_body != request.body: + logger.warning( + f"The body of request {request} ({request.body!r}) " + f"does not match the Zyte Data API httpRequestBody " + f"parameter ({body!r}; decoded: {decoded_body!r})." + ) + elif request.body != b"": + if response_body: + base64_body = b64encode(request.body).decode() + api_params["httpRequestBody"] = base64_body + else: + logger.warning( + f"The body of request {request} ({request.body!r}) " + f"is being ignored. The httpRequestBody parameter of " + f"Zyte Data API can only be set when the httpResponseBody " + f"parameter is True." + ) + return api_params def _update_stats(self): diff --git a/tests/mockserver.py b/tests/mockserver.py index 45444c9f..7aed1445 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -66,7 +66,10 @@ def render_POST(self, request): html = "Hello

World!

" if "browserHtml" in request_data: - if "httpResponseBody" in request_data: + if ( + "httpResponseBody" in request_data + and not request_data.get("passThrough") + ): request.setResponseCode(422) return json.dumps({ "type": "/request/unprocessable", @@ -75,7 +78,7 @@ def render_POST(self, request): "detail": "Incompatible parameters were found in the request." }).encode() response_data["browserHtml"] = html - elif "httpResponseBody" in request_data: + if "httpResponseBody" in request_data: base64_html = b64encode(html.encode()).decode() response_data["httpResponseBody"] = base64_html diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 0b97b2b7..46b3c6e3 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,6 +1,6 @@ import sys from asyncio import iscoroutine -from typing import Any, Dict +from typing import Any, Dict, List, Literal, Union from unittest import mock import pytest @@ -9,6 +9,10 @@ from scrapy import Request, Spider from scrapy.exceptions import IgnoreRequest, NotSupported from scrapy.http import Response, TextResponse +from scrapy.settings.default_settings import ( + DEFAULT_REQUEST_HEADERS, + USER_AGENT as DEFAULT_USER_AGENT, +) from scrapy.utils.defer import deferred_from_coro from scrapy.utils.test import get_crawler from twisted.internet.defer import Deferred @@ -450,3 +454,744 @@ async def parse(self, response): assert ( set(response_indexes[: len(expected_first_indexes)]) == expected_first_indexes ) + + +@ensureDeferred +@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") +@pytest.mark.parametrize( + "request_kwargs,settings,expected,warnings", + [ + # Automatic mapping of request parameters to Zyte Data API parameters + # is enabled by default, but can be disabled. + # + # httpResponseBody is set to True if no other main content is + # requested. + *( + ( + {}, + settings, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ) + for settings in ( + {}, + {"ZYTE_API_AUTOMAP": True}, + ) + ), + ( + {}, + {"ZYTE_API_AUTOMAP": False}, + False, + [], + ), + *( + ( + {"meta": {"zyte_api": {"a": "b"}}}, + settings, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "a": "b", + }, + [], + ) + for settings in ( + {}, + {"ZYTE_API_AUTOMAP": True}, + ) + ), + ( + {"meta": {"zyte_api": {"a": "b"}}}, + {"ZYTE_API_AUTOMAP": False}, + { + "a": "b", + }, + [], + ), + # httpResponseBody can be unset through meta. That way, if a new main + # output type other than browserHtml and screenshot is implemented in + # the future, you can request the new output type and also prevent + # httpResponseBody from being enabled automatically, without the need + # to disable automated mapping completely. + ( + {"meta": {"zyte_api": {"httpResponseBody": False}}}, + {}, + { + "httpResponseBody": False, + }, + [], + ), + ( + { + "meta": {"zyte_api": {"httpResponseBody": False, "newOutputType": True}}, + }, + {}, + { + "httpResponseBody": False, + "newOutputType": True, + }, + [], + ), + # httpResponseHeaders is automatically set to True for httpResponseBody + # (shown in prior tests) and browserHtml. + ( + { + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + # httpResponseHeaders is not set for screenshot. + ( + { + "meta": {"zyte_api": {"screenshot": True}}, + }, + {}, + { + "screenshot": True, + }, + [], + ), + # httpResponseHeaders can be unset through meta. + ( + { + "meta": {"zyte_api": {"httpResponseHeaders": False}}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": False, + }, + [], + ), + ( + { + "meta": { + "zyte_api": { + "browserHtml": True, + "httpResponseHeaders": False, + }, + }, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": False, + }, + [], + ), + + # METHOD + + # Request.method is mapped as is. + *( + ( + {"method": method}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": method, + }, + [], + ) + for method in ( + "POST", + "PUT", + "DELETE", + "OPTIONS", + "TRACE", + "PATCH", + ) + ), + # Request.method is mapped even for methods that Zyte Data API does not + # support. + *( + ( + {"method": method}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": method, + }, + [], + ) + for method in ( + "HEAD", + "CONNECT", + "FOO", + ) + ), + # An exception is the default method (GET), which is not mapped. + ( + {"method": "GET"}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + # httpRequestMethod should not be defined through meta. + ( + { + "meta": { + "zyte_api": { + "httpRequestMethod": "GET", + }, + }, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": "GET", + }, + ["Use Request.method instead"], + ), + # If defined through meta, httpRequestMethod takes precedence, warning + # about value mismatches. + ( + { + "method": "POST", + "meta": { + "zyte_api": { + "httpRequestMethod": "PATCH", + }, + }, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": "PATCH", + }, + [ + "Use Request.method instead", + "does not match the Zyte Data API httpRequestMethod parameter", + ], + ), + # A non-GET method should not be used unless httpResponseBody is also + # used. + ( + { + "method": "POST", + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + ( + { + "method": "POST", + "meta": {"zyte_api": {"screenshot": True}}, + }, + {}, + { + "screenshot": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + + # HEADERS + + # Headers are mapped to requestHeaders or customHttpRequestHeaders + # depending on whether or not httpResponseBody is declared. + ( + { + "headers": {"Referer": "a"}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + }, + [], + ), + ( + { + "headers": {"Referer": "a"}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + # We intentionally generate requestHeaders even if browserHtml and + # screenshot are not used, assuming that future additional outputs are + # more likely to use requestHeaders than to use + # customHttpRequestHeaders. + ( + { + "headers": {"Referer": "a"}, + "meta": {"zyte_api": {"httpResponseBody": False}}, + }, + {}, + { + "httpResponseBody": False, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + # If both httpResponseBody and currently-incompatible attributes + # (browserHtml, screenshot) are declared, both fields are generated. + # This is in case a single request is allowed to combine both in the + # future. + ( + { + "headers": {"Referer": "a"}, + "meta": { + "zyte_api": { + "httpResponseBody": True, + "browserHtml": True, + # Makes the mock API server return 200 despite the + # bad input. + "passThrough": True, + }, + }, + }, + {}, + { + "httpResponseBody": True, + "browserHtml": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "requestHeaders": {"referer": "a"}, + "passThrough": True, + }, + [], + ), + # If requestHeaders or customHttpRequestHeaders are used, their value + # prevails, but a warning is issued. + ( + { + "headers": {"Referer": "a"}, + "meta": { + "zyte_api": { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "b"}, + ], + }, + }, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "b"}, + ], + }, + ["Use Request.headers instead"], + ), + ( + { + "headers": {"Referer": "a"}, + "meta": { + "zyte_api": { + "browserHtml": True, + "requestHeaders": {"referer": "b"}, + }, + }, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "b"}, + }, + ["Use Request.headers instead"], + ), + # A request should not have headers if requestHeaders or + # customHttpRequestHeaders are also used, even if they match. + ( + { + "headers": {"Referer": "b"}, + "meta": { + "zyte_api": { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "b"}, + ], + }, + }, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "b"}, + ], + }, + ["Use Request.headers instead"], + ), + ( + { + "headers": {"Referer": "b"}, + "meta": { + "zyte_api": { + "browserHtml": True, + "requestHeaders": {"referer": "b"}, + }, + }, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "b"}, + }, + ["Use Request.headers instead"], + ), + # Unsupported headers not present in Scrapy requests by default are + # dropped with a warning. + # If all headers are unsupported, the header parameter is not even set. + ( + { + "headers": {"a": "b"}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # Headers with None as value are silently ignored. + ( + { + "headers": {"a": None}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + # Headers with an empty string as value are not silently ignored. + ( + { + "headers": {"a": ""}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # Unsupported headers are looked up case-insensitively. + ( + { + "headers": {"user-Agent": ""}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # The Accept and Accept-Language headers, when unsupported, are dropped + # silently if their value matches the default value of Scrapy for + # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. + ( + { + "headers": DEFAULT_REQUEST_HEADERS, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + { + "headers": { + "Accept": "application/json", + "Accept-Language": "uk", + }, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # The Cookie header is dropped with a warning. + ( + { + "headers": {"Cookie": "a=b",}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + { + "headers": {"Cookie": "a=b",}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # The User-Agent header, which Scrapy sets by default, is dropped + # silently if it matches the default value of the USER_AGENT setting, + # or with a warning otherwise. + ( + { + "headers": {"User-Agent": DEFAULT_USER_AGENT}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + { + "headers": {"User-Agent": ""}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + { + "headers": {"User-Agent": DEFAULT_USER_AGENT}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + { + "headers": {"User-Agent": ""}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # You may update the ZYTE_API_UNSUPPORTED_HEADERS setting to remove + # headers that the customHttpRequestHeaders parameter starts supporting + # in the future. + ( + { + "headers": {"User-Agent": ""}, + }, + { + "ZYTE_API_UNSUPPORTED_HEADERS": ["Cookie"], + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "User-Agent", "value": ""}, + ], + }, + [], + ), + # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support + # for new fields that the requestHeaders parameter may support in the + # future. + ( + { + "headers": {"User-Agent": ""}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + { + "ZYTE_API_BROWSER_HEADERS": { + "Referer": "referer", + "User-Agent": "userAgent", + }, + }, + { + "browserHtml": True, + "httpResponseHeaders": True, + "requestHeaders": {"userAgent": ""}, + }, + [], + ), + + # BODY + + # The body is copied into httpRequestBody, base64-encoded. + ( + { + "body": "a", + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "YQ==", + }, + [], + ), + # httpRequestBody defined in meta takes precedence, but it causes a + # warning. + ( + { + "body": "a", + "meta": {"zyte_api": {"httpRequestBody": "Yg=="}}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "Yg==", + }, + [ + "Use Request.body instead", + "does not match the Zyte Data API httpRequestBody parameter", + ], + ), + # httpRequestBody defined in meta causes a warning even if it matches + # request.body. + ( + { + "body": "a", + "meta": {"zyte_api": {"httpRequestBody": "YQ=="}}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "YQ==", + }, + ["Use Request.body instead"], + ), + # A body should not be used unless httpResponseBody is also used. + ( + { + "body": "a", + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + ( + { + "body": "a", + "meta": {"zyte_api": {"screenshot": True}}, + }, + {}, + { + "screenshot": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + + # httpResponseHeaders + + # Warn if httpResponseHeaders is defined unnecessarily. + ( + { + "meta": {"zyte_api": {"httpResponseHeaders": True}}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["do not need to set httpResponseHeaders"], + ), + ], +) +async def test_automap( + request_kwargs: Dict[str, Any], + settings: Dict[str, str], + expected: Union[Dict[str, str], Literal[False]], + warnings: List[str], + mockserver, + caplog, +): + settings.update({"ZYTE_API_ON_ALL_REQUESTS": True}) + async with mockserver.make_handler(settings) as handler: + if expected is False: + # Only the Zyte Data API client is mocked, meaning requests that + # do not go through Zyte Data API are actually sent, so we point + # them to the mock server to avoid internet connections in tests. + request_kwargs["url"] = mockserver.urljoin("/") + else: + request_kwargs["url"] = "https://toscrape.com" + request = Request(**request_kwargs) + unmocked_client = handler._client + handler._client = mock.AsyncMock(unmocked_client) + handler._client.request_raw.side_effect = unmocked_client.request_raw + with caplog.at_level("WARNING"): + await handler.download_request(request, None) + + # What we're interested in is the Request call in the API + request_call = [ + c for c in handler._client.mock_calls if "request_raw(" in str(c) + ] + + if expected is False: + assert request_call == [] + return + + if not request_call: + pytest.fail("The client's request_raw() method was not called.") + + args_used = request_call[0].args[0] + args_used.pop("url") + assert args_used == expected + + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records From a2ad009301af451dcf79a94614d449b8c1cb83ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Aug 2022 23:38:32 +0200 Subject: [PATCH 05/81] Update test_zyte_api_request_meta --- .pre-commit-config.yaml | 2 +- scrapy_zyte_api/responses.py | 1 - tests/mockserver.py | 21 ++--- tests/test_api_requests.py | 166 +++++++++++++++++++++++------------ 4 files changed, 120 insertions(+), 70 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0b630578..2664b236 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: isort - repo: https://github.com/ambv/black - rev: 20.8b1 + rev: 22.3.0 hooks: - id: black language_version: python3.8 diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 241b43ca..0c758d5a 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -18,7 +18,6 @@ class ZyteAPIMixin: } def __init__(self, *args, raw_api_response: Dict = None, **kwargs): - super().__init__(*args, **kwargs) self._raw_api_response = raw_api_response def replace(self, *args, **kwargs): diff --git a/tests/mockserver.py b/tests/mockserver.py index 7aed1445..45b2738a 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -66,23 +66,24 @@ def render_POST(self, request): html = "Hello

World!

" if "browserHtml" in request_data: - if ( - "httpResponseBody" in request_data - and not request_data.get("passThrough") + if "httpResponseBody" in request_data and not request_data.get( + "passThrough" ): request.setResponseCode(422) - return json.dumps({ - "type": "/request/unprocessable", - "title": "Unprocessable Request", - "status": 422, - "detail": "Incompatible parameters were found in the request." - }).encode() + return json.dumps( + { + "type": "/request/unprocessable", + "title": "Unprocessable Request", + "status": 422, + "detail": "Incompatible parameters were found in the request.", + } + ).encode() response_data["browserHtml"] = html if "httpResponseBody" in request_data: base64_html = b64encode(html.encode()).decode() response_data["httpResponseBody"] = base64_html - if "httpResponseHeaders" in request_data: + if request_data.get("httpResponseHeaders") is True: response_data["httpResponseHeaders"] = [ {"name": "test_header", "value": "test_value"} ] diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 46b3c6e3..4d108423 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -9,10 +9,8 @@ from scrapy import Request, Spider from scrapy.exceptions import IgnoreRequest, NotSupported from scrapy.http import Response, TextResponse -from scrapy.settings.default_settings import ( - DEFAULT_REQUEST_HEADERS, - USER_AGENT as DEFAULT_USER_AGENT, -) +from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS +from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT from scrapy.utils.defer import deferred_from_coro from scrapy.utils.test import get_crawler from twisted.internet.defer import Deferred @@ -29,9 +27,19 @@ {"zyte_api": {"browserHtml": True, "geolocation": "US"}}, {"zyte_api": {"browserHtml": True, "geolocation": "US", "echoData": 123}}, {"zyte_api": {"browserHtml": True, "randomParameter": None}}, + {"zyte_api": {"httpResponseBody": True}}, + {"zyte_api": {"httpResponseBody": True, "geolocation": "US"}}, + { + "zyte_api": { + "httpResponseBody": True, + "geolocation": "US", + "echoData": 123, + } + }, + {"zyte_api": {"httpResponseBody": True, "randomParameter": None}}, ], ) -async def test_browser_html_request(meta: Dict[str, Dict[str, Any]], mockserver): +async def test_html_response_and_headers(meta: Dict[str, Dict[str, Any]], mockserver): req, resp = await produce_request_response(mockserver, meta) assert isinstance(resp, TextResponse) assert resp.request is req @@ -42,21 +50,35 @@ async def test_browser_html_request(meta: Dict[str, Dict[str, Any]], mockserver) assert resp.text == "Hello

World!

" assert resp.css("h1 ::text").get() == "World!" assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.headers == {b"Test_Header": [b"test_value"]} @pytest.mark.parametrize( "meta", [ - {"zyte_api": {"httpResponseBody": True}}, - {"zyte_api": {"httpResponseBody": True, "geolocation": "US"}}, + {"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": False}}, { "zyte_api": { "httpResponseBody": True, + "httpResponseHeaders": False, + "geolocation": "US", + }, + }, + { + "zyte_api": { + "httpResponseBody": True, + "httpResponseHeaders": False, "geolocation": "US", "echoData": 123, } }, - {"zyte_api": {"httpResponseBody": True, "randomParameter": None}}, + { + "zyte_api": { + "httpResponseBody": True, + "httpResponseHeaders": False, + "randomParameter": None, + }, + }, ], ) @ensureDeferred @@ -75,24 +97,6 @@ async def test_http_response_body_request(meta: Dict[str, Dict[str, Any]], mocks assert resp.xpath("//body/text()").getall() == ["Hello"] -@pytest.mark.parametrize( - "meta", - [ - {"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": True}}, - {"zyte_api": {"browserHtml": True, "httpResponseHeaders": True}}, - ], -) -@ensureDeferred -async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mockserver): - req, resp = await produce_request_response(mockserver, meta) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"Hello

World!

" - assert resp.headers == {b"Test_Header": [b"test_value"]} - - @ensureDeferred @pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") @pytest.mark.filterwarnings("ignore:.*None is deprecated") @@ -101,13 +105,23 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo [ # Default ZYTE_API_ON_ALL_REQUESTS ({}, {}, {}, False), - ({"zyte_api": {}}, {}, {}, False), - ({"zyte_api": True}, {}, {}, False), + ( + {"zyte_api": {}}, + {}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + True, + ), + ( + {"zyte_api": True}, + {}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + True, + ), ({"zyte_api": False}, {}, {}, False), ( {}, {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA"}, + {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, False, ), ( @@ -125,26 +139,40 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo ( {"zyte_api": {}}, {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA"}, + {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, True, ), ( {"zyte_api": True}, {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA"}, + {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, True, ), ( {"zyte_api": {"javascript": True, "geolocation": "US"}}, {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "US", "javascript": True}, + { + "browserHtml": True, + "geolocation": "US", + "javascript": True, + "httpResponseHeaders": True, + }, True, ), - # ZYTE_API_ON_ALL_REQUESTS=False ({}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), - ({"zyte_api": {}}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), - ({"zyte_api": True}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), + ( + {"zyte_api": {}}, + {"ZYTE_API_ON_ALL_REQUESTS": False}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + True, + ), + ( + {"zyte_api": True}, + {"ZYTE_API_ON_ALL_REQUESTS": False}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + True, + ), ({"zyte_api": False}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), ( {}, @@ -179,7 +207,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, "ZYTE_API_ON_ALL_REQUESTS": False, }, - {"browserHtml": True, "geolocation": "CA"}, + {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, True, ), ( @@ -188,7 +216,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, "ZYTE_API_ON_ALL_REQUESTS": False, }, - {"browserHtml": True, "geolocation": "CA"}, + {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, True, ), ( @@ -197,14 +225,33 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, "ZYTE_API_ON_ALL_REQUESTS": False, }, - {"browserHtml": True, "geolocation": "US", "javascript": True}, + { + "browserHtml": True, + "geolocation": "US", + "javascript": True, + "httpResponseHeaders": True, + }, True, ), - # ZYTE_API_ON_ALL_REQUESTS=True - ({}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), - ({"zyte_api": {}}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), - ({"zyte_api": True}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), + ( + {}, + {"ZYTE_API_ON_ALL_REQUESTS": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + True, + ), + ( + {"zyte_api": {}}, + {"ZYTE_API_ON_ALL_REQUESTS": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + True, + ), + ( + {"zyte_api": True}, + {"ZYTE_API_ON_ALL_REQUESTS": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + True, + ), ({"zyte_api": False}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), ( {}, @@ -212,7 +259,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, "ZYTE_API_ON_ALL_REQUESTS": True, }, - {"browserHtml": True, "geolocation": "CA"}, + {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, True, ), ( @@ -239,7 +286,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, "ZYTE_API_ON_ALL_REQUESTS": True, }, - {"browserHtml": True, "geolocation": "CA"}, + {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, True, ), ( @@ -248,7 +295,7 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, "ZYTE_API_ON_ALL_REQUESTS": True, }, - {"browserHtml": True, "geolocation": "CA"}, + {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, True, ), ( @@ -257,7 +304,12 @@ async def test_http_response_headers_request(meta: Dict[str, Dict[str, Any]], mo "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, "ZYTE_API_ON_ALL_REQUESTS": True, }, - {"browserHtml": True, "geolocation": "US", "javascript": True}, + { + "browserHtml": True, + "geolocation": "US", + "javascript": True, + "httpResponseHeaders": True, + }, True, ), ], @@ -526,7 +578,9 @@ async def parse(self, response): ), ( { - "meta": {"zyte_api": {"httpResponseBody": False, "newOutputType": True}}, + "meta": { + "zyte_api": {"httpResponseBody": False, "newOutputType": True} + }, }, {}, { @@ -587,9 +641,7 @@ async def parse(self, response): }, [], ), - # METHOD - # Request.method is mapped as is. *( ( @@ -704,9 +756,7 @@ async def parse(self, response): }, ["can only be set when the httpResponseBody parameter"], ), - # HEADERS - # Headers are mapped to requestHeaders or customHttpRequestHeaders # depending on whether or not httpResponseBody is declared. ( @@ -950,7 +1000,9 @@ async def parse(self, response): # The Cookie header is dropped with a warning. ( { - "headers": {"Cookie": "a=b",}, + "headers": { + "Cookie": "a=b", + }, }, {}, { @@ -961,7 +1013,9 @@ async def parse(self, response): ), ( { - "headers": {"Cookie": "a=b",}, + "headers": { + "Cookie": "a=b", + }, "meta": {"zyte_api": {"browserHtml": True}}, }, {}, @@ -1060,9 +1114,7 @@ async def parse(self, response): }, [], ), - # BODY - # The body is copied into httpRequestBody, base64-encoded. ( { @@ -1133,9 +1185,7 @@ async def parse(self, response): }, ["can only be set when the httpResponseBody parameter"], ), - # httpResponseHeaders - # Warn if httpResponseHeaders is defined unnecessarily. ( { @@ -1152,7 +1202,7 @@ async def parse(self, response): ) async def test_automap( request_kwargs: Dict[str, Any], - settings: Dict[str, str], + settings: Dict[str, Any], expected: Union[Dict[str, str], Literal[False]], warnings: List[str], mockserver, From 4c44ebe7a0ab6e5b62db8c14f02df59e401221ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Aug 2022 23:46:18 +0200 Subject: [PATCH 06/81] Run pre-commit hooks --- scrapy_zyte_api/handler.py | 156 +++++++++++++++++++------------------ tests/conftest.py | 3 +- tests/test_handler.py | 48 ++++++------ 3 files changed, 105 insertions(+), 102 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 83a676b0..39381dd8 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -9,10 +9,8 @@ from scrapy.exceptions import IgnoreRequest, NotConfigured from scrapy.http import Request from scrapy.settings import Settings -from scrapy.settings.default_settings import ( - DEFAULT_REQUEST_HEADERS, - USER_AGENT as DEFAULT_USER_AGENT -) +from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS +from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT from scrapy.utils.defer import deferred_from_coro from scrapy.utils.reactor import verify_installed_reactor from twisted.internet.defer import Deferred, inlineCallbacks @@ -67,7 +65,8 @@ def __init__( self._on_all_requests = settings.getbool("ZYTE_API_ON_ALL_REQUESTS") self._automap = settings.getbool("ZYTE_API_AUTOMAP", True) self._unsupported_headers = { - header.strip().lower().encode() for header in settings.getlist( + header.strip().lower().encode() + for header in settings.getlist( "ZYTE_API_UNSUPPORTED_HEADERS", ["Cookie", "User-Agent"], ) @@ -77,8 +76,7 @@ def __init__( {"Referer": "referer"}, ) self._browser_headers = { - k.strip().lower().encode(): v - for k, v in browser_headers.items() + k.strip().lower().encode(): v for k, v in browser_headers.items() } def download_request(self, request: Request, spider: Spider) -> Deferred: @@ -114,49 +112,13 @@ def _prepare_api_params(self, request: Request) -> Optional[dict]: ) raise IgnoreRequest() - if not self._automap: - return api_params + if self._automap: + self._map_params(api_params, request) - if not any( - api_params.get(k) - for k in ("httpResponseBody", "browserHtml", "screenshot") - ): - api_params.setdefault("httpResponseBody", True) - response_body = api_params.get("httpResponseBody") - - if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): - if api_params.get("httpResponseHeaders") is True: - logger.warning( - "You do not need to set httpResponseHeaders to True if " - "you httpResponseBody or browserHtml to True. Note that " - "httpResponseBody is set to True automatically if neither " - "browserHtml nor screenshot are set to True." - ) - api_params.setdefault("httpResponseHeaders", True) - - method = api_params.get("httpRequestMethod") - if method: - logger.warning( - f"Request {request} uses the Zyte Data API httpRequestMethod " - f"parameter. Use Request.method instead." - ) - if method != request.method: - logger.warning( - f"The HTTP method of request {request} ({request.method}) " - f"does not match the Zyte Data API httpRequestMethod " - f"parameter ({method})." - ) - elif request.method != "GET": - if response_body: - api_params["httpRequestMethod"] = request.method - else: - logger.warning( - f"The HTTP method of request {request} ({request.method}) " - f"is being ignored. The httpRequestMethod parameter of " - f"Zyte Data API can only be set when the httpResponseBody " - f"parameter is True." - ) + return api_params + def _map_headers(self, api_params, request): + response_body = api_params.get("httpResponseBody") if response_body: headers = api_params.get("customHttpRequestHeaders") if headers is not None: @@ -170,13 +132,10 @@ def _prepare_api_params(self, request: Request) -> Optional[dict]: for k, v in request.headers.items(): if not v: continue - v = b','.join(v).decode() + v = b",".join(v).decode() lowercase_k = k.strip().lower() if lowercase_k in self._unsupported_headers: - if ( - lowercase_k != b'user-agent' - or v != DEFAULT_USER_AGENT - ): + if lowercase_k != b"user-agent" or v != DEFAULT_USER_AGENT: logger.warning( f"Request {request} defines header {k}, which " f"cannot be mapped into the Zyte Data API " @@ -187,9 +146,8 @@ def _prepare_api_params(self, request: Request) -> Optional[dict]: headers.append({"name": k, "value": v}) if headers: api_params["customHttpRequestHeaders"] = headers - if ( - not response_body - or any(api_params.get(k) for k in ("browserHtml", "screenshot")) + if not response_body or any( + api_params.get(k) for k in ("browserHtml", "screenshot") ): headers = api_params.get("requestHeaders") if headers is not None: @@ -202,22 +160,21 @@ def _prepare_api_params(self, request: Request) -> Optional[dict]: for k, v in request.headers.items(): if not v: continue - v = b','.join(v).decode() + v = b",".join(v).decode() lowercase_k = k.strip().lower() key = self._browser_headers.get(lowercase_k) if key is not None: request_headers[key] = v elif not ( ( - lowercase_k == b'accept' - and v == DEFAULT_REQUEST_HEADERS['Accept'] - ) or ( - lowercase_k == b'accept-language' - and v == DEFAULT_REQUEST_HEADERS['Accept-Language'] - ) or ( - lowercase_k == b'user-agent' - and v == DEFAULT_USER_AGENT + lowercase_k == b"accept" + and v == DEFAULT_REQUEST_HEADERS["Accept"] + ) + or ( + lowercase_k == b"accept-language" + and v == DEFAULT_REQUEST_HEADERS["Accept-Language"] ) + or (lowercase_k == b"user-agent" and v == DEFAULT_USER_AGENT) ): logger.warning( f"Request {request} defines header {k}, which " @@ -227,6 +184,48 @@ def _prepare_api_params(self, request: Request) -> Optional[dict]: if request_headers: api_params["requestHeaders"] = request_headers + def _map_params(self, api_params, request): + if not any( + api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") + ): + api_params.setdefault("httpResponseBody", True) + response_body = api_params.get("httpResponseBody") + + if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): + if api_params.get("httpResponseHeaders") is True: + logger.warning( + "You do not need to set httpResponseHeaders to True if " + "you httpResponseBody or browserHtml to True. Note that " + "httpResponseBody is set to True automatically if neither " + "browserHtml nor screenshot are set to True." + ) + api_params.setdefault("httpResponseHeaders", True) + + method = api_params.get("httpRequestMethod") + if method: + logger.warning( + f"Request {request} uses the Zyte Data API httpRequestMethod " + f"parameter. Use Request.method instead." + ) + if method != request.method: + logger.warning( + f"The HTTP method of request {request} ({request.method}) " + f"does not match the Zyte Data API httpRequestMethod " + f"parameter ({method})." + ) + elif request.method != "GET": + if response_body: + api_params["httpRequestMethod"] = request.method + else: + logger.warning( + f"The HTTP method of request {request} ({request.method}) " + f"is being ignored. The httpRequestMethod parameter of " + f"Zyte Data API can only be set when the httpResponseBody " + f"parameter is True." + ) + + self._map_headers(api_params, request) + body = api_params.get("httpRequestBody") if body: logger.warning( @@ -257,29 +256,29 @@ def _prepare_api_params(self, request: Request) -> Optional[dict]: def _update_stats(self): prefix = "scrapy-zyte-api" for stat in ( - '429', - 'attempts', - 'errors', - 'fatal_errors', - 'processed', - 'success', + "429", + "attempts", + "errors", + "fatal_errors", + "processed", + "success", ): self._stats.set_value( f"{prefix}/{stat}", getattr(self._client.agg_stats, f"n_{stat}"), ) for stat in ( - 'error_ratio', - 'success_ratio', - 'throttle_ratio', + "error_ratio", + "success_ratio", + "throttle_ratio", ): self._stats.set_value( f"{prefix}/{stat}", getattr(self._client.agg_stats, stat)(), ) for source, target in ( - ('connect', 'connection'), - ('total', 'response'), + ("connect", "connection"), + ("total", "response"), ): self._stats.set_value( f"{prefix}/mean_{target}_seconds", @@ -292,7 +291,10 @@ def _update_stats(self): error_type = f"/{error_type}" self._stats.set_value(f"{prefix}/error_types{error_type}", count) - for counter in ('exception_types', 'status_codes',): + for counter in ( + "exception_types", + "status_codes", + ): for key, value in getattr(self._client.agg_stats, counter).items(): self._stats.set_value(f"{prefix}/{counter}/{key}", value) diff --git a/tests/conftest.py b/tests/conftest.py index 4eec4da9..db2b302e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,9 @@ import pytest -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def mockserver(): from .mockserver import MockServer + with MockServer() as server: yield server diff --git a/tests/test_handler.py b/tests/test_handler.py index 7d7d90d0..ceb00ed8 100644 --- a/tests/test_handler.py +++ b/tests/test_handler.py @@ -225,37 +225,37 @@ async def test_stats(mockserver): await handler.download_request(request, None) assert set(scrapy_stats.get_stats()) == { - f'scrapy-zyte-api/{stat}' + f"scrapy-zyte-api/{stat}" for stat in ( - '429', - 'attempts', - 'error_ratio', - 'errors', - 'fatal_errors', - 'mean_connection_seconds', - 'mean_response_seconds', - 'processed', - 'status_codes/200', - 'success_ratio', - 'success', - 'throttle_ratio', + "429", + "attempts", + "error_ratio", + "errors", + "fatal_errors", + "mean_connection_seconds", + "mean_response_seconds", + "processed", + "status_codes/200", + "success_ratio", + "success", + "throttle_ratio", ) } for suffix, value in ( - ('429', 0), - ('attempts', 1), - ('error_ratio', 0.0), - ('errors', 0), - ('fatal_errors', 0), - ('processed', 1), - ('status_codes/200', 1), - ('success_ratio', 1.0), - ('success', 1), - ('throttle_ratio', 0.0), + ("429", 0), + ("attempts", 1), + ("error_ratio", 0.0), + ("errors", 0), + ("fatal_errors", 0), + ("processed", 1), + ("status_codes/200", 1), + ("success_ratio", 1.0), + ("success", 1), + ("throttle_ratio", 0.0), ): stat = f"scrapy-zyte-api/{suffix}" assert scrapy_stats.get_value(stat) == value - for name in ('connection', 'response'): + for name in ("connection", "response"): stat = f"scrapy-zyte-api/mean_{name}_seconds" value = scrapy_stats.get_value(stat) assert isinstance(value, float) From 26b2ef080d751fb04607d1bead2a78af4b3be916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Aug 2022 10:01:43 +0200 Subject: [PATCH 07/81] README: move ZYTE_API_ENABLED under Configuration --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index b36345e0..863baf7a 100644 --- a/README.rst +++ b/README.rst @@ -60,6 +60,9 @@ Here's an example of the things needed inside a Scrapy project's ``settings.py`` TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +The ``ZYTE_API_ENABLED`` setting, which is ``True`` by default, can be set to +``False`` to disable this plugin. + Usage ----- @@ -214,9 +217,6 @@ Zyte Data API requests need parameters. If you set the ``zyte_api`` key in Request.meta_ or the ``ZYTE_API_ON_ALL_REQUESTS`` setting to ``True``, you must also :ref:`set default parameters `. -Set the ``ZYTE_API_ENABLED`` setting to ``False`` to disable this plugin. -``ZYTE_API_ENABLED`` is ``True`` by default. - Customizing the retry policy ---------------------------- From e78a499a85d6ef0c81a0ccbff1704724092615af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Aug 2022 10:42:04 +0200 Subject: [PATCH 08/81] test_zyte_api_request_meta: update misleading values --- tests/test_api_requests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index b8d2ac9c..6b3092a6 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -106,7 +106,7 @@ async def test_http_response_headers_request( ( {}, {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA"}, + {}, False, ), ( @@ -150,7 +150,7 @@ async def test_http_response_headers_request( "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, "ZYTE_API_ON_ALL_REQUESTS": False, }, - {"browserHtml": True, "geolocation": "CA"}, + {}, False, ), ( From d9fbbf15016bd4914a1c7380b8114c67ce658c33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Aug 2022 11:41:24 +0200 Subject: [PATCH 09/81] Clarify value precedence in warnings --- scrapy_zyte_api/handler.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 99b484e0..11719075 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -127,8 +127,8 @@ def _map_headers(self, api_params, request): if headers is not None: logger.warning( f"Request {request} defines the Zyte Data API " - f"customHttpRequestHeaders parameter. Use Request.headers " - f"instead." + f"customHttpRequestHeaders parameter, overriding " + f"Request.headers. Use Request.headers instead." ) elif request.headers: headers = [] @@ -156,7 +156,8 @@ def _map_headers(self, api_params, request): if headers is not None: logger.warning( f"Request {request} defines the Zyte Data API " - f"requestHeaders parameter. Use Request.headers instead." + f"requestHeaders parameter, overriding Request.headers. " + f"Use Request.headers instead." ) elif request.headers: request_headers = {} @@ -198,9 +199,9 @@ def _map_params(self, api_params, request): if api_params.get("httpResponseHeaders") is True: logger.warning( "You do not need to set httpResponseHeaders to True if " - "you httpResponseBody or browserHtml to True. Note that " - "httpResponseBody is set to True automatically if neither " - "browserHtml nor screenshot are set to True." + "you set httpResponseBody or browserHtml to True. Note " + "that httpResponseBody is set to True automatically if " + "neither browserHtml nor screenshot are set to True." ) api_params.setdefault("httpResponseHeaders", True) @@ -208,7 +209,8 @@ def _map_params(self, api_params, request): if method: logger.warning( f"Request {request} uses the Zyte Data API httpRequestMethod " - f"parameter. Use Request.method instead." + f"parameter, overriding Request.method. Use Request.method " + f"instead." ) if method != request.method: logger.warning( @@ -233,7 +235,8 @@ def _map_params(self, api_params, request): if body: logger.warning( f"Request {request} uses the Zyte Data API httpRequestBody " - f"parameter. Use Request.body instead." + f"parameter, overriding Request.body. Use Request.body " + f"instead." ) decoded_body = b64decode(body) if decoded_body != request.body: From 4442ac9ff852dc8e851638f6aed9976355de05e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Aug 2022 11:44:45 +0200 Subject: [PATCH 10/81] Include an unsupported header in the test for ZYTE_API_UNSUPPORTED_HEADERS --- tests/test_api_requests.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 3e38e82b..9ddadce1 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1092,7 +1092,10 @@ async def parse(self, response): # in the future. ( { - "headers": {"User-Agent": ""}, + "headers": { + "Cookie": "", + "User-Agent": "", + }, }, { "ZYTE_API_UNSUPPORTED_HEADERS": ["Cookie"], @@ -1104,7 +1107,9 @@ async def parse(self, response): {"name": "User-Agent", "value": ""}, ], }, - [], + [ + "defines header b'Cookie', which cannot be mapped", + ], ), # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support # for new fields that the requestHeaders parameter may support in the From a80b04547df3475d8f8a213ea7a5f7248790caff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Aug 2022 11:49:28 +0200 Subject: [PATCH 11/81] Implement the zyte_api_automap request meta key --- scrapy_zyte_api/handler.py | 2 +- tests/test_api_requests.py | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 11719075..5289b62b 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -115,7 +115,7 @@ def _prepare_api_params(self, request: Request) -> Optional[dict]: ) raise - if self._automap: + if request.meta.get("zyte_api_automap", self._automap): self._map_params(api_params, request) return api_params diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 9ddadce1..f9833f66 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -527,13 +527,14 @@ async def parse(self, response): "request_kwargs,settings,expected,warnings", [ # Automatic mapping of request parameters to Zyte Data API parameters - # is enabled by default, but can be disabled. + # is enabled by default, but can be disabled, either globally or per + # request. # # httpResponseBody is set to True if no other main content is # requested. *( ( - {}, + request_kwargs, settings, { "httpResponseBody": True, @@ -541,9 +542,13 @@ async def parse(self, response): }, [], ) - for settings in ( - {}, - {"ZYTE_API_AUTOMAP": True}, + for request_kwargs, settings in ( + ({}, {}), + ({}, {"ZYTE_API_AUTOMAP": True}), + ( + {"meta": {"zyte_api_automap": True}}, + {"ZYTE_API_AUTOMAP": False}, + ), ) ), ( @@ -552,6 +557,12 @@ async def parse(self, response): False, [], ), + ( + {"meta": {"zyte_api_automap": False}}, + {}, + False, + [], + ), *( ( {"meta": {"zyte_api": {"a": "b"}}}, From 9864273423a153974ffb52a0cc4106abeef9db7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Aug 2022 12:13:39 +0200 Subject: [PATCH 12/81] Add a missing test case --- tests/test_api_requests.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index f9833f66..8c391d26 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -954,6 +954,17 @@ async def parse(self, response): ["cannot be mapped"], ), # Headers with None as value are silently ignored. + ( + { + "headers": {"a": None}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), ( { "headers": {"a": None}, From afd9b6dff1e408a17ce0f3996c5633f4dbe2ce2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Aug 2022 12:22:24 +0200 Subject: [PATCH 13/81] Restore Python 3.7 support --- setup.py | 6 +++++- tests/test_api_requests.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 22e5d8ac..68817309 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,11 @@ author_email="info@zyte.com", url="https://github.com/scrapy-plugins/scrapy-zyte-api", packages=["scrapy_zyte_api"], - install_requires=["zyte-api>=0.3.0", "scrapy>=2.6.0"], + install_requires=[ + "scrapy>=2.6.0", + "typing-extensions>=3.10", + "zyte-api>=0.3.0", + ], classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 8c391d26..53c2efe9 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,6 +1,6 @@ import sys from asyncio import iscoroutine -from typing import Any, Dict, List, Literal, Union +from typing import Any, Dict, List, Union from unittest import mock import pytest @@ -14,6 +14,7 @@ from scrapy.utils.defer import deferred_from_coro from scrapy.utils.test import get_crawler from twisted.internet.defer import Deferred +from typing_extensions import Literal from zyte_api.aio.errors import RequestError from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS From c49e1b36bb4055e467a7fcc2f011b38af674dc8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Aug 2022 22:17:34 +0200 Subject: [PATCH 14/81] Refactor parse_api_params and start refactoring tests --- scrapy_zyte_api/handler.py | 388 +++++++++++++++++++++---------------- tests/test_api_requests.py | 181 +++++++++++++++-- 2 files changed, 384 insertions(+), 185 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 5289b62b..8fb216e4 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,6 +1,6 @@ import logging from base64 import b64decode, b64encode -from typing import Any, Dict, Generator, Optional, Union +from typing import Any, Dict, Generator, Optional, Set, Union from warnings import warn from scrapy import Spider @@ -25,6 +25,214 @@ logger = logging.getLogger(__name__) +def _update_api_params_from_request_headers( + api_params: Dict[str, Any], + request: Request, + *, + unsupported_headers: Set[str], + browser_headers: Dict[str, str], +): + """Updates *api_params*, in place, based on *request*.""" + response_body = api_params.get("httpResponseBody") + if response_body: + headers = api_params.get("customHttpRequestHeaders") + if headers is not None: + logger.warning( + f"Request {request} defines the Zyte Data API " + f"customHttpRequestHeaders parameter, overriding " + f"Request.headers. Use Request.headers instead." + ) + elif request.headers: + headers = [] + for k, v in request.headers.items(): + if not v: + continue + v = b",".join(v).decode() + lowercase_k = k.strip().lower() + if lowercase_k in unsupported_headers: + if lowercase_k != b"user-agent" or v != DEFAULT_USER_AGENT: + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte Data API " + f"customHttpRequestHeaders parameter." + ) + continue + k = k.decode() + headers.append({"name": k, "value": v}) + if headers: + api_params["customHttpRequestHeaders"] = headers + if not response_body or any( + api_params.get(k) for k in ("browserHtml", "screenshot") + ): + headers = api_params.get("requestHeaders") + if headers is not None: + logger.warning( + f"Request {request} defines the Zyte Data API " + f"requestHeaders parameter, overriding Request.headers. " + f"Use Request.headers instead." + ) + elif request.headers: + request_headers = {} + for k, v in request.headers.items(): + if not v: + continue + v = b",".join(v).decode() + lowercase_k = k.strip().lower() + key = browser_headers.get(lowercase_k) + if key is not None: + request_headers[key] = v + elif not ( + ( + lowercase_k == b"accept" + and v == DEFAULT_REQUEST_HEADERS["Accept"] + ) + or ( + lowercase_k == b"accept-language" + and v == DEFAULT_REQUEST_HEADERS["Accept-Language"] + ) + or (lowercase_k == b"user-agent" and v == DEFAULT_USER_AGENT) + ): + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte Data API " + f"requestHeaders parameter." + ) + if request_headers: + api_params["requestHeaders"] = request_headers + + +def _update_api_params_from_request( + api_params: Dict[str, Any], + request: Request, + *, + unsupported_headers: Set[str], + browser_headers: Dict[str, str], +): + if not any( + api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") + ): + api_params.setdefault("httpResponseBody", True) + response_body = api_params.get("httpResponseBody") + + if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): + if api_params.get("httpResponseHeaders") is True: + logger.warning( + "You do not need to set httpResponseHeaders to True if " + "you set httpResponseBody or browserHtml to True. Note " + "that httpResponseBody is set to True automatically if " + "neither browserHtml nor screenshot are set to True." + ) + api_params.setdefault("httpResponseHeaders", True) + + method = api_params.get("httpRequestMethod") + if method: + logger.warning( + f"Request {request} uses the Zyte Data API httpRequestMethod " + f"parameter, overriding Request.method. Use Request.method " + f"instead." + ) + if method != request.method: + logger.warning( + f"The HTTP method of request {request} ({request.method}) " + f"does not match the Zyte Data API httpRequestMethod " + f"parameter ({method})." + ) + elif request.method != "GET": + if response_body: + api_params["httpRequestMethod"] = request.method + else: + logger.warning( + f"The HTTP method of request {request} ({request.method}) " + f"is being ignored. The httpRequestMethod parameter of " + f"Zyte Data API can only be set when the httpResponseBody " + f"parameter is True." + ) + + _update_api_params_from_request_headers( + api_params, + request, + unsupported_headers=unsupported_headers, + browser_headers=browser_headers, + ) + + body = api_params.get("httpRequestBody") + if body: + logger.warning( + f"Request {request} uses the Zyte Data API httpRequestBody " + f"parameter, overriding Request.body. Use Request.body " + f"instead." + ) + decoded_body = b64decode(body) + if decoded_body != request.body: + logger.warning( + f"The body of request {request} ({request.body!r}) " + f"does not match the Zyte Data API httpRequestBody " + f"parameter ({body!r}; decoded: {decoded_body!r})." + ) + elif request.body != b"": + if response_body: + base64_body = b64encode(request.body).decode() + api_params["httpRequestBody"] = base64_body + else: + logger.warning( + f"The body of request {request} ({request.body!r}) " + f"is being ignored. The httpRequestBody parameter of " + f"Zyte Data API can only be set when the httpResponseBody " + f"parameter is True." + ) + + return api_params + + +def _get_api_params( + request: Request, + *, + use_api_by_default: bool, + automap_by_default: bool, + default_params: Optional[Dict[str, Any]], + unsupported_headers: Set[str], + browser_headers: Dict[str, str], +) -> Optional[dict]: + """Returns a dictionary of API parameters that must be sent to Zyte Data + API for the specified request, or None if the request should not be driven + through Zyte Data API.""" + meta_params = request.meta.get("zyte_api", use_api_by_default) + if meta_params is False: + return None + + if not meta_params and meta_params != {}: + warn( + f"Setting the zyte_api request metadata key to " + f"{meta_params!r} is deprecated. Use False instead.", + DeprecationWarning, + ) + return None + + if meta_params is True: + meta_params = {} + + api_params: Dict[str, Any] = default_params or {} + try: + api_params.update(meta_params) + except (ValueError, TypeError): + actual_type = type(request.meta.get("zyte_api")) + logger.error( + f"'zyte_api' parameters in the request meta should be provided as " + f"a dictionary, got {actual_type} instead ({request})." + ) + raise + + if request.meta.get("zyte_api_automap", automap_by_default): + _update_api_params_from_request( + api_params, + request, + unsupported_headers=unsupported_headers, + browser_headers=browser_headers, + ) + + return api_params + + class ScrapyZyteAPIDownloadHandler(HTTPDownloadHandler): def __init__( self, settings: Settings, crawler: Crawler, client: AsyncClient = None @@ -83,182 +291,20 @@ def __init__( } def download_request(self, request: Request, spider: Spider) -> Deferred: - api_params = self._prepare_api_params(request) + api_params = _get_api_params( + request, + use_api_by_default=self._on_all_requests, + automap_by_default=self._automap, + default_params=self._zyte_api_default_params, + unsupported_headers=self._unsupported_headers, + browser_headers=self._browser_headers, + ) if api_params: return deferred_from_coro( self._download_request(api_params, request, spider) ) return super().download_request(request, spider) - def _prepare_api_params(self, request: Request) -> Optional[dict]: - meta_params = request.meta.get("zyte_api", self._on_all_requests) - if meta_params is False: - return None - if not meta_params and meta_params != {}: - warn( - f"Setting the zyte_api request metadata key to " - f"{meta_params!r} is deprecated. Use False instead.", - DeprecationWarning, - ) - return None - if meta_params is True: - meta_params = {} - - api_params: Dict[str, Any] = self._zyte_api_default_params or {} - try: - api_params.update(meta_params) - except (ValueError, TypeError): - logger.error( - f"'zyte_api' parameters in the request meta should be " - f"provided as dictionary, got {type(request.meta.get('zyte_api'))} " - f"instead. ({request})." - ) - raise - - if request.meta.get("zyte_api_automap", self._automap): - self._map_params(api_params, request) - - return api_params - - def _map_headers(self, api_params, request): - response_body = api_params.get("httpResponseBody") - if response_body: - headers = api_params.get("customHttpRequestHeaders") - if headers is not None: - logger.warning( - f"Request {request} defines the Zyte Data API " - f"customHttpRequestHeaders parameter, overriding " - f"Request.headers. Use Request.headers instead." - ) - elif request.headers: - headers = [] - for k, v in request.headers.items(): - if not v: - continue - v = b",".join(v).decode() - lowercase_k = k.strip().lower() - if lowercase_k in self._unsupported_headers: - if lowercase_k != b"user-agent" or v != DEFAULT_USER_AGENT: - logger.warning( - f"Request {request} defines header {k}, which " - f"cannot be mapped into the Zyte Data API " - f"customHttpRequestHeaders parameter." - ) - continue - k = k.decode() - headers.append({"name": k, "value": v}) - if headers: - api_params["customHttpRequestHeaders"] = headers - if not response_body or any( - api_params.get(k) for k in ("browserHtml", "screenshot") - ): - headers = api_params.get("requestHeaders") - if headers is not None: - logger.warning( - f"Request {request} defines the Zyte Data API " - f"requestHeaders parameter, overriding Request.headers. " - f"Use Request.headers instead." - ) - elif request.headers: - request_headers = {} - for k, v in request.headers.items(): - if not v: - continue - v = b",".join(v).decode() - lowercase_k = k.strip().lower() - key = self._browser_headers.get(lowercase_k) - if key is not None: - request_headers[key] = v - elif not ( - ( - lowercase_k == b"accept" - and v == DEFAULT_REQUEST_HEADERS["Accept"] - ) - or ( - lowercase_k == b"accept-language" - and v == DEFAULT_REQUEST_HEADERS["Accept-Language"] - ) - or (lowercase_k == b"user-agent" and v == DEFAULT_USER_AGENT) - ): - logger.warning( - f"Request {request} defines header {k}, which " - f"cannot be mapped into the Zyte Data API " - f"requestHeaders parameter." - ) - if request_headers: - api_params["requestHeaders"] = request_headers - - def _map_params(self, api_params, request): - if not any( - api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") - ): - api_params.setdefault("httpResponseBody", True) - response_body = api_params.get("httpResponseBody") - - if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): - if api_params.get("httpResponseHeaders") is True: - logger.warning( - "You do not need to set httpResponseHeaders to True if " - "you set httpResponseBody or browserHtml to True. Note " - "that httpResponseBody is set to True automatically if " - "neither browserHtml nor screenshot are set to True." - ) - api_params.setdefault("httpResponseHeaders", True) - - method = api_params.get("httpRequestMethod") - if method: - logger.warning( - f"Request {request} uses the Zyte Data API httpRequestMethod " - f"parameter, overriding Request.method. Use Request.method " - f"instead." - ) - if method != request.method: - logger.warning( - f"The HTTP method of request {request} ({request.method}) " - f"does not match the Zyte Data API httpRequestMethod " - f"parameter ({method})." - ) - elif request.method != "GET": - if response_body: - api_params["httpRequestMethod"] = request.method - else: - logger.warning( - f"The HTTP method of request {request} ({request.method}) " - f"is being ignored. The httpRequestMethod parameter of " - f"Zyte Data API can only be set when the httpResponseBody " - f"parameter is True." - ) - - self._map_headers(api_params, request) - - body = api_params.get("httpRequestBody") - if body: - logger.warning( - f"Request {request} uses the Zyte Data API httpRequestBody " - f"parameter, overriding Request.body. Use Request.body " - f"instead." - ) - decoded_body = b64decode(body) - if decoded_body != request.body: - logger.warning( - f"The body of request {request} ({request.body!r}) " - f"does not match the Zyte Data API httpRequestBody " - f"parameter ({body!r}; decoded: {decoded_body!r})." - ) - elif request.body != b"": - if response_body: - base64_body = b64encode(request.body).decode() - api_params["httpRequestBody"] = base64_body - else: - logger.warning( - f"The body of request {request} ({request.body!r}) " - f"is being ignored. The httpRequestBody parameter of " - f"Zyte Data API can only be set when the httpResponseBody " - f"parameter is True." - ) - - return api_params - def _update_stats(self): prefix = "scrapy-zyte-api" for stat in ( diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 53c2efe9..722af0ea 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -2,6 +2,7 @@ from asyncio import iscoroutine from typing import Any, Dict, List, Union from unittest import mock +from unittest.mock import patch import pytest from _pytest.logging import LogCaptureFixture # NOQA @@ -17,6 +18,8 @@ from typing_extensions import Literal from zyte_api.aio.errors import RequestError +from scrapy_zyte_api.handler import _get_api_params + from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS from .mockserver import DelayedResource, MockServer, produce_request_response @@ -405,18 +408,6 @@ async def test_coro_handling(meta: Dict[str, Dict[str, Any]], mockserver): "Got an error when processing Zyte API request (http://example.com): " "Object of type Request is not JSON serializable", ), - ( - {"zyte_api": ["some", "bad", "non-dict", "value"]}, - ValueError, - "'zyte_api' parameters in the request meta should be provided as " - "dictionary, got instead. ().", - ), - ( - {"zyte_api": 1}, - TypeError, - "'zyte_api' parameters in the request meta should be provided as " - "dictionary, got instead. ().", - ), ( {"zyte_api": {"browserHtml": True, "httpResponseBody": True}}, RequestError, @@ -436,7 +427,14 @@ async def test_exceptions( req = Request("http://example.com", method="POST", meta=meta) with pytest.raises(exception_type): # NOQA - api_params = handler._prepare_api_params(req) + api_params = _get_api_params( + req, + use_api_by_default=handler._on_all_requests, + automap_by_default=handler._automap, + default_params=handler._zyte_api_default_params, + unsupported_headers=handler._unsupported_headers, + browser_headers=handler._browser_headers, + ) await deferred_from_coro( handler._download_request(api_params, req, Spider("test")) # NOQA ) # NOQA @@ -456,7 +454,14 @@ async def test_job_id(job_id, mockserver): method="POST", meta={"zyte_api": {"browserHtml": True}}, ) - api_params = handler._prepare_api_params(req) + api_params = _get_api_params( + req, + use_api_by_default=handler._on_all_requests, + automap_by_default=handler._automap, + default_params=handler._zyte_api_default_params, + unsupported_headers=handler._unsupported_headers, + browser_headers=handler._browser_headers, + ) resp = await deferred_from_coro( handler._download_request(api_params, req, Spider("test")) # NOQA ) @@ -1286,3 +1291,151 @@ async def test_automap( assert warning in caplog.text else: assert not caplog.records + + +_UNSUPPORTED_HEADERS = {b"cookie", b"user-agent"} +_BROWSER_HEADERS = {b"referer": "referer"} + + +@ensureDeferred +async def test_get_api_params_input_default(mockserver): + request = Request(url="https://example.com") + async with mockserver.make_handler() as handler: + patch_path = "scrapy_zyte_api.handler._get_api_params" + with patch(patch_path) as _get_api_params: + _get_api_params.side_effect = RuntimeError("That’s it!") + with pytest.raises(RuntimeError): + await handler.download_request(request, None) + _get_api_params.assert_called_once_with( + request, + use_api_by_default=False, + automap_by_default=True, + default_params={}, + unsupported_headers=_UNSUPPORTED_HEADERS, + browser_headers=_BROWSER_HEADERS, + ) + + +@ensureDeferred +async def test_get_api_params_input_custom(mockserver): + request = Request(url="https://example.com") + settings = { + "ZYTE_API_AUTOMAP": False, + "ZYTE_API_BROWSER_HEADERS": {"B": "b"}, + "ZYTE_API_DEFAULT_PARAMS": {"a": "b"}, + "ZYTE_API_ON_ALL_REQUESTS": True, + "ZYTE_API_UNSUPPORTED_HEADERS": {"A"}, + } + async with mockserver.make_handler(settings) as handler: + patch_path = "scrapy_zyte_api.handler._get_api_params" + with patch(patch_path) as _get_api_params: + _get_api_params.side_effect = RuntimeError("That’s it!") + with pytest.raises(RuntimeError): + await handler.download_request(request, None) + _get_api_params.assert_called_once_with( + request, + use_api_by_default=True, + automap_by_default=False, + default_params={"a": "b"}, + unsupported_headers={b"a"}, + browser_headers={b"b": "b"}, + ) + + +_UNSET = object() + + +@ensureDeferred +@pytest.mark.parametrize( + "setting,meta,expected", + [ + (False, _UNSET, None), + (False, False, None), + (False, True, {}), + (False, {}, {}), + (False, {"a": "b"}, {"a": "b"}), + (True, _UNSET, {}), + (True, False, None), + (True, True, {}), + (True, {}, {}), + (True, {"a": "b"}, {"a": "b"}), + ], +) +async def test_get_api_params_toggling(setting, meta, expected): + request = Request(url="https://example.com") + if meta is not _UNSET: + request.meta["zyte_api"] = meta + api_params = _get_api_params( + request, + use_api_by_default=setting, + automap_by_default=False, + default_params={}, + unsupported_headers=_UNSUPPORTED_HEADERS, + browser_headers=_BROWSER_HEADERS, + ) + assert api_params == expected + + +@ensureDeferred +@pytest.mark.parametrize("setting", [False, True]) +@pytest.mark.parametrize("meta", [None, 0, "", b"", []]) +async def test_get_api_params_disabling_deprecated(setting, meta): + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + with pytest.warns(DeprecationWarning, match=r".* Use False instead\.$"): + api_params = _get_api_params( + request, + use_api_by_default=setting, + automap_by_default=False, + default_params={}, + unsupported_headers=_UNSUPPORTED_HEADERS, + browser_headers=_BROWSER_HEADERS, + ) + assert api_params is None + + +@ensureDeferred +@pytest.mark.parametrize( + "default_params,meta,expected", + [ + ({}, {}, {}), + ({}, {"b": 2}, {"b": 2}), + ({"a": 1}, {}, {"a": 1}), + ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}), + ({"a": 1}, {"a": 2}, {"a": 2}), + ], +) +async def test_get_api_params_default_params_merging(default_params, meta, expected): + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + api_params = _get_api_params( + request, + use_api_by_default=False, + automap_by_default=False, + default_params=default_params, + unsupported_headers=_UNSUPPORTED_HEADERS, + browser_headers=_BROWSER_HEADERS, + ) + assert api_params == expected + + +@ensureDeferred +@pytest.mark.parametrize( + "meta,exception", + [ + (1, TypeError), + (["a", "b"], ValueError), + ], +) +async def test_get_api_params_bad_meta_type(meta, exception): + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + with pytest.raises(exception): + _get_api_params( + request, + use_api_by_default=False, + automap_by_default=False, + default_params={}, + unsupported_headers=_UNSUPPORTED_HEADERS, + browser_headers=_BROWSER_HEADERS, + ) From 400a4c2ddd1e130f9cbc7bcb4b493401760b7dca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 25 Aug 2022 20:41:35 +0200 Subject: [PATCH 15/81] Complete test refactoring --- scrapy_zyte_api/handler.py | 85 +- tests/test_api_requests.py | 1797 +++++++++++++++++------------------- 2 files changed, 940 insertions(+), 942 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 8fb216e4..3b8eee76 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,6 +1,7 @@ import logging from base64 import b64decode, b64encode -from typing import Any, Dict, Generator, Optional, Set, Union +from copy import copy +from typing import Any, Dict, Generator, Mapping, Optional, Set, Union from warnings import warn from scrapy import Spider @@ -25,6 +26,12 @@ logger = logging.getLogger(__name__) +_DEFAULT_API_PARAMS = { + "browserHtml": False, + "screenshot": False, +} + + def _update_api_params_from_request_headers( api_params: Dict[str, Any], request: Request, @@ -101,18 +108,34 @@ def _update_api_params_from_request_headers( api_params["requestHeaders"] = request_headers -def _update_api_params_from_request( +def _update_api_params_from_request( # NOQA api_params: Dict[str, Any], request: Request, *, unsupported_headers: Set[str], browser_headers: Dict[str, str], + default_params: Dict[str, Any], ): if not any( api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") ): api_params.setdefault("httpResponseBody", True) + elif api_params.get("httpResponseBody") is True and not any( + api_params.get(k) for k in ("browserHtml", "screenshot") + ): + logger.warning( + "You do not need to set httpResponseBody to True if neither " + "browserHtml nor screenshot are set to True." + ) + elif api_params.get("httpResponseBody") is False: + logging.warning( + f"Request {request} unnecessarily defines the Zyte Data API " + f"'httpResponseBody' parameter with its default value, False. " + f"It will not be sent to the server." + ) response_body = api_params.get("httpResponseBody") + if response_body is False: + api_params.pop("httpResponseBody") if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): if api_params.get("httpResponseHeaders") is True: @@ -123,6 +146,15 @@ def _update_api_params_from_request( "neither browserHtml nor screenshot are set to True." ) api_params.setdefault("httpResponseHeaders", True) + elif api_params.get("httpResponseHeaders") is False: + logger.warning( + "You do not need to set httpResponseHeaders to False if " + "you do set httpResponseBody or browserHtml to True. Note " + "that httpResponseBody is set to True automatically if " + "neither browserHtml nor screenshot are set to True." + ) + if api_params.get("httpResponseHeaders") is False: + api_params.pop("httpResponseHeaders") method = api_params.get("httpRequestMethod") if method: @@ -181,6 +213,16 @@ def _update_api_params_from_request( f"parameter is True." ) + for param, default_value in _DEFAULT_API_PARAMS.items(): + if api_params.get(param) != default_value: + continue + logging.warning( + f"Request {request} unnecessarily defines the Zyte Data API " + f"{param!r} parameter with its default value, {default_value!r}. " + f"It will not be sent to the server." + ) + api_params.pop(param) + return api_params @@ -189,7 +231,7 @@ def _get_api_params( *, use_api_by_default: bool, automap_by_default: bool, - default_params: Optional[Dict[str, Any]], + default_params: Dict[str, Any], unsupported_headers: Set[str], browser_headers: Dict[str, str], ) -> Optional[dict]: @@ -210,17 +252,28 @@ def _get_api_params( if meta_params is True: meta_params = {} - - api_params: Dict[str, Any] = default_params or {} - try: - api_params.update(meta_params) - except (ValueError, TypeError): - actual_type = type(request.meta.get("zyte_api")) + elif not isinstance(meta_params, Mapping): logger.error( f"'zyte_api' parameters in the request meta should be provided as " - f"a dictionary, got {actual_type} instead ({request})." + f"a dictionary, got {type(meta_params)} instead in {request}." ) - raise + raise ValueError("The value of the 'zyte_api' meta key of ") + + api_params = copy(default_params) + for k in list(meta_params): + if meta_params[k] is not None: + continue + meta_params.pop(k) + if k in api_params: + api_params.pop(k) + else: + logger.warning( + f"In request {request} 'zyte_api' parameter {k} is None, " + f"which is a value reserved to unset parameters defined in " + f"the ZYTE_API_DEFAULT_PARAMS setting, but the setting does " + f"not define such a parameter." + ) + api_params.update(meta_params) if request.meta.get("zyte_api_automap", automap_by_default): _update_api_params_from_request( @@ -228,6 +281,7 @@ def _get_api_params( request, unsupported_headers=unsupported_headers, browser_headers=browser_headers, + default_params=default_params, ) return api_params @@ -269,6 +323,15 @@ def __init__( self._stats = crawler.stats self._job_id = crawler.settings.get("JOB") self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") + for param in list(self._zyte_api_default_params): + if self._zyte_api_default_params[param] is not None: + continue + logger.warning( + f"Parameter {param!r} in the ZYTE_API_DEFAULT_PARAMS " + f"setting is None. Default parameters should never be " + f"None." + ) + self._zyte_api_default_params.pop(param) self._session = create_session(connection_pool_size=self._client.n_conn) self._retry_policy = settings.get("ZYTE_API_RETRY_POLICY") if self._retry_policy: diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 722af0ea..ca8da3cd 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -12,7 +12,6 @@ from scrapy.http import Response, TextResponse from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT -from scrapy.utils.defer import deferred_from_coro from scrapy.utils.test import get_crawler from twisted.internet.defer import Deferred from typing_extensions import Literal @@ -102,256 +101,6 @@ async def test_http_response_body_request(meta: Dict[str, Dict[str, Any]], mocks assert resp.xpath("//body/text()").getall() == ["Hello"] -@ensureDeferred -@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") -@pytest.mark.filterwarnings("ignore:.*None is deprecated") -@pytest.mark.parametrize( - "meta,settings,expected,use_zyte_api", - [ - # Default ZYTE_API_ON_ALL_REQUESTS - ({}, {}, {}, False), - ( - {"zyte_api": {}}, - {}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": True}, - {}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - True, - ), - ({"zyte_api": False}, {}, {}, False), - ( - {}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {}, - False, - ), - ( - {"zyte_api": False}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {}, - False, - ), - ( - {"zyte_api": None}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {}, - False, - ), - ( - {"zyte_api": {}}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": True}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": {"javascript": True, "geolocation": "US"}}, - {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, - { - "browserHtml": True, - "geolocation": "US", - "javascript": True, - "httpResponseHeaders": True, - }, - True, - ), - # ZYTE_API_ON_ALL_REQUESTS=False - ({}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), - ( - {"zyte_api": {}}, - {"ZYTE_API_ON_ALL_REQUESTS": False}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": True}, - {"ZYTE_API_ON_ALL_REQUESTS": False}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - True, - ), - ({"zyte_api": False}, {"ZYTE_API_ON_ALL_REQUESTS": False}, {}, False), - ( - {}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": False, - }, - {}, - False, - ), - ( - {"zyte_api": False}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": False, - }, - {}, - False, - ), - ( - {"zyte_api": None}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": False, - }, - {}, - False, - ), - ( - {"zyte_api": {}}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": False, - }, - {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": True}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": False, - }, - {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": {"javascript": True, "geolocation": "US"}}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": False, - }, - { - "browserHtml": True, - "geolocation": "US", - "javascript": True, - "httpResponseHeaders": True, - }, - True, - ), - # ZYTE_API_ON_ALL_REQUESTS=True - ( - {}, - {"ZYTE_API_ON_ALL_REQUESTS": True}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": {}}, - {"ZYTE_API_ON_ALL_REQUESTS": True}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": True}, - {"ZYTE_API_ON_ALL_REQUESTS": True}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - True, - ), - ({"zyte_api": False}, {"ZYTE_API_ON_ALL_REQUESTS": True}, {}, False), - ( - {}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": True, - }, - {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": False}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": True, - }, - {}, - False, - ), - ( - {"zyte_api": None}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": True, - }, - {}, - False, - ), - ( - {"zyte_api": {}}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": True, - }, - {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": True}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": True, - }, - {"browserHtml": True, "geolocation": "CA", "httpResponseHeaders": True}, - True, - ), - ( - {"zyte_api": {"javascript": True, "geolocation": "US"}}, - { - "ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}, - "ZYTE_API_ON_ALL_REQUESTS": True, - }, - { - "browserHtml": True, - "geolocation": "US", - "javascript": True, - "httpResponseHeaders": True, - }, - True, - ), - ], -) -async def test_zyte_api_request_meta( - meta: Dict[str, Dict[str, Any]], - settings: Dict[str, str], - expected: Dict[str, str], - use_zyte_api: bool, - mockserver, -): - async with mockserver.make_handler(settings) as handler: - req = Request(mockserver.urljoin("/"), meta=meta) - unmocked_client = handler._client - handler._client = mock.AsyncMock(unmocked_client) - handler._client.request_raw.side_effect = unmocked_client.request_raw - - await handler.download_request(req, None) - - # What we're interested in is the Request call in the API - request_call = [ - c for c in handler._client.mock_calls if "request_raw(" in str(c) - ] - - if not use_zyte_api: - assert request_call == [] - return - - elif not request_call: - pytest.fail("The client's request_raw() method was not called.") - - args_used = request_call[0].args[0] - args_used.pop("url") - - assert args_used == expected - - @ensureDeferred async def test_disable(mockserver): settings = {"ZYTE_API_ENABLED": False} @@ -359,38 +108,16 @@ async def test_disable(mockserver): assert handler is None +@pytest.mark.parametrize("zyte_api", [True, False]) @ensureDeferred -@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") -async def test_zyte_api_request_meta_none_deprecation(mockserver): - async with mockserver.make_handler() as handler: - req = Request(mockserver.urljoin("/"), meta={"zyte_api": None}) - handler._client = mock.AsyncMock(handler._client) - with pytest.warns(DeprecationWarning, match="None is deprecated"): - await handler.download_request(req, None) - - -@pytest.mark.parametrize( - "meta", - [ - {"zyte_api": {"waka": True}}, - {"zyte_api": True}, - {"zyte_api": {"browserHtml": True}}, - {"zyte_api": {}}, - {"zyte_api": False}, - {"randomParameter": True}, - {}, - None, - ], -) -@ensureDeferred -async def test_coro_handling(meta: Dict[str, Dict[str, Any]], mockserver): +async def test_coro_handling(zyte_api: bool, mockserver): settings = {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True}} async with mockserver.make_handler(settings) as handler: req = Request( # this should really be a URL to a website, not to the API server, # but API server URL works ok mockserver.urljoin("/"), - meta=meta, + meta={"zyte_api": zyte_api}, ) dfd = handler.download_request(req, Spider("test")) assert not iscoroutine(dfd) @@ -425,28 +152,14 @@ async def test_exceptions( ): async with mockserver.make_handler() as handler: req = Request("http://example.com", method="POST", meta=meta) - - with pytest.raises(exception_type): # NOQA - api_params = _get_api_params( - req, - use_api_by_default=handler._on_all_requests, - automap_by_default=handler._automap, - default_params=handler._zyte_api_default_params, - unsupported_headers=handler._unsupported_headers, - browser_headers=handler._browser_headers, - ) - await deferred_from_coro( - handler._download_request(api_params, req, Spider("test")) # NOQA - ) # NOQA + with pytest.raises(exception_type): + await handler.download_request(req, None) assert exception_text in caplog.text -@pytest.mark.parametrize( - "job_id", - ["547773/99/6"], -) @ensureDeferred -async def test_job_id(job_id, mockserver): +async def test_job_id(mockserver): + job_id = "547773/99/6" settings = {"JOB": job_id} async with mockserver.make_handler(settings) as handler: req = Request( @@ -454,17 +167,7 @@ async def test_job_id(job_id, mockserver): method="POST", meta={"zyte_api": {"browserHtml": True}}, ) - api_params = _get_api_params( - req, - use_api_by_default=handler._on_all_requests, - automap_by_default=handler._automap, - default_params=handler._zyte_api_default_params, - unsupported_headers=handler._unsupported_headers, - browser_headers=handler._browser_headers, - ) - resp = await deferred_from_coro( - handler._download_request(api_params, req, Spider("test")) # NOQA - ) + resp = await handler.download_request(req, None) assert resp.request is req assert resp.url == req.url @@ -483,7 +186,7 @@ async def test_higher_concurrency(): response_indexes = [] expected_first_indexes = {0, concurrency - 1} fast_seconds = 0.001 - slow_seconds = 0.1 + slow_seconds = 0.2 with MockServer(DelayedResource) as server: @@ -532,97 +235,55 @@ async def parse(self, response): @pytest.mark.parametrize( "request_kwargs,settings,expected,warnings", [ - # Automatic mapping of request parameters to Zyte Data API parameters - # is enabled by default, but can be disabled, either globally or per - # request. - # - # httpResponseBody is set to True if no other main content is - # requested. - *( - ( - request_kwargs, - settings, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - ) - for request_kwargs, settings in ( - ({}, {}), - ({}, {"ZYTE_API_AUTOMAP": True}), - ( - {"meta": {"zyte_api_automap": True}}, - {"ZYTE_API_AUTOMAP": False}, - ), - ) - ), - ( - {}, - {"ZYTE_API_AUTOMAP": False}, - False, - [], - ), + # The Accept and Accept-Language headers, when unsupported, are dropped + # silently if their value matches the default value of Scrapy for + # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. ( - {"meta": {"zyte_api_automap": False}}, + { + "headers": DEFAULT_REQUEST_HEADERS, + "meta": {"zyte_api": {"browserHtml": True}}, + }, {}, - False, - [], - ), - *( - ( - {"meta": {"zyte_api": {"a": "b"}}}, - settings, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "a": "b", - }, - [], - ) - for settings in ( - {}, - {"ZYTE_API_AUTOMAP": True}, - ) - ), - ( - {"meta": {"zyte_api": {"a": "b"}}}, - {"ZYTE_API_AUTOMAP": False}, { - "a": "b", + "browserHtml": True, + "httpResponseHeaders": True, }, [], ), - # httpResponseBody can be unset through meta. That way, if a new main - # output type other than browserHtml and screenshot is implemented in - # the future, you can request the new output type and also prevent - # httpResponseBody from being enabled automatically, without the need - # to disable automated mapping completely. ( - {"meta": {"zyte_api": {"httpResponseBody": False}}}, + { + "headers": { + "Accept": "application/json", + "Accept-Language": "uk", + }, + "meta": {"zyte_api": {"browserHtml": True}}, + }, {}, { - "httpResponseBody": False, + "browserHtml": True, + "httpResponseHeaders": True, }, - [], + ["cannot be mapped"], ), + # The Cookie header is dropped with a warning. ( { - "meta": { - "zyte_api": {"httpResponseBody": False, "newOutputType": True} + "headers": { + "Cookie": "a=b", }, }, {}, { - "httpResponseBody": False, - "newOutputType": True, + "httpResponseBody": True, + "httpResponseHeaders": True, }, - [], + ["cannot be mapped"], ), - # httpResponseHeaders is automatically set to True for httpResponseBody - # (shown in prior tests) and browserHtml. ( { + "headers": { + "Cookie": "a=b", + }, "meta": {"zyte_api": {"browserHtml": True}}, }, {}, @@ -630,142 +291,48 @@ async def parse(self, response): "browserHtml": True, "httpResponseHeaders": True, }, - [], + ["cannot be mapped"], ), - # httpResponseHeaders is not set for screenshot. + # The User-Agent header, which Scrapy sets by default, is dropped + # silently if it matches the default value of the USER_AGENT setting, + # or with a warning otherwise. ( { - "meta": {"zyte_api": {"screenshot": True}}, + "headers": {"User-Agent": DEFAULT_USER_AGENT}, }, {}, { - "screenshot": True, + "httpResponseBody": True, + "httpResponseHeaders": True, }, [], ), - # httpResponseHeaders can be unset through meta. ( { - "meta": {"zyte_api": {"httpResponseHeaders": False}}, + "headers": {"User-Agent": ""}, }, {}, { "httpResponseBody": True, - "httpResponseHeaders": False, + "httpResponseHeaders": True, }, - [], + ["cannot be mapped"], ), ( { - "meta": { - "zyte_api": { - "browserHtml": True, - "httpResponseHeaders": False, - }, - }, + "headers": {"User-Agent": DEFAULT_USER_AGENT}, + "meta": {"zyte_api": {"browserHtml": True}}, }, {}, { "browserHtml": True, - "httpResponseHeaders": False, + "httpResponseHeaders": True, }, [], ), - # METHOD - # Request.method is mapped as is. - *( - ( - {"method": method}, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestMethod": method, - }, - [], - ) - for method in ( - "POST", - "PUT", - "DELETE", - "OPTIONS", - "TRACE", - "PATCH", - ) - ), - # Request.method is mapped even for methods that Zyte Data API does not - # support. - *( - ( - {"method": method}, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestMethod": method, - }, - [], - ) - for method in ( - "HEAD", - "CONNECT", - "FOO", - ) - ), - # An exception is the default method (GET), which is not mapped. - ( - {"method": "GET"}, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - ), - # httpRequestMethod should not be defined through meta. - ( - { - "meta": { - "zyte_api": { - "httpRequestMethod": "GET", - }, - }, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestMethod": "GET", - }, - ["Use Request.method instead"], - ), - # If defined through meta, httpRequestMethod takes precedence, warning - # about value mismatches. - ( - { - "method": "POST", - "meta": { - "zyte_api": { - "httpRequestMethod": "PATCH", - }, - }, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestMethod": "PATCH", - }, - [ - "Use Request.method instead", - "does not match the Zyte Data API httpRequestMethod parameter", - ], - ), - # A non-GET method should not be used unless httpResponseBody is also - # used. ( { - "method": "POST", + "headers": {"User-Agent": ""}, "meta": {"zyte_api": {"browserHtml": True}}, }, {}, @@ -773,247 +340,104 @@ async def parse(self, response): "browserHtml": True, "httpResponseHeaders": True, }, - ["can only be set when the httpResponseBody parameter"], + ["cannot be mapped"], ), + # You may update the ZYTE_API_UNSUPPORTED_HEADERS setting to remove + # headers that the customHttpRequestHeaders parameter starts supporting + # in the future. ( { - "method": "POST", - "meta": {"zyte_api": {"screenshot": True}}, - }, - {}, - { - "screenshot": True, + "headers": { + "Cookie": "", + "User-Agent": "", + }, }, - ["can only be set when the httpResponseBody parameter"], - ), - # HEADERS - # Headers are mapped to requestHeaders or customHttpRequestHeaders - # depending on whether or not httpResponseBody is declared. - ( { - "headers": {"Referer": "a"}, + "ZYTE_API_UNSUPPORTED_HEADERS": ["Cookie"], }, - {}, { "httpResponseBody": True, "httpResponseHeaders": True, "customHttpRequestHeaders": [ - {"name": "Referer", "value": "a"}, + {"name": "User-Agent", "value": ""}, ], }, - [], + [ + "defines header b'Cookie', which cannot be mapped", + ], ), + # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support + # for new fields that the requestHeaders parameter may support in the + # future. ( { - "headers": {"Referer": "a"}, + "headers": {"User-Agent": ""}, "meta": {"zyte_api": {"browserHtml": True}}, }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - "requestHeaders": {"referer": "a"}, - }, - [], - ), - # We intentionally generate requestHeaders even if browserHtml and - # screenshot are not used, assuming that future additional outputs are - # more likely to use requestHeaders than to use - # customHttpRequestHeaders. - ( - { - "headers": {"Referer": "a"}, - "meta": {"zyte_api": {"httpResponseBody": False}}, - }, - {}, - { - "httpResponseBody": False, - "requestHeaders": {"referer": "a"}, - }, - [], - ), - # If both httpResponseBody and currently-incompatible attributes - # (browserHtml, screenshot) are declared, both fields are generated. - # This is in case a single request is allowed to combine both in the - # future. - ( { - "headers": {"Referer": "a"}, - "meta": { - "zyte_api": { - "httpResponseBody": True, - "browserHtml": True, - # Makes the mock API server return 200 despite the - # bad input. - "passThrough": True, - }, + "ZYTE_API_BROWSER_HEADERS": { + "Referer": "referer", + "User-Agent": "userAgent", }, }, - {}, { - "httpResponseBody": True, "browserHtml": True, "httpResponseHeaders": True, - "customHttpRequestHeaders": [ - {"name": "Referer", "value": "a"}, - ], - "requestHeaders": {"referer": "a"}, - "passThrough": True, + "requestHeaders": {"userAgent": ""}, }, [], ), - # If requestHeaders or customHttpRequestHeaders are used, their value - # prevails, but a warning is issued. - ( - { - "headers": {"Referer": "a"}, - "meta": { - "zyte_api": { - "customHttpRequestHeaders": [ - {"name": "Referer", "value": "b"}, - ], - }, - }, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "customHttpRequestHeaders": [ - {"name": "Referer", "value": "b"}, - ], - }, - ["Use Request.headers instead"], - ), - ( - { - "headers": {"Referer": "a"}, - "meta": { - "zyte_api": { - "browserHtml": True, - "requestHeaders": {"referer": "b"}, - }, - }, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - "requestHeaders": {"referer": "b"}, - }, - ["Use Request.headers instead"], - ), - # A request should not have headers if requestHeaders or - # customHttpRequestHeaders are also used, even if they match. - ( - { - "headers": {"Referer": "b"}, - "meta": { - "zyte_api": { - "customHttpRequestHeaders": [ - {"name": "Referer", "value": "b"}, - ], - }, - }, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "customHttpRequestHeaders": [ - {"name": "Referer", "value": "b"}, - ], - }, - ["Use Request.headers instead"], - ), - ( - { - "headers": {"Referer": "b"}, - "meta": { - "zyte_api": { - "browserHtml": True, - "requestHeaders": {"referer": "b"}, - }, - }, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - "requestHeaders": {"referer": "b"}, - }, - ["Use Request.headers instead"], - ), - # Unsupported headers not present in Scrapy requests by default are - # dropped with a warning. - # If all headers are unsupported, the header parameter is not even set. - ( - { - "headers": {"a": "b"}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # Headers with None as value are silently ignored. + # BODY + # The body is copied into httpRequestBody, base64-encoded. ( { - "headers": {"a": None}, + "body": "a", }, {}, { "httpResponseBody": True, "httpResponseHeaders": True, + "httpRequestBody": "YQ==", }, [], ), + # httpRequestBody defined in meta takes precedence, but it causes a + # warning. ( { - "headers": {"a": None}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - [], - ), - # Headers with an empty string as value are not silently ignored. - ( - { - "headers": {"a": ""}, - "meta": {"zyte_api": {"browserHtml": True}}, + "body": "a", + "meta": {"zyte_api": {"httpRequestBody": "Yg=="}}, }, {}, { - "browserHtml": True, + "httpResponseBody": True, "httpResponseHeaders": True, + "httpRequestBody": "Yg==", }, - ["cannot be mapped"], + [ + "Use Request.body instead", + "does not match the Zyte Data API httpRequestBody parameter", + ], ), - # Unsupported headers are looked up case-insensitively. + # httpRequestBody defined in meta causes a warning even if it matches + # request.body. ( { - "headers": {"user-Agent": ""}, + "body": "a", + "meta": {"zyte_api": {"httpRequestBody": "YQ=="}}, }, {}, { "httpResponseBody": True, "httpResponseHeaders": True, + "httpRequestBody": "YQ==", }, - ["cannot be mapped"], + ["Use Request.body instead"], ), - # The Accept and Accept-Language headers, when unsupported, are dropped - # silently if their value matches the default value of Scrapy for - # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. + # A body should not be used unless httpResponseBody is also used. ( { - "headers": DEFAULT_REQUEST_HEADERS, + "body": "a", "meta": {"zyte_api": {"browserHtml": True}}, }, {}, @@ -1021,228 +445,31 @@ async def parse(self, response): "browserHtml": True, "httpResponseHeaders": True, }, - [], + ["can only be set when the httpResponseBody parameter"], ), ( { - "headers": { - "Accept": "application/json", - "Accept-Language": "uk", - }, - "meta": {"zyte_api": {"browserHtml": True}}, + "body": "a", + "meta": {"zyte_api": {"screenshot": True}}, }, {}, { - "browserHtml": True, - "httpResponseHeaders": True, + "screenshot": True, }, - ["cannot be mapped"], + ["can only be set when the httpResponseBody parameter"], ), - # The Cookie header is dropped with a warning. + # httpResponseHeaders + # Warn if httpResponseHeaders is defined unnecessarily. ( { - "headers": { - "Cookie": "a=b", - }, + "meta": {"zyte_api": {"httpResponseHeaders": True}}, }, {}, { "httpResponseBody": True, "httpResponseHeaders": True, }, - ["cannot be mapped"], - ), - ( - { - "headers": { - "Cookie": "a=b", - }, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # The User-Agent header, which Scrapy sets by default, is dropped - # silently if it matches the default value of the USER_AGENT setting, - # or with a warning otherwise. - ( - { - "headers": {"User-Agent": DEFAULT_USER_AGENT}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - ), - ( - { - "headers": {"User-Agent": ""}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - ( - { - "headers": {"User-Agent": DEFAULT_USER_AGENT}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - [], - ), - ( - { - "headers": {"User-Agent": ""}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # You may update the ZYTE_API_UNSUPPORTED_HEADERS setting to remove - # headers that the customHttpRequestHeaders parameter starts supporting - # in the future. - ( - { - "headers": { - "Cookie": "", - "User-Agent": "", - }, - }, - { - "ZYTE_API_UNSUPPORTED_HEADERS": ["Cookie"], - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "customHttpRequestHeaders": [ - {"name": "User-Agent", "value": ""}, - ], - }, - [ - "defines header b'Cookie', which cannot be mapped", - ], - ), - # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support - # for new fields that the requestHeaders parameter may support in the - # future. - ( - { - "headers": {"User-Agent": ""}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - { - "ZYTE_API_BROWSER_HEADERS": { - "Referer": "referer", - "User-Agent": "userAgent", - }, - }, - { - "browserHtml": True, - "httpResponseHeaders": True, - "requestHeaders": {"userAgent": ""}, - }, - [], - ), - # BODY - # The body is copied into httpRequestBody, base64-encoded. - ( - { - "body": "a", - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "YQ==", - }, - [], - ), - # httpRequestBody defined in meta takes precedence, but it causes a - # warning. - ( - { - "body": "a", - "meta": {"zyte_api": {"httpRequestBody": "Yg=="}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "Yg==", - }, - [ - "Use Request.body instead", - "does not match the Zyte Data API httpRequestBody parameter", - ], - ), - # httpRequestBody defined in meta causes a warning even if it matches - # request.body. - ( - { - "body": "a", - "meta": {"zyte_api": {"httpRequestBody": "YQ=="}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "YQ==", - }, - ["Use Request.body instead"], - ), - # A body should not be used unless httpResponseBody is also used. - ( - { - "body": "a", - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["can only be set when the httpResponseBody parameter"], - ), - ( - { - "body": "a", - "meta": {"zyte_api": {"screenshot": True}}, - }, - {}, - { - "screenshot": True, - }, - ["can only be set when the httpResponseBody parameter"], - ), - # httpResponseHeaders - # Warn if httpResponseHeaders is defined unnecessarily. - ( - { - "meta": {"zyte_api": {"httpResponseHeaders": True}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["do not need to set httpResponseHeaders"], + ["do not need to set httpResponseHeaders"], ), ], ) @@ -1293,8 +520,11 @@ async def test_automap( assert not caplog.records -_UNSUPPORTED_HEADERS = {b"cookie", b"user-agent"} -_BROWSER_HEADERS = {b"referer": "referer"} +AUTOMAP_BY_DEFAULT = True +BROWSER_HEADERS = {b"referer": "referer"} +DEFAULT_PARAMS: Dict[str, Any] = {} +UNSUPPORTED_HEADERS = {b"cookie", b"user-agent"} +USE_API_BY_DEFAULT = False @ensureDeferred @@ -1308,11 +538,11 @@ async def test_get_api_params_input_default(mockserver): await handler.download_request(request, None) _get_api_params.assert_called_once_with( request, - use_api_by_default=False, - automap_by_default=True, - default_params={}, - unsupported_headers=_UNSUPPORTED_HEADERS, - browser_headers=_BROWSER_HEADERS, + use_api_by_default=USE_API_BY_DEFAULT, + automap_by_default=AUTOMAP_BY_DEFAULT, + default_params=DEFAULT_PARAMS, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, ) @@ -1342,44 +572,46 @@ async def test_get_api_params_input_custom(mockserver): ) -_UNSET = object() +UNSET = object() -@ensureDeferred @pytest.mark.parametrize( "setting,meta,expected", [ - (False, _UNSET, None), - (False, False, None), - (False, True, {}), - (False, {}, {}), - (False, {"a": "b"}, {"a": "b"}), - (True, _UNSET, {}), - (True, False, None), - (True, True, {}), + (False, None, None), + (False, {}, None), + (False, {"a": "b"}, None), + (False, {"zyte_api": False}, None), + (False, {"zyte_api": True}, {}), + (False, {"zyte_api": {}}, {}), + (False, {"zyte_api": {"a": "b"}}, {"a": "b"}), + (False, {"zyte_api": {"browserHtml": True}}, {"browserHtml": True}), + (True, None, {}), (True, {}, {}), - (True, {"a": "b"}, {"a": "b"}), + (True, {"a": "b"}, {}), + (True, {"zyte_api": False}, None), + (True, {"zyte_api": True}, {}), + (True, {"zyte_api": {}}, {}), + (True, {"zyte_api": {"a": "b"}}, {"a": "b"}), + (True, {"zyte_api": {"browserHtml": True}}, {"browserHtml": True}), ], ) -async def test_get_api_params_toggling(setting, meta, expected): - request = Request(url="https://example.com") - if meta is not _UNSET: - request.meta["zyte_api"] = meta +def test_api_toggling(setting, meta, expected): + request = Request(url="https://example.com", meta=meta) api_params = _get_api_params( request, use_api_by_default=setting, automap_by_default=False, - default_params={}, - unsupported_headers=_UNSUPPORTED_HEADERS, - browser_headers=_BROWSER_HEADERS, + default_params=DEFAULT_PARAMS, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, ) assert api_params == expected -@ensureDeferred @pytest.mark.parametrize("setting", [False, True]) @pytest.mark.parametrize("meta", [None, 0, "", b"", []]) -async def test_get_api_params_disabling_deprecated(setting, meta): +def test_api_disabling_deprecated(setting, meta): request = Request(url="https://example.com") request.meta["zyte_api"] = meta with pytest.warns(DeprecationWarning, match=r".* Use False instead\.$"): @@ -1387,55 +619,758 @@ async def test_get_api_params_disabling_deprecated(setting, meta): request, use_api_by_default=setting, automap_by_default=False, - default_params={}, - unsupported_headers=_UNSUPPORTED_HEADERS, - browser_headers=_BROWSER_HEADERS, + default_params=DEFAULT_PARAMS, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, ) assert api_params is None @ensureDeferred +async def test_default_params_none(mockserver, caplog): + request = Request(url="https://example.com") + settings = { + "ZYTE_API_DEFAULT_PARAMS": {"a": None, "b": "c"}, + } + with caplog.at_level("WARNING"): + async with mockserver.make_handler(settings) as handler: + patch_path = "scrapy_zyte_api.handler._get_api_params" + with patch(patch_path) as _get_api_params: + _get_api_params.side_effect = RuntimeError("That’s it!") + with pytest.raises(RuntimeError): + await handler.download_request(request, None) + _get_api_params.assert_called_once_with( + request, + use_api_by_default=USE_API_BY_DEFAULT, + automap_by_default=AUTOMAP_BY_DEFAULT, + default_params={"b": "c"}, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, + ) + assert "Parameter 'a' in the ZYTE_API_DEFAULT_PARAMS setting is None" in caplog.text + + @pytest.mark.parametrize( - "default_params,meta,expected", + "default_params,meta,expected,warnings", [ - ({}, {}, {}), - ({}, {"b": 2}, {"b": 2}), - ({"a": 1}, {}, {"a": 1}), - ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}), - ({"a": 1}, {"a": 2}, {"a": 2}), + ({}, {}, {}, []), + ({}, {"b": 2}, {"b": 2}, []), + ({}, {"b": None}, {}, ["does not define such a parameter"]), + ({"a": 1}, {}, {"a": 1}, []), + ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}, []), + ({"a": 1}, {"b": None}, {"a": 1}, ["does not define such a parameter"]), + ({"a": 1}, {"a": 2}, {"a": 2}, []), + ({"a": 1}, {"a": None}, {}, []), ], ) -async def test_get_api_params_default_params_merging(default_params, meta, expected): +def test_default_params_merging(default_params, meta, expected, warnings, caplog): request = Request(url="https://example.com") request.meta["zyte_api"] = meta - api_params = _get_api_params( - request, - use_api_by_default=False, - automap_by_default=False, - default_params=default_params, - unsupported_headers=_UNSUPPORTED_HEADERS, - browser_headers=_BROWSER_HEADERS, - ) + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + use_api_by_default=USE_API_BY_DEFAULT, + automap_by_default=False, + default_params=default_params, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, + ) assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records -@ensureDeferred +@pytest.mark.xfail(reason="To be implemented", strict=True) @pytest.mark.parametrize( - "meta,exception", + "default_params,meta,expected,warnings", [ - (1, TypeError), - (["a", "b"], ValueError), + ( + {"screenshot": True, "httpResponseHeaders": True}, + {"browserHtml": True}, + {"browserHtml": True, "httpResponseHeaders": True, "screenshot": True}, + [], + ), + ( + {"browserHtml": True, "httpResponseHeaders": False}, + {"screenshot": True, "browserHtml": False}, + {"screenshot": True}, + [], + ), ], ) -async def test_get_api_params_bad_meta_type(meta, exception): +def test_default_params_automap(default_params, meta, expected, warnings, caplog): + """Warnings about unneeded parameters should not apply if those parameters are needed to extend or override default parameters.""" + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + use_api_by_default=USE_API_BY_DEFAULT, + automap_by_default=AUTOMAP_BY_DEFAULT, + default_params=default_params, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, + ) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records + + +def test_default_params_immutability(): + request = Request(url="https://example.com") + request.meta["zyte_api"] = {"a": None} + default_params = {"a": "b"} + _get_api_params( + request, + use_api_by_default=USE_API_BY_DEFAULT, + automap_by_default=AUTOMAP_BY_DEFAULT, + default_params=default_params, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, + ) + assert default_params == {"a": "b"} + + +@pytest.mark.parametrize("meta", [1, ["a", "b"]]) +def test_bad_meta_type(meta): request = Request(url="https://example.com") request.meta["zyte_api"] = meta - with pytest.raises(exception): + with pytest.raises(ValueError): _get_api_params( request, - use_api_by_default=False, + use_api_by_default=USE_API_BY_DEFAULT, automap_by_default=False, - default_params={}, - unsupported_headers=_UNSUPPORTED_HEADERS, - browser_headers=_BROWSER_HEADERS, + default_params=DEFAULT_PARAMS, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, + ) + + +@pytest.mark.parametrize( + "setting,meta,expected", + [ + (False, UNSET, False), + (False, False, False), + (False, True, True), + (True, UNSET, True), + (True, False, False), + (True, True, True), + ], +) +def test_automap_toggling(setting, meta, expected): + request = Request(url="https://example.com") + if meta is not UNSET: + request.meta["zyte_api_automap"] = meta + api_params = _get_api_params( + request, + use_api_by_default=True, + automap_by_default=setting, + default_params=DEFAULT_PARAMS, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, + ) + assert bool(api_params) == expected + + +def _test_automap(request_kwargs, meta, expected, warnings, caplog): + request = Request(url="https://example.com", **request_kwargs) + request.meta["zyte_api"] = meta + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + use_api_by_default=USE_API_BY_DEFAULT, + automap_by_default=AUTOMAP_BY_DEFAULT, + default_params=DEFAULT_PARAMS, + unsupported_headers=UNSUPPORTED_HEADERS, + browser_headers=BROWSER_HEADERS, ) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records + + +@pytest.mark.parametrize( + "meta,expected,warnings", + [ + ({}, {"httpResponseBody": True, "httpResponseHeaders": True}, []), + ( + {"httpResponseBody": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + ["do not need to set httpResponseBody to True"], + ), + ( + {"httpResponseBody": False}, + {}, + [], + ), + ( + {"httpResponseBody": True, "browserHtml": True}, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"browserHtml": True}, + {"browserHtml": True, "httpResponseHeaders": True}, + [], + ), + ( + {"screenshot": True}, + {"screenshot": True}, + [], + ), + ( + {"unknown": True}, + {"httpResponseBody": True, "httpResponseHeaders": True, "unknown": True}, + [], + ), + ( + {"unknown": True, "httpResponseBody": False}, + {"unknown": True}, + [], + ), + ], +) +def test_automap_main_outputs(meta, expected, warnings, caplog): + _test_automap({}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "meta,expected,warnings", + [ + ({"httpResponseHeaders": False}, {"httpResponseBody": True}, []), + ( + {"httpResponseHeaders": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + ["do not need to set httpResponseHeaders to True"], + ), + ( + {"httpResponseBody": True, "httpResponseHeaders": False}, + {"httpResponseBody": True}, + ["do not need to set httpResponseBody to True"], + ), + ( + {"httpResponseBody": True, "httpResponseHeaders": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + [ + "do not need to set httpResponseHeaders to True", + "do not need to set httpResponseBody to True", + ], + ), + ( + {"httpResponseBody": False, "httpResponseHeaders": False}, + {}, + ["do not need to set httpResponseHeaders to False"], + ), + ( + {"httpResponseBody": False, "httpResponseHeaders": True}, + {"httpResponseHeaders": True}, + [], + ), + ( + {"browserHtml": True, "httpResponseHeaders": False}, + {"browserHtml": True}, + [], + ), + ( + {"browserHtml": True, "httpResponseHeaders": True}, + {"browserHtml": True, "httpResponseHeaders": True}, + ["do not need to set httpResponseHeaders to True"], + ), + ( + { + "httpResponseBody": True, + "browserHtml": True, + "httpResponseHeaders": False, + }, + {"browserHtml": True, "httpResponseBody": True}, + [], + ), + ( + { + "httpResponseBody": True, + "browserHtml": True, + "httpResponseHeaders": True, + }, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["do not need to set httpResponseHeaders to True"], + ), + ( + {"screenshot": True, "httpResponseHeaders": False}, + {"screenshot": True}, + ["do not need to set httpResponseHeaders to False"], + ), + ( + {"screenshot": True, "httpResponseHeaders": True}, + {"screenshot": True, "httpResponseHeaders": True}, + [], + ), + ( + {"unknown": True, "httpResponseHeaders": True}, + {"unknown": True, "httpResponseBody": True, "httpResponseHeaders": True}, + ["do not need to set httpResponseHeaders to True"], + ), + ( + {"unknown": True, "httpResponseHeaders": False}, + {"unknown": True, "httpResponseBody": True}, + [], + ), + ( + {"unknown": True, "httpResponseBody": False, "httpResponseHeaders": True}, + {"unknown": True, "httpResponseHeaders": True}, + [], + ), + ( + {"unknown": True, "httpResponseBody": False, "httpResponseHeaders": False}, + {"unknown": True}, + ["do not need to set httpResponseHeaders to False"], + ), + ], +) +def test_automap_header_output(meta, expected, warnings, caplog): + _test_automap({}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "method,meta,expected,warnings", + [ + ( + "GET", + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + *( + ( + method, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": method, + }, + [], + ) + for method in ( + "POST", + "PUT", + "DELETE", + "OPTIONS", + "TRACE", + "PATCH", + "HEAD", + "CONNECT", + "FOO", + ) + ), + *( + ( + request_method, + {"httpRequestMethod": meta_method}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": meta_method, + }, + ["Use Request.method"], + ) + for request_method, meta_method in ( + ("GET", "GET"), + ("POST", "POST"), + ) + ), + *( + ( + request_method, + {"httpRequestMethod": meta_method}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": meta_method, + }, + [ + "Use Request.method", + "does not match the Zyte Data API httpRequestMethod", + ], + ) + for request_method, meta_method in ( + ("GET", "POST"), + ("PUT", "GET"), + ) + ), + ( + "POST", + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + ( + "POST", + {"screenshot": True}, + { + "screenshot": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + ], +) +def test_automap_method(method, meta, expected, warnings, caplog): + _test_automap({"method": method}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "headers,meta,expected,warnings", + [ + # Base header mapping scenarios for a supported header. + ( + {"Referer": "a"}, + {}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True, "httpResponseBody": True}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"screenshot": True}, + { + "screenshot": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"screenshot": True, "httpResponseBody": True}, + { + "screenshot": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"unknown": True}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "unknown": True, + }, + [], + ), + ( + {"Referer": "a"}, + {"unknown": True, "httpResponseBody": False}, + { + "requestHeaders": {"referer": "a"}, + "unknown": True, + }, + [], + ), + ( + {"Referer": "a"}, + {"httpResponseBody": False}, + { + "requestHeaders": {"referer": "a"}, + }, + [], + ), + # Headers with None as value are ignored. + ( + {"Referer": None}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": None}, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": None}, + {"browserHtml": True, "httpResponseBody": True}, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": None}, + {"screenshot": True}, + { + "screenshot": True, + }, + [], + ), + ( + {"Referer": None}, + {"screenshot": True, "httpResponseBody": True}, + { + "screenshot": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": None}, + {"unknown": True}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "unknown": True, + }, + [], + ), + ( + {"Referer": None}, + {"unknown": True, "httpResponseBody": False}, + { + "unknown": True, + }, + [], + ), + ( + {"Referer": None}, + {"httpResponseBody": False}, + {}, + [], + ), + # Warn if header parameters are used, even if the values match request + # headers. + ( + {"Referer": "a"}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "b"}, + ] + }, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "b"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + ( + {"Referer": "a"}, + { + "browserHtml": True, + "requestHeaders": {"referer": "b"}, + }, + { + "browserHtml": True, + "requestHeaders": {"referer": "b"}, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + ( + {"Referer": "a"}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ] + }, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + ( + {"Referer": "a"}, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + }, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + # Unsupported headers not present in Scrapy requests by default are + # dropped with a warning. + # If all headers are unsupported, the header parameter is not even set. + ( + {"Cookie": "a=b"}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + {"a": "b"}, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # Headers with an empty string as value are not silently ignored. + ( + {"Cookie": ""}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + {"a": ""}, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # Unsupported headers are looked up case-insensitively. + ( + {"user-Agent": ""}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ], +) +def test_automap_headers(headers, meta, expected, warnings, caplog): + _test_automap({"headers": headers}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "meta,expected,warnings", + [ + ( + { + "httpResponseBody": False, + }, + {}, + [], + ), + ( + { + "browserHtml": True, + "httpResponseBody": False, + }, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["unnecessarily defines"], + ), + ( + { + "browserHtml": False, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["unnecessarily defines"], + ), + ( + { + "screenshot": False, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["unnecessarily defines"], + ), + ( + { + "httpResponseHeaders": False, + }, + { + "httpResponseBody": True, + }, + [], + ), + ], +) +def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): + _test_automap({}, meta, expected, warnings, caplog) From a19c0048ece26ae3f2d5f0d0f1df345e2df80fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 29 Aug 2022 23:10:45 +0200 Subject: [PATCH 16/81] Revert change no longer needed --- tests/mockserver.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index 45b2738a..2035cf46 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -66,9 +66,7 @@ def render_POST(self, request): html = "Hello

World!

" if "browserHtml" in request_data: - if "httpResponseBody" in request_data and not request_data.get( - "passThrough" - ): + if "httpResponseBody" in request_data: request.setResponseCode(422) return json.dumps( { From c59fb8bf828816ffdc6833dfa1373764ba02493b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 29 Aug 2022 23:49:24 +0200 Subject: [PATCH 17/81] Refactor response mapping tests --- tests/mockserver.py | 9 +++- tests/test_api_requests.py | 94 ++++++++++++++++---------------------- 2 files changed, 47 insertions(+), 56 deletions(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index 2035cf46..f41374a8 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -78,8 +78,13 @@ def render_POST(self, request): ).encode() response_data["browserHtml"] = html if "httpResponseBody" in request_data: - base64_html = b64encode(html.encode()).decode() - response_data["httpResponseBody"] = base64_html + headers = request_data.get("customHttpRequestHeaders", {}) + accept = headers.get("Accept", None) + if accept == "application/octet-stream": + body = b64encode(b"\x00").decode() + else: + body = b64encode(html.encode()).decode() + response_data["httpResponseBody"] = body if request_data.get("httpResponseHeaders") is True: response_data["httpResponseHeaders"] = [ diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index ca8da3cd..03476079 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -8,7 +8,6 @@ from _pytest.logging import LogCaptureFixture # NOQA from pytest_twisted import ensureDeferred from scrapy import Request, Spider -from scrapy.exceptions import NotSupported from scrapy.http import Response, TextResponse from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT @@ -23,82 +22,69 @@ from .mockserver import DelayedResource, MockServer, produce_request_response -@ensureDeferred @pytest.mark.parametrize( "meta", [ - {"zyte_api": {"browserHtml": True}}, - {"zyte_api": {"browserHtml": True, "geolocation": "US"}}, - {"zyte_api": {"browserHtml": True, "geolocation": "US", "echoData": 123}}, - {"zyte_api": {"browserHtml": True, "randomParameter": None}}, - {"zyte_api": {"httpResponseBody": True}}, - {"zyte_api": {"httpResponseBody": True, "geolocation": "US"}}, { - "zyte_api": { - "httpResponseBody": True, - "geolocation": "US", - "echoData": 123, - } + "httpResponseBody": True, + "httpResponseHeaders": False, + "customHttpRequestHeaders": {"Accept": "application/octet-stream"}, }, - {"zyte_api": {"httpResponseBody": True, "randomParameter": None}}, + pytest.param( + { + "httpResponseBody": True, + "customHttpRequestHeaders": {"Accept": "application/octet-stream"}, + }, + marks=pytest.mark.xfail( + reason="https://github.com/scrapy-plugins/scrapy-zyte-api/issues/47", + strict=True, + ), + ), ], ) -async def test_html_response_and_headers(meta: Dict[str, Dict[str, Any]], mockserver): - req, resp = await produce_request_response(mockserver, meta) - assert isinstance(resp, TextResponse) +@ensureDeferred +async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): + req, resp = await produce_request_response(mockserver, {"zyte_api": meta}) + assert isinstance(resp, Response) + assert not isinstance(resp, TextResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "zyte-api" in resp.flags - assert resp.body == b"Hello

World!

" - assert resp.text == "Hello

World!

" - assert resp.css("h1 ::text").get() == "World!" - assert resp.xpath("//body/text()").getall() == ["Hello"] - assert resp.headers == {b"Test_Header": [b"test_value"]} + assert resp.body == b"\x00" +@ensureDeferred @pytest.mark.parametrize( "meta", [ - {"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": False}}, - { - "zyte_api": { - "httpResponseBody": True, - "httpResponseHeaders": False, - "geolocation": "US", - }, - }, - { - "zyte_api": { - "httpResponseBody": True, - "httpResponseHeaders": False, - "geolocation": "US", - "echoData": 123, - } - }, - { - "zyte_api": { - "httpResponseBody": True, - "httpResponseHeaders": False, - "randomParameter": None, - }, - }, + {"browserHtml": True}, + {"browserHtml": True, "httpResponseHeaders": False}, + {"httpResponseBody": True}, + pytest.param( + {"httpResponseBody": True, "httpResponseHeaders": False}, + marks=pytest.mark.xfail( + reason="https://github.com/scrapy-plugins/scrapy-zyte-api/issues/47", + strict=True, + ), + ), ], ) -@ensureDeferred -async def test_http_response_body_request(meta: Dict[str, Dict[str, Any]], mockserver): - req, resp = await produce_request_response(mockserver, meta) - assert isinstance(resp, Response) +async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): + req, resp = await produce_request_response(mockserver, {"zyte_api": meta}) + assert isinstance(resp, TextResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "zyte-api" in resp.flags assert resp.body == b"Hello

World!

" - - with pytest.raises(NotSupported): - assert resp.css("h1 ::text").get() == "World!" - with pytest.raises(NotSupported): - assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.text == "Hello

World!

" + assert resp.css("h1 ::text").get() == "World!" + assert resp.xpath("//body/text()").getall() == ["Hello"] + if meta.get("httpResponseHeaders", True) is True: + assert resp.headers == {b"Test_Header": [b"test_value"]} + else: + assert not resp.headers @ensureDeferred From 0840fa84970b434a346635546c9b9eb130b4f2df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 29 Aug 2022 23:56:01 +0200 Subject: [PATCH 18/81] Fix tests --- tests/mockserver.py | 9 +++++++-- tests/test_api_requests.py | 8 ++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index f41374a8..2796dcc8 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -78,8 +78,13 @@ def render_POST(self, request): ).encode() response_data["browserHtml"] = html if "httpResponseBody" in request_data: - headers = request_data.get("customHttpRequestHeaders", {}) - accept = headers.get("Accept", None) + headers = request_data.get("customHttpRequestHeaders", []) + for header in headers: + if header["name"].strip().lower() == "accept": + accept = header["value"] + break + else: + accept = None if accept == "application/octet-stream": body = b64encode(b"\x00").decode() else: diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 03476079..d4888df0 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -28,12 +28,16 @@ { "httpResponseBody": True, "httpResponseHeaders": False, - "customHttpRequestHeaders": {"Accept": "application/octet-stream"}, + "customHttpRequestHeaders": [ + {"name": "Accept", "value": "application/octet-stream"} + ], }, pytest.param( { "httpResponseBody": True, - "customHttpRequestHeaders": {"Accept": "application/octet-stream"}, + "customHttpRequestHeaders": [ + {"name": "Accept", "value": "application/octet-stream"} + ], }, marks=pytest.mark.xfail( reason="https://github.com/scrapy-plugins/scrapy-zyte-api/issues/47", From 08adc1408ed760ad0a4697ee093634ecdcbd0103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 08:25:33 +0200 Subject: [PATCH 19/81] Make ZYTE_API_AUTOMAP False by default --- scrapy_zyte_api/handler.py | 2 +- tests/test_api_requests.py | 31 ++++++++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 3b8eee76..3bae50aa 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -337,7 +337,7 @@ def __init__( if self._retry_policy: self._retry_policy = load_object(self._retry_policy) self._on_all_requests = settings.getbool("ZYTE_API_ON_ALL_REQUESTS") - self._automap = settings.getbool("ZYTE_API_AUTOMAP", True) + self._automap = settings.getbool("ZYTE_API_AUTOMAP", False) self._unsupported_headers = { header.strip().lower().encode() for header in settings.getlist( diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index d4888df0..7ddee62e 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -27,7 +27,6 @@ [ { "httpResponseBody": True, - "httpResponseHeaders": False, "customHttpRequestHeaders": [ {"name": "Accept", "value": "application/octet-stream"} ], @@ -35,6 +34,7 @@ pytest.param( { "httpResponseBody": True, + "httpResponseHeaders": True, "customHttpRequestHeaders": [ {"name": "Accept", "value": "application/octet-stream"} ], @@ -48,6 +48,13 @@ ) @ensureDeferred async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): + """Test that binary (i.e. non-text) responses from Zyte Data API are + successfully mapped to a subclass of Response that is not also a subclass + of TextResponse. + + Whether response headers are retrieved or not should have no impact on the + outcome if the body is unequivocally binary. + """ req, resp = await produce_request_response(mockserver, {"zyte_api": meta}) assert isinstance(resp, Response) assert not isinstance(resp, TextResponse) @@ -62,11 +69,11 @@ async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): @pytest.mark.parametrize( "meta", [ + {"browserHtml": True, "httpResponseHeaders": True}, {"browserHtml": True}, - {"browserHtml": True, "httpResponseHeaders": False}, - {"httpResponseBody": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, pytest.param( - {"httpResponseBody": True, "httpResponseHeaders": False}, + {"httpResponseBody": True}, marks=pytest.mark.xfail( reason="https://github.com/scrapy-plugins/scrapy-zyte-api/issues/47", strict=True, @@ -75,6 +82,12 @@ async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): ], ) async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): + """Test that HTML responses from Zyte Data API are successfully mapped to a + subclass of TextResponse. + + Whether response headers are retrieved or not should have no impact on the + outcome if the body is unequivocally HTML. + """ req, resp = await produce_request_response(mockserver, {"zyte_api": meta}) assert isinstance(resp, TextResponse) assert resp.request is req @@ -85,7 +98,7 @@ async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): assert resp.text == "Hello

World!

" assert resp.css("h1 ::text").get() == "World!" assert resp.xpath("//body/text()").getall() == ["Hello"] - if meta.get("httpResponseHeaders", True) is True: + if meta.get("httpResponseHeaders", False) is True: assert resp.headers == {b"Test_Header": [b"test_value"]} else: assert not resp.headers @@ -471,7 +484,7 @@ async def test_automap( mockserver, caplog, ): - settings.update({"ZYTE_API_ON_ALL_REQUESTS": True}) + settings.update({"ZYTE_API_ON_ALL_REQUESTS": True, "ZYTE_API_AUTOMAP": True}) async with mockserver.make_handler(settings) as handler: if expected is False: # Only the Zyte Data API client is mocked, meaning requests that @@ -510,7 +523,7 @@ async def test_automap( assert not caplog.records -AUTOMAP_BY_DEFAULT = True +AUTOMAP_BY_DEFAULT = False BROWSER_HEADERS = {b"referer": "referer"} DEFAULT_PARAMS: Dict[str, Any] = {} UNSUPPORTED_HEADERS = {b"cookie", b"user-agent"} @@ -699,7 +712,7 @@ def test_default_params_automap(default_params, meta, expected, warnings, caplog api_params = _get_api_params( request, use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=AUTOMAP_BY_DEFAULT, + automap_by_default=True, default_params=default_params, unsupported_headers=UNSUPPORTED_HEADERS, browser_headers=BROWSER_HEADERS, @@ -775,7 +788,7 @@ def _test_automap(request_kwargs, meta, expected, warnings, caplog): api_params = _get_api_params( request, use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=AUTOMAP_BY_DEFAULT, + automap_by_default=True, default_params=DEFAULT_PARAMS, unsupported_headers=UNSUPPORTED_HEADERS, browser_headers=BROWSER_HEADERS, From a649756b50789d40d00c2b244c7053b60332336e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 08:30:28 +0200 Subject: [PATCH 20/81] Complete testing scenarios for ZYTE_API_ENABLED --- tests/test_api_requests.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 7ddee62e..5626bece 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -104,11 +104,27 @@ async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): assert not resp.headers +UNSET = object() + + @ensureDeferred -async def test_disable(mockserver): - settings = {"ZYTE_API_ENABLED": False} +@pytest.mark.parametrize( + "setting,enabled", + [ + (UNSET, True), + (True, True), + (False, False), + ], +) +async def test_enabled(setting, enabled, mockserver): + settings = {} + if setting is not UNSET: + settings["ZYTE_API_ENABLED"] = setting async with mockserver.make_handler(settings) as handler: - assert handler is None + if enabled: + assert handler is not None + else: + assert handler is None @pytest.mark.parametrize("zyte_api", [True, False]) @@ -575,9 +591,6 @@ async def test_get_api_params_input_custom(mockserver): ) -UNSET = object() - - @pytest.mark.parametrize( "setting,meta,expected", [ From f95e61d0c2a21bc189a14075cce57116f55e337d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 08:56:48 +0200 Subject: [PATCH 21/81] Refactor test_job_id --- scrapy_zyte_api/handler.py | 7 +- tests/mockserver.py | 6 +- tests/test_api_requests.py | 131 ++++++++++++++++++------------------- 3 files changed, 68 insertions(+), 76 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 3bae50aa..a684e924 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -234,6 +234,7 @@ def _get_api_params( default_params: Dict[str, Any], unsupported_headers: Set[str], browser_headers: Dict[str, str], + job_id: Optional[str], ) -> Optional[dict]: """Returns a dictionary of API parameters that must be sent to Zyte Data API for the specified request, or None if the request should not be driven @@ -284,6 +285,9 @@ def _get_api_params( default_params=default_params, ) + if job_id is not None: + api_params["jobId"] = job_id + return api_params @@ -361,6 +365,7 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: default_params=self._zyte_api_default_params, unsupported_headers=self._unsupported_headers, browser_headers=self._browser_headers, + job_id=self._job_id, ) if api_params: return deferred_from_coro( @@ -418,8 +423,6 @@ async def _download_request( ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: # Define url by default api_data = {**{"url": request.url}, **api_params} - if self._job_id is not None: - api_data["jobId"] = self._job_id retrying = request.meta.get("zyte_api_retry_policy") if retrying: retrying = load_object(retrying) diff --git a/tests/mockserver.py b/tests/mockserver.py index 2796dcc8..00ae7050 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -60,11 +60,7 @@ def render_POST(self, request): return json.dumps(response_data).encode() response_data["url"] = request_data["url"] - if request_data.get("jobId") is not None: - html = f"{request_data['jobId']}" - else: - html = "Hello

World!

" - + html = "Hello

World!

" if "browserHtml" in request_data: if "httpResponseBody" in request_data: request.setResponseCode(422) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 5626bece..5f855f0a 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -130,6 +130,9 @@ async def test_enabled(setting, enabled, mockserver): @pytest.mark.parametrize("zyte_api", [True, False]) @ensureDeferred async def test_coro_handling(zyte_api: bool, mockserver): + """ScrapyZyteAPIDownloadHandler.download_request must return a deferred + both when using Zyte Data API and when using the regular downloader + logic.""" settings = {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True}} async with mockserver.make_handler(settings) as handler: req = Request( @@ -176,25 +179,6 @@ async def test_exceptions( assert exception_text in caplog.text -@ensureDeferred -async def test_job_id(mockserver): - job_id = "547773/99/6" - settings = {"JOB": job_id} - async with mockserver.make_handler(settings) as handler: - req = Request( - "http://example.com", - method="POST", - meta={"zyte_api": {"browserHtml": True}}, - ) - resp = await handler.download_request(req, None) - - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == f"{job_id}".encode("utf8") - - @ensureDeferred async def test_higher_concurrency(): """Send DEFAULT_CLIENT_CONCURRENCY + 1 requests, the first and last taking @@ -544,6 +528,15 @@ async def test_automap( DEFAULT_PARAMS: Dict[str, Any] = {} UNSUPPORTED_HEADERS = {b"cookie", b"user-agent"} USE_API_BY_DEFAULT = False +JOB_ID = None +GET_API_PARAMS_KWARGS = { + "use_api_by_default": USE_API_BY_DEFAULT, + "automap_by_default": AUTOMAP_BY_DEFAULT, + "default_params": DEFAULT_PARAMS, + "unsupported_headers": UNSUPPORTED_HEADERS, + "browser_headers": BROWSER_HEADERS, + "job_id": JOB_ID, +} @ensureDeferred @@ -557,11 +550,7 @@ async def test_get_api_params_input_default(mockserver): await handler.download_request(request, None) _get_api_params.assert_called_once_with( request, - use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=AUTOMAP_BY_DEFAULT, - default_params=DEFAULT_PARAMS, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **GET_API_PARAMS_KWARGS, ) @@ -569,6 +558,7 @@ async def test_get_api_params_input_default(mockserver): async def test_get_api_params_input_custom(mockserver): request = Request(url="https://example.com") settings = { + "JOB": "1/2/3", "ZYTE_API_AUTOMAP": False, "ZYTE_API_BROWSER_HEADERS": {"B": "b"}, "ZYTE_API_DEFAULT_PARAMS": {"a": "b"}, @@ -588,6 +578,7 @@ async def test_get_api_params_input_custom(mockserver): default_params={"a": "b"}, unsupported_headers={b"a"}, browser_headers={b"b": "b"}, + job_id="1/2/3", ) @@ -616,11 +607,10 @@ def test_api_toggling(setting, meta, expected): request = Request(url="https://example.com", meta=meta) api_params = _get_api_params( request, - use_api_by_default=setting, - automap_by_default=False, - default_params=DEFAULT_PARAMS, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **{ + **GET_API_PARAMS_KWARGS, + "use_api_by_default": setting, + }, ) assert api_params == expected @@ -633,15 +623,27 @@ def test_api_disabling_deprecated(setting, meta): with pytest.warns(DeprecationWarning, match=r".* Use False instead\.$"): api_params = _get_api_params( request, - use_api_by_default=setting, - automap_by_default=False, - default_params=DEFAULT_PARAMS, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **{ + **GET_API_PARAMS_KWARGS, + "use_api_by_default": setting, + }, ) assert api_params is None +@ensureDeferred +async def test_job_id(mockserver): + request = Request(url="https://example.com", meta={"zyte_api": True}) + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + "job_id": "1/2/3", + }, + ) + assert api_params["jobId"] == "1/2/3" + + @ensureDeferred async def test_default_params_none(mockserver, caplog): request = Request(url="https://example.com") @@ -657,11 +659,10 @@ async def test_default_params_none(mockserver, caplog): await handler.download_request(request, None) _get_api_params.assert_called_once_with( request, - use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=AUTOMAP_BY_DEFAULT, - default_params={"b": "c"}, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **{ + **GET_API_PARAMS_KWARGS, + "default_params": {"b": "c"}, + }, ) assert "Parameter 'a' in the ZYTE_API_DEFAULT_PARAMS setting is None" in caplog.text @@ -685,11 +686,10 @@ def test_default_params_merging(default_params, meta, expected, warnings, caplog with caplog.at_level("WARNING"): api_params = _get_api_params( request, - use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=False, - default_params=default_params, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **{ + **GET_API_PARAMS_KWARGS, + "default_params": default_params, + }, ) assert api_params == expected if warnings: @@ -724,11 +724,10 @@ def test_default_params_automap(default_params, meta, expected, warnings, caplog with caplog.at_level("WARNING"): api_params = _get_api_params( request, - use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=True, - default_params=default_params, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **{ + **GET_API_PARAMS_KWARGS, + "automap_by_default": True, + }, ) assert api_params == expected if warnings: @@ -744,11 +743,10 @@ def test_default_params_immutability(): default_params = {"a": "b"} _get_api_params( request, - use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=AUTOMAP_BY_DEFAULT, - default_params=default_params, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **{ + **GET_API_PARAMS_KWARGS, + "default_params": default_params, + }, ) assert default_params == {"a": "b"} @@ -760,11 +758,7 @@ def test_bad_meta_type(meta): with pytest.raises(ValueError): _get_api_params( request, - use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=False, - default_params=DEFAULT_PARAMS, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **GET_API_PARAMS_KWARGS, ) @@ -785,11 +779,11 @@ def test_automap_toggling(setting, meta, expected): request.meta["zyte_api_automap"] = meta api_params = _get_api_params( request, - use_api_by_default=True, - automap_by_default=setting, - default_params=DEFAULT_PARAMS, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **{ + **GET_API_PARAMS_KWARGS, + "use_api_by_default": True, + "automap_by_default": setting, + }, ) assert bool(api_params) == expected @@ -800,11 +794,10 @@ def _test_automap(request_kwargs, meta, expected, warnings, caplog): with caplog.at_level("WARNING"): api_params = _get_api_params( request, - use_api_by_default=USE_API_BY_DEFAULT, - automap_by_default=True, - default_params=DEFAULT_PARAMS, - unsupported_headers=UNSUPPORTED_HEADERS, - browser_headers=BROWSER_HEADERS, + **{ + **GET_API_PARAMS_KWARGS, + "automap_by_default": True, + }, ) assert api_params == expected if warnings: From e0ac08956a363542b11b8ec2ce588092371b7083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 09:11:59 +0200 Subject: [PATCH 22/81] Fix test_higher_concurrency Before it could technically succeed even if the tested feature was not working: in the time slow requests were processing, the first request could be resolved, and then the last request could be sent and resolved. --- tests/test_api_requests.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 5f855f0a..4665c076 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -8,6 +8,7 @@ from _pytest.logging import LogCaptureFixture # NOQA from pytest_twisted import ensureDeferred from scrapy import Request, Spider +from scrapy.exceptions import CloseSpider from scrapy.http import Response, TextResponse from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT @@ -181,13 +182,15 @@ async def test_exceptions( @ensureDeferred async def test_higher_concurrency(): - """Send DEFAULT_CLIENT_CONCURRENCY + 1 requests, the first and last taking - less time than the rest, and ensure that the first 2 responses are the - first and the last, verifying that a concurrency ≥ - DEFAULT_CLIENT_CONCURRENCY + 1 has been reached.""" + """Make sure that CONCURRENT_REQUESTS and CONCURRENT_REQUESTS_PER_DOMAIN + have an effect on Zyte Data API requests.""" + # Send DEFAULT_CLIENT_CONCURRENCY + 1 requests, the last one taking less + # time than the rest, and ensure that the first response comes from the + # last request, verifying that a concurrency ≥ DEFAULT_CLIENT_CONCURRENCY + # + 1 has been reached. concurrency = DEFAULT_CLIENT_CONCURRENCY + 1 response_indexes = [] - expected_first_indexes = {0, concurrency - 1} + expected_first_index = concurrency - 1 fast_seconds = 0.001 slow_seconds = 0.2 @@ -206,7 +209,7 @@ def start_requests(self): "browserHtml": True, "delay": ( fast_seconds - if index in expected_first_indexes + if index == expected_first_index else slow_seconds ), }, @@ -216,6 +219,7 @@ def start_requests(self): async def parse(self, response): response_indexes.append(response.meta["index"]) + raise CloseSpider crawler = get_crawler( TestSpider, @@ -228,9 +232,7 @@ async def parse(self, response): ) await crawler.crawl() - assert ( - set(response_indexes[: len(expected_first_indexes)]) == expected_first_indexes - ) + assert response_indexes[0] == expected_first_index @ensureDeferred From 3508beefb43ffdc024e99b5687ffc5eeed5799ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 09:13:15 +0200 Subject: [PATCH 23/81] Move test_automap next to related tests --- tests/test_api_requests.py | 784 ++++++++++++++++++------------------- 1 file changed, 392 insertions(+), 392 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 4665c076..838905e0 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -235,296 +235,6 @@ async def parse(self, response): assert response_indexes[0] == expected_first_index -@ensureDeferred -@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") -@pytest.mark.parametrize( - "request_kwargs,settings,expected,warnings", - [ - # The Accept and Accept-Language headers, when unsupported, are dropped - # silently if their value matches the default value of Scrapy for - # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. - ( - { - "headers": DEFAULT_REQUEST_HEADERS, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - [], - ), - ( - { - "headers": { - "Accept": "application/json", - "Accept-Language": "uk", - }, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # The Cookie header is dropped with a warning. - ( - { - "headers": { - "Cookie": "a=b", - }, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - ( - { - "headers": { - "Cookie": "a=b", - }, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # The User-Agent header, which Scrapy sets by default, is dropped - # silently if it matches the default value of the USER_AGENT setting, - # or with a warning otherwise. - ( - { - "headers": {"User-Agent": DEFAULT_USER_AGENT}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - ), - ( - { - "headers": {"User-Agent": ""}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - ( - { - "headers": {"User-Agent": DEFAULT_USER_AGENT}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - [], - ), - ( - { - "headers": {"User-Agent": ""}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # You may update the ZYTE_API_UNSUPPORTED_HEADERS setting to remove - # headers that the customHttpRequestHeaders parameter starts supporting - # in the future. - ( - { - "headers": { - "Cookie": "", - "User-Agent": "", - }, - }, - { - "ZYTE_API_UNSUPPORTED_HEADERS": ["Cookie"], - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "customHttpRequestHeaders": [ - {"name": "User-Agent", "value": ""}, - ], - }, - [ - "defines header b'Cookie', which cannot be mapped", - ], - ), - # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support - # for new fields that the requestHeaders parameter may support in the - # future. - ( - { - "headers": {"User-Agent": ""}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - { - "ZYTE_API_BROWSER_HEADERS": { - "Referer": "referer", - "User-Agent": "userAgent", - }, - }, - { - "browserHtml": True, - "httpResponseHeaders": True, - "requestHeaders": {"userAgent": ""}, - }, - [], - ), - # BODY - # The body is copied into httpRequestBody, base64-encoded. - ( - { - "body": "a", - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "YQ==", - }, - [], - ), - # httpRequestBody defined in meta takes precedence, but it causes a - # warning. - ( - { - "body": "a", - "meta": {"zyte_api": {"httpRequestBody": "Yg=="}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "Yg==", - }, - [ - "Use Request.body instead", - "does not match the Zyte Data API httpRequestBody parameter", - ], - ), - # httpRequestBody defined in meta causes a warning even if it matches - # request.body. - ( - { - "body": "a", - "meta": {"zyte_api": {"httpRequestBody": "YQ=="}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "YQ==", - }, - ["Use Request.body instead"], - ), - # A body should not be used unless httpResponseBody is also used. - ( - { - "body": "a", - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["can only be set when the httpResponseBody parameter"], - ), - ( - { - "body": "a", - "meta": {"zyte_api": {"screenshot": True}}, - }, - {}, - { - "screenshot": True, - }, - ["can only be set when the httpResponseBody parameter"], - ), - # httpResponseHeaders - # Warn if httpResponseHeaders is defined unnecessarily. - ( - { - "meta": {"zyte_api": {"httpResponseHeaders": True}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["do not need to set httpResponseHeaders"], - ), - ], -) -async def test_automap( - request_kwargs: Dict[str, Any], - settings: Dict[str, Any], - expected: Union[Dict[str, str], Literal[False]], - warnings: List[str], - mockserver, - caplog, -): - settings.update({"ZYTE_API_ON_ALL_REQUESTS": True, "ZYTE_API_AUTOMAP": True}) - async with mockserver.make_handler(settings) as handler: - if expected is False: - # Only the Zyte Data API client is mocked, meaning requests that - # do not go through Zyte Data API are actually sent, so we point - # them to the mock server to avoid internet connections in tests. - request_kwargs["url"] = mockserver.urljoin("/") - else: - request_kwargs["url"] = "https://toscrape.com" - request = Request(**request_kwargs) - unmocked_client = handler._client - handler._client = mock.AsyncMock(unmocked_client) - handler._client.request_raw.side_effect = unmocked_client.request_raw - with caplog.at_level("WARNING"): - await handler.download_request(request, None) - - # What we're interested in is the Request call in the API - request_call = [ - c for c in handler._client.mock_calls if "request_raw(" in str(c) - ] - - if expected is False: - assert request_call == [] - return - - if not request_call: - pytest.fail("The client's request_raw() method was not called.") - - args_used = request_call[0].args[0] - args_used.pop("url") - assert args_used == expected - - if warnings: - for warning in warnings: - assert warning in caplog.text - else: - assert not caplog.records - - AUTOMAP_BY_DEFAULT = False BROWSER_HEADERS = {b"referer": "referer"} DEFAULT_PARAMS: Dict[str, Any] = {} @@ -692,121 +402,411 @@ def test_default_params_merging(default_params, meta, expected, warnings, caplog **GET_API_PARAMS_KWARGS, "default_params": default_params, }, - ) - assert api_params == expected - if warnings: - for warning in warnings: - assert warning in caplog.text - else: - assert not caplog.records - - -@pytest.mark.xfail(reason="To be implemented", strict=True) -@pytest.mark.parametrize( - "default_params,meta,expected,warnings", - [ + ) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records + + +@pytest.mark.xfail(reason="To be implemented", strict=True) +@pytest.mark.parametrize( + "default_params,meta,expected,warnings", + [ + ( + {"screenshot": True, "httpResponseHeaders": True}, + {"browserHtml": True}, + {"browserHtml": True, "httpResponseHeaders": True, "screenshot": True}, + [], + ), + ( + {"browserHtml": True, "httpResponseHeaders": False}, + {"screenshot": True, "browserHtml": False}, + {"screenshot": True}, + [], + ), + ], +) +def test_default_params_automap(default_params, meta, expected, warnings, caplog): + """Warnings about unneeded parameters should not apply if those parameters are needed to extend or override default parameters.""" + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + "automap_by_default": True, + }, + ) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records + + +def test_default_params_immutability(): + request = Request(url="https://example.com") + request.meta["zyte_api"] = {"a": None} + default_params = {"a": "b"} + _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + "default_params": default_params, + }, + ) + assert default_params == {"a": "b"} + + +@pytest.mark.parametrize("meta", [1, ["a", "b"]]) +def test_bad_meta_type(meta): + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + with pytest.raises(ValueError): + _get_api_params( + request, + **GET_API_PARAMS_KWARGS, + ) + + +@pytest.mark.parametrize( + "setting,meta,expected", + [ + (False, UNSET, False), + (False, False, False), + (False, True, True), + (True, UNSET, True), + (True, False, False), + (True, True, True), + ], +) +def test_automap_toggling(setting, meta, expected): + request = Request(url="https://example.com") + if meta is not UNSET: + request.meta["zyte_api_automap"] = meta + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + "use_api_by_default": True, + "automap_by_default": setting, + }, + ) + assert bool(api_params) == expected + + +def _test_automap(request_kwargs, meta, expected, warnings, caplog): + request = Request(url="https://example.com", **request_kwargs) + request.meta["zyte_api"] = meta + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + "automap_by_default": True, + }, + ) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records + + +@ensureDeferred +@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") +@pytest.mark.parametrize( + "request_kwargs,settings,expected,warnings", + [ + # The Accept and Accept-Language headers, when unsupported, are dropped + # silently if their value matches the default value of Scrapy for + # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. + ( + { + "headers": DEFAULT_REQUEST_HEADERS, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + { + "headers": { + "Accept": "application/json", + "Accept-Language": "uk", + }, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # The Cookie header is dropped with a warning. + ( + { + "headers": { + "Cookie": "a=b", + }, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + { + "headers": { + "Cookie": "a=b", + }, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # The User-Agent header, which Scrapy sets by default, is dropped + # silently if it matches the default value of the USER_AGENT setting, + # or with a warning otherwise. + ( + { + "headers": {"User-Agent": DEFAULT_USER_AGENT}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + { + "headers": {"User-Agent": ""}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + { + "headers": {"User-Agent": DEFAULT_USER_AGENT}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + { + "headers": {"User-Agent": ""}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # You may update the ZYTE_API_UNSUPPORTED_HEADERS setting to remove + # headers that the customHttpRequestHeaders parameter starts supporting + # in the future. + ( + { + "headers": { + "Cookie": "", + "User-Agent": "", + }, + }, + { + "ZYTE_API_UNSUPPORTED_HEADERS": ["Cookie"], + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "User-Agent", "value": ""}, + ], + }, + [ + "defines header b'Cookie', which cannot be mapped", + ], + ), + # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support + # for new fields that the requestHeaders parameter may support in the + # future. ( - {"screenshot": True, "httpResponseHeaders": True}, - {"browserHtml": True}, - {"browserHtml": True, "httpResponseHeaders": True, "screenshot": True}, + { + "headers": {"User-Agent": ""}, + "meta": {"zyte_api": {"browserHtml": True}}, + }, + { + "ZYTE_API_BROWSER_HEADERS": { + "Referer": "referer", + "User-Agent": "userAgent", + }, + }, + { + "browserHtml": True, + "httpResponseHeaders": True, + "requestHeaders": {"userAgent": ""}, + }, [], ), + # BODY + # The body is copied into httpRequestBody, base64-encoded. ( - {"browserHtml": True, "httpResponseHeaders": False}, - {"screenshot": True, "browserHtml": False}, - {"screenshot": True}, + { + "body": "a", + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "YQ==", + }, [], ), + # httpRequestBody defined in meta takes precedence, but it causes a + # warning. + ( + { + "body": "a", + "meta": {"zyte_api": {"httpRequestBody": "Yg=="}}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "Yg==", + }, + [ + "Use Request.body instead", + "does not match the Zyte Data API httpRequestBody parameter", + ], + ), + # httpRequestBody defined in meta causes a warning even if it matches + # request.body. + ( + { + "body": "a", + "meta": {"zyte_api": {"httpRequestBody": "YQ=="}}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "YQ==", + }, + ["Use Request.body instead"], + ), + # A body should not be used unless httpResponseBody is also used. + ( + { + "body": "a", + "meta": {"zyte_api": {"browserHtml": True}}, + }, + {}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + ( + { + "body": "a", + "meta": {"zyte_api": {"screenshot": True}}, + }, + {}, + { + "screenshot": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + # httpResponseHeaders + # Warn if httpResponseHeaders is defined unnecessarily. + ( + { + "meta": {"zyte_api": {"httpResponseHeaders": True}}, + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["do not need to set httpResponseHeaders"], + ), ], ) -def test_default_params_automap(default_params, meta, expected, warnings, caplog): - """Warnings about unneeded parameters should not apply if those parameters are needed to extend or override default parameters.""" - request = Request(url="https://example.com") - request.meta["zyte_api"] = meta - with caplog.at_level("WARNING"): - api_params = _get_api_params( - request, - **{ - **GET_API_PARAMS_KWARGS, - "automap_by_default": True, - }, - ) - assert api_params == expected - if warnings: - for warning in warnings: - assert warning in caplog.text - else: - assert not caplog.records - - -def test_default_params_immutability(): - request = Request(url="https://example.com") - request.meta["zyte_api"] = {"a": None} - default_params = {"a": "b"} - _get_api_params( - request, - **{ - **GET_API_PARAMS_KWARGS, - "default_params": default_params, - }, - ) - assert default_params == {"a": "b"} - +async def test_automap( + request_kwargs: Dict[str, Any], + settings: Dict[str, Any], + expected: Union[Dict[str, str], Literal[False]], + warnings: List[str], + mockserver, + caplog, +): + settings.update({"ZYTE_API_ON_ALL_REQUESTS": True, "ZYTE_API_AUTOMAP": True}) + async with mockserver.make_handler(settings) as handler: + if expected is False: + # Only the Zyte Data API client is mocked, meaning requests that + # do not go through Zyte Data API are actually sent, so we point + # them to the mock server to avoid internet connections in tests. + request_kwargs["url"] = mockserver.urljoin("/") + else: + request_kwargs["url"] = "https://toscrape.com" + request = Request(**request_kwargs) + unmocked_client = handler._client + handler._client = mock.AsyncMock(unmocked_client) + handler._client.request_raw.side_effect = unmocked_client.request_raw + with caplog.at_level("WARNING"): + await handler.download_request(request, None) -@pytest.mark.parametrize("meta", [1, ["a", "b"]]) -def test_bad_meta_type(meta): - request = Request(url="https://example.com") - request.meta["zyte_api"] = meta - with pytest.raises(ValueError): - _get_api_params( - request, - **GET_API_PARAMS_KWARGS, - ) + # What we're interested in is the Request call in the API + request_call = [ + c for c in handler._client.mock_calls if "request_raw(" in str(c) + ] + if expected is False: + assert request_call == [] + return -@pytest.mark.parametrize( - "setting,meta,expected", - [ - (False, UNSET, False), - (False, False, False), - (False, True, True), - (True, UNSET, True), - (True, False, False), - (True, True, True), - ], -) -def test_automap_toggling(setting, meta, expected): - request = Request(url="https://example.com") - if meta is not UNSET: - request.meta["zyte_api_automap"] = meta - api_params = _get_api_params( - request, - **{ - **GET_API_PARAMS_KWARGS, - "use_api_by_default": True, - "automap_by_default": setting, - }, - ) - assert bool(api_params) == expected + if not request_call: + pytest.fail("The client's request_raw() method was not called.") + args_used = request_call[0].args[0] + args_used.pop("url") + assert args_used == expected -def _test_automap(request_kwargs, meta, expected, warnings, caplog): - request = Request(url="https://example.com", **request_kwargs) - request.meta["zyte_api"] = meta - with caplog.at_level("WARNING"): - api_params = _get_api_params( - request, - **{ - **GET_API_PARAMS_KWARGS, - "automap_by_default": True, - }, - ) - assert api_params == expected - if warnings: - for warning in warnings: - assert warning in caplog.text - else: - assert not caplog.records + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records @pytest.mark.parametrize( From 4f36bdaedb83087d12dd90ac8c08f8564c446b90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 09:47:22 +0200 Subject: [PATCH 24/81] Test the side effects of the output of _get_api_params --- scrapy_zyte_api/handler.py | 2 +- tests/test_api_requests.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index a684e924..d1fcfaa2 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -367,7 +367,7 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: browser_headers=self._browser_headers, job_id=self._job_id, ) - if api_params: + if api_params is not None: return deferred_from_coro( self._download_request(api_params, request, spider) ) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 838905e0..77bde649 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -294,6 +294,38 @@ async def test_get_api_params_input_custom(mockserver): ) +@ensureDeferred +@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") +@pytest.mark.parametrize( + "output,uses_zyte_api", + [ + (None, False), + ({}, True), + ({"a": "b"}, True), + ], +) +async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockserver): + """If _get_api_params returns None, requests go outside Zyte API, but if it + returns a dictionary, even if empty, requests go through Zyte API.""" + request = Request(url=mockserver.urljoin("/")) + async with mockserver.make_handler() as handler: + patch_path = "scrapy_zyte_api.handler._get_api_params" + with patch(patch_path) as _get_api_params: + patch_path = "scrapy_zyte_api.handler.super" + with patch(patch_path) as super: + handler._download_request = mock.AsyncMock(side_effect=RuntimeError) + super_mock = mock.Mock() + super_mock.download_request = mock.AsyncMock(side_effect=RuntimeError) + super.return_value = super_mock + _get_api_params.return_value = output + with pytest.raises(RuntimeError): + await handler.download_request(request, None) + if uses_zyte_api: + handler._download_request.assert_called() + else: + super_mock.download_request.assert_called() + + @pytest.mark.parametrize( "setting,meta,expected", [ From 3bfb787420dc9004bffe515c8013d1ba4e2d79e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 09:57:06 +0200 Subject: [PATCH 25/81] test_use_api_by_default: add a docstring --- tests/test_api_requests.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 77bde649..ccbfcfad 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -336,7 +336,6 @@ async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockser (False, {"zyte_api": True}, {}), (False, {"zyte_api": {}}, {}), (False, {"zyte_api": {"a": "b"}}, {"a": "b"}), - (False, {"zyte_api": {"browserHtml": True}}, {"browserHtml": True}), (True, None, {}), (True, {}, {}), (True, {"a": "b"}, {}), @@ -344,10 +343,19 @@ async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockser (True, {"zyte_api": True}, {}), (True, {"zyte_api": {}}, {}), (True, {"zyte_api": {"a": "b"}}, {"a": "b"}), - (True, {"zyte_api": {"browserHtml": True}}, {"browserHtml": True}), ], ) -def test_api_toggling(setting, meta, expected): +def test_use_api_by_default(setting, meta, expected): + """Test how the value of the ZYTE_API_ON_ALL_REQUESTS (*setting*) in + combination with request metadata (*meta*) determines what Zyte Data API + parameters are used (*expected*). + + Note that :func:`test_get_api_params_output_side_effects` already tests how + *expected* affects whether the request is sent through Zyte Data API or + not, and :func:`test_get_api_params_input_custom` tests how the + ZYTE_API_ON_ALL_REQUESTS setting is mapped to the corresponding + :func:`~scrapy_zyte_api.handler._get_api_params`` parameter. + """ request = Request(url="https://example.com", meta=meta) api_params = _get_api_params( request, From 3cf94b3e2f95643f32a251fdf1360368adf2b6ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 10:14:36 +0200 Subject: [PATCH 26/81] test_api_disabling_deprecated: add a docstring --- tests/test_api_requests.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index ccbfcfad..99913925 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -370,6 +370,9 @@ def test_use_api_by_default(setting, meta, expected): @pytest.mark.parametrize("setting", [False, True]) @pytest.mark.parametrize("meta", [None, 0, "", b"", []]) def test_api_disabling_deprecated(setting, meta): + """Test how undocumented falsy values of the ``zyte_api`` request metadata + key (*meta*) can be used to disable the use of Zyte Data API, but trigger a + deprecation warning asking to replace them with False.""" request = Request(url="https://example.com") request.meta["zyte_api"] = meta with pytest.warns(DeprecationWarning, match=r".* Use False instead\.$"): From 13bcde63fe7e5d26db872c18bb2c21a190aee47b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 10:18:42 +0200 Subject: [PATCH 27/81] test_job_id: add a docstring --- tests/test_api_requests.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 99913925..794615f2 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -345,16 +345,16 @@ async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockser (True, {"zyte_api": {"a": "b"}}, {"a": "b"}), ], ) -def test_use_api_by_default(setting, meta, expected): - """Test how the value of the ZYTE_API_ON_ALL_REQUESTS (*setting*) in - combination with request metadata (*meta*) determines what Zyte Data API +def test_api_toggling(setting, meta, expected): + """Test how the value of the ZYTE_API_ON_ALL_REQUESTS setting (*setting*) + in combination with request metadata (*meta*) determines what Zyte Data API parameters are used (*expected*). Note that :func:`test_get_api_params_output_side_effects` already tests how *expected* affects whether the request is sent through Zyte Data API or not, and :func:`test_get_api_params_input_custom` tests how the ZYTE_API_ON_ALL_REQUESTS setting is mapped to the corresponding - :func:`~scrapy_zyte_api.handler._get_api_params`` parameter. + :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com", meta=meta) api_params = _get_api_params( @@ -388,6 +388,13 @@ def test_api_disabling_deprecated(setting, meta): @ensureDeferred async def test_job_id(mockserver): + """Test how the value of the JOB setting (*setting*) is included as + ``jobId`` among the parameters sent to Zyte Data API. + + Note that :func:`test_get_api_params_input_custom` already tests how the + JOB setting is mapped to the corresponding + :func:`~scrapy_zyte_api.handler._get_api_params` parameter. + """ request = Request(url="https://example.com", meta={"zyte_api": True}) api_params = _get_api_params( request, From 871f178463992f2c9c7ddfedd52c18ecb9944abe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 10:23:11 +0200 Subject: [PATCH 28/81] test_default_params_none: add a docstring --- tests/test_api_requests.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 794615f2..e02e42e8 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -346,14 +346,14 @@ async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockser ], ) def test_api_toggling(setting, meta, expected): - """Test how the value of the ZYTE_API_ON_ALL_REQUESTS setting (*setting*) - in combination with request metadata (*meta*) determines what Zyte Data API - parameters are used (*expected*). + """Test how the value of the ``ZYTE_API_ON_ALL_REQUESTS`` setting + (*setting*) in combination with request metadata (*meta*) determines what + Zyte Data API parameters are used (*expected*). Note that :func:`test_get_api_params_output_side_effects` already tests how *expected* affects whether the request is sent through Zyte Data API or not, and :func:`test_get_api_params_input_custom` tests how the - ZYTE_API_ON_ALL_REQUESTS setting is mapped to the corresponding + ``ZYTE_API_ON_ALL_REQUESTS`` setting is mapped to the corresponding :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com", meta=meta) @@ -388,11 +388,11 @@ def test_api_disabling_deprecated(setting, meta): @ensureDeferred async def test_job_id(mockserver): - """Test how the value of the JOB setting (*setting*) is included as + """Test how the value of the ``JOB`` setting (*setting*) is included as ``jobId`` among the parameters sent to Zyte Data API. Note that :func:`test_get_api_params_input_custom` already tests how the - JOB setting is mapped to the corresponding + ``JOB`` setting is mapped to the corresponding :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com", meta={"zyte_api": True}) @@ -408,6 +408,18 @@ async def test_job_id(mockserver): @ensureDeferred async def test_default_params_none(mockserver, caplog): + """Test how setting a value to ``None`` in the dictionary of the + ZYTE_API_DEFAULT_PARAMS setting causes a warning, because that is not + expected to be a valid value. + + Note that ``None`` is however a valid value for parameters defined in the + ``zyte_api`` request metadata key to unset parameters set in the + ZYTE_API_DEFAULT_PARAMS setting for a specific request. + + Also note that :func:`test_get_api_params_input_custom` already tests how + the ``ZYTE_API_DEFAULT_PARAMS`` setting is mapped to the corresponding + :func:`~scrapy_zyte_api.handler._get_api_params` parameter. + """ request = Request(url="https://example.com") settings = { "ZYTE_API_DEFAULT_PARAMS": {"a": None, "b": "c"}, From 558a32bb924a960cf02b285e17bed92ef839b16f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 10:31:24 +0200 Subject: [PATCH 29/81] test_default_params_merging: add a docstring --- tests/test_api_requests.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index e02e42e8..9149c729 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -413,8 +413,8 @@ async def test_default_params_none(mockserver, caplog): expected to be a valid value. Note that ``None`` is however a valid value for parameters defined in the - ``zyte_api`` request metadata key to unset parameters set in the - ZYTE_API_DEFAULT_PARAMS setting for a specific request. + ``zyte_api`` request metadata key. It can be used to unset parameters set + in the ``ZYTE_API_DEFAULT_PARAMS`` setting for that specific request. Also note that :func:`test_get_api_params_input_custom` already tests how the ``ZYTE_API_DEFAULT_PARAMS`` setting is mapped to the corresponding @@ -442,7 +442,7 @@ async def test_default_params_none(mockserver, caplog): @pytest.mark.parametrize( - "default_params,meta,expected,warnings", + "setting,meta,expected,warnings", [ ({}, {}, {}, []), ({}, {"b": 2}, {"b": 2}, []), @@ -454,7 +454,19 @@ async def test_default_params_none(mockserver, caplog): ({"a": 1}, {"a": None}, {}, []), ], ) -def test_default_params_merging(default_params, meta, expected, warnings, caplog): +def test_default_params_merging(setting, meta, expected, warnings, caplog): + """Test how Zyte Data API parameters defined in the + ``ZYTE_API_DEFAULT_PARAMS`` setting (*setting*) and those defined in the ``zyte_api`` request metadata key (*meta*) are combined. + + Request metadata takes precedence. Also, ``None`` values in request + metadata can be used to unset parameters defined in the setting. Request + metadata ``None`` values for keys that do not exist in the setting cause a + warning. + + Note that :func:`test_get_api_params_input_custom` already tests how the + ``ZYTE_API_DEFAULT_PARAMS`` setting is mapped to the corresponding + :func:`~scrapy_zyte_api.handler._get_api_params` parameter. + """ request = Request(url="https://example.com") request.meta["zyte_api"] = meta with caplog.at_level("WARNING"): @@ -462,7 +474,7 @@ def test_default_params_merging(default_params, meta, expected, warnings, caplog request, **{ **GET_API_PARAMS_KWARGS, - "default_params": default_params, + "default_params": setting, }, ) assert api_params == expected From c661d384ab4f1bb95bc99d1845d8c4ab8c0eb785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 10:32:57 +0200 Subject: [PATCH 30/81] Move test_default_params_automap after default params and automap tests --- tests/test_api_requests.py | 77 +++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 9149c729..afa496ad 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -485,44 +485,6 @@ def test_default_params_merging(setting, meta, expected, warnings, caplog): assert not caplog.records -@pytest.mark.xfail(reason="To be implemented", strict=True) -@pytest.mark.parametrize( - "default_params,meta,expected,warnings", - [ - ( - {"screenshot": True, "httpResponseHeaders": True}, - {"browserHtml": True}, - {"browserHtml": True, "httpResponseHeaders": True, "screenshot": True}, - [], - ), - ( - {"browserHtml": True, "httpResponseHeaders": False}, - {"screenshot": True, "browserHtml": False}, - {"screenshot": True}, - [], - ), - ], -) -def test_default_params_automap(default_params, meta, expected, warnings, caplog): - """Warnings about unneeded parameters should not apply if those parameters are needed to extend or override default parameters.""" - request = Request(url="https://example.com") - request.meta["zyte_api"] = meta - with caplog.at_level("WARNING"): - api_params = _get_api_params( - request, - **{ - **GET_API_PARAMS_KWARGS, - "automap_by_default": True, - }, - ) - assert api_params == expected - if warnings: - for warning in warnings: - assert warning in caplog.text - else: - assert not caplog.records - - def test_default_params_immutability(): request = Request(url="https://example.com") request.meta["zyte_api"] = {"a": None} @@ -1459,3 +1421,42 @@ def test_automap_headers(headers, meta, expected, warnings, caplog): ) def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): _test_automap({}, meta, expected, warnings, caplog) + + +@pytest.mark.xfail(reason="To be implemented", strict=True) +@pytest.mark.parametrize( + "default_params,meta,expected,warnings", + [ + ( + {"screenshot": True, "httpResponseHeaders": True}, + {"browserHtml": True}, + {"browserHtml": True, "httpResponseHeaders": True, "screenshot": True}, + [], + ), + ( + {"browserHtml": True, "httpResponseHeaders": False}, + {"screenshot": True, "browserHtml": False}, + {"screenshot": True}, + [], + ), + ], +) +def test_default_params_automap(default_params, meta, expected, warnings, caplog): + """Warnings about unneeded parameters should not apply if those parameters + are needed to extend or override default parameters.""" + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + with caplog.at_level("WARNING"): + api_params = _get_api_params( + request, + **{ + **GET_API_PARAMS_KWARGS, + "automap_by_default": True, + }, + ) + assert api_params == expected + if warnings: + for warning in warnings: + assert warning in caplog.text + else: + assert not caplog.records From a934bee7ff26518636747b4b24f68dd713113703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 10:50:43 +0200 Subject: [PATCH 31/81] test_default_params_immutability: add a docstring --- tests/test_api_requests.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index afa496ad..b09870a0 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -486,6 +486,10 @@ def test_default_params_merging(setting, meta, expected, warnings, caplog): def test_default_params_immutability(): + """Make sure that the merging of Zyte Data API parameters from the + ``ZYTE_API_DEFAULT_PARAMS`` setting with those from the ``zyte_api`` + request metadata key does not affect the contents of the setting for later + requests.""" request = Request(url="https://example.com") request.meta["zyte_api"] = {"a": None} default_params = {"a": "b"} From 0bdf54e6839d02ebe038dba0873ac6f5a4f4190a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 10:52:25 +0200 Subject: [PATCH 32/81] test_bad_meta_type: reorder and add a docstring --- tests/test_api_requests.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index b09870a0..775e1932 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -386,6 +386,19 @@ def test_api_disabling_deprecated(setting, meta): assert api_params is None +@pytest.mark.parametrize("meta", [1, ["a", "b"]]) +def test_bad_meta_type(meta): + """Test how undocumented truthy values for the ``zyte_api`` request + metadata key (*meta*) trigger a :exc:`ValueError` exception.""" + request = Request(url="https://example.com") + request.meta["zyte_api"] = meta + with pytest.raises(ValueError): + _get_api_params( + request, + **GET_API_PARAMS_KWARGS, + ) + + @ensureDeferred async def test_job_id(mockserver): """Test how the value of the ``JOB`` setting (*setting*) is included as @@ -485,6 +498,19 @@ def test_default_params_merging(setting, meta, expected, warnings, caplog): assert not caplog.records +@pytest.mark.parametrize( + "meta,expected,warnings", + [ + ({}, {}, {}, []), + ({}, {"b": 2}, {"b": 2}, []), + ({}, {"b": None}, {}, ["does not define such a parameter"]), + ({"a": 1}, {}, {"a": 1}, []), + ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}, []), + ({"a": 1}, {"b": None}, {"a": 1}, ["does not define such a parameter"]), + ({"a": 1}, {"a": 2}, {"a": 2}, []), + ({"a": 1}, {"a": None}, {}, []), + ], +) def test_default_params_immutability(): """Make sure that the merging of Zyte Data API parameters from the ``ZYTE_API_DEFAULT_PARAMS`` setting with those from the ``zyte_api`` @@ -503,17 +529,6 @@ def test_default_params_immutability(): assert default_params == {"a": "b"} -@pytest.mark.parametrize("meta", [1, ["a", "b"]]) -def test_bad_meta_type(meta): - request = Request(url="https://example.com") - request.meta["zyte_api"] = meta - with pytest.raises(ValueError): - _get_api_params( - request, - **GET_API_PARAMS_KWARGS, - ) - - @pytest.mark.parametrize( "setting,meta,expected", [ From 4288101ffbe6f275f9c224a103d0ad974fde5c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 10:57:34 +0200 Subject: [PATCH 33/81] test_automap_toggling: add a docstring --- tests/test_api_requests.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 775e1932..6810d613 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -541,6 +541,14 @@ def test_default_params_immutability(): ], ) def test_automap_toggling(setting, meta, expected): + """Test how the value of the ``ZYTE_API_AUTOMAP`` setting (*setting*) in combination with the ``zyte_api_automap`` request metadata key (*meta*) + determines whether or not automated mapping is enabled for a request + (*expected*). + + Note that :func:`test_get_api_params_input_custom` already tests how the + ``ZYTE_API_AUTOMAP`` setting is mapped to the corresponding + :func:`~scrapy_zyte_api.handler._get_api_params` parameter. + """ request = Request(url="https://example.com") if meta is not UNSET: request.meta["zyte_api_automap"] = meta From 8aa623af91dcb4da9f992278aad9105f1e3ddb88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 11:07:16 +0200 Subject: [PATCH 34/81] test_default_params_immutability: fix, finish its refactoring --- tests/test_api_requests.py | 40 +++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 6810d613..f4ba7602 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,5 +1,6 @@ import sys from asyncio import iscoroutine +from copy import copy from typing import Any, Dict, List, Union from unittest import mock from unittest.mock import patch @@ -499,26 +500,33 @@ def test_default_params_merging(setting, meta, expected, warnings, caplog): @pytest.mark.parametrize( - "meta,expected,warnings", + "setting,meta", [ - ({}, {}, {}, []), - ({}, {"b": 2}, {"b": 2}, []), - ({}, {"b": None}, {}, ["does not define such a parameter"]), - ({"a": 1}, {}, {"a": 1}, []), - ({"a": 1}, {"b": 2}, {"a": 1, "b": 2}, []), - ({"a": 1}, {"b": None}, {"a": 1}, ["does not define such a parameter"]), - ({"a": 1}, {"a": 2}, {"a": 2}, []), - ({"a": 1}, {"a": None}, {}, []), + # append + ( + {"a": "b"}, + {"b": "c"}, + ), + # overwrite + ( + {"a": "b"}, + {"a": "c"}, + ), + # drop + ( + {"a": "b"}, + {"a": None}, + ), ], ) -def test_default_params_immutability(): +def test_default_params_immutability(setting, meta): """Make sure that the merging of Zyte Data API parameters from the - ``ZYTE_API_DEFAULT_PARAMS`` setting with those from the ``zyte_api`` - request metadata key does not affect the contents of the setting for later - requests.""" + ``ZYTE_API_DEFAULT_PARAMS`` setting (*setting*) with those from the + ``zyte_api`` request metadata key (*meta*) does not affect the contents of + the setting for later requests.""" request = Request(url="https://example.com") - request.meta["zyte_api"] = {"a": None} - default_params = {"a": "b"} + request.meta["zyte_api"] = meta + default_params = copy(setting) _get_api_params( request, **{ @@ -526,7 +534,7 @@ def test_default_params_immutability(): "default_params": default_params, }, ) - assert default_params == {"a": "b"} + assert default_params == setting @pytest.mark.parametrize( From 7191ae2d8a85956eaea9bd16e8a5f6f678b6d5b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 11:43:22 +0200 Subject: [PATCH 35/81] test_automap: finish refactoring --- setup.py | 1 - tests/test_api_requests.py | 487 ++++++++++++++----------------------- 2 files changed, 189 insertions(+), 299 deletions(-) diff --git a/setup.py b/setup.py index 68817309..956b0085 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,6 @@ packages=["scrapy_zyte_api"], install_requires=[ "scrapy>=2.6.0", - "typing-extensions>=3.10", "zyte-api>=0.3.0", ], classifiers=[ diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index f4ba7602..5b4877be 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,7 +1,7 @@ import sys from asyncio import iscoroutine from copy import copy -from typing import Any, Dict, List, Union +from typing import Any, Dict from unittest import mock from unittest.mock import patch @@ -15,7 +15,6 @@ from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT from scrapy.utils.test import get_crawler from twisted.internet.defer import Deferred -from typing_extensions import Literal from zyte_api.aio.errors import RequestError from scrapy_zyte_api.handler import _get_api_params @@ -571,7 +570,7 @@ def test_automap_toggling(setting, meta, expected): assert bool(api_params) == expected -def _test_automap(request_kwargs, meta, expected, warnings, caplog): +def _test_automap(global_kwargs, request_kwargs, meta, expected, warnings, caplog): request = Request(url="https://example.com", **request_kwargs) request.meta["zyte_api"] = meta with caplog.at_level("WARNING"): @@ -579,6 +578,7 @@ def _test_automap(request_kwargs, meta, expected, warnings, caplog): request, **{ **GET_API_PARAMS_KWARGS, + **global_kwargs, "automap_by_default": True, }, ) @@ -590,296 +590,6 @@ def _test_automap(request_kwargs, meta, expected, warnings, caplog): assert not caplog.records -@ensureDeferred -@pytest.mark.skipif(sys.version_info < (3, 8), reason="unittest.mock.AsyncMock") -@pytest.mark.parametrize( - "request_kwargs,settings,expected,warnings", - [ - # The Accept and Accept-Language headers, when unsupported, are dropped - # silently if their value matches the default value of Scrapy for - # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. - ( - { - "headers": DEFAULT_REQUEST_HEADERS, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - [], - ), - ( - { - "headers": { - "Accept": "application/json", - "Accept-Language": "uk", - }, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # The Cookie header is dropped with a warning. - ( - { - "headers": { - "Cookie": "a=b", - }, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - ( - { - "headers": { - "Cookie": "a=b", - }, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # The User-Agent header, which Scrapy sets by default, is dropped - # silently if it matches the default value of the USER_AGENT setting, - # or with a warning otherwise. - ( - { - "headers": {"User-Agent": DEFAULT_USER_AGENT}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - [], - ), - ( - { - "headers": {"User-Agent": ""}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - ( - { - "headers": {"User-Agent": DEFAULT_USER_AGENT}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - [], - ), - ( - { - "headers": {"User-Agent": ""}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["cannot be mapped"], - ), - # You may update the ZYTE_API_UNSUPPORTED_HEADERS setting to remove - # headers that the customHttpRequestHeaders parameter starts supporting - # in the future. - ( - { - "headers": { - "Cookie": "", - "User-Agent": "", - }, - }, - { - "ZYTE_API_UNSUPPORTED_HEADERS": ["Cookie"], - }, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "customHttpRequestHeaders": [ - {"name": "User-Agent", "value": ""}, - ], - }, - [ - "defines header b'Cookie', which cannot be mapped", - ], - ), - # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support - # for new fields that the requestHeaders parameter may support in the - # future. - ( - { - "headers": {"User-Agent": ""}, - "meta": {"zyte_api": {"browserHtml": True}}, - }, - { - "ZYTE_API_BROWSER_HEADERS": { - "Referer": "referer", - "User-Agent": "userAgent", - }, - }, - { - "browserHtml": True, - "httpResponseHeaders": True, - "requestHeaders": {"userAgent": ""}, - }, - [], - ), - # BODY - # The body is copied into httpRequestBody, base64-encoded. - ( - { - "body": "a", - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "YQ==", - }, - [], - ), - # httpRequestBody defined in meta takes precedence, but it causes a - # warning. - ( - { - "body": "a", - "meta": {"zyte_api": {"httpRequestBody": "Yg=="}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "Yg==", - }, - [ - "Use Request.body instead", - "does not match the Zyte Data API httpRequestBody parameter", - ], - ), - # httpRequestBody defined in meta causes a warning even if it matches - # request.body. - ( - { - "body": "a", - "meta": {"zyte_api": {"httpRequestBody": "YQ=="}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestBody": "YQ==", - }, - ["Use Request.body instead"], - ), - # A body should not be used unless httpResponseBody is also used. - ( - { - "body": "a", - "meta": {"zyte_api": {"browserHtml": True}}, - }, - {}, - { - "browserHtml": True, - "httpResponseHeaders": True, - }, - ["can only be set when the httpResponseBody parameter"], - ), - ( - { - "body": "a", - "meta": {"zyte_api": {"screenshot": True}}, - }, - {}, - { - "screenshot": True, - }, - ["can only be set when the httpResponseBody parameter"], - ), - # httpResponseHeaders - # Warn if httpResponseHeaders is defined unnecessarily. - ( - { - "meta": {"zyte_api": {"httpResponseHeaders": True}}, - }, - {}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - }, - ["do not need to set httpResponseHeaders"], - ), - ], -) -async def test_automap( - request_kwargs: Dict[str, Any], - settings: Dict[str, Any], - expected: Union[Dict[str, str], Literal[False]], - warnings: List[str], - mockserver, - caplog, -): - settings.update({"ZYTE_API_ON_ALL_REQUESTS": True, "ZYTE_API_AUTOMAP": True}) - async with mockserver.make_handler(settings) as handler: - if expected is False: - # Only the Zyte Data API client is mocked, meaning requests that - # do not go through Zyte Data API are actually sent, so we point - # them to the mock server to avoid internet connections in tests. - request_kwargs["url"] = mockserver.urljoin("/") - else: - request_kwargs["url"] = "https://toscrape.com" - request = Request(**request_kwargs) - unmocked_client = handler._client - handler._client = mock.AsyncMock(unmocked_client) - handler._client.request_raw.side_effect = unmocked_client.request_raw - with caplog.at_level("WARNING"): - await handler.download_request(request, None) - - # What we're interested in is the Request call in the API - request_call = [ - c for c in handler._client.mock_calls if "request_raw(" in str(c) - ] - - if expected is False: - assert request_call == [] - return - - if not request_call: - pytest.fail("The client's request_raw() method was not called.") - - args_used = request_call[0].args[0] - args_used.pop("url") - assert args_used == expected - - if warnings: - for warning in warnings: - assert warning in caplog.text - else: - assert not caplog.records - - @pytest.mark.parametrize( "meta,expected,warnings", [ @@ -926,7 +636,7 @@ async def test_automap( ], ) def test_automap_main_outputs(meta, expected, warnings, caplog): - _test_automap({}, meta, expected, warnings, caplog) + _test_automap({}, {}, meta, expected, warnings, caplog) @pytest.mark.parametrize( @@ -1026,7 +736,7 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): ], ) def test_automap_header_output(meta, expected, warnings, caplog): - _test_automap({}, meta, expected, warnings, caplog) + _test_automap({}, {}, meta, expected, warnings, caplog) @pytest.mark.parametrize( @@ -1119,7 +829,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): ], ) def test_automap_method(method, meta, expected, warnings, caplog): - _test_automap({"method": method}, meta, expected, warnings, caplog) + _test_automap({}, {"method": method}, meta, expected, warnings, caplog) @pytest.mark.parametrize( @@ -1396,10 +1106,191 @@ def test_automap_method(method, meta, expected, warnings, caplog): }, ["cannot be mapped"], ), + # The Accept and Accept-Language headers, when unsupported, are dropped + # silently if their value matches the default value of Scrapy for + # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. + ( + DEFAULT_REQUEST_HEADERS, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + { + "Accept": "application/json", + "Accept-Language": "uk", + }, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + # The User-Agent header, which Scrapy sets by default, is dropped + # silently if it matches the default value of the USER_AGENT setting, + # or with a warning otherwise. + ( + {"User-Agent": DEFAULT_USER_AGENT}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"User-Agent": ""}, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), + ( + {"User-Agent": DEFAULT_USER_AGENT}, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"User-Agent": ""}, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["cannot be mapped"], + ), ], ) def test_automap_headers(headers, meta, expected, warnings, caplog): - _test_automap({"headers": headers}, meta, expected, warnings, caplog) + _test_automap({}, {"headers": headers}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "global_kwargs,headers,meta,expected,warnings", + [ + # You may update the ZYTE_API_UNSUPPORTED_HEADERS setting to remove + # headers that the customHttpRequestHeaders parameter starts supporting + # in the future. + ( + { + "unsupported_headers": {b"cookie"}, + }, + { + "Cookie": "", + "User-Agent": "", + }, + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "customHttpRequestHeaders": [ + {"name": "User-Agent", "value": ""}, + ], + }, + [ + "defines header b'Cookie', which cannot be mapped", + ], + ), + # You may update the ZYTE_API_BROWSER_HEADERS setting to extend support + # for new fields that the requestHeaders parameter may support in the + # future. + ( + { + "browser_headers": { + b"referer": "referer", + b"user-agent": "userAgent", + }, + }, + {"User-Agent": ""}, + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + "requestHeaders": {"userAgent": ""}, + }, + [], + ), + ], +) +def test_automap_header_settings( + global_kwargs, headers, meta, expected, warnings, caplog +): + _test_automap(global_kwargs, {"headers": headers}, meta, expected, warnings, caplog) + + +@pytest.mark.parametrize( + "body,meta,expected,warnings", + [ + # The body is copied into httpRequestBody, base64-encoded. + ( + "a", + {}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "YQ==", + }, + [], + ), + # httpRequestBody defined in meta takes precedence, but it causes a + # warning. + ( + "a", + {"httpRequestBody": "Yg=="}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "Yg==", + }, + [ + "Use Request.body instead", + "does not match the Zyte Data API httpRequestBody parameter", + ], + ), + # httpRequestBody defined in meta causes a warning even if it matches + # request.body. + ( + "a", + {"httpRequestBody": "YQ=="}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestBody": "YQ==", + }, + ["Use Request.body instead"], + ), + # A body should not be used unless httpResponseBody is also used. + ( + "a", + {"browserHtml": True}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + ( + "a", + {"screenshot": True}, + { + "screenshot": True, + }, + ["can only be set when the httpResponseBody parameter"], + ), + ], +) +def test_automap_body(body, meta, expected, warnings, caplog): + _test_automap({}, {"body": body}, meta, expected, warnings, caplog) @pytest.mark.parametrize( @@ -1455,7 +1346,7 @@ def test_automap_headers(headers, meta, expected, warnings, caplog): ], ) def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): - _test_automap({}, meta, expected, warnings, caplog) + _test_automap({}, {}, meta, expected, warnings, caplog) @pytest.mark.xfail(reason="To be implemented", strict=True) From efd61d01fa7da6dce8428cb2aeb02f57c8d0ea9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 11:53:00 +0200 Subject: [PATCH 36/81] test_automap_main_outputs: add comments --- tests/test_api_requests.py | 52 ++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 5b4877be..27b73270 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -593,26 +593,27 @@ def _test_automap(global_kwargs, request_kwargs, meta, expected, warnings, caplo @pytest.mark.parametrize( "meta,expected,warnings", [ + # If no other known main output is specified in meta, httpResponseBody + # is requested. ({}, {"httpResponseBody": True, "httpResponseHeaders": True}, []), ( - {"httpResponseBody": True}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - ["do not need to set httpResponseBody to True"], - ), - ( - {"httpResponseBody": False}, - {}, - [], - ), - ( - {"httpResponseBody": True, "browserHtml": True}, + {"unknownMainOutput": True}, { - "browserHtml": True, "httpResponseBody": True, "httpResponseHeaders": True, + "unknownMainOutput": True, }, [], ), + # If httpResponseBody is unnecessarily requested in meta, a warning is + # logged. + ( + {"httpResponseBody": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + ["do not need to set httpResponseBody to True"], + ), + # If other main outputs are specified in meta, httpRequestBody is not + # set. ( {"browserHtml": True}, {"browserHtml": True, "httpResponseHeaders": True}, @@ -624,13 +625,32 @@ def _test_automap(global_kwargs, request_kwargs, meta, expected, warnings, caplo [], ), ( - {"unknown": True}, - {"httpResponseBody": True, "httpResponseHeaders": True, "unknown": True}, + {"browserHtml": True, "screenshot": True}, + {"browserHtml": True, "httpResponseHeaders": True, "screenshot": True}, [], ), + # If no known main output is specified, and httpResponseBody is + # explicitly set to False, httpResponseBody is unset and no main output + # is added. ( - {"unknown": True, "httpResponseBody": False}, - {"unknown": True}, + {"httpResponseBody": False}, + {}, + [], + ), + ( + {"httpResponseBody": False, "unknownMainOutput": True}, + {"unknownMainOutput": True}, + [], + ), + # We allow httpResponseBody and browserHtml to be both set to True, in + # case that becomes possible in the future. + ( + {"httpResponseBody": True, "browserHtml": True}, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, [], ), ], From 5fd6a5f6a5c0a226b48f4a7f06e3bd0e88f42b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 12:09:58 +0200 Subject: [PATCH 37/81] test_automap_header_output: add comments --- tests/test_api_requests.py | 132 +++++++++++++++++++++++-------------- 1 file changed, 81 insertions(+), 51 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 27b73270..71b03928 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -662,16 +662,40 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): @pytest.mark.parametrize( "meta,expected,warnings", [ - ({"httpResponseHeaders": False}, {"httpResponseBody": True}, []), + # Test cases where httpResponseHeaders is not specifically set to True + # or False, where it is automatically set to True if httpResponseBody + # or browserHtml are also True, are covered in + # test_automap_main_outputs. + # + # If httpResponseHeaders is set to True in a scenario where it would + # not be implicitly set to True, it is passed as such. ( + {"httpResponseBody": False, "httpResponseHeaders": True}, {"httpResponseHeaders": True}, - {"httpResponseBody": True, "httpResponseHeaders": True}, - ["do not need to set httpResponseHeaders to True"], + [], ), ( - {"httpResponseBody": True, "httpResponseHeaders": False}, - {"httpResponseBody": True}, - ["do not need to set httpResponseBody to True"], + {"screenshot": True, "httpResponseHeaders": True}, + {"screenshot": True, "httpResponseHeaders": True}, + [], + ), + ( + { + "unknownMainOutput": True, + "httpResponseBody": False, + "httpResponseHeaders": True, + }, + {"unknownMainOutput": True, "httpResponseHeaders": True}, + [], + ), + # If httpResponseHeaders is unnecessarily set to True where + # httpResponseBody or browserHtml are set to True implicitly or + # explicitly, httpResponseHeaders is set to True, and a warning is + # logged. + ( + {"httpResponseHeaders": True}, + {"httpResponseBody": True, "httpResponseHeaders": True}, + ["do not need to set httpResponseHeaders to True"], ), ( {"httpResponseBody": True, "httpResponseHeaders": True}, @@ -681,21 +705,6 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): "do not need to set httpResponseBody to True", ], ), - ( - {"httpResponseBody": False, "httpResponseHeaders": False}, - {}, - ["do not need to set httpResponseHeaders to False"], - ), - ( - {"httpResponseBody": False, "httpResponseHeaders": True}, - {"httpResponseHeaders": True}, - [], - ), - ( - {"browserHtml": True, "httpResponseHeaders": False}, - {"browserHtml": True}, - [], - ), ( {"browserHtml": True, "httpResponseHeaders": True}, {"browserHtml": True, "httpResponseHeaders": True}, @@ -705,52 +714,73 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): { "httpResponseBody": True, "browserHtml": True, - "httpResponseHeaders": False, + "httpResponseHeaders": True, }, - {"browserHtml": True, "httpResponseBody": True}, - [], - ), - ( { - "httpResponseBody": True, "browserHtml": True, + "httpResponseBody": True, "httpResponseHeaders": True, }, + ["do not need to set httpResponseHeaders to True"], + ), + ( + {"unknownMainOutput": True, "httpResponseHeaders": True}, { - "browserHtml": True, + "unknownMainOutput": True, "httpResponseBody": True, "httpResponseHeaders": True, }, ["do not need to set httpResponseHeaders to True"], ), + # If httpResponseHeaders is set to False, httpResponseHeaders is not + # defined, even if httpResponseBody or browserHtml are set to True, + # implicitly or explicitly. + ({"httpResponseHeaders": False}, {"httpResponseBody": True}, []), ( - {"screenshot": True, "httpResponseHeaders": False}, - {"screenshot": True}, - ["do not need to set httpResponseHeaders to False"], + {"httpResponseBody": True, "httpResponseHeaders": False}, + {"httpResponseBody": True}, + ["do not need to set httpResponseBody to True"], ), ( - {"screenshot": True, "httpResponseHeaders": True}, - {"screenshot": True, "httpResponseHeaders": True}, + {"browserHtml": True, "httpResponseHeaders": False}, + {"browserHtml": True}, [], ), ( - {"unknown": True, "httpResponseHeaders": True}, - {"unknown": True, "httpResponseBody": True, "httpResponseHeaders": True}, - ["do not need to set httpResponseHeaders to True"], + { + "httpResponseBody": True, + "browserHtml": True, + "httpResponseHeaders": False, + }, + {"browserHtml": True, "httpResponseBody": True}, + [], ), ( - {"unknown": True, "httpResponseHeaders": False}, - {"unknown": True, "httpResponseBody": True}, + {"unknownMainOutput": True, "httpResponseHeaders": False}, + {"unknownMainOutput": True, "httpResponseBody": True}, [], ), + # If httpResponseHeaders is unnecessarily set to False where + # httpResponseBody and browserHtml are set to False implicitly or + # explicitly, httpResponseHeaders is not defined, and a warning is + # logged. ( - {"unknown": True, "httpResponseBody": False, "httpResponseHeaders": True}, - {"unknown": True, "httpResponseHeaders": True}, - [], + {"httpResponseBody": False, "httpResponseHeaders": False}, + {}, + ["do not need to set httpResponseHeaders to False"], ), ( - {"unknown": True, "httpResponseBody": False, "httpResponseHeaders": False}, - {"unknown": True}, + {"screenshot": True, "httpResponseHeaders": False}, + {"screenshot": True}, + ["do not need to set httpResponseHeaders to False"], + ), + ( + { + "unknownMainOutput": True, + "httpResponseBody": False, + "httpResponseHeaders": False, + }, + {"unknownMainOutput": True}, ["do not need to set httpResponseHeaders to False"], ), ], @@ -917,23 +947,23 @@ def test_automap_method(method, meta, expected, warnings, caplog): ), ( {"Referer": "a"}, - {"unknown": True}, + {"unknownMainOutput": True}, { "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], "httpResponseBody": True, "httpResponseHeaders": True, - "unknown": True, + "unknownMainOutput": True, }, [], ), ( {"Referer": "a"}, - {"unknown": True, "httpResponseBody": False}, + {"unknownMainOutput": True, "httpResponseBody": False}, { "requestHeaders": {"referer": "a"}, - "unknown": True, + "unknownMainOutput": True, }, [], ), @@ -994,19 +1024,19 @@ def test_automap_method(method, meta, expected, warnings, caplog): ), ( {"Referer": None}, - {"unknown": True}, + {"unknownMainOutput": True}, { "httpResponseBody": True, "httpResponseHeaders": True, - "unknown": True, + "unknownMainOutput": True, }, [], ), ( {"Referer": None}, - {"unknown": True, "httpResponseBody": False}, + {"unknownMainOutput": True, "httpResponseBody": False}, { - "unknown": True, + "unknownMainOutput": True, }, [], ), From dca7e4bcd9d3c4bf76162497c11d1b388c0522d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 12:16:30 +0200 Subject: [PATCH 38/81] test_automap_method: add comments --- tests/test_api_requests.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 71b03928..7e5ee8a8 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -792,6 +792,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): @pytest.mark.parametrize( "method,meta,expected,warnings", [ + # The GET HTTP method is not mapped, since it is the default method. ( "GET", {}, @@ -801,6 +802,9 @@ def test_automap_header_output(meta, expected, warnings, caplog): }, [], ), + # Other HTTP methods, regardless of whether they are supported, + # unsupported, or unknown, are mapped as httpRequestMethod, letting + # Zyte Data API decide whether or not they are allowed. *( ( method, @@ -824,6 +828,9 @@ def test_automap_header_output(meta, expected, warnings, caplog): "FOO", ) ), + # If httpRequestMethod is also specified in meta with the same value + # as Request.method, a warning is logged asking to use only + # Request.meta. *( ( request_method, @@ -840,6 +847,9 @@ def test_automap_header_output(meta, expected, warnings, caplog): ("POST", "POST"), ) ), + # If httpRequestMethod is also specified in meta with a different value + # from Request.method, a warning is logged asking to use Request.meta, + # and the meta value takes precedence. *( ( request_method, @@ -859,6 +869,9 @@ def test_automap_header_output(meta, expected, warnings, caplog): ("PUT", "GET"), ) ), + # If httpResponseBody is not True, implicitly or explicitly, + # Request.method is not mapped, and a warning is issued if its value + # is anything other than GET. ( "POST", {"browserHtml": True}, From bfee2bab0d04a195e7b7c88e2cd34298a42fb589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 12:38:43 +0200 Subject: [PATCH 39/81] test_automap_headers: add comments --- tests/test_api_requests.py | 78 ++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 7e5ee8a8..02d6fddc 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -898,7 +898,8 @@ def test_automap_method(method, meta, expected, warnings, caplog): @pytest.mark.parametrize( "headers,meta,expected,warnings", [ - # Base header mapping scenarios for a supported header. + # If httpResponseBody is True, implicitly or explicitly, + # Request.headers are mapped as customHttpRequestHeaders. ( {"Referer": "a"}, {}, @@ -911,6 +912,8 @@ def test_automap_method(method, meta, expected, warnings, caplog): }, [], ), + # If browserHtml or screenshot are True, Request.headers are mapped as + # requestHeaders. ( {"Referer": "a"}, {"browserHtml": True}, @@ -921,6 +924,18 @@ def test_automap_method(method, meta, expected, warnings, caplog): }, [], ), + ( + {"Referer": "a"}, + {"screenshot": True}, + { + "requestHeaders": {"referer": "a"}, + "screenshot": True, + }, + [], + ), + # If both httpResponseBody and browserHtml (or screenshot, or both) are + # True, implicitly or explicitly, Request.headers are mapped both as + # customHttpRequestHeaders and as requestHeaders. ( {"Referer": "a"}, {"browserHtml": True, "httpResponseBody": True}, @@ -937,27 +952,42 @@ def test_automap_method(method, meta, expected, warnings, caplog): ), ( {"Referer": "a"}, - {"screenshot": True}, + {"screenshot": True, "httpResponseBody": True}, { - "screenshot": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, "requestHeaders": {"referer": "a"}, + "screenshot": True, }, [], ), ( {"Referer": "a"}, - {"screenshot": True, "httpResponseBody": True}, + {"browserHtml": True, "screenshot": True, "httpResponseBody": True}, { - "screenshot": True, + "browserHtml": True, "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], "httpResponseBody": True, "httpResponseHeaders": True, "requestHeaders": {"referer": "a"}, + "screenshot": True, }, [], ), + # If httpResponseBody is True, implicitly or explicitly, and there is + # no other known main output parameter (browserHtml, screenshot), + # Request.headers are mapped as customHttpRequestHeaders only. + # + # While future main output parameters are likely to use requestHeaders + # instead, we cannot known if an unknown parameter is a main output + # parameter or a different type of parameter for httpRequestBody, and + # what we know for sure is that, at the time of writing, Zyte Data API + # does not allow requestHeaders to be combined with httpRequestBody. ( {"Referer": "a"}, {"unknownMainOutput": True}, @@ -971,24 +1001,28 @@ def test_automap_method(method, meta, expected, warnings, caplog): }, [], ), + # If no known main output is requested, implicitly or explicitly, we + # assume that some unknown main output is being requested, and we map + # Request.headers as requestHeaders because that is the most likely way + # headers will need to be mapped for a future main output. ( {"Referer": "a"}, - {"unknownMainOutput": True, "httpResponseBody": False}, + {"httpResponseBody": False}, { "requestHeaders": {"referer": "a"}, - "unknownMainOutput": True, }, [], ), ( {"Referer": "a"}, - {"httpResponseBody": False}, + {"unknownMainOutput": True, "httpResponseBody": False}, { "requestHeaders": {"referer": "a"}, + "unknownMainOutput": True, }, [], ), - # Headers with None as value are ignored. + # Headers with None as value are not mapped. ( {"Referer": None}, {}, @@ -1059,18 +1093,18 @@ def test_automap_method(method, meta, expected, warnings, caplog): {}, [], ), - # Warn if header parameters are used, even if the values match request - # headers. + # Warn if header parameters are used in meta, even if the values match + # request headers. If they do not match, meta takes precedence. ( {"Referer": "a"}, { "customHttpRequestHeaders": [ - {"name": "Referer", "value": "b"}, + {"name": "Referer", "value": "a"}, ] }, { "customHttpRequestHeaders": [ - {"name": "Referer", "value": "b"}, + {"name": "Referer", "value": "a"}, ], "httpResponseBody": True, "httpResponseHeaders": True, @@ -1081,11 +1115,11 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"Referer": "a"}, { "browserHtml": True, - "requestHeaders": {"referer": "b"}, + "requestHeaders": {"referer": "a"}, }, { "browserHtml": True, - "requestHeaders": {"referer": "b"}, + "requestHeaders": {"referer": "a"}, "httpResponseHeaders": True, }, ["Use Request.headers instead"], @@ -1094,12 +1128,12 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"Referer": "a"}, { "customHttpRequestHeaders": [ - {"name": "Referer", "value": "a"}, + {"name": "Referer", "value": "b"}, ] }, { "customHttpRequestHeaders": [ - {"name": "Referer", "value": "a"}, + {"name": "Referer", "value": "b"}, ], "httpResponseBody": True, "httpResponseHeaders": True, @@ -1110,11 +1144,11 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"Referer": "a"}, { "browserHtml": True, - "requestHeaders": {"referer": "a"}, + "requestHeaders": {"referer": "b"}, }, { "browserHtml": True, - "requestHeaders": {"referer": "a"}, + "requestHeaders": {"referer": "b"}, "httpResponseHeaders": True, }, ["Use Request.headers instead"], @@ -1173,7 +1207,11 @@ def test_automap_method(method, meta, expected, warnings, caplog): # silently if their value matches the default value of Scrapy for # DEFAULT_REQUEST_HEADERS, or with a warning otherwise. ( - DEFAULT_REQUEST_HEADERS, + { + k: v + for k, v in DEFAULT_REQUEST_HEADERS.items() + if k in {"Accept", "Accept-Language"} + }, {"browserHtml": True}, { "browserHtml": True, From 30df7c8adfdb0320965f989c0e5832466df36934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 12:48:52 +0200 Subject: [PATCH 40/81] test_automap_default_parameter_cleanup: add comments --- tests/test_api_requests.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 02d6fddc..ae62fb2c 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1397,13 +1397,10 @@ def test_automap_body(body, meta, expected, warnings, caplog): @pytest.mark.parametrize( "meta,expected,warnings", [ - ( - { - "httpResponseBody": False, - }, - {}, - [], - ), + # When httpResponseBody, browserHtml, screenshot, or + # httpResponseHeaders, are unnecessarily set to False, they are not + # defined in the parameters sent to Zyte Data API, and a warning is + # logged. ( { "browserHtml": True, @@ -1438,11 +1435,12 @@ def test_automap_body(body, meta, expected, warnings, caplog): ( { "httpResponseHeaders": False, + "screenshot": True, }, { - "httpResponseBody": True, + "screenshot": True, }, - [], + ["do not need to set httpResponseHeaders to False"], ), ], ) From 1bad96bb9b5a3664f24a778e68e435004abb759e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 13:01:56 +0200 Subject: [PATCH 41/81] Extend test_automap_headers --- tests/test_api_requests.py | 72 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index ae62fb2c..53c5a9a8 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1094,7 +1094,8 @@ def test_automap_method(method, meta, expected, warnings, caplog): [], ), # Warn if header parameters are used in meta, even if the values match - # request headers. If they do not match, meta takes precedence. + # request headers, and even if there are no request headers to match in + # the first place. If they do not match, meta takes precedence. ( {"Referer": "a"}, { @@ -1153,6 +1154,72 @@ def test_automap_method(method, meta, expected, warnings, caplog): }, ["Use Request.headers instead"], ), + ( + {}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ] + }, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + ( + {}, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + }, + { + "browserHtml": True, + "requestHeaders": {"referer": "a"}, + "httpResponseHeaders": True, + }, + ["Use Request.headers instead"], + ), + # If httpRequestBody is True and requestHeaders is defined in meta, or + # if browserHtml is True and customHttpRequestHeaders is defined in + # meta, keep the meta parameters and do not issue a warning. There is + # no need for a warning because the request should get an error + # response from Zyte Data API. And if Zyte Data API were not to send an + # error response, that would mean the Zyte Data API has started + # supporting this scenario, all the more reason not to warn and let the + # parameters reach Zyte Data API. + ( + {}, + { + "requestHeaders": {"referer": "a"}, + }, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + }, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseHeaders": True, + }, + [], + ), # Unsupported headers not present in Scrapy requests by default are # dropped with a warning. # If all headers are unsupported, the header parameter is not even set. @@ -1468,7 +1535,8 @@ def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): ) def test_default_params_automap(default_params, meta, expected, warnings, caplog): """Warnings about unneeded parameters should not apply if those parameters - are needed to extend or override default parameters.""" + are needed to extend or override parameters set in the + ``ZYTE_API_DEFAULT_PARAMS`` setting.""" request = Request(url="https://example.com") request.meta["zyte_api"] = meta with caplog.at_level("WARNING"): From 325c10533fbab103063e25686c7fce506ff22fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Aug 2022 13:06:21 +0200 Subject: [PATCH 42/81] setup.py: revert unneeded change --- setup.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 6f00e9a8..c6436ab1 100644 --- a/setup.py +++ b/setup.py @@ -10,10 +10,7 @@ author_email="info@zyte.com", url="https://github.com/scrapy-plugins/scrapy-zyte-api", packages=["scrapy_zyte_api"], - install_requires=[ - "scrapy>=2.6.0", - "zyte-api>=0.3.0", - ], + install_requires=["zyte-api>=0.3.0", "scrapy>=2.6.0"], classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", From a56c1f04cbcfeabc11cd277810595c6be041af0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 14:06:30 +0200 Subject: [PATCH 43/81] Update the documentation --- README.rst | 264 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 164 insertions(+), 100 deletions(-) diff --git a/README.rst b/README.rst index d6612ea4..cf82c8b9 100644 --- a/README.rst +++ b/README.rst @@ -18,23 +18,29 @@ scrapy-zyte-api :target: https://codecov.io/gh/scrapy-plugins/scrapy-zyte-api :alt: Coverage report + +Scrapy plugin for `Zyte API`_. + +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html + + Requirements ------------- +============ * Python 3.7+ * Scrapy 2.0.1+ + Installation ------------- +============ .. code-block:: pip install scrapy-zyte-api -This package requires Python 3.7+. Configuration -------------- +============= Replace the default ``http`` and ``https`` in Scrapy's `DOWNLOAD_HANDLERS `_ @@ -63,12 +69,49 @@ Here's an example of the things needed inside a Scrapy project's ``settings.py`` The ``ZYTE_API_ENABLED`` setting, which is ``True`` by default, can be set to ``False`` to disable this plugin. + Usage ------ +===== + +You can send a request through Zyte API in one of the following ways: + +- Setting all Zyte API parameters :ref:`manually `, keeping full + control of what is sent to Zyte API. + +- Letting Zyte API parameters be chosen :ref:`automatically ` based + on your Scrapy request parameters where possible. + + You can :ref:`make this the default behavior for all requests + `. + +The raw Zyte API response can be accessed via the ``raw_api_response`` +attribute of the response object. + +When you use the Zyte API parameters ``browserHtml``, ``httpResponseBody``, or +``httpResponseHeaders``, the response body and headers are set accordingly. + +Note that, for Zyte API requests, the spider gets responses of +``ZyteAPIResponse`` and ``ZyteAPITextResponse`` types, which are respectively +subclasses of ``scrapy.http.Response`` and ``scrapy.http.TextResponse``. + +If multiple requests target the same URL with different Zyte API parameters, +pass ``dont_filter=True`` to ``Request``. -To enable a ``scrapy.Request`` to go through Zyte API, the ``zyte_api`` key in + +.. _manual: + +Sending requests with manually-defined parameters +------------------------------------------------- + +To send a Scrapy request through Zyte API with manually-defined parameters, +define your Zyte API parameters in the ``zyte_api`` key in `Request.meta `_ -must be present and contain a dict with Zyte API parameters: +as a ``dict``. + +The only exception is the ``url`` parameter, which should not be defined as a +Zyte API parameter. The value from ``Request.url`` is used automatically. + +For example: .. code-block:: python @@ -81,7 +124,6 @@ must be present and contain a dict with Zyte API parameters: def start_requests(self): yield scrapy.Request( url="http://quotes.toscrape.com/", - callback=self.parse, meta={ "zyte_api": { "browserHtml": True, @@ -90,135 +132,157 @@ must be present and contain a dict with Zyte API parameters: ) def parse(self, response): - yield {"URL": response.url, "HTML": response.body} - print(response.raw_api_response) # { # 'url': 'https://quotes.toscrape.com/', # 'statusCode': 200, - # 'browserHtml': ' ... ', + # 'browserHtml': '…', # } -You can see the full list of parameters in the `Zyte API Specification -`_. -The ``url`` parameter is filled automatically from ``request.url``, other -parameters should be set explicitly. +See the `Zyte API documentation`_ to learn about Zyte API parameters. -The raw Zyte API response can be accessed via the ``raw_api_response`` -attribute of the response object. +.. _Zyte API documentation: https://docs.zyte.com/zyte-api/get-started.html -When you use the Zyte API parameters ``browserHtml``, ``httpResponseBody``, or -``httpResponseHeaders``, the response body and headers are set accordingly. -Note that, for Zyte API requests, the spider gets responses of -``ZyteAPIResponse`` and ``ZyteAPITextResponse`` types, -which are respectively subclasses of ``scrapy.http.Response`` -and ``scrapy.http.TextResponse``. +.. _automap: -If multiple requests target the same URL with different Zyte API parameters, -pass ``dont_filter=True`` to ``Request``. +Sending requests with automatically-mapped parameters +----------------------------------------------------- +To send a Scrapy request through Zyte API letting Zyte API parameters be +automatically chosen based on the parameters of that Scrapy request, set the +``zyte_api_automap`` key in +`Request.meta `_ +to ``True``. See also :ref:`transparent-mode`. -.. _default-params: +Automated parameter mapping chooses Zyte API parameters as follows by default: -Setting default parameters --------------------------- +- ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. -Often the same configuration needs to be used for all Zyte API requests. -For example, all requests may need to set the same geolocation, or -the spider only uses ``browserHtml`` requests. +- ``Rerquest.url`` becomes ``url``, same as in requests with manually-defined + parameters. -To set the default parameters for Zyte API enabled requests, you can set the -following in the ``settings.py`` file or `any other settings within Scrapy -`_: +- If ``Rerquest.method`` is something other than ``"GET"``, it becomes + ``httpRequestMethod``. -.. code-block:: python +- ``Rerquest.headers`` become ``customHttpRequestHeaders``. - ZYTE_API_DEFAULT_PARAMS = { - "browserHtml": True, - "geolocation": "US", - } +- ``Rerquest.body`` is base64-encoded as ``httpRequestBody``. -For example: +Instead of setting ``zyte_api_automap`` to ``True``, you may set it to a +``dict`` of Zyte API parameters to extend or override choices made by automated +parameter mapping. Some parameters modify the result of automated parameter +mapping as a side effect: -.. code-block:: python +- Setting ``browserHtml`` or ``screenshot`` to ``True`` unsets + ``httpResponseBody``, and makes ``Request.headers`` become + ``requestHeaders`` instead of ``customHttpRequestHeaders``. - import scrapy +- Setting ``screenshot`` to ``True`` without also setting ``browserHtml`` to + ``True`` unsets ``httpResponseHeaders``. +When mapping headers, unsupported headers are excluded from the mapping. Use +the following settings to change which headers are mapped and how they are +mapped: - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" +- ``ZYTE_API_UNSUPPORTED_HEADERS`` determines headers that *cannot* be mapped + as ``customHttpRequestHeaders``, and its default value is: - custom_settings = { - "ZYTE_API_DEFAULT_PARAMS": { - "geolocation": "US", # You can set any Geolocation region you want. - } - } + .. code-block:: python - def start_requests(self): - yield scrapy.Request( - url="http://quotes.toscrape.com/", - callback=self.parse, - meta={ - "zyte_api": { - "browserHtml": True, - "javascript": True, - "echoData": {"some_value_I_could_track": 123}, - } - }, - ) + ["Cookie", "User-Agent"] - def parse(self, response): - yield {"URL": response.url, "HTML": response.body} +- ``ZYTE_API_BROWSER_HEADERS`` determines headers that *can* be mapped as + ``requestHeaders``. It is a ``dict``, where keys are header names and + values are the key that represents them in ``requestHeaders``. Its default + value is: - print(response.raw_api_response) - # { - # 'url': 'https://quotes.toscrape.com/', - # 'statusCode': 200, - # 'browserHtml': ' ... ', - # 'echoData': {'some_value_I_could_track': 123}, - # } + .. code-block:: python + + {"Referer": "referer"} + +To maximize support for potential future changes in Zyte API, automated +parameter mapping allows some parameter values and parameter combinations that +Zyte API does not currently support, and may never support: + +- ``Rerquest.method`` becomes ``httpRequestMethod`` even for unsupported_ + ``httpRequestMethod`` values, and even if ``httpResponseBody`` is unset. + + .. _unsupported: https://docs.zyte.com/zyte-api/usage/extract.html#zyte-api-set-method + +- You can set ``customHttpRequestHeaders`` or ``requestHeaders`` to ``True`` + to force their mapping from ``Request.headers`` in scenarios where they + would not be mapped otherwise. + + Conversely, you can set ``customHttpRequestHeaders`` or ``requestHeaders`` + to ``False`` to prevent their mapping from ``Request.headers``. + +- ``Rerquest.body`` becomes ``httpRequestBody`` even if ``httpResponseBody`` + is unset. + +- You can set ``httpResponseBody`` to ``False`` (which unsets the parameter), + and not set ``browserHtml`` or ``screenshot`` to ``True``. In this case, + ``Request.headers`` is mapped as ``requestHeaders``. + +- You can set ``httpResponseBody`` to ``True`` and also set ``browserHtml`` + or ``screenshot`` to ``True``. In this case, ``Request.headers`` is mapped + both as ``customHttpRequestHeaders`` and as ``requestHeaders``, and + ``browserHtml`` is used as the Scrapy response body. - print(response.request.meta) - # { - # 'zyte_api': { - # 'browserHtml': True, - # 'geolocation': 'US', - # 'javascript': True, - # 'echoData': {'some_value_I_could_track': 123} - # }, - # 'download_timeout': 180.0, - # 'download_slot': 'quotes.toscrape.com' - # } -``ZYTE_API_DEFAULT_PARAMS`` does not make requests automatically go through -Zyte Data API. See :ref:`enabled`. +.. _transparent-mode: -Parameters in ``ZYTE_API_DEFAULT_PARAMS`` are merged with parameters set via -the ``zyte_api`` meta key, with the values in meta taking priority. +Using transparent mode +---------------------- + +Set the ``ZYTE_API_TRANSPARENT_MODE`` setting to ``True`` to handle Scrapy +requests as follows: + +- Requests with the ``zyte_api_automap`` request meta key set to ``False`` + are *not* sent through Zyte API. + +- Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent + through Zyte API with :ref:`manually-defined parameters `. + +- All other requests are sent through Zyte API with + :ref:`automatically-mapped parameters `. + + You do not need to set the ``zyte-api-automap`` request meta key to + ``True``, but you can set it to a dictionary to extend your request + parameters. + + +.. _default-params: + +Setting default parameters +========================== +Often the same configuration needs to be used for all Zyte API requests. For +example, all requests may need to set the same geolocation, or the spider only +uses ``browserHtml`` requests. -.. _enabled: +The following settings allow you to define Zyte API parameters to be included +in all requests: -Controlling which requests go through Zyte Data API ---------------------------------------------------- +- ``ZYTE_API_DEFAULT_PARAMS`` is a ``dict`` of parameters to be combined with + :ref:`manually-defined parameters `. -By default, only requests where the ``zyte_api`` key in Request.meta_ is set to -``True`` or set to a dictionary go through Zyte Data API. + You may set the ``zyte_api`` request meta key to an empty ``dict`` to only + use default parameters for that request. -.. _Request.meta: https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta +- ``ZYTE_API_AUTOMAP_PARAMS`` is a ``dict`` of parameters to be combined with + :ref:`automatically-mapped parameters `. -Set the ``ZYTE_API_ON_ALL_REQUESTS`` setting to ``True`` to make all requests -go through Zyte Data API unless the ``zyte_api`` key in Request.meta_ is set to -``False``. ``ZYTE_API_ON_ALL_REQUESTS`` is ``False`` by default. +For example, if you set ``ZYTE_API_DEFAULT_PARAMS`` to +``{"geolocation": "US"}`` and ``zyte_api`` to ``{"browserHtml": True}``, +``{"url: "…", "geolocation": "US", "browserHtml": True}`` is sent to Zyte API. -Zyte Data API requests need parameters. If you set the ``zyte_api`` key in -Request.meta_ or the ``ZYTE_API_ON_ALL_REQUESTS`` setting to ``True``, you must -also :ref:`set default parameters `. +Parameters in these settings are merged with request-specific parameters, with +request-specific parameters taking precedence. Customizing the retry policy ----------------------------- +============================ API requests are retried automatically using the default retry policy of `python-zyte-api`_. @@ -237,7 +301,7 @@ Scrapy settings must be picklable, which `retry policies are not policy objects directly to the ``ZYTE_API_RETRY_POLICY`` setting, and must use their import path string instead. -When setting a retry policy through request metadata, you can assign the +When setting a retry policy through request meta, you can assign the ``zyte_api_retry_policy`` request meta key either the retry policy object itself or its import path string. If you need your requests to be serializable, however, you may also need to use the import path string. @@ -283,7 +347,7 @@ subclass RetryFactory_ as follows: Stats ------ +===== Stats from python-zyte-api_ are exposed as Scrapy stats with the ``scrapy-zyte-api`` prefix. From 46f498e9e2adcb3c978da5598b50a38bf0c210e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 14:06:41 +0200 Subject: [PATCH 44/81] =?UTF-8?q?Zyte=20Data=20API=20=E2=86=92=20Zyte=20AP?= =?UTF-8?q?I?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_zyte_api/handler.py | 47 +++++++++++++++++++------------------- tests/test_api_requests.py | 47 +++++++++++++++++++------------------- 2 files changed, 46 insertions(+), 48 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 94020602..200ddaa2 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -45,7 +45,7 @@ def _update_api_params_from_request_headers( headers = api_params.get("customHttpRequestHeaders") if headers is not None: logger.warning( - f"Request {request} defines the Zyte Data API " + f"Request {request} defines the Zyte API " f"customHttpRequestHeaders parameter, overriding " f"Request.headers. Use Request.headers instead." ) @@ -60,7 +60,7 @@ def _update_api_params_from_request_headers( if lowercase_k != b"user-agent" or v != DEFAULT_USER_AGENT: logger.warning( f"Request {request} defines header {k}, which " - f"cannot be mapped into the Zyte Data API " + f"cannot be mapped into the Zyte API " f"customHttpRequestHeaders parameter." ) continue @@ -74,9 +74,9 @@ def _update_api_params_from_request_headers( headers = api_params.get("requestHeaders") if headers is not None: logger.warning( - f"Request {request} defines the Zyte Data API " - f"requestHeaders parameter, overriding Request.headers. " - f"Use Request.headers instead." + f"Request {request} defines the Zyte API requestHeaders " + f"parameter, overriding Request.headers. Use Request.headers " + f"instead." ) elif request.headers: request_headers = {} @@ -101,8 +101,8 @@ def _update_api_params_from_request_headers( ): logger.warning( f"Request {request} defines header {k}, which " - f"cannot be mapped into the Zyte Data API " - f"requestHeaders parameter." + f"cannot be mapped into the Zyte API requestHeaders " + f"parameter." ) if request_headers: api_params["requestHeaders"] = request_headers @@ -129,7 +129,7 @@ def _update_api_params_from_request( # NOQA ) elif api_params.get("httpResponseBody") is False: logging.warning( - f"Request {request} unnecessarily defines the Zyte Data API " + f"Request {request} unnecessarily defines the Zyte API " f"'httpResponseBody' parameter with its default value, False. " f"It will not be sent to the server." ) @@ -159,15 +159,15 @@ def _update_api_params_from_request( # NOQA method = api_params.get("httpRequestMethod") if method: logger.warning( - f"Request {request} uses the Zyte Data API httpRequestMethod " + f"Request {request} uses the Zyte API httpRequestMethod " f"parameter, overriding Request.method. Use Request.method " f"instead." ) if method != request.method: logger.warning( f"The HTTP method of request {request} ({request.method}) " - f"does not match the Zyte Data API httpRequestMethod " - f"parameter ({method})." + f"does not match the Zyte API httpRequestMethod parameter " + f"({method})." ) elif request.method != "GET": if response_body: @@ -176,7 +176,7 @@ def _update_api_params_from_request( # NOQA logger.warning( f"The HTTP method of request {request} ({request.method}) " f"is being ignored. The httpRequestMethod parameter of " - f"Zyte Data API can only be set when the httpResponseBody " + f"Zyte API can only be set when the httpResponseBody " f"parameter is True." ) @@ -190,16 +190,15 @@ def _update_api_params_from_request( # NOQA body = api_params.get("httpRequestBody") if body: logger.warning( - f"Request {request} uses the Zyte Data API httpRequestBody " - f"parameter, overriding Request.body. Use Request.body " - f"instead." + f"Request {request} uses the Zyte API httpRequestBody parameter, " + f"overriding Request.body. Use Request.body instead." ) decoded_body = b64decode(body) if decoded_body != request.body: logger.warning( f"The body of request {request} ({request.body!r}) " - f"does not match the Zyte Data API httpRequestBody " - f"parameter ({body!r}; decoded: {decoded_body!r})." + f"does not match the Zyte API httpRequestBody parameter " + f"({body!r}; decoded: {decoded_body!r})." ) elif request.body != b"": if response_body: @@ -209,7 +208,7 @@ def _update_api_params_from_request( # NOQA logger.warning( f"The body of request {request} ({request.body!r}) " f"is being ignored. The httpRequestBody parameter of " - f"Zyte Data API can only be set when the httpResponseBody " + f"Zyte API can only be set when the httpResponseBody " f"parameter is True." ) @@ -217,9 +216,9 @@ def _update_api_params_from_request( # NOQA if api_params.get(param) != default_value: continue logging.warning( - f"Request {request} unnecessarily defines the Zyte Data API " - f"{param!r} parameter with its default value, {default_value!r}. " - f"It will not be sent to the server." + f"Request {request} unnecessarily defines the Zyte API {param!r} " + f"parameter with its default value, {default_value!r}. It will " + f"not be sent to the server." ) api_params.pop(param) @@ -236,9 +235,9 @@ def _get_api_params( browser_headers: Dict[str, str], job_id: Optional[str], ) -> Optional[dict]: - """Returns a dictionary of API parameters that must be sent to Zyte Data - API for the specified request, or None if the request should not be driven - through Zyte Data API.""" + """Returns a dictionary of API parameters that must be sent to Zyte API for + the specified request, or None if the request should not be sent through + Zyte API.""" meta_params = request.meta.get("zyte_api", use_api_by_default) if meta_params is False: return None diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 53c5a9a8..7602b74d 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -49,7 +49,7 @@ ) @ensureDeferred async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): - """Test that binary (i.e. non-text) responses from Zyte Data API are + """Test that binary (i.e. non-text) responses from Zyte API are successfully mapped to a subclass of Response that is not also a subclass of TextResponse. @@ -83,7 +83,7 @@ async def test_response_binary(meta: Dict[str, Dict[str, Any]], mockserver): ], ) async def test_response_html(meta: Dict[str, Dict[str, Any]], mockserver): - """Test that HTML responses from Zyte Data API are successfully mapped to a + """Test that HTML responses from Zyte API are successfully mapped to a subclass of TextResponse. Whether response headers are retrieved or not should have no impact on the @@ -132,8 +132,7 @@ async def test_enabled(setting, enabled, mockserver): @ensureDeferred async def test_coro_handling(zyte_api: bool, mockserver): """ScrapyZyteAPIDownloadHandler.download_request must return a deferred - both when using Zyte Data API and when using the regular downloader - logic.""" + both when using Zyte API and when using the regular downloader logic.""" settings = {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True}} async with mockserver.make_handler(settings) as handler: req = Request( @@ -183,7 +182,7 @@ async def test_exceptions( @ensureDeferred async def test_higher_concurrency(): """Make sure that CONCURRENT_REQUESTS and CONCURRENT_REQUESTS_PER_DOMAIN - have an effect on Zyte Data API requests.""" + have an effect on Zyte API requests.""" # Send DEFAULT_CLIENT_CONCURRENCY + 1 requests, the last one taking less # time than the rest, and ensure that the first response comes from the # last request, verifying that a concurrency ≥ DEFAULT_CLIENT_CONCURRENCY @@ -348,11 +347,11 @@ async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockser def test_api_toggling(setting, meta, expected): """Test how the value of the ``ZYTE_API_ON_ALL_REQUESTS`` setting (*setting*) in combination with request metadata (*meta*) determines what - Zyte Data API parameters are used (*expected*). + Zyte API parameters are used (*expected*). Note that :func:`test_get_api_params_output_side_effects` already tests how - *expected* affects whether the request is sent through Zyte Data API or - not, and :func:`test_get_api_params_input_custom` tests how the + *expected* affects whether the request is sent through Zyte API or not, + and :func:`test_get_api_params_input_custom` tests how the ``ZYTE_API_ON_ALL_REQUESTS`` setting is mapped to the corresponding :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ @@ -371,7 +370,7 @@ def test_api_toggling(setting, meta, expected): @pytest.mark.parametrize("meta", [None, 0, "", b"", []]) def test_api_disabling_deprecated(setting, meta): """Test how undocumented falsy values of the ``zyte_api`` request metadata - key (*meta*) can be used to disable the use of Zyte Data API, but trigger a + key (*meta*) can be used to disable the use of Zyte API, but trigger a deprecation warning asking to replace them with False.""" request = Request(url="https://example.com") request.meta["zyte_api"] = meta @@ -402,7 +401,7 @@ def test_bad_meta_type(meta): @ensureDeferred async def test_job_id(mockserver): """Test how the value of the ``JOB`` setting (*setting*) is included as - ``jobId`` among the parameters sent to Zyte Data API. + ``jobId`` among the parameters sent to Zyte API. Note that :func:`test_get_api_params_input_custom` already tests how the ``JOB`` setting is mapped to the corresponding @@ -468,8 +467,9 @@ async def test_default_params_none(mockserver, caplog): ], ) def test_default_params_merging(setting, meta, expected, warnings, caplog): - """Test how Zyte Data API parameters defined in the - ``ZYTE_API_DEFAULT_PARAMS`` setting (*setting*) and those defined in the ``zyte_api`` request metadata key (*meta*) are combined. + """Test how Zyte API parameters defined in the ``ZYTE_API_DEFAULT_PARAMS`` + setting (*setting*) and those defined in the ``zyte_api`` request metadata + key (*meta*) are combined. Request metadata takes precedence. Also, ``None`` values in request metadata can be used to unset parameters defined in the setting. Request @@ -519,7 +519,7 @@ def test_default_params_merging(setting, meta, expected, warnings, caplog): ], ) def test_default_params_immutability(setting, meta): - """Make sure that the merging of Zyte Data API parameters from the + """Make sure that the merging of Zyte API parameters from the ``ZYTE_API_DEFAULT_PARAMS`` setting (*setting*) with those from the ``zyte_api`` request metadata key (*meta*) does not affect the contents of the setting for later requests.""" @@ -804,7 +804,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): ), # Other HTTP methods, regardless of whether they are supported, # unsupported, or unknown, are mapped as httpRequestMethod, letting - # Zyte Data API decide whether or not they are allowed. + # Zyte API decide whether or not they are allowed. *( ( method, @@ -861,7 +861,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): }, [ "Use Request.method", - "does not match the Zyte Data API httpRequestMethod", + "does not match the Zyte API httpRequestMethod", ], ) for request_method, meta_method in ( @@ -986,8 +986,8 @@ def test_automap_method(method, meta, expected, warnings, caplog): # While future main output parameters are likely to use requestHeaders # instead, we cannot known if an unknown parameter is a main output # parameter or a different type of parameter for httpRequestBody, and - # what we know for sure is that, at the time of writing, Zyte Data API - # does not allow requestHeaders to be combined with httpRequestBody. + # what we know for sure is that, at the time of writing, Zyte API does + # not allow requestHeaders to be combined with httpRequestBody. ( {"Referer": "a"}, {"unknownMainOutput": True}, @@ -1187,10 +1187,10 @@ def test_automap_method(method, meta, expected, warnings, caplog): # if browserHtml is True and customHttpRequestHeaders is defined in # meta, keep the meta parameters and do not issue a warning. There is # no need for a warning because the request should get an error - # response from Zyte Data API. And if Zyte Data API were not to send an - # error response, that would mean the Zyte Data API has started - # supporting this scenario, all the more reason not to warn and let the - # parameters reach Zyte Data API. + # response from Zyte API. And if Zyte API were not to send an error + # response, that would mean the Zyte API has started supporting this + # scenario, all the more reason not to warn and let the parameters + # reach Zyte API. ( {}, { @@ -1422,7 +1422,7 @@ def test_automap_header_settings( }, [ "Use Request.body instead", - "does not match the Zyte Data API httpRequestBody parameter", + "does not match the Zyte API httpRequestBody parameter", ], ), # httpRequestBody defined in meta causes a warning even if it matches @@ -1466,8 +1466,7 @@ def test_automap_body(body, meta, expected, warnings, caplog): [ # When httpResponseBody, browserHtml, screenshot, or # httpResponseHeaders, are unnecessarily set to False, they are not - # defined in the parameters sent to Zyte Data API, and a warning is - # logged. + # defined in the parameters sent to Zyte API, and a warning is logged. ( { "browserHtml": True, From 2c28cac7d543150a75227df6b200e99192d84043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 14:13:51 +0200 Subject: [PATCH 45/81] README: do not use Sphinx references --- README.rst | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/README.rst b/README.rst index cf82c8b9..acbf6d33 100644 --- a/README.rst +++ b/README.rst @@ -49,7 +49,7 @@ in the ``settings.py`` of your Scrapy project. You also need to set the ``ZYTE_API_KEY``. Lastly, make sure to `install the asyncio-based Twisted reactor -`_ +`_ in the ``settings.py`` file as well. Here's an example of the things needed inside a Scrapy project's ``settings.py`` file: @@ -75,14 +75,13 @@ Usage You can send a request through Zyte API in one of the following ways: -- Setting all Zyte API parameters :ref:`manually `, keeping full - control of what is sent to Zyte API. +- Setting all Zyte API parameters manually, keeping full control of what is + sent to Zyte API. See **Sending requests with manually-defined parameters** + below. -- Letting Zyte API parameters be chosen :ref:`automatically ` based - on your Scrapy request parameters where possible. - - You can :ref:`make this the default behavior for all requests - `. +- Letting Zyte API parameters be chosen automatically based on your Scrapy + request parameters where possible. See **Sending requests with + automatically-mapped parameters** below. The raw Zyte API response can be accessed via the ``raw_api_response`` attribute of the response object. @@ -98,8 +97,6 @@ If multiple requests target the same URL with different Zyte API parameters, pass ``dont_filter=True`` to ``Request``. -.. _manual: - Sending requests with manually-defined parameters ------------------------------------------------- @@ -144,8 +141,6 @@ See the `Zyte API documentation`_ to learn about Zyte API parameters. .. _Zyte API documentation: https://docs.zyte.com/zyte-api/get-started.html -.. _automap: - Sending requests with automatically-mapped parameters ----------------------------------------------------- @@ -153,7 +148,7 @@ To send a Scrapy request through Zyte API letting Zyte API parameters be automatically chosen based on the parameters of that Scrapy request, set the ``zyte_api_automap`` key in `Request.meta `_ -to ``True``. See also :ref:`transparent-mode`. +to ``True``. See also **Using transparent mode** below. Automated parameter mapping chooses Zyte API parameters as follows by default: @@ -230,8 +225,6 @@ Zyte API does not currently support, and may never support: ``browserHtml`` is used as the Scrapy response body. -.. _transparent-mode: - Using transparent mode ---------------------- @@ -242,18 +235,18 @@ requests as follows: are *not* sent through Zyte API. - Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent - through Zyte API with :ref:`manually-defined parameters `. + through Zyte API with manually-defined parameters. See **Sending requests + with manually-defined parameters** above. -- All other requests are sent through Zyte API with - :ref:`automatically-mapped parameters `. +- All other requests are sent through Zyte API with automatically-mapped + parameters. See **Sending requests with automatically-mapped parameters** + above. You do not need to set the ``zyte-api-automap`` request meta key to ``True``, but you can set it to a dictionary to extend your request parameters. -.. _default-params: - Setting default parameters ========================== @@ -265,13 +258,15 @@ The following settings allow you to define Zyte API parameters to be included in all requests: - ``ZYTE_API_DEFAULT_PARAMS`` is a ``dict`` of parameters to be combined with - :ref:`manually-defined parameters `. + manually-defined parameters. See **Sending requests with manually-defined + parameters** above. You may set the ``zyte_api`` request meta key to an empty ``dict`` to only use default parameters for that request. - ``ZYTE_API_AUTOMAP_PARAMS`` is a ``dict`` of parameters to be combined with - :ref:`automatically-mapped parameters `. + automatically-mapped parameters. See **Sending requests with + automatically-mapped parameters** above. For example, if you set ``ZYTE_API_DEFAULT_PARAMS`` to ``{"geolocation": "US"}`` and ``zyte_api`` to ``{"browserHtml": True}``, From 3241a4cb2a733659c5a12bebc0adcdd87e516c96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 14:14:52 +0200 Subject: [PATCH 46/81] =?UTF-8?q?Fix=20typo:=20Rerquest=20=E2=86=92=20Requ?= =?UTF-8?q?est?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index acbf6d33..f9bb0aad 100644 --- a/README.rst +++ b/README.rst @@ -154,15 +154,15 @@ Automated parameter mapping chooses Zyte API parameters as follows by default: - ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. -- ``Rerquest.url`` becomes ``url``, same as in requests with manually-defined +- ``Request.url`` becomes ``url``, same as in requests with manually-defined parameters. -- If ``Rerquest.method`` is something other than ``"GET"``, it becomes +- If ``Request.method`` is something other than ``"GET"``, it becomes ``httpRequestMethod``. -- ``Rerquest.headers`` become ``customHttpRequestHeaders``. +- ``Request.headers`` become ``customHttpRequestHeaders``. -- ``Rerquest.body`` is base64-encoded as ``httpRequestBody``. +- ``Request.body`` is base64-encoded as ``httpRequestBody``. Instead of setting ``zyte_api_automap`` to ``True``, you may set it to a ``dict`` of Zyte API parameters to extend or override choices made by automated @@ -200,7 +200,7 @@ To maximize support for potential future changes in Zyte API, automated parameter mapping allows some parameter values and parameter combinations that Zyte API does not currently support, and may never support: -- ``Rerquest.method`` becomes ``httpRequestMethod`` even for unsupported_ +- ``Request.method`` becomes ``httpRequestMethod`` even for unsupported_ ``httpRequestMethod`` values, and even if ``httpResponseBody`` is unset. .. _unsupported: https://docs.zyte.com/zyte-api/usage/extract.html#zyte-api-set-method @@ -212,7 +212,7 @@ Zyte API does not currently support, and may never support: Conversely, you can set ``customHttpRequestHeaders`` or ``requestHeaders`` to ``False`` to prevent their mapping from ``Request.headers``. -- ``Rerquest.body`` becomes ``httpRequestBody`` even if ``httpResponseBody`` +- ``Request.body`` becomes ``httpRequestBody`` even if ``httpResponseBody`` is unset. - You can set ``httpResponseBody`` to ``False`` (which unsets the parameter), From a89efbbc3f4071919a01dee850bc0db78baeec0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 14:15:41 +0200 Subject: [PATCH 47/81] Remove unneeded detail --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index f9bb0aad..e1f2cb3a 100644 --- a/README.rst +++ b/README.rst @@ -162,7 +162,7 @@ Automated parameter mapping chooses Zyte API parameters as follows by default: - ``Request.headers`` become ``customHttpRequestHeaders``. -- ``Request.body`` is base64-encoded as ``httpRequestBody``. +- ``Request.body`` becomes ``httpRequestBody``. Instead of setting ``zyte_api_automap`` to ``True``, you may set it to a ``dict`` of Zyte API parameters to extend or override choices made by automated From 75918bc5e1ed3a25c43b0bb66ec2bcc70a019f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 14:19:56 +0200 Subject: [PATCH 48/81] Clarify when to use header settings --- README.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index e1f2cb3a..1eb65652 100644 --- a/README.rst +++ b/README.rst @@ -176,9 +176,10 @@ mapping as a side effect: - Setting ``screenshot`` to ``True`` without also setting ``browserHtml`` to ``True`` unsets ``httpResponseHeaders``. -When mapping headers, unsupported headers are excluded from the mapping. Use -the following settings to change which headers are mapped and how they are -mapped: +When mapping headers, unsupported headers are excluded from the mapping. If +Zyte API header support changes in the future, and you cannot upgrade to a +version of scrapy-zyte-api that includes such changes, use the following +settings to change which headers are mapped and how they are mapped: - ``ZYTE_API_UNSUPPORTED_HEADERS`` determines headers that *cannot* be mapped as ``customHttpRequestHeaders``, and its default value is: From 15d89d2d8011b9d31f166691bd51fb02f5ffaf0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 18:18:49 +0200 Subject: [PATCH 49/81] =?UTF-8?q?test=5Fapi=5Ftoggling=20=E2=86=92=20test?= =?UTF-8?q?=5Ftransparent=5Fmode=5Ftoggling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_zyte_api/handler.py | 140 +++++++++++++++++++++++++++---------- tests/test_api_requests.py | 97 ++++++++++++++++++++----- 2 files changed, 182 insertions(+), 55 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 200ddaa2..03be4a31 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -114,7 +114,6 @@ def _update_api_params_from_request( # NOQA *, unsupported_headers: Set[str], browser_headers: Dict[str, str], - default_params: Dict[str, Any], ): if not any( api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") @@ -225,20 +224,12 @@ def _update_api_params_from_request( # NOQA return api_params -def _get_api_params( +def _get_raw_params( request: Request, *, - use_api_by_default: bool, - automap_by_default: bool, default_params: Dict[str, Any], - unsupported_headers: Set[str], - browser_headers: Dict[str, str], - job_id: Optional[str], -) -> Optional[dict]: - """Returns a dictionary of API parameters that must be sent to Zyte API for - the specified request, or None if the request should not be sent through - Zyte API.""" - meta_params = request.meta.get("zyte_api", use_api_by_default) +): + meta_params = request.meta.get("zyte_api", False) if meta_params is False: return None @@ -253,19 +244,18 @@ def _get_api_params( if meta_params is True: meta_params = {} elif not isinstance(meta_params, Mapping): - logger.error( + raise ValueError( f"'zyte_api' parameters in the request meta should be provided as " f"a dictionary, got {type(meta_params)} instead in {request}." ) - raise ValueError("The value of the 'zyte_api' meta key of ") - api_params = copy(default_params) + params = copy(default_params) for k in list(meta_params): if meta_params[k] is not None: continue meta_params.pop(k) - if k in api_params: - api_params.pop(k) + if k in params: + params.pop(k) else: logger.warning( f"In request {request} 'zyte_api' parameter {k} is None, " @@ -273,16 +263,88 @@ def _get_api_params( f"the ZYTE_API_DEFAULT_PARAMS setting, but the setting does " f"not define such a parameter." ) - api_params.update(meta_params) + params.update(meta_params) + + return params + + +def _get_automap_params( + request: Request, + *, + default_enabled: bool, + default_params: Dict[str, Any], + unsupported_headers: Set[str], + browser_headers: Dict[str, str], +): + meta_params = request.meta.get("zyte_api_automap", default_enabled) + if meta_params is False: + return None + + if meta_params is True: + meta_params = {} + elif not isinstance(meta_params, Mapping): + raise ValueError( + f"'zyte_api_automap' parameters in the request meta should be " + f"provided as a dictionary, got {type(meta_params)} instead in " + f"{request}." + ) + + params = copy(default_params) + + _update_api_params_from_request( + params, + request, + unsupported_headers=unsupported_headers, + browser_headers=browser_headers, + ) + + for k in list(meta_params): + if meta_params[k] is not None: + continue + meta_params.pop(k) + if k in params: + params.pop(k) + else: + logger.warning( + f"In request {request} 'zyte_api_automap' parameter {k} is " + f"None, which is a value reserved to unset parameters defined " + f"in the ZYTE_API_AUTOMAP_PARAMS setting, but the setting " + f"does not define such a parameter." + ) + params.update(meta_params) + + return params - if request.meta.get("zyte_api_automap", automap_by_default): - _update_api_params_from_request( - api_params, + +def _get_api_params( + request: Request, + *, + default_params: Dict[str, Any], + transparent_mode: bool, + automap_params: Dict[str, Any], + unsupported_headers: Set[str], + browser_headers: Dict[str, str], + job_id: Optional[str], +) -> Optional[dict]: + """Returns a dictionary of API parameters that must be sent to Zyte API for + the specified request, or None if the request should not be sent through + Zyte API.""" + api_params = _get_raw_params(request, default_params=default_params) + if api_params is None: + api_params = _get_automap_params( request, + default_enabled=transparent_mode, + default_params=automap_params, unsupported_headers=unsupported_headers, browser_headers=browser_headers, - default_params=default_params, ) + elif request.meta.get("zyte_api_automap", False) is not False: + raise ValueError( + f"Request {request} combines manually-defined parameters and " + f"automatically-mapped parameters." + ) + if api_params is None: + return None if job_id is not None: api_params["jobId"] = job_id @@ -290,6 +352,19 @@ def _get_api_params( return api_params +def _load_default_params(settings, setting): + params = settings.getdict(setting) + for param in list(params): + if params[param] is not None: + continue + logger.warning( + f"Parameter {param!r} in the {setting} setting is None. Default " + f"parameters should never be None." + ) + params.pop(param) + return params + + class ScrapyZyteAPIDownloadHandler(HTTPDownloadHandler): def __init__( self, settings: Settings, crawler: Crawler, client: AsyncClient = None @@ -323,22 +398,13 @@ def __init__( ) self._stats = crawler.stats self._job_id = crawler.settings.get("JOB") - self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") - for param in list(self._zyte_api_default_params): - if self._zyte_api_default_params[param] is not None: - continue - logger.warning( - f"Parameter {param!r} in the ZYTE_API_DEFAULT_PARAMS " - f"setting is None. Default parameters should never be " - f"None." - ) - self._zyte_api_default_params.pop(param) + self._default_params = _load_default_params(settings, "ZYTE_API_DEFAULT_PARAMS") + self._automap_params = _load_default_params(settings, "ZYTE_API_AUTOMAP_PARAMS") self._session = create_session(connection_pool_size=self._client.n_conn) self._retry_policy = settings.get("ZYTE_API_RETRY_POLICY") if self._retry_policy: self._retry_policy = load_object(self._retry_policy) - self._on_all_requests = settings.getbool("ZYTE_API_ON_ALL_REQUESTS") - self._automap = settings.getbool("ZYTE_API_AUTOMAP", False) + self._transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False) self._unsupported_headers = { header.strip().lower().encode() for header in settings.getlist( @@ -357,9 +423,9 @@ def __init__( def download_request(self, request: Request, spider: Spider) -> Deferred: api_params = _get_api_params( request, - use_api_by_default=self._on_all_requests, - automap_by_default=self._automap, - default_params=self._zyte_api_default_params, + default_params=self._default_params, + transparent_mode=self._transparent_mode, + automap_params=self._automap_params, unsupported_headers=self._unsupported_headers, browser_headers=self._browser_headers, job_id=self._job_id, diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 7602b74d..6c9e5560 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,6 +1,8 @@ import sys from asyncio import iscoroutine from copy import copy +from functools import partial +from inspect import isclass from typing import Any, Dict from unittest import mock from unittest.mock import patch @@ -234,16 +236,16 @@ async def parse(self, response): assert response_indexes[0] == expected_first_index -AUTOMAP_BY_DEFAULT = False +AUTOMAP_PARAMS: Dict[str, Any] = {} BROWSER_HEADERS = {b"referer": "referer"} DEFAULT_PARAMS: Dict[str, Any] = {} +TRANSPARENT_MODE = False UNSUPPORTED_HEADERS = {b"cookie", b"user-agent"} -USE_API_BY_DEFAULT = False JOB_ID = None GET_API_PARAMS_KWARGS = { - "use_api_by_default": USE_API_BY_DEFAULT, - "automap_by_default": AUTOMAP_BY_DEFAULT, "default_params": DEFAULT_PARAMS, + "transparent_mode": TRANSPARENT_MODE, + "automap_params": AUTOMAP_PARAMS, "unsupported_headers": UNSUPPORTED_HEADERS, "browser_headers": BROWSER_HEADERS, "job_id": JOB_ID, @@ -270,10 +272,10 @@ async def test_get_api_params_input_custom(mockserver): request = Request(url="https://example.com") settings = { "JOB": "1/2/3", - "ZYTE_API_AUTOMAP": False, + "ZYTE_API_TRANSPARENT_MODE": True, "ZYTE_API_BROWSER_HEADERS": {"B": "b"}, "ZYTE_API_DEFAULT_PARAMS": {"a": "b"}, - "ZYTE_API_ON_ALL_REQUESTS": True, + "ZYTE_API_AUTOMAP_PARAMS": {"c": "d"}, "ZYTE_API_UNSUPPORTED_HEADERS": {"A"}, } async with mockserver.make_handler(settings) as handler: @@ -284,9 +286,9 @@ async def test_get_api_params_input_custom(mockserver): await handler.download_request(request, None) _get_api_params.assert_called_once_with( request, - use_api_by_default=True, - automap_by_default=False, default_params={"a": "b"}, + transparent_mode=True, + automap_params={"c": "d"}, unsupported_headers={b"a"}, browser_headers={b"b": "b"}, job_id="1/2/3", @@ -325,6 +327,12 @@ async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockser super_mock.download_request.assert_called() +DEFAULT_AUTOMAP_PARAMS: Dict[str, Any] = { + "httpResponseBody": True, + "httpResponseHeaders": True, +} + + @pytest.mark.parametrize( "setting,meta,expected", [ @@ -335,35 +343,88 @@ async def test_get_api_params_output_side_effects(output, uses_zyte_api, mockser (False, {"zyte_api": True}, {}), (False, {"zyte_api": {}}, {}), (False, {"zyte_api": {"a": "b"}}, {"a": "b"}), - (True, None, {}), - (True, {}, {}), - (True, {"a": "b"}, {}), - (True, {"zyte_api": False}, None), + (False, {"zyte_api_automap": False}, None), + (False, {"zyte_api_automap": True}, DEFAULT_AUTOMAP_PARAMS), + (False, {"zyte_api_automap": {}}, DEFAULT_AUTOMAP_PARAMS), + (False, {"zyte_api_automap": {"a": "b"}}, {**DEFAULT_AUTOMAP_PARAMS, "a": "b"}), + (False, {"zyte_api": False, "zyte_api_automap": False}, None), + (False, {"zyte_api": False, "zyte_api_automap": True}, DEFAULT_AUTOMAP_PARAMS), + (False, {"zyte_api": False, "zyte_api_automap": {}}, DEFAULT_AUTOMAP_PARAMS), + ( + False, + {"zyte_api": False, "zyte_api_automap": {"a": "b"}}, + {**DEFAULT_AUTOMAP_PARAMS, "a": "b"}, + ), + (False, {"zyte_api": True, "zyte_api_automap": False}, {}), + (False, {"zyte_api": True, "zyte_api_automap": True}, ValueError), + (False, {"zyte_api": True, "zyte_api_automap": {}}, ValueError), + (False, {"zyte_api": True, "zyte_api_automap": {"a": "b"}}, ValueError), + (False, {"zyte_api": {}, "zyte_api_automap": False}, {}), + (False, {"zyte_api": {}, "zyte_api_automap": True}, ValueError), + (False, {"zyte_api": {}, "zyte_api_automap": {}}, ValueError), + (False, {"zyte_api": {}, "zyte_api_automap": {"a": "b"}}, ValueError), + (False, {"zyte_api": {"a": "b"}, "zyte_api_automap": False}, {"a": "b"}), + (False, {"zyte_api": {"a": "b"}, "zyte_api_automap": True}, ValueError), + (False, {"zyte_api": {"a": "b"}, "zyte_api_automap": {}}, ValueError), + (False, {"zyte_api": {"a": "b"}, "zyte_api_automap": {"a": "b"}}, ValueError), + (True, None, DEFAULT_AUTOMAP_PARAMS), + (True, {}, DEFAULT_AUTOMAP_PARAMS), + (True, {"a": "b"}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api": False}, DEFAULT_AUTOMAP_PARAMS), (True, {"zyte_api": True}, {}), (True, {"zyte_api": {}}, {}), (True, {"zyte_api": {"a": "b"}}, {"a": "b"}), + (True, {"zyte_api_automap": False}, None), + (True, {"zyte_api_automap": True}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api_automap": {}}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api_automap": {"a": "b"}}, {**DEFAULT_AUTOMAP_PARAMS, "a": "b"}), + (True, {"zyte_api": False, "zyte_api_automap": False}, None), + (True, {"zyte_api": False, "zyte_api_automap": True}, DEFAULT_AUTOMAP_PARAMS), + (True, {"zyte_api": False, "zyte_api_automap": {}}, DEFAULT_AUTOMAP_PARAMS), + ( + True, + {"zyte_api": False, "zyte_api_automap": {"a": "b"}}, + {**DEFAULT_AUTOMAP_PARAMS, "a": "b"}, + ), + (True, {"zyte_api": True, "zyte_api_automap": False}, {}), + (True, {"zyte_api": True, "zyte_api_automap": True}, ValueError), + (True, {"zyte_api": True, "zyte_api_automap": {}}, ValueError), + (True, {"zyte_api": True, "zyte_api_automap": {"a": "b"}}, ValueError), + (True, {"zyte_api": {}, "zyte_api_automap": False}, {}), + (True, {"zyte_api": {}, "zyte_api_automap": True}, ValueError), + (True, {"zyte_api": {}, "zyte_api_automap": {}}, ValueError), + (True, {"zyte_api": {}, "zyte_api_automap": {"a": "b"}}, ValueError), + (True, {"zyte_api": {"a": "b"}, "zyte_api_automap": False}, {"a": "b"}), + (True, {"zyte_api": {"a": "b"}, "zyte_api_automap": True}, ValueError), + (True, {"zyte_api": {"a": "b"}, "zyte_api_automap": {}}, ValueError), + (True, {"zyte_api": {"a": "b"}, "zyte_api_automap": {"a": "b"}}, ValueError), ], ) -def test_api_toggling(setting, meta, expected): - """Test how the value of the ``ZYTE_API_ON_ALL_REQUESTS`` setting +def test_transparent_mode_toggling(setting, meta, expected): + """Test how the value of the ``ZYTE_API_TRANSPARENT_MODE`` setting (*setting*) in combination with request metadata (*meta*) determines what Zyte API parameters are used (*expected*). Note that :func:`test_get_api_params_output_side_effects` already tests how *expected* affects whether the request is sent through Zyte API or not, and :func:`test_get_api_params_input_custom` tests how the - ``ZYTE_API_ON_ALL_REQUESTS`` setting is mapped to the corresponding + ``ZYTE_API_TRANSPARENT_MODE`` setting is mapped to the corresponding :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com", meta=meta) - api_params = _get_api_params( + func = partial( + _get_api_params, request, **{ **GET_API_PARAMS_KWARGS, - "use_api_by_default": setting, + "transparent_mode": setting, }, ) - assert api_params == expected + if isclass(expected): + with pytest.raises(expected): + func() + else: + assert func() == expected @pytest.mark.parametrize("setting", [False, True]) From 24920363472da8bd284857f597b1900781d1da51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 19:15:04 +0200 Subject: [PATCH 50/81] Update some tests --- scrapy_zyte_api/handler.py | 4 +++ tests/test_api_requests.py | 73 ++++++++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 03be4a31..d101010f 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -248,6 +248,8 @@ def _get_raw_params( f"'zyte_api' parameters in the request meta should be provided as " f"a dictionary, got {type(meta_params)} instead in {request}." ) + else: + meta_params = copy(meta_params) params = copy(default_params) for k in list(meta_params): @@ -288,6 +290,8 @@ def _get_automap_params( f"provided as a dictionary, got {type(meta_params)} instead in " f"{request}." ) + else: + meta_params = copy(meta_params) params = copy(default_params) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 6c9e5560..7c41980f 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -427,9 +427,8 @@ def test_transparent_mode_toggling(setting, meta, expected): assert func() == expected -@pytest.mark.parametrize("setting", [False, True]) @pytest.mark.parametrize("meta", [None, 0, "", b"", []]) -def test_api_disabling_deprecated(setting, meta): +def test_api_disabling_deprecated(meta): """Test how undocumented falsy values of the ``zyte_api`` request metadata key (*meta*) can be used to disable the use of Zyte API, but trigger a deprecation warning asking to replace them with False.""" @@ -438,20 +437,18 @@ def test_api_disabling_deprecated(setting, meta): with pytest.warns(DeprecationWarning, match=r".* Use False instead\.$"): api_params = _get_api_params( request, - **{ - **GET_API_PARAMS_KWARGS, - "use_api_by_default": setting, - }, + **GET_API_PARAMS_KWARGS, ) assert api_params is None -@pytest.mark.parametrize("meta", [1, ["a", "b"]]) -def test_bad_meta_type(meta): - """Test how undocumented truthy values for the ``zyte_api`` request - metadata key (*meta*) trigger a :exc:`ValueError` exception.""" - request = Request(url="https://example.com") - request.meta["zyte_api"] = meta +@pytest.mark.parametrize("key", ["zyte_api", "zyte_api_automap"]) +@pytest.mark.parametrize("value", [1, ["a", "b"]]) +def test_bad_meta_type(key, value): + """Test how undocumented truthy values (*value*) for the ``zyte_api`` and + ``zyte_api_automap`` request metadata keys (*key*) trigger a + :exc:`ValueError` exception.""" + request = Request(url="https://example.com", meta={key: value}) with pytest.raises(ValueError): _get_api_params( request, @@ -459,16 +456,18 @@ def test_bad_meta_type(meta): ) +@pytest.mark.parametrize("meta", ["zyte_api", "zyte_api_automap"]) @ensureDeferred -async def test_job_id(mockserver): - """Test how the value of the ``JOB`` setting (*setting*) is included as - ``jobId`` among the parameters sent to Zyte API. +async def test_job_id(meta, mockserver): + """Test how the value of the ``JOB`` setting is included as ``jobId`` among + the parameters sent to Zyte API, both with manually-defined parameters and + with automatically-mapped parameters. Note that :func:`test_get_api_params_input_custom` already tests how the ``JOB`` setting is mapped to the corresponding :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ - request = Request(url="https://example.com", meta={"zyte_api": True}) + request = Request(url="https://example.com", meta={meta: True}) api_params = _get_api_params( request, **{ @@ -482,20 +481,21 @@ async def test_job_id(mockserver): @ensureDeferred async def test_default_params_none(mockserver, caplog): """Test how setting a value to ``None`` in the dictionary of the - ZYTE_API_DEFAULT_PARAMS setting causes a warning, because that is not - expected to be a valid value. + ZYTE_API_DEFAULT_PARAMS and ZYTE_API_AUTOMAP_PARAMS settings causes a + warning, because that is not expected to be a valid value. Note that ``None`` is however a valid value for parameters defined in the - ``zyte_api`` request metadata key. It can be used to unset parameters set - in the ``ZYTE_API_DEFAULT_PARAMS`` setting for that specific request. + ``zyte_api`` and ``zyte_api_automap`` request metadata keys. It can be used + to unset parameters set in those settings for a specific request. Also note that :func:`test_get_api_params_input_custom` already tests how - the ``ZYTE_API_DEFAULT_PARAMS`` setting is mapped to the corresponding + the settings are mapped to the corresponding :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com") settings = { "ZYTE_API_DEFAULT_PARAMS": {"a": None, "b": "c"}, + "ZYTE_API_AUTOMAP_PARAMS": {"d": None, "e": "f"}, } with caplog.at_level("WARNING"): async with mockserver.make_handler(settings) as handler: @@ -509,9 +509,11 @@ async def test_default_params_none(mockserver, caplog): **{ **GET_API_PARAMS_KWARGS, "default_params": {"b": "c"}, + "automap_params": {"e": "f"}, }, ) assert "Parameter 'a' in the ZYTE_API_DEFAULT_PARAMS setting is None" in caplog.text + assert "Parameter 'd' in the ZYTE_API_AUTOMAP_PARAMS setting is None" in caplog.text @pytest.mark.parametrize( @@ -527,30 +529,47 @@ async def test_default_params_none(mockserver, caplog): ({"a": 1}, {"a": None}, {}, []), ], ) -def test_default_params_merging(setting, meta, expected, warnings, caplog): - """Test how Zyte API parameters defined in the ``ZYTE_API_DEFAULT_PARAMS`` - setting (*setting*) and those defined in the ``zyte_api`` request metadata - key (*meta*) are combined. +@pytest.mark.parametrize( + "arg_key,meta_key,ignore_keys", + [ + ("default_params", "zyte_api", set()), + ( + "automap_params", + "zyte_api_automap", + {"httpResponseBody", "httpResponseHeaders"}, + ), + ], +) +def test_default_params_merging( + arg_key, meta_key, ignore_keys, setting, meta, expected, warnings, caplog +): + """Test how Zyte API parameters defined in the *arg_key* setting and those + defined in the *meta_key* request metadata key are combined. Request metadata takes precedence. Also, ``None`` values in request metadata can be used to unset parameters defined in the setting. Request metadata ``None`` values for keys that do not exist in the setting cause a warning. + This test also makes sure that, when `None` is used to unset a parameter, + the original request metadata key value is not modified. + Note that :func:`test_get_api_params_input_custom` already tests how the ``ZYTE_API_DEFAULT_PARAMS`` setting is mapped to the corresponding :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com") - request.meta["zyte_api"] = meta + request.meta[meta_key] = meta with caplog.at_level("WARNING"): api_params = _get_api_params( request, **{ **GET_API_PARAMS_KWARGS, - "default_params": setting, + arg_key: setting, }, ) + for key in ignore_keys: + api_params.pop(key) assert api_params == expected if warnings: for warning in warnings: From 8e9113f3a90d35c0071fc995fd4e258922b1ebd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 20:25:47 +0200 Subject: [PATCH 51/81] Complete test update --- scrapy_zyte_api/handler.py | 41 +++++++++++++------- tests/test_api_requests.py | 77 +++++++++++++------------------------- 2 files changed, 52 insertions(+), 66 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index d101010f..c816eb52 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -112,6 +112,8 @@ def _update_api_params_from_request( # NOQA api_params: Dict[str, Any], request: Request, *, + default_params: Dict[str, Any], + meta_params: Dict[str, Any], unsupported_headers: Set[str], browser_headers: Dict[str, str], ): @@ -137,7 +139,11 @@ def _update_api_params_from_request( # NOQA api_params.pop("httpResponseBody") if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): - if api_params.get("httpResponseHeaders") is True: + if api_params.get("httpResponseHeaders") is True and not ( + default_params.get("httpResponseHeaders") is True + and "httpResponseHeaders" not in meta_params + ): + logger.error(default_params) logger.warning( "You do not need to set httpResponseHeaders to True if " "you set httpResponseBody or browserHtml to True. Note " @@ -145,7 +151,10 @@ def _update_api_params_from_request( # NOQA "neither browserHtml nor screenshot are set to True." ) api_params.setdefault("httpResponseHeaders", True) - elif api_params.get("httpResponseHeaders") is False: + elif ( + api_params.get("httpResponseHeaders") is False + and not default_params.get("httpResponseHeaders") is False + ): logger.warning( "You do not need to set httpResponseHeaders to False if " "you do set httpResponseBody or browserHtml to True. Note " @@ -214,11 +223,12 @@ def _update_api_params_from_request( # NOQA for param, default_value in _DEFAULT_API_PARAMS.items(): if api_params.get(param) != default_value: continue - logging.warning( - f"Request {request} unnecessarily defines the Zyte API {param!r} " - f"parameter with its default value, {default_value!r}. It will " - f"not be sent to the server." - ) + if param not in default_params or default_params.get(param) == default_value: + logging.warning( + f"Request {request} unnecessarily defines the Zyte API {param!r} " + f"parameter with its default value, {default_value!r}. It will " + f"not be sent to the server." + ) api_params.pop(param) return api_params @@ -292,16 +302,10 @@ def _get_automap_params( ) else: meta_params = copy(meta_params) + original_meta_params = copy(meta_params) params = copy(default_params) - _update_api_params_from_request( - params, - request, - unsupported_headers=unsupported_headers, - browser_headers=browser_headers, - ) - for k in list(meta_params): if meta_params[k] is not None: continue @@ -317,6 +321,15 @@ def _get_automap_params( ) params.update(meta_params) + _update_api_params_from_request( + params, + request, + default_params=default_params, + meta_params=original_meta_params, + unsupported_headers=unsupported_headers, + browser_headers=browser_headers, + ) + return params diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 7c41980f..52fe4cf8 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -543,8 +543,9 @@ async def test_default_params_none(mockserver, caplog): def test_default_params_merging( arg_key, meta_key, ignore_keys, setting, meta, expected, warnings, caplog ): - """Test how Zyte API parameters defined in the *arg_key* setting and those - defined in the *meta_key* request metadata key are combined. + """Test how Zyte API parameters defined in the *arg_key* _get_api_params + parameter and those defined in the *meta_key* request metadata key are + combined. Request metadata takes precedence. Also, ``None`` values in request metadata can be used to unset parameters defined in the setting. Request @@ -553,10 +554,6 @@ def test_default_params_merging( This test also makes sure that, when `None` is used to unset a parameter, the original request metadata key value is not modified. - - Note that :func:`test_get_api_params_input_custom` already tests how the - ``ZYTE_API_DEFAULT_PARAMS`` setting is mapped to the corresponding - :func:`~scrapy_zyte_api.handler._get_api_params` parameter. """ request = Request(url="https://example.com") request.meta[meta_key] = meta @@ -598,68 +595,43 @@ def test_default_params_merging( ), ], ) -def test_default_params_immutability(setting, meta): - """Make sure that the merging of Zyte API parameters from the - ``ZYTE_API_DEFAULT_PARAMS`` setting (*setting*) with those from the - ``zyte_api`` request metadata key (*meta*) does not affect the contents of - the setting for later requests.""" - request = Request(url="https://example.com") - request.meta["zyte_api"] = meta - default_params = copy(setting) - _get_api_params( - request, - **{ - **GET_API_PARAMS_KWARGS, - "default_params": default_params, - }, - ) - assert default_params == setting - - @pytest.mark.parametrize( - "setting,meta,expected", + "arg_key,meta_key", [ - (False, UNSET, False), - (False, False, False), - (False, True, True), - (True, UNSET, True), - (True, False, False), - (True, True, True), + ("default_params", "zyte_api"), + ( + "automap_params", + "zyte_api_automap", + ), ], ) -def test_automap_toggling(setting, meta, expected): - """Test how the value of the ``ZYTE_API_AUTOMAP`` setting (*setting*) in combination with the ``zyte_api_automap`` request metadata key (*meta*) - determines whether or not automated mapping is enabled for a request - (*expected*). - - Note that :func:`test_get_api_params_input_custom` already tests how the - ``ZYTE_API_AUTOMAP`` setting is mapped to the corresponding - :func:`~scrapy_zyte_api.handler._get_api_params` parameter. - """ +def test_default_params_immutability(arg_key, meta_key, setting, meta): + """Make sure that the merging of Zyte API parameters from the *arg_key* + _get_api_params parameter with those from the *meta_key* request metadata + key does not affect the contents of the setting for later requests.""" request = Request(url="https://example.com") - if meta is not UNSET: - request.meta["zyte_api_automap"] = meta - api_params = _get_api_params( + request.meta[meta_key] = meta + default_params = copy(setting) + _get_api_params( request, **{ **GET_API_PARAMS_KWARGS, - "use_api_by_default": True, - "automap_by_default": setting, + arg_key: default_params, }, ) - assert bool(api_params) == expected + assert default_params == setting def _test_automap(global_kwargs, request_kwargs, meta, expected, warnings, caplog): request = Request(url="https://example.com", **request_kwargs) - request.meta["zyte_api"] = meta + request.meta["zyte_api_automap"] = meta with caplog.at_level("WARNING"): api_params = _get_api_params( request, **{ **GET_API_PARAMS_KWARGS, **global_kwargs, - "automap_by_default": True, + "transparent_mode": True, }, ) assert api_params == expected @@ -1594,7 +1566,7 @@ def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): _test_automap({}, {}, meta, expected, warnings, caplog) -@pytest.mark.xfail(reason="To be implemented", strict=True) +# @pytest.mark.xfail(reason="To be implemented", strict=True) @pytest.mark.parametrize( "default_params,meta,expected,warnings", [ @@ -1615,15 +1587,16 @@ def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): def test_default_params_automap(default_params, meta, expected, warnings, caplog): """Warnings about unneeded parameters should not apply if those parameters are needed to extend or override parameters set in the - ``ZYTE_API_DEFAULT_PARAMS`` setting.""" + ``ZYTE_API_AUTOMAP_PARAMS`` setting.""" request = Request(url="https://example.com") - request.meta["zyte_api"] = meta + request.meta["zyte_api_automap"] = meta with caplog.at_level("WARNING"): api_params = _get_api_params( request, **{ **GET_API_PARAMS_KWARGS, - "automap_by_default": True, + "transparent_mode": True, + "automap_params": default_params, }, ) assert api_params == expected From ebe3b4f9ed141e2571b91f04fdbb72abf6b7cdd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 20:55:38 +0200 Subject: [PATCH 52/81] Refactor _update_api_params_from_request_headers --- scrapy_zyte_api/handler.py | 154 ++++++++++++++++++++++--------------- 1 file changed, 93 insertions(+), 61 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index c816eb52..6de35c97 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -32,6 +32,89 @@ } +def _iter_headers( + *, + api_params: Dict[str, Any], + request: Request, + parameter: str, +): + headers = api_params.get(parameter) + if headers is not None: + logger.warning( + f"Request {request} defines the Zyte API {parameter} parameter, " + f"overriding Request.headers. Use Request.headers instead." + ) + return + if not request.headers: + return + for k, v in request.headers.items(): + if not v: + continue + decoded_v = b",".join(v).decode() + lowercase_k = k.strip().lower() + yield k, lowercase_k, decoded_v + + +def _map_custom_http_request_headers( + *, + api_params: Dict[str, Any], + request: Request, + unsupported_headers: Set[str], +): + headers = [] + for k, lowercase_k, decoded_v in _iter_headers( + api_params=api_params, + request=request, + parameter="customHttpRequestHeaders", + ): + if lowercase_k in unsupported_headers: + if lowercase_k != b"user-agent" or decoded_v != DEFAULT_USER_AGENT: + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte API " + f"customHttpRequestHeaders parameter." + ) + continue + headers.append({"name": k.decode(), "value": decoded_v}) + if headers: + api_params["customHttpRequestHeaders"] = headers + + +def _map_request_headers( + *, + api_params: Dict[str, Any], + request: Request, + browser_headers: Dict[str, str], +): + request_headers = {} + for k, lowercase_k, decoded_v in _iter_headers( + api_params=api_params, + request=request, + parameter="requestHeaders", + ): + key = browser_headers.get(lowercase_k) + if key is not None: + request_headers[key] = decoded_v + elif not ( + ( + lowercase_k == b"accept" + and decoded_v == DEFAULT_REQUEST_HEADERS["Accept"] + ) + or ( + lowercase_k == b"accept-language" + and decoded_v == DEFAULT_REQUEST_HEADERS["Accept-Language"] + ) + or (lowercase_k == b"user-agent" and decoded_v == DEFAULT_USER_AGENT) + ): + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte API requestHeaders " + f"parameter." + ) + if request_headers: + api_params["requestHeaders"] = request_headers + + def _update_api_params_from_request_headers( api_params: Dict[str, Any], request: Request, @@ -42,70 +125,19 @@ def _update_api_params_from_request_headers( """Updates *api_params*, in place, based on *request*.""" response_body = api_params.get("httpResponseBody") if response_body: - headers = api_params.get("customHttpRequestHeaders") - if headers is not None: - logger.warning( - f"Request {request} defines the Zyte API " - f"customHttpRequestHeaders parameter, overriding " - f"Request.headers. Use Request.headers instead." - ) - elif request.headers: - headers = [] - for k, v in request.headers.items(): - if not v: - continue - v = b",".join(v).decode() - lowercase_k = k.strip().lower() - if lowercase_k in unsupported_headers: - if lowercase_k != b"user-agent" or v != DEFAULT_USER_AGENT: - logger.warning( - f"Request {request} defines header {k}, which " - f"cannot be mapped into the Zyte API " - f"customHttpRequestHeaders parameter." - ) - continue - k = k.decode() - headers.append({"name": k, "value": v}) - if headers: - api_params["customHttpRequestHeaders"] = headers + _map_custom_http_request_headers( + api_params=api_params, + request=request, + unsupported_headers=unsupported_headers, + ) if not response_body or any( api_params.get(k) for k in ("browserHtml", "screenshot") ): - headers = api_params.get("requestHeaders") - if headers is not None: - logger.warning( - f"Request {request} defines the Zyte API requestHeaders " - f"parameter, overriding Request.headers. Use Request.headers " - f"instead." - ) - elif request.headers: - request_headers = {} - for k, v in request.headers.items(): - if not v: - continue - v = b",".join(v).decode() - lowercase_k = k.strip().lower() - key = browser_headers.get(lowercase_k) - if key is not None: - request_headers[key] = v - elif not ( - ( - lowercase_k == b"accept" - and v == DEFAULT_REQUEST_HEADERS["Accept"] - ) - or ( - lowercase_k == b"accept-language" - and v == DEFAULT_REQUEST_HEADERS["Accept-Language"] - ) - or (lowercase_k == b"user-agent" and v == DEFAULT_USER_AGENT) - ): - logger.warning( - f"Request {request} defines header {k}, which " - f"cannot be mapped into the Zyte API requestHeaders " - f"parameter." - ) - if request_headers: - api_params["requestHeaders"] = request_headers + _map_request_headers( + api_params=api_params, + request=request, + browser_headers=browser_headers, + ) def _update_api_params_from_request( # NOQA From e0ed243120db15b25b63e9581ec0d79fb5128084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 21:08:41 +0200 Subject: [PATCH 53/81] Refactor _update_api_params_from_request --- scrapy_zyte_api/handler.py | 79 +++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 6de35c97..818be878 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -115,10 +115,10 @@ def _map_request_headers( api_params["requestHeaders"] = request_headers -def _update_api_params_from_request_headers( +def _set_request_headers_from_request( + *, api_params: Dict[str, Any], request: Request, - *, unsupported_headers: Set[str], browser_headers: Dict[str, str], ): @@ -140,14 +140,10 @@ def _update_api_params_from_request_headers( ) -def _update_api_params_from_request( # NOQA +def _set_http_response_body_from_request( + *, api_params: Dict[str, Any], request: Request, - *, - default_params: Dict[str, Any], - meta_params: Dict[str, Any], - unsupported_headers: Set[str], - browser_headers: Dict[str, str], ): if not any( api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") @@ -166,10 +162,17 @@ def _update_api_params_from_request( # NOQA f"'httpResponseBody' parameter with its default value, False. " f"It will not be sent to the server." ) - response_body = api_params.get("httpResponseBody") - if response_body is False: + if api_params.get("httpResponseBody") is False: api_params.pop("httpResponseBody") + +def _set_http_response_headers_from_request( + *, + api_params: Dict[str, Any], + default_params: Dict[str, Any], + meta_params: Dict[str, Any], + request: Request, +): if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): if api_params.get("httpResponseHeaders") is True and not ( default_params.get("httpResponseHeaders") is True @@ -196,6 +199,12 @@ def _update_api_params_from_request( # NOQA if api_params.get("httpResponseHeaders") is False: api_params.pop("httpResponseHeaders") + +def _set_http_request_method_from_request( + *, + api_params: Dict[str, Any], + request: Request, +): method = api_params.get("httpRequestMethod") if method: logger.warning( @@ -210,7 +219,7 @@ def _update_api_params_from_request( # NOQA f"({method})." ) elif request.method != "GET": - if response_body: + if api_params.get("httpResponseBody"): api_params["httpRequestMethod"] = request.method else: logger.warning( @@ -220,13 +229,12 @@ def _update_api_params_from_request( # NOQA f"parameter is True." ) - _update_api_params_from_request_headers( - api_params, - request, - unsupported_headers=unsupported_headers, - browser_headers=browser_headers, - ) +def _set_http_request_body_from_request( + *, + api_params: Dict[str, Any], + request: Request, +): body = api_params.get("httpRequestBody") if body: logger.warning( @@ -241,7 +249,7 @@ def _update_api_params_from_request( # NOQA f"({body!r}; decoded: {decoded_body!r})." ) elif request.body != b"": - if response_body: + if api_params.get("httpResponseBody"): base64_body = b64encode(request.body).decode() api_params["httpRequestBody"] = base64_body else: @@ -252,6 +260,13 @@ def _update_api_params_from_request( # NOQA f"parameter is True." ) + +def _unset_unneeded_api_params( + *, + api_params: Dict[str, Any], + default_params: Dict[str, Any], + request: Request, +): for param, default_value in _DEFAULT_API_PARAMS.items(): if api_params.get(param) != default_value: continue @@ -263,6 +278,34 @@ def _update_api_params_from_request( # NOQA ) api_params.pop(param) + +def _update_api_params_from_request( + api_params: Dict[str, Any], + request: Request, + *, + default_params: Dict[str, Any], + meta_params: Dict[str, Any], + unsupported_headers: Set[str], + browser_headers: Dict[str, str], +): + _set_http_response_body_from_request(api_params=api_params, request=request) + _set_http_response_headers_from_request( + api_params=api_params, + request=request, + default_params=default_params, + meta_params=meta_params, + ) + _set_http_request_method_from_request(api_params=api_params, request=request) + _set_request_headers_from_request( + api_params=api_params, + request=request, + unsupported_headers=unsupported_headers, + browser_headers=browser_headers, + ) + _set_http_request_body_from_request(api_params=api_params, request=request) + _unset_unneeded_api_params( + api_params=api_params, request=request, default_params=default_params + ) return api_params From 2afc1043b8393818e9cdf85f1aa8a3cd71f01df1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 21:20:58 +0200 Subject: [PATCH 54/81] Refactor _get_raw_params and _get_automap_params --- scrapy_zyte_api/handler.py | 112 +++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 48 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 818be878..79b30c51 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -309,33 +309,31 @@ def _update_api_params_from_request( return api_params -def _get_raw_params( - request: Request, +def _copy_meta_params_as_dict( + meta_params: Dict[str, Any], *, - default_params: Dict[str, Any], + param: str, + request: Request, ): - meta_params = request.meta.get("zyte_api", False) - if meta_params is False: - return None - - if not meta_params and meta_params != {}: - warn( - f"Setting the zyte_api request metadata key to " - f"{meta_params!r} is deprecated. Use False instead.", - DeprecationWarning, - ) - return None - if meta_params is True: - meta_params = {} + return {} elif not isinstance(meta_params, Mapping): raise ValueError( - f"'zyte_api' parameters in the request meta should be provided as " + f"'{param}' parameters in the request meta should be provided as " f"a dictionary, got {type(meta_params)} instead in {request}." ) else: - meta_params = copy(meta_params) + return copy(meta_params) + +def _merge_params( + *, + default_params: Dict[str, Any], + meta_params: Dict[str, Any], + param: str, + setting: str, + request: Request, +): params = copy(default_params) for k in list(meta_params): if meta_params[k] is not None: @@ -345,16 +343,47 @@ def _get_raw_params( params.pop(k) else: logger.warning( - f"In request {request} 'zyte_api' parameter {k} is None, " + f"In request {request} {param!r} parameter {k} is None, " f"which is a value reserved to unset parameters defined in " - f"the ZYTE_API_DEFAULT_PARAMS setting, but the setting does " - f"not define such a parameter." + f"the {setting} setting, but the setting does not define such " + f"a parameter." ) params.update(meta_params) - return params +def _get_raw_params( + request: Request, + *, + default_params: Dict[str, Any], +): + meta_params = request.meta.get("zyte_api", False) + if meta_params is False: + return None + + if not meta_params and meta_params != {}: + warn( + f"Setting the zyte_api request metadata key to " + f"{meta_params!r} is deprecated. Use False instead.", + DeprecationWarning, + ) + return None + + meta_params = _copy_meta_params_as_dict( + meta_params, + param="zyte_api", + request=request, + ) + + return _merge_params( + default_params=default_params, + meta_params=meta_params, + param="zyte_api", + setting="ZYTE_API_DEFAULT_PARAMS", + request=request, + ) + + def _get_automap_params( request: Request, *, @@ -367,34 +396,21 @@ def _get_automap_params( if meta_params is False: return None - if meta_params is True: - meta_params = {} - elif not isinstance(meta_params, Mapping): - raise ValueError( - f"'zyte_api_automap' parameters in the request meta should be " - f"provided as a dictionary, got {type(meta_params)} instead in " - f"{request}." - ) - else: - meta_params = copy(meta_params) - original_meta_params = copy(meta_params) + meta_params = _copy_meta_params_as_dict( + meta_params, + param="zyte_api_automap", + request=request, + ) - params = copy(default_params) + original_meta_params = copy(meta_params) - for k in list(meta_params): - if meta_params[k] is not None: - continue - meta_params.pop(k) - if k in params: - params.pop(k) - else: - logger.warning( - f"In request {request} 'zyte_api_automap' parameter {k} is " - f"None, which is a value reserved to unset parameters defined " - f"in the ZYTE_API_AUTOMAP_PARAMS setting, but the setting " - f"does not define such a parameter." - ) - params.update(meta_params) + params = _merge_params( + default_params=default_params, + meta_params=meta_params, + param="zyte_api_automap", + setting="ZYTE_API_AUTOMAP_PARAMS", + request=request, + ) _update_api_params_from_request( params, From 414ccc227e74132f330810800f897b875dca6ec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 21:23:54 +0200 Subject: [PATCH 55/81] _merge_params: do not modify meta_params --- scrapy_zyte_api/handler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 79b30c51..49174deb 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -335,6 +335,7 @@ def _merge_params( request: Request, ): params = copy(default_params) + meta_params = copy(meta_params) for k in list(meta_params): if meta_params[k] is not None: continue @@ -402,8 +403,6 @@ def _get_automap_params( request=request, ) - original_meta_params = copy(meta_params) - params = _merge_params( default_params=default_params, meta_params=meta_params, @@ -416,7 +415,7 @@ def _get_automap_params( params, request, default_params=default_params, - meta_params=original_meta_params, + meta_params=meta_params, unsupported_headers=unsupported_headers, browser_headers=browser_headers, ) From 41d6430c5e262a9842653dbd60d72598d01abe64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 21:32:24 +0200 Subject: [PATCH 56/81] Refactor downloader __init__ --- scrapy_zyte_api/handler.py | 56 +++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 49174deb..a78c696a 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -445,13 +445,13 @@ def _get_api_params( unsupported_headers=unsupported_headers, browser_headers=browser_headers, ) + if api_params is None: + return None elif request.meta.get("zyte_api_automap", False) is not False: raise ValueError( f"Request {request} combines manually-defined parameters and " f"automatically-mapped parameters." ) - if api_params is None: - return None if job_id is not None: api_params["jobId"] = job_id @@ -472,6 +472,31 @@ def _load_default_params(settings, setting): return params +def _load_unsupported_headers(settings): + return { + header.strip().lower().encode() + for header in settings.getlist( + "ZYTE_API_UNSUPPORTED_HEADERS", + ["Cookie", "User-Agent"], + ) + } + + +def _load_browser_headers(settings): + browser_headers = settings.getdict( + "ZYTE_API_BROWSER_HEADERS", + {"Referer": "referer"}, + ) + return {k.strip().lower().encode(): v for k, v in browser_headers.items()} + + +def _load_retry_policy(settings): + policy = settings.get("ZYTE_API_RETRY_POLICY") + if policy: + policy = load_object(policy) + return policy + + class ScrapyZyteAPIDownloadHandler(HTTPDownloadHandler): def __init__( self, settings: Settings, crawler: Crawler, client: AsyncClient = None @@ -504,28 +529,15 @@ def __init__( "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ) self._stats = crawler.stats - self._job_id = crawler.settings.get("JOB") - self._default_params = _load_default_params(settings, "ZYTE_API_DEFAULT_PARAMS") - self._automap_params = _load_default_params(settings, "ZYTE_API_AUTOMAP_PARAMS") self._session = create_session(connection_pool_size=self._client.n_conn) - self._retry_policy = settings.get("ZYTE_API_RETRY_POLICY") - if self._retry_policy: - self._retry_policy = load_object(self._retry_policy) + + self._automap_params = _load_default_params(settings, "ZYTE_API_AUTOMAP_PARAMS") + self._browser_headers = _load_browser_headers(settings) + self._default_params = _load_default_params(settings, "ZYTE_API_DEFAULT_PARAMS") + self._job_id = crawler.settings.get("JOB") + self._retry_policy = _load_retry_policy(settings) self._transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False) - self._unsupported_headers = { - header.strip().lower().encode() - for header in settings.getlist( - "ZYTE_API_UNSUPPORTED_HEADERS", - ["Cookie", "User-Agent"], - ) - } - browser_headers = settings.getdict( - "ZYTE_API_BROWSER_HEADERS", - {"Referer": "referer"}, - ) - self._browser_headers = { - k.strip().lower().encode(): v for k, v in browser_headers.items() - } + self._unsupported_headers = _load_unsupported_headers(settings) def download_request(self, request: Request, spider: Spider) -> Deferred: api_params = _get_api_params( From d7a02c70649fa099f01fbef6cd72750b24a4e739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 21:39:12 +0200 Subject: [PATCH 57/81] When automapping, always set httpRequestMethod for non-GET values --- scrapy_zyte_api/handler.py | 10 +--------- tests/test_api_requests.py | 9 +++++---- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index a78c696a..bf4aca65 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -219,15 +219,7 @@ def _set_http_request_method_from_request( f"({method})." ) elif request.method != "GET": - if api_params.get("httpResponseBody"): - api_params["httpRequestMethod"] = request.method - else: - logger.warning( - f"The HTTP method of request {request} ({request.method}) " - f"is being ignored. The httpRequestMethod parameter of " - f"Zyte API can only be set when the httpResponseBody " - f"parameter is True." - ) + api_params["httpRequestMethod"] = request.method def _set_http_request_body_from_request( diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 52fe4cf8..a05c2450 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -922,24 +922,25 @@ def test_automap_header_output(meta, expected, warnings, caplog): ) ), # If httpResponseBody is not True, implicitly or explicitly, - # Request.method is not mapped, and a warning is issued if its value - # is anything other than GET. + # Request.method is still mapped for anything other than GET. ( "POST", {"browserHtml": True}, { "browserHtml": True, + "httpRequestMethod": "POST", "httpResponseHeaders": True, }, - ["can only be set when the httpResponseBody parameter"], + [], ), ( "POST", {"screenshot": True}, { "screenshot": True, + "httpRequestMethod": "POST", }, - ["can only be set when the httpResponseBody parameter"], + [], ), ], ) From 1b0a29205d2228f50a078fa7317f7a6167cd84e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 21:41:35 +0200 Subject: [PATCH 58/81] When automapping, always set httpRequestBody for non-empty values --- scrapy_zyte_api/handler.py | 12 ++---------- tests/test_api_requests.py | 8 +++++--- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index bf4aca65..9dcef371 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -241,16 +241,8 @@ def _set_http_request_body_from_request( f"({body!r}; decoded: {decoded_body!r})." ) elif request.body != b"": - if api_params.get("httpResponseBody"): - base64_body = b64encode(request.body).decode() - api_params["httpRequestBody"] = base64_body - else: - logger.warning( - f"The body of request {request} ({request.body!r}) " - f"is being ignored. The httpRequestBody parameter of " - f"Zyte API can only be set when the httpResponseBody " - f"parameter is True." - ) + base64_body = b64encode(request.body).decode() + api_params["httpRequestBody"] = base64_body def _unset_unneeded_api_params( diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index a05c2450..df6ef9fe 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1490,23 +1490,25 @@ def test_automap_header_settings( }, ["Use Request.body instead"], ), - # A body should not be used unless httpResponseBody is also used. + # The body is mapped even if httpResponseBody is not used. ( "a", {"browserHtml": True}, { "browserHtml": True, + "httpRequestBody": "YQ==", "httpResponseHeaders": True, }, - ["can only be set when the httpResponseBody parameter"], + [], ), ( "a", {"screenshot": True}, { + "httpRequestBody": "YQ==", "screenshot": True, }, - ["can only be set when the httpResponseBody parameter"], + [], ), ], ) From 6889a254561bb3c2ddd6073c6471fa8174457e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 22:07:04 +0200 Subject: [PATCH 59/81] Allow disabling or forcing header mapping --- scrapy_zyte_api/handler.py | 25 +++++++++-- tests/test_api_requests.py | 89 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 4 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 9dcef371..7a6223ab 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -39,7 +39,7 @@ def _iter_headers( parameter: str, ): headers = api_params.get(parameter) - if headers is not None: + if headers not in (None, True): logger.warning( f"Request {request} defines the Zyte API {parameter} parameter, " f"overriding Request.headers. Use Request.headers instead." @@ -123,21 +123,38 @@ def _set_request_headers_from_request( browser_headers: Dict[str, str], ): """Updates *api_params*, in place, based on *request*.""" + custom_http_request_headers = api_params.get("customHttpRequestHeaders") + request_headers = api_params.get("requestHeaders") response_body = api_params.get("httpResponseBody") - if response_body: + + if ( + response_body + and custom_http_request_headers is not False + or custom_http_request_headers is True + ): _map_custom_http_request_headers( api_params=api_params, request=request, unsupported_headers=unsupported_headers, ) - if not response_body or any( - api_params.get(k) for k in ("browserHtml", "screenshot") + elif custom_http_request_headers is False: + api_params.pop("customHttpRequestHeaders") + + if ( + ( + not response_body + or any(api_params.get(k) for k in ("browserHtml", "screenshot")) + ) + and request_headers is not False + or request_headers is True ): _map_request_headers( api_params=api_params, request=request, browser_headers=browser_headers, ) + elif request_headers is False: + api_params.pop("requestHeaders") def _set_http_response_body_from_request( diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index df6ef9fe..becdbc65 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1075,6 +1075,95 @@ def test_automap_method(method, meta, expected, warnings, caplog): }, [], ), + # False disables header mapping. + ( + {"Referer": "a"}, + {"customHttpRequestHeaders": False}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True, "requestHeaders": False}, + { + "browserHtml": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": "a"}, + { + "browserHtml": True, + "httpResponseBody": True, + "customHttpRequestHeaders": False, + }, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True, "httpResponseBody": True, "requestHeaders": False}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + ( + {"Referer": "a"}, + { + "browserHtml": True, + "httpResponseBody": True, + "customHttpRequestHeaders": False, + "requestHeaders": False, + }, + { + "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [], + ), + # True forces header mapping. + ( + {"Referer": "a"}, + {"requestHeaders": True}, + { + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseBody": True, + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), + ( + {"Referer": "a"}, + {"browserHtml": True, "customHttpRequestHeaders": True}, + { + "browserHtml": True, + "customHttpRequestHeaders": [ + {"name": "Referer", "value": "a"}, + ], + "httpResponseHeaders": True, + "requestHeaders": {"referer": "a"}, + }, + [], + ), # Headers with None as value are not mapped. ( {"Referer": None}, From db6c3aee7f292db550d89df4b758d870545b5d20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 20 Sep 2022 22:14:43 +0200 Subject: [PATCH 60/81] Test that browserHtml takes precedence over httpResponseBody for Response.body --- tests/test_responses.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_responses.py b/tests/test_responses.py index a0acb946..a3cda820 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -13,6 +13,7 @@ from scrapy_zyte_api.utils import _RESPONSE_HAS_IP_ADDRESS, _RESPONSE_HAS_PROTOCOL PAGE_CONTENT = "The cake is a lie!" +PAGE_CONTENT_2 = "Ceci n’est pas une pipe" URL = "https://example.com" @@ -43,6 +44,20 @@ def raw_api_response_body(): } +def raw_api_response_mixed(): + return { + "url": URL, + "browserHtml": PAGE_CONTENT, + "httpResponseBody": b64encode(PAGE_CONTENT_2.encode("utf-8")), + "echoData": {"some_value": "here"}, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": len(PAGE_CONTENT)}, + ], + "statusCode": 200, + } + + EXPECTED_HEADERS = {b"Content-Type": [b"text/html"], b"Content-Length": [b"44"]} EXPECTED_BODY = PAGE_CONTENT.encode("utf-8") @@ -76,6 +91,7 @@ def test_init(api_response, cls): [ (raw_api_response_browser, ZyteAPITextResponse), (raw_api_response_body, ZyteAPIResponse), + (raw_api_response_mixed, ZyteAPITextResponse), ], ) def test_text_from_api_response(api_response, cls): From 550cff3eeaaf66a411f5fe4ff084dffb1a7bd429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 22 Sep 2022 11:56:59 +0200 Subject: [PATCH 61/81] =?UTF-8?q?cannot=20known=20=E2=86=92=20cannot=20kno?= =?UTF-8?q?w?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Kevin Lloyd Bernal --- tests/test_api_requests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index becdbc65..3833257a 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1037,7 +1037,7 @@ def test_automap_method(method, meta, expected, warnings, caplog): # Request.headers are mapped as customHttpRequestHeaders only. # # While future main output parameters are likely to use requestHeaders - # instead, we cannot known if an unknown parameter is a main output + # instead, we cannot know if an unknown parameter is a main output # parameter or a different type of parameter for httpRequestBody, and # what we know for sure is that, at the time of writing, Zyte API does # not allow requestHeaders to be combined with httpRequestBody. From d454c9b2416e322f4250ab48c53e482bd9d3dbc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 27 Sep 2022 16:32:03 +0200 Subject: [PATCH 62/81] Update tests/test_api_requests.py Co-authored-by: Kevin Lloyd Bernal --- tests/test_api_requests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 3833257a..603a831f 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -427,7 +427,7 @@ def test_transparent_mode_toggling(setting, meta, expected): assert func() == expected -@pytest.mark.parametrize("meta", [None, 0, "", b"", []]) +@pytest.mark.parametrize("meta", [None, 0, "", b"", [], ()]) def test_api_disabling_deprecated(meta): """Test how undocumented falsy values of the ``zyte_api`` request metadata key (*meta*) can be used to disable the use of Zyte API, but trigger a From 1c46849082df44c5f66c1a305c579630f4f11d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 13 Oct 2022 11:51:12 +0200 Subject: [PATCH 63/81] README: sort usage approaches by relevance --- README.rst | 80 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/README.rst b/README.rst index 1eb65652..74b229b8 100644 --- a/README.rst +++ b/README.rst @@ -73,15 +73,19 @@ The ``ZYTE_API_ENABLED`` setting, which is ``True`` by default, can be set to Usage ===== -You can send a request through Zyte API in one of the following ways: +You can send requests through Zyte API in one of the following ways: -- Setting all Zyte API parameters manually, keeping full control of what is - sent to Zyte API. See **Sending requests with manually-defined parameters** - below. +- Send all request through Zyte API by default, letting Zyte API parameters + be chosen automatically based on your Scrapy request parameters. See + **Using transparent mode** below. -- Letting Zyte API parameters be chosen automatically based on your Scrapy - request parameters where possible. See **Sending requests with - automatically-mapped parameters** below. +- Send specific requests through Zyte API, setting all Zyte API parameters + manually, keeping full control of what is sent to Zyte API. See **Sending + requests with manually-defined parameters** below. + +- Send specific requests through Zyte API, letting Zyte API parameters be + chosen automatically based on your Scrapy request parameters. See **Sending + requests with automatically-mapped parameters** below. The raw Zyte API response can be accessed via the ``raw_api_response`` attribute of the response object. @@ -97,6 +101,28 @@ If multiple requests target the same URL with different Zyte API parameters, pass ``dont_filter=True`` to ``Request``. +Using transparent mode +---------------------- + +Set the ``ZYTE_API_TRANSPARENT_MODE`` setting to ``True`` to handle Scrapy +requests as follows: + +- By default, requests are sent through Zyte API with automatically-mapped + parameters. See **Sending requests with automatically-mapped parameters** + below for details about automatic parameter mapping. + + You do not need to set the ``zyte-api-automap`` request meta key to + ``True``, but you can set it to a dictionary to extend your Zyte API + request parameters. + +- Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent + through Zyte API with manually-defined parameters. See **Sending requests + with manually-defined parameters** below. + +- Requests with the ``zyte_api_automap`` request meta key set to ``False`` + are *not* sent through Zyte API. + + Sending requests with manually-defined parameters ------------------------------------------------- @@ -148,7 +174,14 @@ To send a Scrapy request through Zyte API letting Zyte API parameters be automatically chosen based on the parameters of that Scrapy request, set the ``zyte_api_automap`` key in `Request.meta `_ -to ``True``. See also **Using transparent mode** below. +to ``True``. + +See also **Using transparent mode** above and **Automated parameter mapping** +below. + + +Automated parameter mapping +--------------------------- Automated parameter mapping chooses Zyte API parameters as follows by default: @@ -164,10 +197,11 @@ Automated parameter mapping chooses Zyte API parameters as follows by default: - ``Request.body`` becomes ``httpRequestBody``. -Instead of setting ``zyte_api_automap`` to ``True``, you may set it to a -``dict`` of Zyte API parameters to extend or override choices made by automated -parameter mapping. Some parameters modify the result of automated parameter -mapping as a side effect: +You may set the ``zyte_api_automap`` key in +`Request.meta `_ +to a ``dict`` of Zyte API parameters to extend or override choices made by +automated parameter mapping. Some parameters modify the result of automated +parameter mapping as a side effect: - Setting ``browserHtml`` or ``screenshot`` to ``True`` unsets ``httpResponseBody``, and makes ``Request.headers`` become @@ -226,28 +260,6 @@ Zyte API does not currently support, and may never support: ``browserHtml`` is used as the Scrapy response body. -Using transparent mode ----------------------- - -Set the ``ZYTE_API_TRANSPARENT_MODE`` setting to ``True`` to handle Scrapy -requests as follows: - -- Requests with the ``zyte_api_automap`` request meta key set to ``False`` - are *not* sent through Zyte API. - -- Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent - through Zyte API with manually-defined parameters. See **Sending requests - with manually-defined parameters** above. - -- All other requests are sent through Zyte API with automatically-mapped - parameters. See **Sending requests with automatically-mapped parameters** - above. - - You do not need to set the ``zyte-api-automap`` request meta key to - ``True``, but you can set it to a dictionary to extend your request - parameters. - - Setting default parameters ========================== From 987ef16998d26f14a3358be90d104eac9339fb7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 10:17:18 +0200 Subject: [PATCH 64/81] README: include more code examples --- README.rst | 78 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 74b229b8..6c1a2827 100644 --- a/README.rst +++ b/README.rst @@ -87,8 +87,28 @@ You can send requests through Zyte API in one of the following ways: chosen automatically based on your Scrapy request parameters. See **Sending requests with automatically-mapped parameters** below. -The raw Zyte API response can be accessed via the ``raw_api_response`` -attribute of the response object. +Zyte API response parameters are mapped into Scrapy response parameters where +possible. The raw Zyte API response can be accessed via the +``raw_api_response`` attribute of the response object: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {b"Content-Type": [b"text/html"], …} + print(response.text) + # "…" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", + # "httpResponseHeaders": […], + # } When you use the Zyte API parameters ``browserHtml``, ``httpResponseBody``, or ``httpResponseHeaders``, the response body and headers are set accordingly. @@ -122,6 +142,25 @@ requests as follows: - Requests with the ``zyte_api_automap`` request meta key set to ``False`` are *not* sent through Zyte API. +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + start_urls = ["http://quotes.toscrape.com/"] + + custom_settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + } + + def parse(self, response): + print(response.text) + # "…" + Sending requests with manually-defined parameters ------------------------------------------------- @@ -149,18 +188,15 @@ For example: url="http://quotes.toscrape.com/", meta={ "zyte_api": { - "browserHtml": True, + "httpResponseBody": True, + "httpResponseHeaders": True, } }, ) def parse(self, response): - print(response.raw_api_response) - # { - # 'url': 'https://quotes.toscrape.com/', - # 'statusCode': 200, - # 'browserHtml': '…', - # } + print(response.text) + # "…" See the `Zyte API documentation`_ to learn about Zyte API parameters. @@ -176,6 +212,30 @@ automatically chosen based on the parameters of that Scrapy request, set the `Request.meta `_ to ``True``. +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="http://quotes.toscrape.com/", + meta={ + "zyte_api": { + "zyte_api_automap": True, + } + }, + ) + + def parse(self, response): + print(response.text) + # "…" + See also **Using transparent mode** above and **Automated parameter mapping** below. From a4fa769dde5a50ac387e63f2da711bff4c631827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 10:20:48 +0200 Subject: [PATCH 65/81] README: provide more specific links to the Zyte API documentation --- README.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 6c1a2827..0ec1bb43 100644 --- a/README.rst +++ b/README.rst @@ -198,8 +198,11 @@ For example: print(response.text) # "…" -See the `Zyte API documentation`_ to learn about Zyte API parameters. +To learn about Zyte API parameters, see the `data extraction usage`_ and +`API reference`_ pages of the `Zyte API documentation`_. +.. _API reference: https://docs.zyte.com/zyte-api/openapi.html +.. _data extraction usage: https://docs.zyte.com/zyte-api/usage/extract.html .. _Zyte API documentation: https://docs.zyte.com/zyte-api/get-started.html From 6098b697c8e892e288f22d8b19823db831e560f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 10:29:44 +0200 Subject: [PATCH 66/81] =?UTF-8?q?unsupported=20headers=20=E2=86=92=20skip?= =?UTF-8?q?=20headers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.rst | 11 +++++------ scrapy_zyte_api/handler.py | 28 ++++++++++++++-------------- tests/test_api_requests.py | 12 ++++++------ 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/README.rst b/README.rst index 0ec1bb43..fb05a539 100644 --- a/README.rst +++ b/README.rst @@ -273,13 +273,12 @@ parameter mapping as a side effect: - Setting ``screenshot`` to ``True`` without also setting ``browserHtml`` to ``True`` unsets ``httpResponseHeaders``. -When mapping headers, unsupported headers are excluded from the mapping. If -Zyte API header support changes in the future, and you cannot upgrade to a -version of scrapy-zyte-api that includes such changes, use the following -settings to change which headers are mapped and how they are mapped: +When mapping headers, headers not supported by Zyte API are excluded from the +mapping by default. Use the following settings to change which headers are +included or excluded from header mapping: -- ``ZYTE_API_UNSUPPORTED_HEADERS`` determines headers that *cannot* be mapped - as ``customHttpRequestHeaders``, and its default value is: +- ``ZYTE_API_SKIP_HEADERS`` determines headers that must *not* be mapped as + ``customHttpRequestHeaders``, and its default value is: .. code-block:: python diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 7a6223ab..aef29318 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -59,7 +59,7 @@ def _map_custom_http_request_headers( *, api_params: Dict[str, Any], request: Request, - unsupported_headers: Set[str], + skip_headers: Set[str], ): headers = [] for k, lowercase_k, decoded_v in _iter_headers( @@ -67,7 +67,7 @@ def _map_custom_http_request_headers( request=request, parameter="customHttpRequestHeaders", ): - if lowercase_k in unsupported_headers: + if lowercase_k in skip_headers: if lowercase_k != b"user-agent" or decoded_v != DEFAULT_USER_AGENT: logger.warning( f"Request {request} defines header {k}, which " @@ -119,7 +119,7 @@ def _set_request_headers_from_request( *, api_params: Dict[str, Any], request: Request, - unsupported_headers: Set[str], + skip_headers: Set[str], browser_headers: Dict[str, str], ): """Updates *api_params*, in place, based on *request*.""" @@ -135,7 +135,7 @@ def _set_request_headers_from_request( _map_custom_http_request_headers( api_params=api_params, request=request, - unsupported_headers=unsupported_headers, + skip_headers=skip_headers, ) elif custom_http_request_headers is False: api_params.pop("customHttpRequestHeaders") @@ -286,7 +286,7 @@ def _update_api_params_from_request( *, default_params: Dict[str, Any], meta_params: Dict[str, Any], - unsupported_headers: Set[str], + skip_headers: Set[str], browser_headers: Dict[str, str], ): _set_http_response_body_from_request(api_params=api_params, request=request) @@ -300,7 +300,7 @@ def _update_api_params_from_request( _set_request_headers_from_request( api_params=api_params, request=request, - unsupported_headers=unsupported_headers, + skip_headers=skip_headers, browser_headers=browser_headers, ) _set_http_request_body_from_request(api_params=api_params, request=request) @@ -391,7 +391,7 @@ def _get_automap_params( *, default_enabled: bool, default_params: Dict[str, Any], - unsupported_headers: Set[str], + skip_headers: Set[str], browser_headers: Dict[str, str], ): meta_params = request.meta.get("zyte_api_automap", default_enabled) @@ -417,7 +417,7 @@ def _get_automap_params( request, default_params=default_params, meta_params=meta_params, - unsupported_headers=unsupported_headers, + skip_headers=skip_headers, browser_headers=browser_headers, ) @@ -430,7 +430,7 @@ def _get_api_params( default_params: Dict[str, Any], transparent_mode: bool, automap_params: Dict[str, Any], - unsupported_headers: Set[str], + skip_headers: Set[str], browser_headers: Dict[str, str], job_id: Optional[str], ) -> Optional[dict]: @@ -443,7 +443,7 @@ def _get_api_params( request, default_enabled=transparent_mode, default_params=automap_params, - unsupported_headers=unsupported_headers, + skip_headers=skip_headers, browser_headers=browser_headers, ) if api_params is None: @@ -473,11 +473,11 @@ def _load_default_params(settings, setting): return params -def _load_unsupported_headers(settings): +def _load_skip_headers(settings): return { header.strip().lower().encode() for header in settings.getlist( - "ZYTE_API_UNSUPPORTED_HEADERS", + "ZYTE_API_SKIP_HEADERS", ["Cookie", "User-Agent"], ) } @@ -538,7 +538,7 @@ def __init__( self._job_id = crawler.settings.get("JOB") self._retry_policy = _load_retry_policy(settings) self._transparent_mode = settings.getbool("ZYTE_API_TRANSPARENT_MODE", False) - self._unsupported_headers = _load_unsupported_headers(settings) + self._skip_headers = _load_skip_headers(settings) def download_request(self, request: Request, spider: Spider) -> Deferred: api_params = _get_api_params( @@ -546,7 +546,7 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: default_params=self._default_params, transparent_mode=self._transparent_mode, automap_params=self._automap_params, - unsupported_headers=self._unsupported_headers, + skip_headers=self._skip_headers, browser_headers=self._browser_headers, job_id=self._job_id, ) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 603a831f..f91614f1 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -240,13 +240,13 @@ async def parse(self, response): BROWSER_HEADERS = {b"referer": "referer"} DEFAULT_PARAMS: Dict[str, Any] = {} TRANSPARENT_MODE = False -UNSUPPORTED_HEADERS = {b"cookie", b"user-agent"} +SKIP_HEADERS = {b"cookie", b"user-agent"} JOB_ID = None GET_API_PARAMS_KWARGS = { "default_params": DEFAULT_PARAMS, "transparent_mode": TRANSPARENT_MODE, "automap_params": AUTOMAP_PARAMS, - "unsupported_headers": UNSUPPORTED_HEADERS, + "skip_headers": SKIP_HEADERS, "browser_headers": BROWSER_HEADERS, "job_id": JOB_ID, } @@ -276,7 +276,7 @@ async def test_get_api_params_input_custom(mockserver): "ZYTE_API_BROWSER_HEADERS": {"B": "b"}, "ZYTE_API_DEFAULT_PARAMS": {"a": "b"}, "ZYTE_API_AUTOMAP_PARAMS": {"c": "d"}, - "ZYTE_API_UNSUPPORTED_HEADERS": {"A"}, + "ZYTE_API_SKIP_HEADERS": {"A"}, } async with mockserver.make_handler(settings) as handler: patch_path = "scrapy_zyte_api.handler._get_api_params" @@ -289,7 +289,7 @@ async def test_get_api_params_input_custom(mockserver): default_params={"a": "b"}, transparent_mode=True, automap_params={"c": "d"}, - unsupported_headers={b"a"}, + skip_headers={b"a"}, browser_headers={b"b": "b"}, job_id="1/2/3", ) @@ -1488,12 +1488,12 @@ def test_automap_headers(headers, meta, expected, warnings, caplog): @pytest.mark.parametrize( "global_kwargs,headers,meta,expected,warnings", [ - # You may update the ZYTE_API_UNSUPPORTED_HEADERS setting to remove + # You may update the ZYTE_API_SKIP_HEADERS setting to remove # headers that the customHttpRequestHeaders parameter starts supporting # in the future. ( { - "unsupported_headers": {b"cookie"}, + "skip_headers": {b"cookie"}, }, { "Cookie": "", From 41d9a705ab257f3eeae5e2c0b16c3b017dda592e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 11:28:03 +0200 Subject: [PATCH 67/81] README: add code examples of automated parameter mapping --- README.rst | 67 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index fb05a539..3ac12a56 100644 --- a/README.rst +++ b/README.rst @@ -124,8 +124,8 @@ pass ``dont_filter=True`` to ``Request``. Using transparent mode ---------------------- -Set the ``ZYTE_API_TRANSPARENT_MODE`` setting to ``True`` to handle Scrapy -requests as follows: +Set the ``ZYTE_API_TRANSPARENT_MODE`` `Scrapy setting`_ to ``True`` to handle +Scrapy requests as follows: - By default, requests are sent through Zyte API with automatically-mapped parameters. See **Sending requests with automatically-mapped parameters** @@ -151,7 +151,7 @@ For example: class SampleQuotesSpider(scrapy.Spider): name = "sample_quotes" - start_urls = ["http://quotes.toscrape.com/"] + start_urls = ["https://quotes.toscrape.com/"] custom_settings = { "ZYTE_API_TRANSPARENT_MODE": True, @@ -185,7 +185,7 @@ For example: def start_requests(self): yield scrapy.Request( - url="http://quotes.toscrape.com/", + url="https://quotes.toscrape.com/", meta={ "zyte_api": { "httpResponseBody": True, @@ -227,7 +227,7 @@ For example: def start_requests(self): yield scrapy.Request( - url="http://quotes.toscrape.com/", + url="https://quotes.toscrape.com/", meta={ "zyte_api": { "zyte_api_automap": True, @@ -246,7 +246,10 @@ below. Automated parameter mapping --------------------------- -Automated parameter mapping chooses Zyte API parameters as follows by default: +When you enable automated parameter mapping, be it through transparent mode +(see **Using transparent mode** above) or for a speicfic request (see **Sending +requests with automatically-mapped parameters** above), Zyte API parameters are +chosen as follows by default: - ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. @@ -260,6 +263,31 @@ Automated parameter mapping chooses Zyte API parameters as follows by default: - ``Request.body`` becomes ``httpRequestBody``. +For example, the following Scrapy request: + +.. code-block:: python + + Request( + method="POST" + url="https://httpbin.org/anything", + headers={"Content-Type": "application/json"}, + body=b'{"foo": "bar"}', + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "httpResponseBody": true, + "httpResponseHeaders": true, + "url": "https://httpbin.org/anything", + "httpRequestMethod": "POST", + "customHttpRequestHeaders": [{"name": "Content-Type", "value": "application/json"}], + "httpRequestBody": "eyJmb28iOiAiYmFyIn0=" + } + You may set the ``zyte_api_automap`` key in `Request.meta `_ to a ``dict`` of Zyte API parameters to extend or override choices made by @@ -273,9 +301,32 @@ parameter mapping as a side effect: - Setting ``screenshot`` to ``True`` without also setting ``browserHtml`` to ``True`` unsets ``httpResponseHeaders``. +For example, the following Scrapy request: + +.. code-block:: python + + Request( + url="https://quotes.toscrape.com", + headers={"Referer": "https://example.com/"}, + meta={"zyte_api_automap": {"screenshot": True}}, + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "screenshot": true, + "url": "https://quotes.toscrape.com", + "requestHeaders": {"referer": "https://example.com/"}, + } + When mapping headers, headers not supported by Zyte API are excluded from the -mapping by default. Use the following settings to change which headers are -included or excluded from header mapping: +mapping by default. Use the following `Scrapy settings`_` to change which +headers are included or excluded from header mapping: + +.. _Scrapy settings: https://docs.scrapy.org/en/latest/topics/settings.html - ``ZYTE_API_SKIP_HEADERS`` determines headers that must *not* be mapped as ``customHttpRequestHeaders``, and its default value is: From 8e93e00d97e17c85f833b5f040aaff9fc8389924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 11:30:35 +0200 Subject: [PATCH 68/81] Clarify how default parameter settings affect unrelated requests --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index 3ac12a56..70a3d4a1 100644 --- a/README.rst +++ b/README.rst @@ -401,6 +401,10 @@ For example, if you set ``ZYTE_API_DEFAULT_PARAMS`` to Parameters in these settings are merged with request-specific parameters, with request-specific parameters taking precedence. +``ZYTE_API_DEFAULT_PARAMS`` has no effect on requests that use automated +parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on requests +that use manually-defined parameters. + Customizing the retry policy ============================ From 0019ea28560d0e046b787f8b103a928b119f9381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 11:32:05 +0200 Subject: [PATCH 69/81] =?UTF-8?q?=5Fiter=5Fheaders:=20parameter=20?= =?UTF-8?q?=E2=86=92=20header=5Fparameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_zyte_api/handler.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index aef29318..f6145017 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -36,13 +36,14 @@ def _iter_headers( *, api_params: Dict[str, Any], request: Request, - parameter: str, + header_parameter: str, ): - headers = api_params.get(parameter) + headers = api_params.get(header_parameter) if headers not in (None, True): logger.warning( - f"Request {request} defines the Zyte API {parameter} parameter, " - f"overriding Request.headers. Use Request.headers instead." + f"Request {request} defines the Zyte API {header_parameter} " + f"parameter, overriding Request.headers. Use Request.headers " + f"instead." ) return if not request.headers: @@ -65,7 +66,7 @@ def _map_custom_http_request_headers( for k, lowercase_k, decoded_v in _iter_headers( api_params=api_params, request=request, - parameter="customHttpRequestHeaders", + header_parameter="customHttpRequestHeaders", ): if lowercase_k in skip_headers: if lowercase_k != b"user-agent" or decoded_v != DEFAULT_USER_AGENT: @@ -90,7 +91,7 @@ def _map_request_headers( for k, lowercase_k, decoded_v in _iter_headers( api_params=api_params, request=request, - parameter="requestHeaders", + header_parameter="requestHeaders", ): key = browser_headers.get(lowercase_k) if key is not None: From 82cbc5430c121ceca6df063629153623bf7ec281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 11:51:41 +0200 Subject: [PATCH 70/81] Move parameter handling to its own module --- scrapy_zyte_api/_params.py | 443 +++++++++++++++++++++++++++++++++++++ scrapy_zyte_api/handler.py | 443 +------------------------------------ 2 files changed, 445 insertions(+), 441 deletions(-) create mode 100644 scrapy_zyte_api/_params.py diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py new file mode 100644 index 00000000..dd140cd6 --- /dev/null +++ b/scrapy_zyte_api/_params.py @@ -0,0 +1,443 @@ +from base64 import b64decode, b64encode +from copy import copy +from logging import getLogger +from typing import Any, Dict, Mapping, Optional, Set +from warnings import warn + +from scrapy import Request +from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS +from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT + +logger = getLogger(__name__) + +_DEFAULT_API_PARAMS = { + "browserHtml": False, + "screenshot": False, +} + + +def _iter_headers( + *, + api_params: Dict[str, Any], + request: Request, + header_parameter: str, +): + headers = api_params.get(header_parameter) + if headers not in (None, True): + logger.warning( + f"Request {request} defines the Zyte API {header_parameter} " + f"parameter, overriding Request.headers. Use Request.headers " + f"instead." + ) + return + if not request.headers: + return + for k, v in request.headers.items(): + if not v: + continue + decoded_v = b",".join(v).decode() + lowercase_k = k.strip().lower() + yield k, lowercase_k, decoded_v + + +def _map_custom_http_request_headers( + *, + api_params: Dict[str, Any], + request: Request, + skip_headers: Set[str], +): + headers = [] + for k, lowercase_k, decoded_v in _iter_headers( + api_params=api_params, + request=request, + header_parameter="customHttpRequestHeaders", + ): + if lowercase_k in skip_headers: + if lowercase_k != b"user-agent" or decoded_v != DEFAULT_USER_AGENT: + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte API " + f"customHttpRequestHeaders parameter." + ) + continue + headers.append({"name": k.decode(), "value": decoded_v}) + if headers: + api_params["customHttpRequestHeaders"] = headers + + +def _map_request_headers( + *, + api_params: Dict[str, Any], + request: Request, + browser_headers: Dict[str, str], +): + request_headers = {} + for k, lowercase_k, decoded_v in _iter_headers( + api_params=api_params, + request=request, + header_parameter="requestHeaders", + ): + key = browser_headers.get(lowercase_k) + if key is not None: + request_headers[key] = decoded_v + elif not ( + ( + lowercase_k == b"accept" + and decoded_v == DEFAULT_REQUEST_HEADERS["Accept"] + ) + or ( + lowercase_k == b"accept-language" + and decoded_v == DEFAULT_REQUEST_HEADERS["Accept-Language"] + ) + or (lowercase_k == b"user-agent" and decoded_v == DEFAULT_USER_AGENT) + ): + logger.warning( + f"Request {request} defines header {k}, which " + f"cannot be mapped into the Zyte API requestHeaders " + f"parameter." + ) + if request_headers: + api_params["requestHeaders"] = request_headers + + +def _set_request_headers_from_request( + *, + api_params: Dict[str, Any], + request: Request, + skip_headers: Set[str], + browser_headers: Dict[str, str], +): + """Updates *api_params*, in place, based on *request*.""" + custom_http_request_headers = api_params.get("customHttpRequestHeaders") + request_headers = api_params.get("requestHeaders") + response_body = api_params.get("httpResponseBody") + + if ( + response_body + and custom_http_request_headers is not False + or custom_http_request_headers is True + ): + _map_custom_http_request_headers( + api_params=api_params, + request=request, + skip_headers=skip_headers, + ) + elif custom_http_request_headers is False: + api_params.pop("customHttpRequestHeaders") + + if ( + ( + not response_body + or any(api_params.get(k) for k in ("browserHtml", "screenshot")) + ) + and request_headers is not False + or request_headers is True + ): + _map_request_headers( + api_params=api_params, + request=request, + browser_headers=browser_headers, + ) + elif request_headers is False: + api_params.pop("requestHeaders") + + +def _set_http_response_body_from_request( + *, + api_params: Dict[str, Any], + request: Request, +): + if not any( + api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") + ): + api_params.setdefault("httpResponseBody", True) + elif api_params.get("httpResponseBody") is True and not any( + api_params.get(k) for k in ("browserHtml", "screenshot") + ): + logger.warning( + "You do not need to set httpResponseBody to True if neither " + "browserHtml nor screenshot are set to True." + ) + elif api_params.get("httpResponseBody") is False: + logger.warning( + f"Request {request} unnecessarily defines the Zyte API " + f"'httpResponseBody' parameter with its default value, False. " + f"It will not be sent to the server." + ) + if api_params.get("httpResponseBody") is False: + api_params.pop("httpResponseBody") + + +def _set_http_response_headers_from_request( + *, + api_params: Dict[str, Any], + default_params: Dict[str, Any], + meta_params: Dict[str, Any], +): + if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): + if api_params.get("httpResponseHeaders") is True and not ( + default_params.get("httpResponseHeaders") is True + and "httpResponseHeaders" not in meta_params + ): + logger.error(default_params) + logger.warning( + "You do not need to set httpResponseHeaders to True if " + "you set httpResponseBody or browserHtml to True. Note " + "that httpResponseBody is set to True automatically if " + "neither browserHtml nor screenshot are set to True." + ) + api_params.setdefault("httpResponseHeaders", True) + elif ( + api_params.get("httpResponseHeaders") is False + and not default_params.get("httpResponseHeaders") is False + ): + logger.warning( + "You do not need to set httpResponseHeaders to False if " + "you do set httpResponseBody or browserHtml to True. Note " + "that httpResponseBody is set to True automatically if " + "neither browserHtml nor screenshot are set to True." + ) + if api_params.get("httpResponseHeaders") is False: + api_params.pop("httpResponseHeaders") + + +def _set_http_request_method_from_request( + *, + api_params: Dict[str, Any], + request: Request, +): + method = api_params.get("httpRequestMethod") + if method: + logger.warning( + f"Request {request} uses the Zyte API httpRequestMethod " + f"parameter, overriding Request.method. Use Request.method " + f"instead." + ) + if method != request.method: + logger.warning( + f"The HTTP method of request {request} ({request.method}) " + f"does not match the Zyte API httpRequestMethod parameter " + f"({method})." + ) + elif request.method != "GET": + api_params["httpRequestMethod"] = request.method + + +def _set_http_request_body_from_request( + *, + api_params: Dict[str, Any], + request: Request, +): + body = api_params.get("httpRequestBody") + if body: + logger.warning( + f"Request {request} uses the Zyte API httpRequestBody parameter, " + f"overriding Request.body. Use Request.body instead." + ) + decoded_body = b64decode(body) + if decoded_body != request.body: + logger.warning( + f"The body of request {request} ({request.body!r}) " + f"does not match the Zyte API httpRequestBody parameter " + f"({body!r}; decoded: {decoded_body!r})." + ) + elif request.body != b"": + base64_body = b64encode(request.body).decode() + api_params["httpRequestBody"] = base64_body + + +def _unset_unneeded_api_params( + *, + api_params: Dict[str, Any], + default_params: Dict[str, Any], + request: Request, +): + for param, default_value in _DEFAULT_API_PARAMS.items(): + if api_params.get(param) != default_value: + continue + if param not in default_params or default_params.get(param) == default_value: + logger.warning( + f"Request {request} unnecessarily defines the Zyte API {param!r} " + f"parameter with its default value, {default_value!r}. It will " + f"not be sent to the server." + ) + api_params.pop(param) + + +def _update_api_params_from_request( + api_params: Dict[str, Any], + request: Request, + *, + default_params: Dict[str, Any], + meta_params: Dict[str, Any], + skip_headers: Set[str], + browser_headers: Dict[str, str], +): + _set_http_response_body_from_request(api_params=api_params, request=request) + _set_http_response_headers_from_request( + api_params=api_params, + default_params=default_params, + meta_params=meta_params, + ) + _set_http_request_method_from_request(api_params=api_params, request=request) + _set_request_headers_from_request( + api_params=api_params, + request=request, + skip_headers=skip_headers, + browser_headers=browser_headers, + ) + _set_http_request_body_from_request(api_params=api_params, request=request) + _unset_unneeded_api_params( + api_params=api_params, request=request, default_params=default_params + ) + return api_params + + +def _copy_meta_params_as_dict( + meta_params: Dict[str, Any], + *, + param: str, + request: Request, +): + if meta_params is True: + return {} + elif not isinstance(meta_params, Mapping): + raise ValueError( + f"'{param}' parameters in the request meta should be provided as " + f"a dictionary, got {type(meta_params)} instead in {request}." + ) + else: + return copy(meta_params) + + +def _merge_params( + *, + default_params: Dict[str, Any], + meta_params: Dict[str, Any], + param: str, + setting: str, + request: Request, +): + params = copy(default_params) + meta_params = copy(meta_params) + for k in list(meta_params): + if meta_params[k] is not None: + continue + meta_params.pop(k) + if k in params: + params.pop(k) + else: + logger.warning( + f"In request {request} {param!r} parameter {k} is None, " + f"which is a value reserved to unset parameters defined in " + f"the {setting} setting, but the setting does not define such " + f"a parameter." + ) + params.update(meta_params) + return params + + +def _get_raw_params( + request: Request, + *, + default_params: Dict[str, Any], +): + meta_params = request.meta.get("zyte_api", False) + if meta_params is False: + return None + + if not meta_params and meta_params != {}: + warn( + f"Setting the zyte_api request metadata key to " + f"{meta_params!r} is deprecated. Use False instead.", + DeprecationWarning, + ) + return None + + meta_params = _copy_meta_params_as_dict( + meta_params, + param="zyte_api", + request=request, + ) + + return _merge_params( + default_params=default_params, + meta_params=meta_params, + param="zyte_api", + setting="ZYTE_API_DEFAULT_PARAMS", + request=request, + ) + + +def _get_automap_params( + request: Request, + *, + default_enabled: bool, + default_params: Dict[str, Any], + skip_headers: Set[str], + browser_headers: Dict[str, str], +): + meta_params = request.meta.get("zyte_api_automap", default_enabled) + if meta_params is False: + return None + + meta_params = _copy_meta_params_as_dict( + meta_params, + param="zyte_api_automap", + request=request, + ) + + params = _merge_params( + default_params=default_params, + meta_params=meta_params, + param="zyte_api_automap", + setting="ZYTE_API_AUTOMAP_PARAMS", + request=request, + ) + + _update_api_params_from_request( + params, + request, + default_params=default_params, + meta_params=meta_params, + skip_headers=skip_headers, + browser_headers=browser_headers, + ) + + return params + + +def _get_api_params( + request: Request, + *, + default_params: Dict[str, Any], + transparent_mode: bool, + automap_params: Dict[str, Any], + skip_headers: Set[str], + browser_headers: Dict[str, str], + job_id: Optional[str], +) -> Optional[dict]: + """Returns a dictionary of API parameters that must be sent to Zyte API for + the specified request, or None if the request should not be sent through + Zyte API.""" + api_params = _get_raw_params(request, default_params=default_params) + if api_params is None: + api_params = _get_automap_params( + request, + default_enabled=transparent_mode, + default_params=automap_params, + skip_headers=skip_headers, + browser_headers=browser_headers, + ) + if api_params is None: + return None + elif request.meta.get("zyte_api_automap", False) is not False: + raise ValueError( + f"Request {request} combines manually-defined parameters and " + f"automatically-mapped parameters." + ) + + if job_id is not None: + api_params["jobId"] = job_id + + return api_params diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index f6145017..0b5e044a 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,8 +1,5 @@ import logging -from base64 import b64decode, b64encode -from copy import copy -from typing import Any, Dict, Generator, Mapping, Optional, Set, Union -from warnings import warn +from typing import Generator, Optional, Union from scrapy import Spider from scrapy.core.downloader.handlers.http import HTTPDownloadHandler @@ -10,8 +7,6 @@ from scrapy.exceptions import NotConfigured from scrapy.http import Request from scrapy.settings import Settings -from scrapy.settings.default_settings import DEFAULT_REQUEST_HEADERS -from scrapy.settings.default_settings import USER_AGENT as DEFAULT_USER_AGENT from scrapy.utils.defer import deferred_from_coro from scrapy.utils.misc import load_object from scrapy.utils.reactor import verify_installed_reactor @@ -21,446 +16,12 @@ from zyte_api.apikey import NoApiKey from zyte_api.constants import API_URL +from ._params import _get_api_params from .responses import ZyteAPIResponse, ZyteAPITextResponse, _process_response logger = logging.getLogger(__name__) -_DEFAULT_API_PARAMS = { - "browserHtml": False, - "screenshot": False, -} - - -def _iter_headers( - *, - api_params: Dict[str, Any], - request: Request, - header_parameter: str, -): - headers = api_params.get(header_parameter) - if headers not in (None, True): - logger.warning( - f"Request {request} defines the Zyte API {header_parameter} " - f"parameter, overriding Request.headers. Use Request.headers " - f"instead." - ) - return - if not request.headers: - return - for k, v in request.headers.items(): - if not v: - continue - decoded_v = b",".join(v).decode() - lowercase_k = k.strip().lower() - yield k, lowercase_k, decoded_v - - -def _map_custom_http_request_headers( - *, - api_params: Dict[str, Any], - request: Request, - skip_headers: Set[str], -): - headers = [] - for k, lowercase_k, decoded_v in _iter_headers( - api_params=api_params, - request=request, - header_parameter="customHttpRequestHeaders", - ): - if lowercase_k in skip_headers: - if lowercase_k != b"user-agent" or decoded_v != DEFAULT_USER_AGENT: - logger.warning( - f"Request {request} defines header {k}, which " - f"cannot be mapped into the Zyte API " - f"customHttpRequestHeaders parameter." - ) - continue - headers.append({"name": k.decode(), "value": decoded_v}) - if headers: - api_params["customHttpRequestHeaders"] = headers - - -def _map_request_headers( - *, - api_params: Dict[str, Any], - request: Request, - browser_headers: Dict[str, str], -): - request_headers = {} - for k, lowercase_k, decoded_v in _iter_headers( - api_params=api_params, - request=request, - header_parameter="requestHeaders", - ): - key = browser_headers.get(lowercase_k) - if key is not None: - request_headers[key] = decoded_v - elif not ( - ( - lowercase_k == b"accept" - and decoded_v == DEFAULT_REQUEST_HEADERS["Accept"] - ) - or ( - lowercase_k == b"accept-language" - and decoded_v == DEFAULT_REQUEST_HEADERS["Accept-Language"] - ) - or (lowercase_k == b"user-agent" and decoded_v == DEFAULT_USER_AGENT) - ): - logger.warning( - f"Request {request} defines header {k}, which " - f"cannot be mapped into the Zyte API requestHeaders " - f"parameter." - ) - if request_headers: - api_params["requestHeaders"] = request_headers - - -def _set_request_headers_from_request( - *, - api_params: Dict[str, Any], - request: Request, - skip_headers: Set[str], - browser_headers: Dict[str, str], -): - """Updates *api_params*, in place, based on *request*.""" - custom_http_request_headers = api_params.get("customHttpRequestHeaders") - request_headers = api_params.get("requestHeaders") - response_body = api_params.get("httpResponseBody") - - if ( - response_body - and custom_http_request_headers is not False - or custom_http_request_headers is True - ): - _map_custom_http_request_headers( - api_params=api_params, - request=request, - skip_headers=skip_headers, - ) - elif custom_http_request_headers is False: - api_params.pop("customHttpRequestHeaders") - - if ( - ( - not response_body - or any(api_params.get(k) for k in ("browserHtml", "screenshot")) - ) - and request_headers is not False - or request_headers is True - ): - _map_request_headers( - api_params=api_params, - request=request, - browser_headers=browser_headers, - ) - elif request_headers is False: - api_params.pop("requestHeaders") - - -def _set_http_response_body_from_request( - *, - api_params: Dict[str, Any], - request: Request, -): - if not any( - api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") - ): - api_params.setdefault("httpResponseBody", True) - elif api_params.get("httpResponseBody") is True and not any( - api_params.get(k) for k in ("browserHtml", "screenshot") - ): - logger.warning( - "You do not need to set httpResponseBody to True if neither " - "browserHtml nor screenshot are set to True." - ) - elif api_params.get("httpResponseBody") is False: - logging.warning( - f"Request {request} unnecessarily defines the Zyte API " - f"'httpResponseBody' parameter with its default value, False. " - f"It will not be sent to the server." - ) - if api_params.get("httpResponseBody") is False: - api_params.pop("httpResponseBody") - - -def _set_http_response_headers_from_request( - *, - api_params: Dict[str, Any], - default_params: Dict[str, Any], - meta_params: Dict[str, Any], - request: Request, -): - if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): - if api_params.get("httpResponseHeaders") is True and not ( - default_params.get("httpResponseHeaders") is True - and "httpResponseHeaders" not in meta_params - ): - logger.error(default_params) - logger.warning( - "You do not need to set httpResponseHeaders to True if " - "you set httpResponseBody or browserHtml to True. Note " - "that httpResponseBody is set to True automatically if " - "neither browserHtml nor screenshot are set to True." - ) - api_params.setdefault("httpResponseHeaders", True) - elif ( - api_params.get("httpResponseHeaders") is False - and not default_params.get("httpResponseHeaders") is False - ): - logger.warning( - "You do not need to set httpResponseHeaders to False if " - "you do set httpResponseBody or browserHtml to True. Note " - "that httpResponseBody is set to True automatically if " - "neither browserHtml nor screenshot are set to True." - ) - if api_params.get("httpResponseHeaders") is False: - api_params.pop("httpResponseHeaders") - - -def _set_http_request_method_from_request( - *, - api_params: Dict[str, Any], - request: Request, -): - method = api_params.get("httpRequestMethod") - if method: - logger.warning( - f"Request {request} uses the Zyte API httpRequestMethod " - f"parameter, overriding Request.method. Use Request.method " - f"instead." - ) - if method != request.method: - logger.warning( - f"The HTTP method of request {request} ({request.method}) " - f"does not match the Zyte API httpRequestMethod parameter " - f"({method})." - ) - elif request.method != "GET": - api_params["httpRequestMethod"] = request.method - - -def _set_http_request_body_from_request( - *, - api_params: Dict[str, Any], - request: Request, -): - body = api_params.get("httpRequestBody") - if body: - logger.warning( - f"Request {request} uses the Zyte API httpRequestBody parameter, " - f"overriding Request.body. Use Request.body instead." - ) - decoded_body = b64decode(body) - if decoded_body != request.body: - logger.warning( - f"The body of request {request} ({request.body!r}) " - f"does not match the Zyte API httpRequestBody parameter " - f"({body!r}; decoded: {decoded_body!r})." - ) - elif request.body != b"": - base64_body = b64encode(request.body).decode() - api_params["httpRequestBody"] = base64_body - - -def _unset_unneeded_api_params( - *, - api_params: Dict[str, Any], - default_params: Dict[str, Any], - request: Request, -): - for param, default_value in _DEFAULT_API_PARAMS.items(): - if api_params.get(param) != default_value: - continue - if param not in default_params or default_params.get(param) == default_value: - logging.warning( - f"Request {request} unnecessarily defines the Zyte API {param!r} " - f"parameter with its default value, {default_value!r}. It will " - f"not be sent to the server." - ) - api_params.pop(param) - - -def _update_api_params_from_request( - api_params: Dict[str, Any], - request: Request, - *, - default_params: Dict[str, Any], - meta_params: Dict[str, Any], - skip_headers: Set[str], - browser_headers: Dict[str, str], -): - _set_http_response_body_from_request(api_params=api_params, request=request) - _set_http_response_headers_from_request( - api_params=api_params, - request=request, - default_params=default_params, - meta_params=meta_params, - ) - _set_http_request_method_from_request(api_params=api_params, request=request) - _set_request_headers_from_request( - api_params=api_params, - request=request, - skip_headers=skip_headers, - browser_headers=browser_headers, - ) - _set_http_request_body_from_request(api_params=api_params, request=request) - _unset_unneeded_api_params( - api_params=api_params, request=request, default_params=default_params - ) - return api_params - - -def _copy_meta_params_as_dict( - meta_params: Dict[str, Any], - *, - param: str, - request: Request, -): - if meta_params is True: - return {} - elif not isinstance(meta_params, Mapping): - raise ValueError( - f"'{param}' parameters in the request meta should be provided as " - f"a dictionary, got {type(meta_params)} instead in {request}." - ) - else: - return copy(meta_params) - - -def _merge_params( - *, - default_params: Dict[str, Any], - meta_params: Dict[str, Any], - param: str, - setting: str, - request: Request, -): - params = copy(default_params) - meta_params = copy(meta_params) - for k in list(meta_params): - if meta_params[k] is not None: - continue - meta_params.pop(k) - if k in params: - params.pop(k) - else: - logger.warning( - f"In request {request} {param!r} parameter {k} is None, " - f"which is a value reserved to unset parameters defined in " - f"the {setting} setting, but the setting does not define such " - f"a parameter." - ) - params.update(meta_params) - return params - - -def _get_raw_params( - request: Request, - *, - default_params: Dict[str, Any], -): - meta_params = request.meta.get("zyte_api", False) - if meta_params is False: - return None - - if not meta_params and meta_params != {}: - warn( - f"Setting the zyte_api request metadata key to " - f"{meta_params!r} is deprecated. Use False instead.", - DeprecationWarning, - ) - return None - - meta_params = _copy_meta_params_as_dict( - meta_params, - param="zyte_api", - request=request, - ) - - return _merge_params( - default_params=default_params, - meta_params=meta_params, - param="zyte_api", - setting="ZYTE_API_DEFAULT_PARAMS", - request=request, - ) - - -def _get_automap_params( - request: Request, - *, - default_enabled: bool, - default_params: Dict[str, Any], - skip_headers: Set[str], - browser_headers: Dict[str, str], -): - meta_params = request.meta.get("zyte_api_automap", default_enabled) - if meta_params is False: - return None - - meta_params = _copy_meta_params_as_dict( - meta_params, - param="zyte_api_automap", - request=request, - ) - - params = _merge_params( - default_params=default_params, - meta_params=meta_params, - param="zyte_api_automap", - setting="ZYTE_API_AUTOMAP_PARAMS", - request=request, - ) - - _update_api_params_from_request( - params, - request, - default_params=default_params, - meta_params=meta_params, - skip_headers=skip_headers, - browser_headers=browser_headers, - ) - - return params - - -def _get_api_params( - request: Request, - *, - default_params: Dict[str, Any], - transparent_mode: bool, - automap_params: Dict[str, Any], - skip_headers: Set[str], - browser_headers: Dict[str, str], - job_id: Optional[str], -) -> Optional[dict]: - """Returns a dictionary of API parameters that must be sent to Zyte API for - the specified request, or None if the request should not be sent through - Zyte API.""" - api_params = _get_raw_params(request, default_params=default_params) - if api_params is None: - api_params = _get_automap_params( - request, - default_enabled=transparent_mode, - default_params=automap_params, - skip_headers=skip_headers, - browser_headers=browser_headers, - ) - if api_params is None: - return None - elif request.meta.get("zyte_api_automap", False) is not False: - raise ValueError( - f"Request {request} combines manually-defined parameters and " - f"automatically-mapped parameters." - ) - - if job_id is not None: - api_params["jobId"] = job_id - - return api_params - - def _load_default_params(settings, setting): params = settings.getdict(setting) for param in list(params): From 109ed8764b3fe5e1ff4a35990fbaa4819a07c50e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 11:55:52 +0200 Subject: [PATCH 71/81] make_handler: clarify when handler is set to None --- tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/__init__.py b/tests/__init__.py index 5775fee7..5a35538c 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -36,7 +36,7 @@ async def make_handler(settings: dict, api_url: Optional[str] = None): settings=None, crawler=crawler, ) - except NotConfigured: + except NotConfigured: # i.e. ZYTE_API_ENABLED=False handler = None try: yield handler From 33a050189b993aaf90ce1f5bd716eaec3b3ef679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 11:57:26 +0200 Subject: [PATCH 72/81] =?UTF-8?q?tests:=20fix=20typo=20(Request.meta=20?= =?UTF-8?q?=E2=86=92=20Request.method)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_api_requests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index f91614f1..8680a69b 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -882,7 +882,7 @@ def test_automap_header_output(meta, expected, warnings, caplog): ), # If httpRequestMethod is also specified in meta with the same value # as Request.method, a warning is logged asking to use only - # Request.meta. + # Request.method. *( ( request_method, From 9f7fb4bd4685227ba52a93d654bba1280031a734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 12:01:00 +0200 Subject: [PATCH 73/81] tests: Fix example Content-Length value --- tests/test_responses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_responses.py b/tests/test_responses.py index a3cda820..df55e035 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -52,7 +52,7 @@ def raw_api_response_mixed(): "echoData": {"some_value": "here"}, "httpResponseHeaders": [ {"name": "Content-Type", "value": "text/html"}, - {"name": "Content-Length", "value": len(PAGE_CONTENT)}, + {"name": "Content-Length", "value": len(PAGE_CONTENT_2)}, ], "statusCode": 200, } From 6a1a4969dc18626278b4eb3286e0459ffcdbc42c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 12:05:23 +0200 Subject: [PATCH 74/81] Clarify message about httpResponseHeaders=False being unnecessary --- scrapy_zyte_api/_params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index dd140cd6..89e4620e 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -193,7 +193,7 @@ def _set_http_response_headers_from_request( ): logger.warning( "You do not need to set httpResponseHeaders to False if " - "you do set httpResponseBody or browserHtml to True. Note " + "neither httpResponseBody nor browserHtml are set to True. Note " "that httpResponseBody is set to True automatically if " "neither browserHtml nor screenshot are set to True." ) From dcfc5b7a72a147d1398ff78a3b31893540c343ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 12:05:57 +0200 Subject: [PATCH 75/81] Remove commented out code --- tests/test_api_requests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 8680a69b..c9c30fe0 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1658,7 +1658,6 @@ def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): _test_automap({}, {}, meta, expected, warnings, caplog) -# @pytest.mark.xfail(reason="To be implemented", strict=True) @pytest.mark.parametrize( "default_params,meta,expected,warnings", [ From 40e708eb0f97ee294e1abe2d9016d74d7195b266 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 12:28:18 +0200 Subject: [PATCH 76/81] tests: fix Content-Length expectations --- tests/test_responses.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/test_responses.py b/tests/test_responses.py index df55e035..0a67e25e 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -58,7 +58,6 @@ def raw_api_response_mixed(): } -EXPECTED_HEADERS = {b"Content-Type": [b"text/html"], b"Content-Length": [b"44"]} EXPECTED_BODY = PAGE_CONTENT.encode("utf-8") @@ -87,20 +86,24 @@ def test_init(api_response, cls): @pytest.mark.parametrize( - "api_response,cls", + "api_response,cls,content_length", [ - (raw_api_response_browser, ZyteAPITextResponse), - (raw_api_response_body, ZyteAPIResponse), - (raw_api_response_mixed, ZyteAPITextResponse), + (raw_api_response_browser, ZyteAPITextResponse, 44), + (raw_api_response_body, ZyteAPIResponse, 44), + (raw_api_response_mixed, ZyteAPITextResponse, 49), ], ) -def test_text_from_api_response(api_response, cls): +def test_text_from_api_response(api_response, cls, content_length): response = cls.from_api_response(api_response()) assert response.raw_api_response == api_response() assert response.url == URL assert response.status == 200 - assert response.headers == EXPECTED_HEADERS + expected_headers = { + b"Content-Type": [b"text/html"], + b"Content-Length": [str(content_length).encode()], + } + assert response.headers == expected_headers assert response.body == EXPECTED_BODY assert response.flags == ["zyte-api"] assert response.request is None From 329ba58a2857ead395df82411e2b3ac77f7f7db2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 14 Oct 2022 12:34:05 +0200 Subject: [PATCH 77/81] Pin pyopenssl==22.0.0 on the pinned Tox environment --- tox.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tox.ini b/tox.ini index 337ac6a1..b2530b78 100644 --- a/tox.ini +++ b/tox.ini @@ -23,6 +23,8 @@ deps = # https://stackoverflow.com/a/73046084 Twisted==21.7.0 + # https://github.com/scrapy/scrapy/issues/5635 + pyopenssl==22.0.0 # Earliest supported Scrapy version. From 79ecfd1f4930ca37e2b91285a7756ea23e4de969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 17 Oct 2022 09:59:09 +0200 Subject: [PATCH 78/81] Document how httpResponseBody should be accompanied by httpResponseHeaders when defining parameters manually --- README.rst | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 70a3d4a1..90a5fea2 100644 --- a/README.rst +++ b/README.rst @@ -180,6 +180,32 @@ For example: import scrapy + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api": { + "browserHtml": True, + } + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +Note that response headers are necessary for raw response decoding. When +defining parameters manually and requesting ``httpResponseBody`` extraction, +remember to also request ``httpResponseHeaders`` extraction: + +.. code-block:: python + + import scrapy + + class SampleQuotesSpider(scrapy.Spider): name = "sample_quotes" @@ -198,7 +224,7 @@ For example: print(response.text) # "…" -To learn about Zyte API parameters, see the `data extraction usage`_ and +To learn more about Zyte API parameters, see the `data extraction usage`_ and `API reference`_ pages of the `Zyte API documentation`_. .. _API reference: https://docs.zyte.com/zyte-api/openapi.html From dd67e12deb9be5dc527e328f528908c7f94528f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 17 Oct 2022 10:42:23 +0200 Subject: [PATCH 79/81] Add a section about response mapping --- README.rst | 150 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 106 insertions(+), 44 deletions(-) diff --git a/README.rst b/README.rst index 90a5fea2..e210de92 100644 --- a/README.rst +++ b/README.rst @@ -88,37 +88,11 @@ You can send requests through Zyte API in one of the following ways: requests with automatically-mapped parameters** below. Zyte API response parameters are mapped into Scrapy response parameters where -possible. The raw Zyte API response can be accessed via the -``raw_api_response`` attribute of the response object: - -.. code-block:: python - - def parse(self, response): - print(response.url) - # "https://quotes.toscrape.com/" - print(response.status) - # 200 - print(response.headers) - # {b"Content-Type": [b"text/html"], …} - print(response.text) - # "…" - print(response.raw_api_response) - # { - # "url": "https://quotes.toscrape.com/", - # "statusCode": 200, - # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", - # "httpResponseHeaders": […], - # } - -When you use the Zyte API parameters ``browserHtml``, ``httpResponseBody``, or -``httpResponseHeaders``, the response body and headers are set accordingly. - -Note that, for Zyte API requests, the spider gets responses of -``ZyteAPIResponse`` and ``ZyteAPITextResponse`` types, which are respectively -subclasses of ``scrapy.http.Response`` and ``scrapy.http.TextResponse``. +possible. See **Response mapping** below for details. If multiple requests target the same URL with different Zyte API parameters, -pass ``dont_filter=True`` to ``Request``. +pass ``dont_filter=True`` to ``Request`` to prevent the duplicate request +filter of Scrapy from dropping all but the first request. Using transparent mode @@ -129,7 +103,7 @@ Scrapy requests as follows: - By default, requests are sent through Zyte API with automatically-mapped parameters. See **Sending requests with automatically-mapped parameters** - below for details about automatic parameter mapping. + below for details about automatic request parameter mapping. You do not need to set the ``zyte-api-automap`` request meta key to ``True``, but you can set it to a dictionary to extend your Zyte API @@ -265,17 +239,105 @@ For example: print(response.text) # "…" -See also **Using transparent mode** above and **Automated parameter mapping** -below. +See also **Using transparent mode** above and **Automated request parameter +mapping** below. + + +Response mapping +---------------- + +Zyte API responses are mapped with one of the following classes: + +- ``scrapy_zyte_api.responses.ZyteAPITextResponse``, a subclass of + ``scrapy.http.TextResponse``, is used to map text responses, i.e. responses + with ``browserHtml`` or responses with both ``httpResponseBody`` and + ``httpResponseHeaders`` with a text body (e.g. plain text, HTML, JSON). + +- ``scrapy_zyte_api.responses.ZyteAPIResponse``, a subclass of + ``scrapy.http.Response``, is used to map any other response. + +Zyte API response parameters are mapped into response class attributes where +possible: + +- ``url`` becomes ``response.url``. + +- ``statusCode`` becomes ``response.status``. + +- ``httpResponseHeaders`` becomes ``response.headers``. + +- ``browserHtml`` and ``httpResponseBody`` are mapped into both + ``response.text`` (``str``) and ``response.body`` (``bytes``). + + If none of these parameters were present, e.g. if the only requested output + was ``screenshot``, ``response.text`` and ``response.body`` would be empty. + + If a future version of Zyte API supported requesting both outputs on the + same request, and both parameters were present, ``browserHtml`` would be + the one mapped into ``response.text`` and ``response.body``. + +Both response classes have a ``raw_zyte_api`` attribute that contains a +``dict`` with the complete, raw response from Zyte API, where you can find all +Zyte API response parameters, including those that are not mapped into other +response class atttributes. + +For example, for a request for ``httpResponseBody`` and +``httpResponseHeaders``, you would get: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {b"Content-Type": [b"text/html"], …} + print(response.text) + # "…" + print(response.body) + # b"…" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", + # "httpResponseHeaders": […], + # } + +For a request for ``screenshot``, on the other hand, the response would look +as follows: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {} + print(response.text) + # "" + print(response.body) + # b"" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "screenshot": "iVBORw0KGgoAAAANSUh…", + # } + from base64 import b64decode + print(b64decode(response.raw_api_response["screenshot"])) + # b'\x89PNG\r\n\x1a\n\x00\x00\x00\r…' -Automated parameter mapping ---------------------------- +Automated request parameter mapping +----------------------------------- -When you enable automated parameter mapping, be it through transparent mode -(see **Using transparent mode** above) or for a speicfic request (see **Sending -requests with automatically-mapped parameters** above), Zyte API parameters are -chosen as follows by default: +When you enable automated request parameter mapping, be it through transparent +mode (see **Using transparent mode** above) or for a specific request (see +**Sending requests with automatically-mapped parameters** above), Zyte API +parameters are chosen as follows by default: - ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. @@ -317,8 +379,8 @@ following parameters: You may set the ``zyte_api_automap`` key in `Request.meta `_ to a ``dict`` of Zyte API parameters to extend or override choices made by -automated parameter mapping. Some parameters modify the result of automated -parameter mapping as a side effect: +automated request parameter mapping. Some parameters modify the result of +automated request parameter mapping as a side effect: - Setting ``browserHtml`` or ``screenshot`` to ``True`` unsets ``httpResponseBody``, and makes ``Request.headers`` become @@ -371,8 +433,8 @@ headers are included or excluded from header mapping: {"Referer": "referer"} To maximize support for potential future changes in Zyte API, automated -parameter mapping allows some parameter values and parameter combinations that -Zyte API does not currently support, and may never support: +request parameter mapping allows some parameter values and parameter +combinations that Zyte API does not currently support, and may never support: - ``Request.method`` becomes ``httpRequestMethod`` even for unsupported_ ``httpRequestMethod`` values, and even if ``httpResponseBody`` is unset. @@ -428,8 +490,8 @@ Parameters in these settings are merged with request-specific parameters, with request-specific parameters taking precedence. ``ZYTE_API_DEFAULT_PARAMS`` has no effect on requests that use automated -parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on requests -that use manually-defined parameters. +request parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on +requests that use manually-defined parameters. Customizing the retry policy From c386036567375dd8f0187cdb771e878425aa54ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 17 Oct 2022 11:00:17 +0200 Subject: [PATCH 80/81] Discourage using certain parameters on ZYTE_API_AUTOMAP_PARAMS --- README.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.rst b/README.rst index e210de92..3367f847 100644 --- a/README.rst +++ b/README.rst @@ -493,6 +493,17 @@ request-specific parameters taking precedence. request parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on requests that use manually-defined parameters. +When using transparent mode (see **Using transparent mode** above), be careful +of which parameters you define through ``ZYTE_API_AUTOMAP_PARAMS``. In +transparent mode, all Scrapy requests go through Zyte API, even requests that +Scrapy sends automatically, such as those for ``robots.txt`` files when +ROBOTSTXT_OBEY_ is ``True``, or those for sitemaps when using a `sitemap +spider`_. Certain parameters, like ``browserHtml`` or ``screenshot``, are not +meant to be used for every single request. + +.. _ROBOTSTXT_OBEY: https://docs.scrapy.org/en/latest/topics/settings.html#robotstxt-obey +.. _sitemap spider: https://docs.scrapy.org/en/latest/topics/spiders.html#sitemapspider + Customizing the retry policy ============================ From 80f7b46053055360f2377e9ba18f6934ffa49e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 18 Oct 2022 09:54:20 +0200 Subject: [PATCH 81/81] Do not warn about explicit httpResponseBody or httpResponseHeaders, and do not enable httpResponseHeaders as a side effect of browserHtml --- README.rst | 62 +++++++++++++++++----- scrapy_zyte_api/_params.py | 20 +------ tests/test_api_requests.py | 103 +++++++++++++++---------------------- 3 files changed, 93 insertions(+), 92 deletions(-) diff --git a/README.rst b/README.rst index 3367f847..09b53cd6 100644 --- a/README.rst +++ b/README.rst @@ -341,6 +341,49 @@ parameters are chosen as follows by default: - ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. + This is subject to change without prior notice in future versions of + scrapy-zyte-api, so please account for the following: + + - If you are requesting a binary resource, such as a PDF file or an + image file, set ``httpResponseBody`` to ``True`` explicitly in your + requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/img/zyte.png", + meta={ + "zyte_api": { + "zyte_api_automap": {"httpResponseBody": True}, + } + }, + ) + + In the future, we may stop setting ``httpResponseBody`` to ``True`` by + default, and instead use a different, new Zyte API parameter that only + works for non-binary responses (e.g. HMTL, JSON, plain text). + + - If you need to access response headers, be it through + ``response.headers`` or through + ``response.raw_zyte_api["httpResponseHeaders"]``, set + ``httpResponseHeaders`` to ``True`` explicitly in your requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/", + meta={ + "zyte_api": { + "zyte_api_automap": {"httpResponseHeaders": True}, + } + }, + ) + + At the moment we request response headers because some response headers + are necessary to properly decode the response body as text. In the + future, Zyte API may be able to handle this decoding automatically, so + we would stop setting ``httpResponseHeaders`` to ``True`` by default. + - ``Request.url`` becomes ``url``, same as in requests with manually-defined parameters. @@ -379,24 +422,19 @@ following parameters: You may set the ``zyte_api_automap`` key in `Request.meta `_ to a ``dict`` of Zyte API parameters to extend or override choices made by -automated request parameter mapping. Some parameters modify the result of -automated request parameter mapping as a side effect: - -- Setting ``browserHtml`` or ``screenshot`` to ``True`` unsets - ``httpResponseBody``, and makes ``Request.headers`` become - ``requestHeaders`` instead of ``customHttpRequestHeaders``. +automated request parameter mapping. -- Setting ``screenshot`` to ``True`` without also setting ``browserHtml`` to - ``True`` unsets ``httpResponseHeaders``. - -For example, the following Scrapy request: +Setting ``browserHtml`` or ``screenshot`` to ``True`` unsets +``httpResponseBody`` and ``httpResponseHeaders``, and makes ``Request.headers`` +become ``requestHeaders`` instead of ``customHttpRequestHeaders``. For example, +the following Scrapy request: .. code-block:: python Request( url="https://quotes.toscrape.com", headers={"Referer": "https://example.com/"}, - meta={"zyte_api_automap": {"screenshot": True}}, + meta={"zyte_api_automap": {"browserHtml": True}}, ) Results in a request to the Zyte API data extraction endpoint with the @@ -405,7 +443,7 @@ following parameters: .. code-block:: javascript { - "screenshot": true, + "browserHtml": true, "url": "https://quotes.toscrape.com", "requestHeaders": {"referer": "https://example.com/"}, } diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 89e4620e..df1390d2 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -151,13 +151,6 @@ def _set_http_response_body_from_request( api_params.get(k) for k in ("httpResponseBody", "browserHtml", "screenshot") ): api_params.setdefault("httpResponseBody", True) - elif api_params.get("httpResponseBody") is True and not any( - api_params.get(k) for k in ("browserHtml", "screenshot") - ): - logger.warning( - "You do not need to set httpResponseBody to True if neither " - "browserHtml nor screenshot are set to True." - ) elif api_params.get("httpResponseBody") is False: logger.warning( f"Request {request} unnecessarily defines the Zyte API " @@ -174,18 +167,7 @@ def _set_http_response_headers_from_request( default_params: Dict[str, Any], meta_params: Dict[str, Any], ): - if any(api_params.get(k) for k in ("httpResponseBody", "browserHtml")): - if api_params.get("httpResponseHeaders") is True and not ( - default_params.get("httpResponseHeaders") is True - and "httpResponseHeaders" not in meta_params - ): - logger.error(default_params) - logger.warning( - "You do not need to set httpResponseHeaders to True if " - "you set httpResponseBody or browserHtml to True. Note " - "that httpResponseBody is set to True automatically if " - "neither browserHtml nor screenshot are set to True." - ) + if api_params.get("httpResponseBody"): api_params.setdefault("httpResponseHeaders", True) elif ( api_params.get("httpResponseHeaders") is False diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index c9c30fe0..a06db7a0 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -156,14 +156,21 @@ async def test_coro_handling(zyte_api: bool, mockserver): ( {"zyte_api": {"echoData": Request("http://test.com")}}, TypeError, - "Got an error when processing Zyte API request (http://example.com): " - "Object of type Request is not JSON serializable", + ( + "Got an error when processing Zyte API request " + "(http://example.com): Object of type Request is not JSON " + "serializable" + ), ), ( {"zyte_api": {"browserHtml": True, "httpResponseBody": True}}, RequestError, - "Got Zyte API error (status=422, type='/request/unprocessable') while processing URL (http://example.com): " - "Incompatible parameters were found in the request.", + ( + "Got Zyte API error (status=422, " + "type='/request/unprocessable') while processing URL " + "(http://example.com): Incompatible parameters were found in " + "the request." + ), ), ], ) @@ -657,18 +664,19 @@ def _test_automap(global_kwargs, request_kwargs, meta, expected, warnings, caplo }, [], ), - # If httpResponseBody is unnecessarily requested in meta, a warning is - # logged. + # httpResponseBody can be explicitly requested in meta, and should be + # in cases where a binary response is expected, since automated mapping + # may stop working for binary responses in the future. ( {"httpResponseBody": True}, {"httpResponseBody": True, "httpResponseHeaders": True}, - ["do not need to set httpResponseBody to True"], + [], ), # If other main outputs are specified in meta, httpRequestBody is not # set. ( {"browserHtml": True}, - {"browserHtml": True, "httpResponseHeaders": True}, + {"browserHtml": True}, [], ), ( @@ -678,7 +686,7 @@ def _test_automap(global_kwargs, request_kwargs, meta, expected, warnings, caplo ), ( {"browserHtml": True, "screenshot": True}, - {"browserHtml": True, "httpResponseHeaders": True, "screenshot": True}, + {"browserHtml": True, "screenshot": True}, [], ), # If no known main output is specified, and httpResponseBody is @@ -716,8 +724,7 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): [ # Test cases where httpResponseHeaders is not specifically set to True # or False, where it is automatically set to True if httpResponseBody - # or browserHtml are also True, are covered in - # test_automap_main_outputs. + # is also True, are covered in test_automap_main_outputs. # # If httpResponseHeaders is set to True in a scenario where it would # not be implicitly set to True, it is passed as such. @@ -726,6 +733,11 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): {"httpResponseHeaders": True}, [], ), + ( + {"browserHtml": True, "httpResponseHeaders": True}, + {"browserHtml": True, "httpResponseHeaders": True}, + [], + ), ( {"screenshot": True, "httpResponseHeaders": True}, {"screenshot": True, "httpResponseHeaders": True}, @@ -740,32 +752,25 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): {"unknownMainOutput": True, "httpResponseHeaders": True}, [], ), - # If httpResponseHeaders is unnecessarily set to True where - # httpResponseBody or browserHtml are set to True implicitly or - # explicitly, httpResponseHeaders is set to True, and a warning is - # logged. + # Setting httpResponseHeaders to True where it would be already True + # implicitly, i.e. where httpResponseBody is set to True implicitly or + # explicitly, is OK and should not generate any warning. It is a way + # to make code future-proof, in case in the future httpResponseHeaders + # stops being set to True by default in those scenarios. ( {"httpResponseHeaders": True}, {"httpResponseBody": True, "httpResponseHeaders": True}, - ["do not need to set httpResponseHeaders to True"], + [], ), ( {"httpResponseBody": True, "httpResponseHeaders": True}, {"httpResponseBody": True, "httpResponseHeaders": True}, - [ - "do not need to set httpResponseHeaders to True", - "do not need to set httpResponseBody to True", - ], - ), - ( - {"browserHtml": True, "httpResponseHeaders": True}, - {"browserHtml": True, "httpResponseHeaders": True}, - ["do not need to set httpResponseHeaders to True"], + [], ), ( { - "httpResponseBody": True, "browserHtml": True, + "httpResponseBody": True, "httpResponseHeaders": True, }, { @@ -773,7 +778,7 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): "httpResponseBody": True, "httpResponseHeaders": True, }, - ["do not need to set httpResponseHeaders to True"], + [], ), ( {"unknownMainOutput": True, "httpResponseHeaders": True}, @@ -782,20 +787,15 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): "httpResponseBody": True, "httpResponseHeaders": True, }, - ["do not need to set httpResponseHeaders to True"], + [], ), # If httpResponseHeaders is set to False, httpResponseHeaders is not - # defined, even if httpResponseBody or browserHtml are set to True, - # implicitly or explicitly. + # defined, even if httpResponseBody is set to True, implicitly or + # explicitly. ({"httpResponseHeaders": False}, {"httpResponseBody": True}, []), ( {"httpResponseBody": True, "httpResponseHeaders": False}, {"httpResponseBody": True}, - ["do not need to set httpResponseBody to True"], - ), - ( - {"browserHtml": True, "httpResponseHeaders": False}, - {"browserHtml": True}, [], ), ( @@ -813,14 +813,19 @@ def test_automap_main_outputs(meta, expected, warnings, caplog): [], ), # If httpResponseHeaders is unnecessarily set to False where - # httpResponseBody and browserHtml are set to False implicitly or - # explicitly, httpResponseHeaders is not defined, and a warning is + # httpResponseBody is set to False implicitly or explicitly, + # httpResponseHeaders is not defined, and a warning is # logged. ( {"httpResponseBody": False, "httpResponseHeaders": False}, {}, ["do not need to set httpResponseHeaders to False"], ), + ( + {"browserHtml": True, "httpResponseHeaders": False}, + {"browserHtml": True}, + ["do not need to set httpResponseHeaders to False"], + ), ( {"screenshot": True, "httpResponseHeaders": False}, {"screenshot": True}, @@ -929,7 +934,6 @@ def test_automap_header_output(meta, expected, warnings, caplog): { "browserHtml": True, "httpRequestMethod": "POST", - "httpResponseHeaders": True, }, [], ), @@ -972,7 +976,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, "requestHeaders": {"referer": "a"}, }, [], @@ -1090,7 +1093,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True, "requestHeaders": False}, { "browserHtml": True, - "httpResponseHeaders": True, }, [], ), @@ -1159,7 +1161,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseHeaders": True, "requestHeaders": {"referer": "a"}, }, [], @@ -1179,7 +1180,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, }, [], ), @@ -1263,7 +1263,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "browserHtml": True, "requestHeaders": {"referer": "a"}, - "httpResponseHeaders": True, }, ["Use Request.headers instead"], ), @@ -1292,7 +1291,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "browserHtml": True, "requestHeaders": {"referer": "b"}, - "httpResponseHeaders": True, }, ["Use Request.headers instead"], ), @@ -1321,7 +1319,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): { "browserHtml": True, "requestHeaders": {"referer": "a"}, - "httpResponseHeaders": True, }, ["Use Request.headers instead"], ), @@ -1358,7 +1355,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): "customHttpRequestHeaders": [ {"name": "Referer", "value": "a"}, ], - "httpResponseHeaders": True, }, [], ), @@ -1379,7 +1375,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, }, ["cannot be mapped"], ), @@ -1398,7 +1393,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, }, ["cannot be mapped"], ), @@ -1424,7 +1418,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, }, [], ), @@ -1436,7 +1429,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, }, ["cannot be mapped"], ), @@ -1466,7 +1458,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, }, [], ), @@ -1475,7 +1466,6 @@ def test_automap_method(method, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, }, ["cannot be mapped"], ), @@ -1525,7 +1515,6 @@ def test_automap_headers(headers, meta, expected, warnings, caplog): {"browserHtml": True}, { "browserHtml": True, - "httpResponseHeaders": True, "requestHeaders": {"userAgent": ""}, }, [], @@ -1586,7 +1575,6 @@ def test_automap_header_settings( { "browserHtml": True, "httpRequestBody": "YQ==", - "httpResponseHeaders": True, }, [], ), @@ -1618,7 +1606,6 @@ def test_automap_body(body, meta, expected, warnings, caplog): }, { "browserHtml": True, - "httpResponseHeaders": True, }, ["unnecessarily defines"], ), @@ -1662,13 +1649,7 @@ def test_automap_default_parameter_cleanup(meta, expected, warnings, caplog): "default_params,meta,expected,warnings", [ ( - {"screenshot": True, "httpResponseHeaders": True}, {"browserHtml": True}, - {"browserHtml": True, "httpResponseHeaders": True, "screenshot": True}, - [], - ), - ( - {"browserHtml": True, "httpResponseHeaders": False}, {"screenshot": True, "browserHtml": False}, {"screenshot": True}, [],