scrapy-plugins · kmike · May 30, 2022 · Apr 26, 2022 · Apr 27, 2022 · Apr 28, 2022
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,14 @@
 Changes
 =======
 
+TBD
+---
+
+* Introduce ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are subclasses
+  of ``scrapy.http.Response`` and ``scrapy.http.TextResponse`` respectively.
+  These new response classes hold the raw Zyte Data API response in the
+  ``raw_api_response`` attribute.
+
 0.1.0 (2022-02-03)
 ------------------
 

diff --git a/README.rst b/README.rst
@@ -33,8 +33,8 @@ Installation
 
 This package requires Python 3.7+.
 
-How to configure
-----------------
+Configuration
+-------------
 
 Replace the default ``http`` and ``https`` in Scrapy's
 `DOWNLOAD_HANDLERS <https://docs.scrapy.org/en/latest/topics/settings.html#std-setting-DOWNLOAD_HANDLERS>`_
@@ -46,7 +46,7 @@ Lastly, make sure to `install the asyncio-based Twisted reactor
 <https://docs.scrapy.org/en/latest/topics/asyncio.html#installing-the-asyncio-reactor)>`_
 in the ``settings.py`` file as well:
 
-Here's example of the things needed inside a Scrapy project's ``settings.py`` file:
+Here's an example of the things needed inside a Scrapy project's ``settings.py`` file:
 
 .. code-block:: python
 
@@ -60,37 +60,83 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi
 
     TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 
-How to use
-----------
+Usage
+-----
 
-Set the ``zyte_api`` `Request.meta
-<https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta>`_
-key to download a request using Zyte API. Full list of parameters is provided in the
-`Zyte API Specification <https://docs.zyte.com/zyte-api/openapi.html#zyte-openapi-spec>`_.
+To enable a ``scrapy.Request`` to go through Zyte Data API, the ``zyte_api`` key in
+`Request.meta <https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta>`_
+must be present and has dict-like contents.
 
-.. code-block:: python
+To set the default parameters for Zyte API enabled requests, you can set the
+following in the ``settings.py`` file or `any other settings within Scrapy
+<https://docs.scrapy.org/en/latest/topics/settings.html#populating-the-settings>`_:
 
-   import scrapy
+.. code-block:: python
 
+    ZYTE_API_DEFAULT_PARAMS = {
+        "browserHtml": True,
+        "geolocation": "US",
+    }
 
-   class TestSpider(scrapy.Spider):
-       name = "test"
+You can see the full list of parameters in the `Zyte Data API Specification
+<https://docs.zyte.com/zyte-api/openapi.html#zyte-openapi-spec>`_.
 
-       def start_requests(self):
+Note that the ``ZYTE_API_DEFAULT_PARAMS`` would only work if the ``zyte_api``
+key in `Request.meta <https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta>`_
+is set. When doing so, it will override any parameters set in the 
+``ZYTE_API_DEFAULT_PARAMS`` setting.
 
-           yield scrapy.Request(
-               url="http://books.toscrape.com/",
-               callback=self.parse,
-               meta={
-                   "zyte_api": {
-                       "browserHtml": True,
-                       # You can set any GEOLocation region you want.
-                       "geolocation": "US",
-                       "javascript": True,
-                       "echoData": {"something": True},
-                   }
-               },
-           )
+.. code-block:: python
 
-       def parse(self, response):
-           yield {"URL": response.url, "status": response.status, "HTML": response.body}
+    import scrapy
+
+
+    class SampleQuotesSpider(scrapy.Spider):
+        name = "sample_quotes"
+
+        custom_settings = {
+            "ZYTE_API_DEFAULT_PARAMS": {
+                "geolocation": "US",  # You can set any Geolocation region you want.
+            }
+        }
+
+        def start_requests(self):
+            yield scrapy.Request(
+                url="http://books.toscrape.com/",
+                callback=self.parse,
+                meta={
+                    "zyte_api": {
+                        "browserHtml": True,
+                        "javascript": True,
+                        "echoData": {"some_value_I_could_track": 123},
+                    }
+                },
+            )
+
+        def parse(self, response):
+            yield {"URL": response.url, "status": response.status, "HTML": response.body}
+
+            print(response.raw_api_response)
+            # {
+            #     'url': 'https://quotes.toscrape.com/',
+            #     'browserHtml': '<html> ... </html>',
+            #     'echoData': {'some_value_I_could_track': 123},
+            # }
+
+            print(response.request.meta)
+            # {
+            #     'zyte_api': {
+            #         'browserHtml': True,
+            #         'geolocation': 'US',
+            #         'javascript': True,
+            #         'echoData': {'some_value_I_could_track': 123}
+            #     },
+            #     'download_timeout': 180.0,
+            #     'download_slot': 'quotes.toscrape.com'
+            # }
+
+The raw Zyte Data API response can be accessed via the ``raw_api_response`` attribute
+of the response object. Note that such responses are of ``ZyteAPIResponse`` and
+``ZyteAPITextResponse`` types, which are respectively subclasses of ``scrapy.http.Response``
+and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte Data API
+responses.
diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py
@@ -1,21 +1,22 @@
 import json
 import logging
 import os
-from base64 import b64decode
-from typing import Any, Dict, Generator, List, Optional
+from typing import Any, Dict, Generator, Optional, Union
 
 from scrapy import Spider
 from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
 from scrapy.crawler import Crawler
 from scrapy.exceptions import IgnoreRequest, NotConfigured
-from scrapy.http import Request, Response, TextResponse
+from scrapy.http import Request
 from scrapy.settings import Settings
 from scrapy.utils.defer import deferred_from_coro
 from scrapy.utils.reactor import verify_installed_reactor
 from twisted.internet.defer import Deferred, inlineCallbacks
 from zyte_api.aio.client import AsyncClient, create_session
 from zyte_api.aio.errors import RequestError
 
+from .responses import ZyteAPIResponse, ZyteAPITextResponse, _process_response
+
 logger = logging.getLogger(__name__)
 
 
@@ -30,8 +31,8 @@ def __init__(
         )
         self._stats = crawler.stats
         self._job_id = crawler.settings.get("JOB")
+        self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS")
         self._session = create_session()
-        self._encoding = "utf-8"
 
     @classmethod
     def from_crawler(cls, crawler):
@@ -48,19 +49,36 @@ def from_crawler(cls, crawler):
         return cls(crawler.settings, crawler, client)
 
     def download_request(self, request: Request, spider: Spider) -> Deferred:
-        if request.meta.get("zyte_api"):
-            return deferred_from_coro(self._download_request(request, spider))
-        else:
-            return super().download_request(request, spider)
+        api_params = self._prepare_api_params(request)
+        if api_params:
+            return deferred_from_coro(
+                self._download_request(api_params, request, spider)
+            )
+        return super().download_request(request, spider)
+
+    def _prepare_api_params(self, request: Request) -> Optional[dict]:
+        meta_params = request.meta.get("zyte_api")
+        if not meta_params and meta_params != {}:
+            return None
+
+        if meta_params is True:
+            meta_params = {}
 
-    async def _download_request(self, request: Request, spider: Spider) -> Response:
-        api_params: Dict[str, Any] = request.meta["zyte_api"]
-        if not isinstance(api_params, dict):
+        api_params: Dict[str, Any] = self._zyte_api_default_params or {}
+        try:
+            api_params.update(meta_params)
+        except TypeError:
             logger.error(
-                "zyte_api parameters in the request meta should be "
-                f"provided as dictionary, got {type(api_params)} instead ({request.url})."
+                f"zyte_api parameters in the request meta should be "
+                f"provided as dictionary, got {type(request.meta.get('zyte_api'))} "
+                f"instead ({request.url})."
             )
             raise IgnoreRequest()
+        return api_params
+
+    async def _download_request(
+        self, api_params: dict, request: Request, spider: Spider
+    ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]:
         # Define url by default
         api_data = {**{"url": request.url}, **api_params}
         if self._job_id is not None:
@@ -80,31 +98,9 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
                 f"Got an error when processing Zyte API request ({request.url}): {er}"
             )
             raise IgnoreRequest()
+
         self._stats.inc_value("scrapy-zyte-api/request_count")
-        headers = self._prepare_headers(api_response.get("httpResponseHeaders"))
-        # browserHtml and httpResponseBody are not allowed at the same time,
-        # but at least one of them should be present
-        if api_response.get("browserHtml"):
-            # Using TextResponse because browserHtml always returns a browser-rendered page
-            # even when requesting files (like images)
-            return TextResponse(
-                url=api_response["url"],
-                status=200,
-                body=api_response["browserHtml"].encode(self._encoding),
-                encoding=self._encoding,
-                request=request,
-                flags=["zyte-api"],
-                headers=headers,
-            )
-        else:
-            return Response(
-                url=api_response["url"],
-                status=200,
-                body=b64decode(api_response["httpResponseBody"]),
-                request=request,
-                flags=["zyte-api"],
-                headers=headers,
-            )
+        return _process_response(api_response, request)
 
     @inlineCallbacks
     def close(self) -> Generator:
@@ -129,9 +125,3 @@ def _get_request_error_message(error: RequestError) -> str:
         if error_data.get("detail"):
             return error_data["detail"]
         return base_message
-
-    @staticmethod
-    def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]):
-        if not init_headers:
-            return None
-        return {h["name"]: h["value"] for h in init_headers}
diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py
@@ -0,0 +1,128 @@
+from base64 import b64decode
+from typing import Dict, List, Optional, Tuple, Union
+
+from scrapy import Request
+from scrapy.http import Response, TextResponse
+from scrapy.responsetypes import responsetypes
+
+_DEFAULT_ENCODING = "utf-8"
+
+
+class ZyteAPIMixin:
+
+    REMOVE_HEADERS = {
+        # Zyte API already decompresses the HTTP Response Body. Scrapy's
+        # HttpCompressionMiddleware will error out when it attempts to
+        # decompress an already decompressed body based on this header.
+        "content-encoding"
+    }
+
+    def __init__(self, *args, raw_api_response: Dict = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._raw_api_response = raw_api_response
+
+    def replace(self, *args, **kwargs):
+        if kwargs.get("raw_api_response"):
+            raise ValueError("Replacing the value of 'raw_api_response' isn't allowed.")
+        return super().replace(*args, **kwargs)
+
+    @property
+    def raw_api_response(self) -> Optional[Dict]:
+        """Contains the raw API response from Zyte API.
+
+        To see the full list of parameters and their description, kindly refer to the
+        `Zyte API Specification <https://docs.zyte.com/zyte-api/openapi.html#zyte-openapi-spec>`_.
+        """
+        return self._raw_api_response
+
+    @classmethod
+    def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]):
+        if not init_headers:
+            return None
+        return {
+            h["name"]: h["value"]
+            for h in init_headers
+            if h["name"].lower() not in cls.REMOVE_HEADERS
+        }
+
+
+class ZyteAPITextResponse(ZyteAPIMixin, TextResponse):
+
+    attributes: Tuple[str, ...] = TextResponse.attributes + ("raw_api_response",)
+
+    @classmethod
+    def from_api_response(cls, api_response: Dict, *, request: Request = None):
+        """Alternative constructor to instantiate the response from the raw
+        Zyte API response.
+        """
+        body = None
+        encoding = None
+
+        if api_response.get("browserHtml"):
+            encoding = _DEFAULT_ENCODING  # Zyte API has "utf-8" by default
+            body = api_response["browserHtml"].encode(encoding)
+        elif api_response.get("httpResponseBody"):
+            body = b64decode(api_response["httpResponseBody"])
+
+        return cls(
+            url=api_response["url"],
+            status=200,
+            body=body,
+            encoding=encoding,
+            request=request,
+            flags=["zyte-api"],
+            headers=cls._prepare_headers(api_response.get("httpResponseHeaders")),
+            raw_api_response=api_response,
+        )
+
+
+class ZyteAPIResponse(ZyteAPIMixin, Response):
+
+    attributes: Tuple[str, ...] = Response.attributes + ("raw_api_response",)
+
+    @classmethod
+    def from_api_response(cls, api_response: Dict, *, request: Request = None):
+        """Alternative constructor to instantiate the response from the raw
+        Zyte API response.
+        """
+        return cls(
+            url=api_response["url"],
+            status=200,
+            body=b64decode(api_response.get("httpResponseBody") or ""),
+            request=request,
+            flags=["zyte-api"],
+            headers=cls._prepare_headers(api_response.get("httpResponseHeaders")),
+            raw_api_response=api_response,
+        )
+
+
+def _process_response(
+    api_response: Dict[str, Union[List[Dict], str]], request: Request
+) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]:
+    """Given a Zyte API Response and the ``scrapy.Request`` that asked for it,
+    this returns either a ``ZyteAPITextResponse`` or ``ZyteAPIResponse`` depending
+    on which if it can properly decode the HTTP Body or have access to browserHtml.
+    """
+
+    # NOTES: Currently, Zyte API does NOT only allow both 'browserHtml' and
+    # 'httpResponseBody' to be present at the same time. The support for both
+    # will be addressed in the future. Reference:
+    # - https://github.com/scrapy-plugins/scrapy-zyte-api/pull/10#issuecomment-1131406460
+    # For now, at least one of them should be present.
+
+    if api_response.get("browserHtml"):
+        # Using TextResponse because browserHtml always returns a browser-rendered page
+        # even when requesting files (like images)
+        return ZyteAPITextResponse.from_api_response(api_response, request=request)
+
+    if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"):
+        response_cls = responsetypes.from_args(
+            headers=api_response["httpResponseHeaders"],
+            url=api_response["url"],
+            # FIXME: update this when python-zyte-api supports base64 decoding
+            body=b64decode(api_response["httpResponseBody"]),  # type: ignore
+        )
+        if issubclass(response_cls, TextResponse):
+            return ZyteAPITextResponse.from_api_response(api_response, request=request)
+
+    return ZyteAPIResponse.from_api_response(api_response, request=request)