Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce new settings: ZYTE_API_DEFAULT_PARAMS #13

Merged
merged 6 commits into from
May 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Lastly, make sure to `install the asyncio-based Twisted reactor
<https://docs.scrapy.org/en/latest/topics/asyncio.html#installing-the-asyncio-reactor)>`_
in the ``settings.py`` file as well:

Here's example of the things needed inside a Scrapy project's ``settings.py`` file:
Here's an example of the things needed inside a Scrapy project's ``settings.py`` file:

.. code-block:: python

Expand All @@ -63,10 +63,24 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi
Usage
-----

Set the ``zyte_api`` `Request.meta
<https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta>`_
key to download a request using Zyte API. Full list of parameters is provided in the
`Zyte API Specification <https://docs.zyte.com/zyte-api/openapi.html#zyte-openapi-spec>`_.
To enable every request to be sent through Zyte API, you can set the following
in the ``settings.py`` file or `any other settings within Scrapy
<https://docs.scrapy.org/en/latest/topics/settings.html#populating-the-settings>`_:

.. code-block:: python

ZYTE_API_DEFAULT_PARAMS = {
"browserHtml": True,
"geolocation": "US",
}

You can see the full list of parameters in the `Zyte API Specification
<https://docs.zyte.com/zyte-api/openapi.html#zyte-openapi-spec>`_.

On the other hand, you could also control it on a per request basis by setting the
``zyte_api`` key in `Request.meta <https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta>`_.
When doing so, it will override any parameters that was set in the
``ZYTE_API_DEFAULT_PARAMS`` setting.
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

.. code-block:: python

Expand Down
12 changes: 8 additions & 4 deletions scrapy_zyte_api/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(
)
self._stats = crawler.stats
self._job_id = crawler.settings.get("JOB")
self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS")
self._session = create_session()

@classmethod
Expand All @@ -56,11 +57,14 @@ def download_request(self, request: Request, spider: Spider) -> Deferred:
async def _download_request(
self, request: Request, spider: Spider
) -> Union[ZyteAPITextResponse, ZyteAPIResponse]:
api_params: Dict[str, Any] = request.meta["zyte_api"]
if not isinstance(api_params, dict):
api_params: Dict[str, Any] = self._zyte_api_default_params or {}
try:
api_params.update(request.meta.get("zyte_api") or {})
except TypeError:
logger.error(
"zyte_api parameters in the request meta should be "
f"provided as dictionary, got {type(api_params)} instead ({request.url})."
f"zyte_api parameters in the request meta should be "
f"provided as dictionary, got {type(request.meta.get('zyte_api'))} "
f"instead ({request.url})."
)
raise IgnoreRequest()
# Define url by default
Expand Down
143 changes: 89 additions & 54 deletions tests/test_api_requests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import sys
from asyncio import iscoroutine
from typing import Any, Dict
from unittest import mock

import pytest
from _pytest.logging import LogCaptureFixture # NOQA
Expand All @@ -23,6 +25,21 @@


class TestAPI:
@staticmethod
async def produce_request_response(meta, custom_settings=None):
with MockServer() as server:
async with make_handler(custom_settings, server.urljoin("/")) as handler:
req = Request(
"http://example.com",
method="POST",
meta=meta,
)
coro = handler._download_request(req, None)
assert iscoroutine(coro)
assert not isinstance(coro, Deferred)
resp = await coro # type: ignore
return req, resp

@pytest.mark.parametrize(
"meta",
[
Expand All @@ -34,25 +51,14 @@ class TestAPI:
)
@pytest.mark.asyncio
async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]):
with MockServer() as server:
async with make_handler({}, server.urljoin("/")) as handler:
req = Request(
"http://example.com",
method="POST",
meta=meta,
)
coro = handler._download_request(req, Spider("test"))
assert iscoroutine(coro)
assert not isinstance(coro, Deferred)
resp = await coro # type: ignore

assert isinstance(resp, TextResponse)
assert resp.request is req
assert resp.url == req.url
assert resp.status == 200
assert "zyte-api" in resp.flags
assert resp.body == b"<html></html>"
assert resp.text == "<html></html>"
req, resp = await self.produce_request_response(meta)
assert isinstance(resp, TextResponse)
assert resp.request is req
assert resp.url == req.url
assert resp.status == 200
assert "zyte-api" in resp.flags
assert resp.body == b"<html></html>"
assert resp.text == "<html></html>"

@pytest.mark.parametrize(
"meta",
Expand All @@ -71,24 +77,13 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]):
)
@pytest.mark.asyncio
async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]):
with MockServer() as server:
async with make_handler({}, server.urljoin("/")) as handler:
req = Request(
"http://example.com",
method="POST",
meta=meta,
)
coro = handler._download_request(req, Spider("test"))
assert iscoroutine(coro)
assert not isinstance(coro, Deferred)
resp = await coro # type: ignore

assert isinstance(resp, Response)
assert resp.request is req
assert resp.url == req.url
assert resp.status == 200
assert "zyte-api" in resp.flags
assert resp.body == b"<html></html>"
req, resp = await self.produce_request_response(meta)
assert isinstance(resp, Response)
assert resp.request is req
assert resp.url == req.url
assert resp.status == 200
assert "zyte-api" in resp.flags
assert resp.body == b"<html></html>"

@pytest.mark.parametrize(
"meta",
Expand All @@ -99,24 +94,64 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]])
)
@pytest.mark.asyncio
async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any]]):
with MockServer() as server:
async with make_handler({}, server.urljoin("/")) as handler:
req = Request(
"http://example.com",
method="POST",
meta=meta,
)
coro = handler._download_request(req, Spider("test"))
assert iscoroutine(coro)
assert not isinstance(coro, Deferred)
resp = await coro # type: ignore
req, resp = await self.produce_request_response(meta)
assert resp.request is req
assert resp.url == req.url
assert resp.status == 200
assert "zyte-api" in resp.flags
assert resp.body == b"<html></html>"
assert resp.headers == {b"Test_Header": [b"test_value"]}

assert resp.request is req
assert resp.url == req.url
assert resp.status == 200
assert "zyte-api" in resp.flags
assert resp.body == b"<html></html>"
assert resp.headers == {b"Test_Header": [b"test_value"]}
@pytest.mark.skipif(
sys.version_info < (3, 8), reason="Python3.7 has poor support for AsyncMocks"
)
@pytest.mark.parametrize(
"meta,custom_settings,expected",
[
({}, {}, {}),
({"zyte_api": {}}, {}, {}),
(
{},
{"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}},
{"browserHtml": True, "geolocation": "CA"},
),
(
{"zyte_api": {}},
{"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}},
{"browserHtml": True, "geolocation": "CA"},
),
(
{"zyte_api": {"javascript": True, "geolocation": "US"}},
{"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}},
{"browserHtml": True, "geolocation": "US", "javascript": True},
),
],
)
@mock.patch("tests.AsyncClient")
@pytest.mark.asyncio
async def test_empty_zyte_api_request_meta(
self,
mock_client,
meta: Dict[str, Dict[str, Any]],
custom_settings: Dict[str, str],
expected: Dict[str, str],
):
try:
# This would always error out since the mocked client doesn't
# return the expected API response.
await self.produce_request_response(meta, custom_settings=custom_settings)
except Exception:
pass

# What we're interested in is the Request call in the API
request_call = [c for c in mock_client.mock_calls if "request_raw(" in str(c)]
if not request_call:
pytest.fail("The client's request_raw() method was not called.")

args_used = request_call[0].args[0]
args_used.pop("url")

assert args_used == expected

@pytest.mark.parametrize(
"meta, api_relevant",
Expand Down