Skip to content

Commit

Permalink
Define standard parameters through individual mixins (#46)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Apr 5, 2024
1 parent 9ddb62a commit 281f1f5
Show file tree
Hide file tree
Showing 7 changed files with 235 additions and 163 deletions.
18 changes: 18 additions & 0 deletions docs/customization/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,24 @@ URL:
class MySpider(EcommerceSpider, Args[MyParams]):
name = "my_spider"
A mixin class exists for every spider parameter (see :ref:`parameter-mixins`),
so you can use any combination of them in any order you like in your custom
classes, while enjoying future improvements to validation, documentation or
UI integration for Scrapy Cloud:

.. code-block:: python
from scrapy_spider_metadata import Args
from zyte_spider_templates.params import GeolocationParam, UrlParam
class MyParams(GeolocationParam, UrlParam):
pass
class MySpider(Args[MyParams]):
name = "my_spider"
.. _custom-crawl:

Expand Down
49 changes: 29 additions & 20 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,42 @@
Reference
=========

Base classes
============
Spiders
=======

.. autopydantic_model:: zyte_spider_templates.spiders.base.BaseSpiderParams
:inherited-members: BaseModel
.. autoclass:: zyte_spider_templates.BaseSpider

.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider
.. autoclass:: zyte_spider_templates.EcommerceSpider

.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom
:noindex:

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
:noindex:
Pages
=====

E-commerce
==========
.. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage

.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams
:noindex:
:inherited-members: BaseModel

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
:noindex:
.. _parameter-mixins:

.. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider
Parameter mixins
================

Pages
=====
.. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam
:exclude-members: model_computed_fields

.. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage
.. autoenum:: zyte_spider_templates.params.ExtractFrom

.. autopydantic_model:: zyte_spider_templates.params.GeolocationParam
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.params.Geolocation

.. autopydantic_model:: zyte_spider_templates.params.MaxRequestsParam
:exclude-members: model_computed_fields

.. autopydantic_model:: zyte_spider_templates.params.UrlParam
:exclude-members: model_computed_fields

.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategyParam
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
7 changes: 1 addition & 6 deletions docs/templates/e-commerce.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,4 @@ Parameters

.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams
:inherited-members: BaseModel

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
:exclude-members: model_computed_fields
145 changes: 83 additions & 62 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import re
from unittest.mock import MagicMock, call
Expand Down Expand Up @@ -348,111 +349,131 @@ def test_arguments():
assert spider.allowed_domains == ["example.com"]


def assertEqualJson(actual, expected):
"""Compare the JSON representation of 2 Python objects.
This allows to take into account things like the order of key-value pairs
in dictionaries, which would not be taken into account when comparing
dictionaries directly.
It also generates a better diff in pytest output when enums are involved,
e.g. geolocation values.
"""
actual_json = json.dumps(actual, indent=2)
expected_json = json.dumps(expected, indent=2)
assert actual_json == expected_json


def test_metadata():
metadata = get_spider_metadata(EcommerceSpider, normalize=True)
assert metadata == {
actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True)
expected_metadata = {
"template": True,
"title": "E-commerce",
"description": "Template for spiders that extract product data from e-commerce websites.",
"param_schema": {
"properties": {
"crawl_strategy": {
"default": "full",
"title": "Crawl strategy",
"description": "Determines how the start URL and follow-up URLs are crawled.",
"type": "string",
"enum": ["full", "navigation", "pagination_only"],
"enumMeta": {
"full": {
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
"title": "Full",
},
"navigation": {
"description": (
"Follow pagination, subcategories, and "
"product detail pages. Pagination Only is a "
"better choice if the target URL does not "
"have subcategories, or if Zyte API is "
"misidentifying some URLs as subcategories."
),
"title": "Navigation",
},
"pagination_only": {
"description": (
"Follow pagination and product detail pages. Subcategory links are ignored."
),
"title": "Pagination Only",
},
},
},
"extract_from": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"default": None,
"title": "Extraction source",
"url": {
"description": (
"Whether to perform extraction using a browser request "
"(browserHtml) or an HTTP request (httpResponseBody)."
"Initial URL for the crawl. Enter the full URL including http(s), "
"you can copy and paste it from your browser. Example: https://toscrape.com/"
),
"enum": ["httpResponseBody", "browserHtml"],
"enumMeta": {
"httpResponseBody": {
"title": "httpResponseBody",
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
},
"browserHtml": {
"title": "browserHtml",
"description": "Use browser rendering. Often provides the best quality.",
},
},
"pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
"title": "URL",
"type": "string",
},
"geolocation": {
"anyOf": [
{"type": "string"},
{"type": "null"},
],
"default": None,
"title": "Geolocation",
"description": "ISO 3166-1 alpha-2 2-character string specified in "
"https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
"enum": list(
sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__)
"description": (
"ISO 3166-1 alpha-2 2-character string specified in "
"https://docs.zyte.com/zyte-api/usage/reference.html"
"#operation/extract/request/geolocation."
),
"enumMeta": {
code: {
"title": GEOLOCATION_OPTIONS_WITH_CODE[code],
}
for code in Geolocation
for code in sorted(Geolocation)
},
"title": "Geolocation",
"enum": list(
sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__)
),
},
"max_requests": {
"anyOf": [{"type": "integer"}, {"type": "null"}],
"default": 100,
"title": "Max Requests",
"description": (
"The maximum number of Zyte API requests allowed for the crawl.\n"
"\n"
"Requests with error responses that cannot be retried or exceed "
"their retry limit also count here, but they incur in no costs "
"and do not increase the request count in Scrapy Cloud."
),
"title": "Max Requests",
"widget": "request-limit",
},
"url": {
"type": "string",
"title": "URL",
"extract_from": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"default": None,
"description": (
"Initial URL for the crawl. Enter the full URL including http(s), "
"you can copy and paste it from your browser. Example: https://toscrape.com/"
"Whether to perform extraction using a browser request "
"(browserHtml) or an HTTP request (httpResponseBody)."
),
"pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
"enumMeta": {
"browserHtml": {
"description": "Use browser rendering. Often provides the best quality.",
"title": "browserHtml",
},
"httpResponseBody": {
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
"title": "httpResponseBody",
},
},
"title": "Extraction source",
"enum": ["httpResponseBody", "browserHtml"],
},
"crawl_strategy": {
"default": "full",
"description": "Determines how the start URL and follow-up URLs are crawled.",
"enumMeta": {
"full": {
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
"title": "Full",
},
"navigation": {
"description": (
"Follow pagination, subcategories, and "
"product detail pages. Pagination Only is a "
"better choice if the target URL does not "
"have subcategories, or if Zyte API is "
"misidentifying some URLs as subcategories."
),
"title": "Navigation",
},
"pagination_only": {
"description": (
"Follow pagination and product detail pages. Subcategory links are ignored."
),
"title": "Pagination Only",
},
},
"title": "Crawl strategy",
"enum": ["full", "navigation", "pagination_only"],
"type": "string",
},
},
"required": ["url"],
"title": "EcommerceSpiderParams",
"type": "object",
},
}
geolocation = metadata["param_schema"]["properties"]["geolocation"]
assertEqualJson(actual_metadata, expected_metadata)

geolocation = actual_metadata["param_schema"]["properties"]["geolocation"]
assert geolocation["enum"][0] == "AF"
assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"}
assert set(geolocation["enum"]) == set(geolocation["enumMeta"])
Expand Down
85 changes: 85 additions & 0 deletions zyte_spider_templates/params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from enum import Enum
from typing import Optional

from pydantic import BaseModel, Field

from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS_WITH_CODE,
Geolocation,
)
from zyte_spider_templates.documentation import document_enum


@document_enum
class ExtractFrom(str, Enum):
httpResponseBody: str = "httpResponseBody"
"""Use HTTP responses. Cost-efficient and fast extraction method, which
works well on many websites."""

browserHtml: str = "browserHtml"
"""Use browser rendering. Often provides the best quality."""


class ExtractFromParam(BaseModel):
extract_from: Optional[ExtractFrom] = Field(
title="Extraction source",
description=(
"Whether to perform extraction using a browser request "
"(browserHtml) or an HTTP request (httpResponseBody)."
),
default=None,
json_schema_extra={
"enumMeta": {
ExtractFrom.browserHtml: {
"title": "browserHtml",
"description": "Use browser rendering. Often provides the best quality.",
},
ExtractFrom.httpResponseBody: {
"title": "httpResponseBody",
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
},
},
},
)


class GeolocationParam(BaseModel):
geolocation: Optional[Geolocation] = Field(
title="Geolocation",
description="ISO 3166-1 alpha-2 2-character string specified in "
"https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
default=None,
json_schema_extra={
"enumMeta": {
code: {
"title": GEOLOCATION_OPTIONS_WITH_CODE[code],
}
for code in Geolocation
}
},
)


class MaxRequestsParam(BaseModel):
max_requests: Optional[int] = Field(
description=(
"The maximum number of Zyte API requests allowed for the crawl.\n"
"\n"
"Requests with error responses that cannot be retried or exceed "
"their retry limit also count here, but they incur in no costs "
"and do not increase the request count in Scrapy Cloud."
),
default=100,
json_schema_extra={
"widget": "request-limit",
},
)


class UrlParam(BaseModel):
url: str = Field(
title="URL",
description="Initial URL for the crawl. Enter the full URL including http(s), "
"you can copy and paste it from your browser. Example: https://toscrape.com/",
pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
)
Loading

0 comments on commit 281f1f5

Please sign in to comment.