Define standard parameters through individual mixins (#46)

zytedata · Apr 5, 2024 · 281f1f5 · 281f1f5
1 parent 9ddb62a
commit 281f1f5
Show file tree

Hide file tree

Showing 7 changed files with 235 additions and 163 deletions.
diff --git a/docs/customization/spiders.rst b/docs/customization/spiders.rst
@@ -90,6 +90,24 @@ URL:
     class MySpider(EcommerceSpider, Args[MyParams]):
         name = "my_spider"
 
+A mixin class exists for every spider parameter (see :ref:`parameter-mixins`),
+so you can use any combination of them in any order you like in your custom
+classes, while enjoying future improvements to validation, documentation or
+UI integration for Scrapy Cloud:
+
+.. code-block:: python
+
+    from scrapy_spider_metadata import Args
+    from zyte_spider_templates.params import GeolocationParam, UrlParam
+
+
+    class MyParams(GeolocationParam, UrlParam):
+        pass
+
+
+    class MySpider(Args[MyParams]):
+        name = "my_spider"
+
 
 .. _custom-crawl:
 

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
@@ -2,33 +2,42 @@
 Reference
 =========
 
-Base classes
-============
+Spiders
+=======
 
-.. autopydantic_model:: zyte_spider_templates.spiders.base.BaseSpiderParams
-    :inherited-members: BaseModel
+.. autoclass:: zyte_spider_templates.BaseSpider
 
-.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider
+.. autoclass:: zyte_spider_templates.EcommerceSpider
 
-.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom
-    :noindex:
 
-.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
-    :noindex:
+Pages
+=====
 
-E-commerce
-==========
+.. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage
 
-.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams
-    :noindex:
-    :inherited-members: BaseModel
 
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
-    :noindex:
+.. _parameter-mixins:
 
-.. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider
+Parameter mixins
+================
 
-Pages
-=====
+.. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam
+    :exclude-members: model_computed_fields
 
-.. autoclass:: zyte_spider_templates.pages.HeuristicsProductNavigationPage
+.. autoenum:: zyte_spider_templates.params.ExtractFrom
+
+.. autopydantic_model:: zyte_spider_templates.params.GeolocationParam
+    :exclude-members: model_computed_fields
+
+.. autoenum:: zyte_spider_templates.params.Geolocation
+
+.. autopydantic_model:: zyte_spider_templates.params.MaxRequestsParam
+    :exclude-members: model_computed_fields
+
+.. autopydantic_model:: zyte_spider_templates.params.UrlParam
+    :exclude-members: model_computed_fields
+
+.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategyParam
+    :exclude-members: model_computed_fields
+
+.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
diff --git a/docs/templates/e-commerce.rst b/docs/templates/e-commerce.rst
@@ -16,9 +16,4 @@ Parameters
 
 .. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams
     :inherited-members: BaseModel
-
-.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
-
-.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom
-
-.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
+    :exclude-members: model_computed_fields
diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import re
 from unittest.mock import MagicMock, call
@@ -348,111 +349,131 @@ def test_arguments():
         assert spider.allowed_domains == ["example.com"]
 
 
+def assertEqualJson(actual, expected):
+    """Compare the JSON representation of 2 Python objects.
+
+    This allows to take into account things like the order of key-value pairs
+    in dictionaries, which would not be taken into account when comparing
+    dictionaries directly.
+
+    It also generates a better diff in pytest output when enums are involved,
+    e.g. geolocation values.
+    """
+    actual_json = json.dumps(actual, indent=2)
+    expected_json = json.dumps(expected, indent=2)
+    assert actual_json == expected_json
+
+
 def test_metadata():
-    metadata = get_spider_metadata(EcommerceSpider, normalize=True)
-    assert metadata == {
+    actual_metadata = get_spider_metadata(EcommerceSpider, normalize=True)
+    expected_metadata = {
         "template": True,
         "title": "E-commerce",
         "description": "Template for spiders that extract product data from e-commerce websites.",
         "param_schema": {
             "properties": {
-                "crawl_strategy": {
-                    "default": "full",
-                    "title": "Crawl strategy",
-                    "description": "Determines how the start URL and follow-up URLs are crawled.",
-                    "type": "string",
-                    "enum": ["full", "navigation", "pagination_only"],
-                    "enumMeta": {
-                        "full": {
-                            "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
-                            "title": "Full",
-                        },
-                        "navigation": {
-                            "description": (
-                                "Follow pagination, subcategories, and "
-                                "product detail pages. Pagination Only is a "
-                                "better choice if the target URL does not "
-                                "have subcategories, or if Zyte API is "
-                                "misidentifying some URLs as subcategories."
-                            ),
-                            "title": "Navigation",
-                        },
-                        "pagination_only": {
-                            "description": (
-                                "Follow pagination and product detail pages. Subcategory links are ignored."
-                            ),
-                            "title": "Pagination Only",
-                        },
-                    },
-                },
-                "extract_from": {
-                    "anyOf": [{"type": "string"}, {"type": "null"}],
-                    "default": None,
-                    "title": "Extraction source",
+                "url": {
                     "description": (
-                        "Whether to perform extraction using a browser request "
-                        "(browserHtml) or an HTTP request (httpResponseBody)."
+                        "Initial URL for the crawl. Enter the full URL including http(s), "
+                        "you can copy and paste it from your browser. Example: https://toscrape.com/"
                     ),
-                    "enum": ["httpResponseBody", "browserHtml"],
-                    "enumMeta": {
-                        "httpResponseBody": {
-                            "title": "httpResponseBody",
-                            "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
-                        },
-                        "browserHtml": {
-                            "title": "browserHtml",
-                            "description": "Use browser rendering. Often provides the best quality.",
-                        },
-                    },
+                    "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+                    "title": "URL",
+                    "type": "string",
                 },
                 "geolocation": {
                     "anyOf": [
                         {"type": "string"},
                         {"type": "null"},
                     ],
                     "default": None,
-                    "title": "Geolocation",
-                    "description": "ISO 3166-1 alpha-2 2-character string specified in "
-                    "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
-                    "enum": list(
-                        sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__)
+                    "description": (
+                        "ISO 3166-1 alpha-2 2-character string specified in "
+                        "https://docs.zyte.com/zyte-api/usage/reference.html"
+                        "#operation/extract/request/geolocation."
                     ),
                     "enumMeta": {
                         code: {
                             "title": GEOLOCATION_OPTIONS_WITH_CODE[code],
                         }
-                        for code in Geolocation
+                        for code in sorted(Geolocation)
                     },
+                    "title": "Geolocation",
+                    "enum": list(
+                        sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__)
+                    ),
                 },
                 "max_requests": {
                     "anyOf": [{"type": "integer"}, {"type": "null"}],
                     "default": 100,
-                    "title": "Max Requests",
                     "description": (
                         "The maximum number of Zyte API requests allowed for the crawl.\n"
                         "\n"
                         "Requests with error responses that cannot be retried or exceed "
                         "their retry limit also count here, but they incur in no costs "
                         "and do not increase the request count in Scrapy Cloud."
                     ),
+                    "title": "Max Requests",
                     "widget": "request-limit",
                 },
-                "url": {
-                    "type": "string",
-                    "title": "URL",
+                "extract_from": {
+                    "anyOf": [{"type": "string"}, {"type": "null"}],
+                    "default": None,
                     "description": (
-                        "Initial URL for the crawl. Enter the full URL including http(s), "
-                        "you can copy and paste it from your browser. Example: https://toscrape.com/"
+                        "Whether to perform extraction using a browser request "
+                        "(browserHtml) or an HTTP request (httpResponseBody)."
                     ),
-                    "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+                    "enumMeta": {
+                        "browserHtml": {
+                            "description": "Use browser rendering. Often provides the best quality.",
+                            "title": "browserHtml",
+                        },
+                        "httpResponseBody": {
+                            "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
+                            "title": "httpResponseBody",
+                        },
+                    },
+                    "title": "Extraction source",
+                    "enum": ["httpResponseBody", "browserHtml"],
+                },
+                "crawl_strategy": {
+                    "default": "full",
+                    "description": "Determines how the start URL and follow-up URLs are crawled.",
+                    "enumMeta": {
+                        "full": {
+                            "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
+                            "title": "Full",
+                        },
+                        "navigation": {
+                            "description": (
+                                "Follow pagination, subcategories, and "
+                                "product detail pages. Pagination Only is a "
+                                "better choice if the target URL does not "
+                                "have subcategories, or if Zyte API is "
+                                "misidentifying some URLs as subcategories."
+                            ),
+                            "title": "Navigation",
+                        },
+                        "pagination_only": {
+                            "description": (
+                                "Follow pagination and product detail pages. Subcategory links are ignored."
+                            ),
+                            "title": "Pagination Only",
+                        },
+                    },
+                    "title": "Crawl strategy",
+                    "enum": ["full", "navigation", "pagination_only"],
+                    "type": "string",
                 },
             },
             "required": ["url"],
             "title": "EcommerceSpiderParams",
             "type": "object",
         },
     }
-    geolocation = metadata["param_schema"]["properties"]["geolocation"]
+    assertEqualJson(actual_metadata, expected_metadata)
+
+    geolocation = actual_metadata["param_schema"]["properties"]["geolocation"]
     assert geolocation["enum"][0] == "AF"
     assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"}
     assert set(geolocation["enum"]) == set(geolocation["enumMeta"])

diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py
@@ -0,0 +1,85 @@
+from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+from zyte_spider_templates._geolocations import (
+    GEOLOCATION_OPTIONS_WITH_CODE,
+    Geolocation,
+)
+from zyte_spider_templates.documentation import document_enum
+
+
+@document_enum
+class ExtractFrom(str, Enum):
+    httpResponseBody: str = "httpResponseBody"
+    """Use HTTP responses. Cost-efficient and fast extraction method, which
+    works well on many websites."""
+
+    browserHtml: str = "browserHtml"
+    """Use browser rendering. Often provides the best quality."""
+
+
+class ExtractFromParam(BaseModel):
+    extract_from: Optional[ExtractFrom] = Field(
+        title="Extraction source",
+        description=(
+            "Whether to perform extraction using a browser request "
+            "(browserHtml) or an HTTP request (httpResponseBody)."
+        ),
+        default=None,
+        json_schema_extra={
+            "enumMeta": {
+                ExtractFrom.browserHtml: {
+                    "title": "browserHtml",
+                    "description": "Use browser rendering. Often provides the best quality.",
+                },
+                ExtractFrom.httpResponseBody: {
+                    "title": "httpResponseBody",
+                    "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
+                },
+            },
+        },
+    )
+
+
+class GeolocationParam(BaseModel):
+    geolocation: Optional[Geolocation] = Field(
+        title="Geolocation",
+        description="ISO 3166-1 alpha-2 2-character string specified in "
+        "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.",
+        default=None,
+        json_schema_extra={
+            "enumMeta": {
+                code: {
+                    "title": GEOLOCATION_OPTIONS_WITH_CODE[code],
+                }
+                for code in Geolocation
+            }
+        },
+    )
+
+
+class MaxRequestsParam(BaseModel):
+    max_requests: Optional[int] = Field(
+        description=(
+            "The maximum number of Zyte API requests allowed for the crawl.\n"
+            "\n"
+            "Requests with error responses that cannot be retried or exceed "
+            "their retry limit also count here, but they incur in no costs "
+            "and do not increase the request count in Scrapy Cloud."
+        ),
+        default=100,
+        json_schema_extra={
+            "widget": "request-limit",
+        },
+    )
+
+
+class UrlParam(BaseModel):
+    url: str = Field(
+        title="URL",
+        description="Initial URL for the crawl. Enter the full URL including http(s), "
+        "you can copy and paste it from your browser. Example: https://toscrape.com/",
+        pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+    )