IBM · hmtbr · Oct 18, 2024 · Oct 18, 2024 · matouma · Oct 18, 2024
diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py
@@ -74,6 +74,7 @@ def async_crawl(
  user_agent: str = "",
  headers: dict[str, str] = {},
  allow_domains: Collection[str] = (),
+ subdomain_focus: bool = False,
  path_focus: bool = False,
  allow_mime_types: Collection[str] = (
  "application/pdf",
@@ -96,6 +97,7 @@ def async_crawl(
  user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
  headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
  allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
  path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
  allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -140,6 +142,7 @@ def async_crawl(
  seed_urls=seed_urls,
  callback=on_downloaded,
  allow_domains=allow_domains,
+ subdomain_focus=subdomain_focus,
  path_focus=path_focus,
  allow_mime_types=allow_mime_types,
  disallow_mime_types=disallow_mime_types,
@@ -155,6 +158,7 @@ def crawl(
  user_agent: str = "",
  headers: dict[str, str] = {},
  allow_domains: Collection[str] = (),
+ subdomain_focus: bool = False,
  path_focus: bool = False,
  allow_mime_types: Collection[str] = (
  "application/pdf",
@@ -177,6 +181,7 @@ def crawl(
  user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
  headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
  allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+ subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
  path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
  allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -198,6 +203,7 @@ def on_completed(result: Any):
  user_agent,
  headers,
  allow_domains,
+ subdomain_focus,
  path_focus,
  allow_mime_types,
  disallow_mime_types,

diff --git a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py
@@ -28,6 +28,7 @@
  get_content_type,
  get_etld1,
  get_focus_path,
+ get_fqdn,
  is_allowed_path,
  urlparse_cached,
 )
@@ -42,6 +43,7 @@ def __init__(
  self,
  seed_urls: Collection[str],
  allow_domains: Collection[str] = (),
+ subdomain_focus: bool = False,
  path_focus: bool = False,
  allow_mime_types: Collection[str] = (),
  disallow_mime_types: Collection[str] = (),
@@ -88,11 +90,15 @@ def __init__(
  self.focus_paths.add(path)
 
  # Domains and mime types filtering
- self.allowed_domains = set(
- allow_domains
- if len(allow_domains) > 0
- else [get_etld1(url) for url in seed_urls]
- )
+ if allow_domains:
+ self.allowed_domains = set(allow_domains)
+ elif subdomain_focus:
+ self.allowed_domains = set()
+ for url in seed_urls:
+ if fqdn := get_fqdn(url):
+ self.allowed_domains.add(fqdn)
+ else:
+ self.allowed_domains = set(get_etld1(url) for url in seed_urls)
  self.allow_mime_types = set(
  [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
  )
@@ -155,7 +161,9 @@ def start_requests(self):
  )
 
  def _parse_sitemap(self, response: Response):
- yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
+ yield ConnectorItem(
+ dropped=False, downloaded=False, system_request=True, sitemap=True
+ )
 
  seed_url = response.meta["seed_url"]
 

diff --git a/data-connector-lib/src/dpk_connector/core/utils.py b/data-connector-lib/src/dpk_connector/core/utils.py
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
  return f"{ext.domain}.{ext.suffix}"
 
 
+def get_fqdn(url: str) -> str:
+ ext = tldextract.extract(url)
+ return ext.fqdn
+
+
 def get_focus_path(url: str) -> str | None:
  parts = urlparse_cached(url)
  if len(parts.path.split("/")) > 2:

diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py
@@ -1,13 +1,12 @@
 from pathlib import Path
 
 import pytest
+from dpk_connector.core.item import ConnectorItem
+from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
 from scrapy import Request
 from scrapy.crawler import Crawler
 from scrapy.http import HtmlResponse
 
-from dpk_connector.core.item import ConnectorItem
-from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
-
 
 @pytest.fixture
 def crawler() -> Crawler:
@@ -22,6 +21,21 @@ def crawler() -> Crawler:
  return crawler
 
 
+def test_init_subdomain_focus():
+ spider = BaseSitemapSpider(
+ seed_urls=(
+ "http://blog.example.com/",
+ "http://contents.example.com/",
+ ),
+ subdomain_focus=True,
+ )
+ assert spider.seed_urls == {
+ "http://blog.example.com/",
+ "http://contents.example.com/",
+ }
+ assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
+
+
 def test_init_path_focus():
  spider = BaseSitemapSpider(
  seed_urls=(

diff --git a/data-connector-lib/test/dpk_connector/core/test_utils.py b/data-connector-lib/test/dpk_connector/core/test_utils.py
@@ -7,6 +7,7 @@
  get_content_type,
  get_etld1,
  get_focus_path,
+ get_fqdn,
  get_header_value,
  get_mime_type,
  is_allowed_path,
@@ -83,6 +84,21 @@ def test_get_etld1(url: str, expected: str):
  assert get_etld1(url) == expected
 
 
+@pytest.mark.parametrize(
+ "url,expected",
+ [
+ ("http://www.example.com", "www.example.com"),
+ ("https://www.example.co.uk", "www.example.co.uk"),
+ ("http://www.example.com/path?query=string#fragment", "www.example.com"),
+ ("http://localhost:8080/", ""),
+ ("http://www.example.com:8080/", "www.example.com"),
+ ("http://www.sub.example.com:8080/", "www.sub.example.com"),
+ ],
+)
+def test_get_fqdn(url: str, expected: str):
+ assert get_fqdn(url) == expected
+
+
 @pytest.mark.parametrize(
  "url,expected",
  [